增加评分代码

main
PeterAlbus 9 months ago
parent c06f9a3684
commit 148a9e1de0

@ -34,7 +34,8 @@ def main(args):
k=args.ntrain, k=args.ntrain,
model_name=args.model_name, model_name=args.model_name,
device=device, device=device,
finetune=fine_tune_model finetune=fine_tune_model,
finetune_method=args.finetune_method
) )
elif "chatglm" in args.model_name: elif "chatglm" in args.model_name:
if args.cuda_device: if args.cuda_device:
@ -67,21 +68,22 @@ def main(args):
subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market'] subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market']
qa_subject_list = ['car_knowledge', 'car_use', 'car_market'] qa_subject_list = ['car_knowledge', 'car_use', 'car_market']
# qa_subject_list = ['car_use', 'car_market'==
for subject_name in subject_list: # for subject_name in subject_list:
print("Now testing: " + subject_name) # print("Now testing: " + subject_name)
# subject_name=args.subject # # subject_name=args.subject
val_file_path = os.path.join('data/val', f'{subject_name}_val.csv') # val_file_path = os.path.join('data/val', f'{subject_name}_val.csv')
val_df = pd.read_csv(val_file_path) # val_df = pd.read_csv(val_file_path)
if args.few_shot: # if args.few_shot:
dev_file_path = os.path.join('data/dev', f'{subject_name}_dev.csv') # dev_file_path = os.path.join('data/dev', f'{subject_name}_dev.csv')
dev_df = pd.read_csv(dev_file_path) # dev_df = pd.read_csv(dev_file_path)
correct_ratio = evaluator.eval_subject(subject_name, val_df, dev_df, few_shot=args.few_shot, # correct_ratio = evaluator.eval_subject(subject_name, val_df, dev_df, few_shot=args.few_shot,
save_result_dir=save_result_dir, cot=args.cot) # save_result_dir=save_result_dir, cot=args.cot)
else: # else:
correct_ratio = evaluator.eval_subject(subject_name, val_df, few_shot=args.few_shot, # correct_ratio = evaluator.eval_subject(subject_name, val_df, few_shot=args.few_shot,
save_result_dir=save_result_dir) # save_result_dir=save_result_dir)
print("Acc:", correct_ratio) # print("Acc:", correct_ratio)
for subject_name in qa_subject_list: for subject_name in qa_subject_list:
print("Now testing: " + subject_name) print("Now testing: " + subject_name)
@ -102,5 +104,6 @@ if __name__ == "__main__":
# parser.add_argument("--subject","-s",type=str,default="operating_system") # parser.add_argument("--subject","-s",type=str,default="operating_system")
parser.add_argument("--cuda_device", type=str) parser.add_argument("--cuda_device", type=str)
parser.add_argument("--finetune", type=str) parser.add_argument("--finetune", type=str)
args = parser.parse_args() parser.add_argument("--finetune_method", type=str)
main(args) user_args = parser.parse_args()
main(user_args)

@ -8,6 +8,7 @@ from transformers.generation.utils import LogitsProcessorList
from evaluators.evaluator import Evaluator from evaluators.evaluator import Evaluator
from peft import PeftModel from peft import PeftModel
class InvalidScoreLogitsProcessor(LogitsProcessor): class InvalidScoreLogitsProcessor(LogitsProcessor):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
if torch.isnan(scores).any() or torch.isinf(scores).any(): if torch.isnan(scores).any() or torch.isinf(scores).any():
@ -15,57 +16,82 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
scores[..., 5] = 5e4 scores[..., 5] = 5e4
return scores return scores
class ChatGLM_Evaluator(Evaluator): class ChatGLM_Evaluator(Evaluator):
def __init__(self, choices, k, model_name, device, finetune=None): def __init__(self, choices, k, model_name, device, finetune=None, finetune_method=None):
super(ChatGLM_Evaluator, self).__init__(choices, model_name, k) super(ChatGLM_Evaluator, self).__init__(choices, model_name, k)
# try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem # try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem
# or directly clone the model # or directly clone the model
self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna") self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna")
self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna", resume_download=True).half().to(device) if finetune_method == "lora":
if finetune: self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna",
resume_download=True).half().to(device)
peft_model_id = "lora/" + finetune peft_model_id = "lora/" + finetune
self.model = PeftModel.from_pretrained(self.model, peft_model_id) self.model = PeftModel.from_pretrained(self.model, peft_model_id)
print("Model loaded! use GLM2" + finetune) print("Model loaded! use GLM2" + finetune)
elif finetune_method == "ptuning":
CHECKPOINT_PATH = "ptuning/" + finetune
config = AutoConfig.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, pre_seq_len=128)
self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", config=config, trust_remote_code=True)
prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"))
new_prefix_state_dict = {}
for k, v in prefix_state_dict.items():
if k.startswith("transformer.prefix_encoder."):
new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
self.model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
self.model = self.model.half().to(device)
self.model.transformer.prefix_encoder.float()
print("Model loaded! use GLM2 + " + finetune)
else: else:
self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna",
resume_download=True).half().to(device)
print("Model loaded!(GLM2)") print("Model loaded!(GLM2)")
# self.model = self.model.eval() # self.model = self.model.eval()
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None): def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None):
correct_num = 0 correct_num = 0
if save_result_dir: result = []
if few_shot: score = []
result = [] answer_list = []
score = []
answer_list = []
if few_shot: if few_shot:
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot) history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
print(history)
else: else:
history = [] # _ , history = self.model.chat(self.tokenizer, "接下来会提供给你一些选择题,请选出正确的答案。", do_sample=False)
history = [('接下来会提供给你一些选择题,请选出正确的答案,给出正确的选项即可。', '好的,我会尽力解答。')]
# print(history)
answers = list(test_df['answer']) answers = list(test_df['answer'])
for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)): for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)):
question = self.format_example(row, include_answer=False, cot=cot) question = self.format_example(row, include_answer=False, cot=cot)
if few_shot: if few_shot:
response, _ = self.model.chat(self.tokenizer, question, do_sample=False, history=history) response, _ = self.model.chat(self.tokenizer, question, max_length=300,
do_sample=False, history=history)
response = response.strip() response = response.strip()
# For ChatGLM, we use answer extraction in answer-only mode too. # For ChatGLM, we use answer extraction in answer-only mode too.
ans, direct_extract = self.extract_cot_answer(row, response) ans, direct_extract = self.extract_cot_answer(row, response)
else: # zero-shot by extracting answer from distribution else: # zero-shot by extracting answer from distribution
ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, history=history) response, _ = self.model.chat(self.tokenizer, question, max_length=300,
do_sample=False, history=history)
response = response.strip()
ans, direct_extract = self.extract_cot_answer(row, response)
print(response, ans)
# ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048,
# history=history)
if ans == answers[row_index]: if ans == answers[row_index]:
correct_num += 1 correct_num += 1
correct = 1 correct = 1
else: else:
correct = 0 correct = 0
if save_result_dir: if save_result_dir:
if few_shot: # if few_shot:
result.append(response) result.append(response)
answer_list.append(ans) answer_list.append(ans)
score.append(correct) score.append(correct)
correct_ratio = 100*correct_num/len(answers) correct_ratio = 100 * correct_num / len(answers)
if save_result_dir: if save_result_dir:
if few_shot: # if few_shot:
test_df['model_output'] = result test_df['model_output'] = result
test_df['correctness'] = score test_df['correctness'] = score
test_df['model_answer'] = answer_list test_df['model_answer'] = answer_list
result_file_name = f'{subject_name}_{correct_ratio}_test.csv' result_file_name = f'{subject_name}_{correct_ratio}_test.csv'
@ -75,12 +101,33 @@ class ChatGLM_Evaluator(Evaluator):
return correct_ratio return correct_ratio
def eval_qa(self, subject_name, qa_df, save_result_dir=None):
# history = []
history = [('接下来会给你一些一些汽车领域相关问题,请回答。', '好的,我会尽力解答。')]
for row_index, row in tqdm(qa_df.iterrows(), total=len(qa_df)):
question = row['question']
response, _ = self.model.chat(self.tokenizer, question, max_length=300, do_sample=False, history=history)
# current_length = 0
# response = ""
# for resp, _ in self.model.stream_chat(self.tokenizer, question, max_length=300,
# do_sample=False, history=history):
# print(resp[current_length:], end="", flush=True)
# current_length = len(resp)
# response = resp
# print('')
response = response.strip()
qa_df.loc[row_index, 'model_output'] = response
if save_result_dir:
result_file_name = f'{subject_name}_qa_test_result.csv'
qa_df.to_csv(os.path.join(save_result_dir, result_file_name))
def generate_few_shot_prompt(self, subject, dev_df, cot=False): def generate_few_shot_prompt(self, subject, dev_df, cot=False):
message = [] message = []
k = self.k k = self.k
if self.k == -1: if self.k == -1:
k = dev_df.shape[0] k = dev_df.shape[0]
message.append(self.format_example(dev_df.iloc[0, :], cot=cot, add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n")) message.append(self.format_example(dev_df.iloc[0, :], cot=cot,
add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"))
for i in range(1, k): for i in range(1, k):
message.append(self.format_example(dev_df.iloc[i, :], cot=cot)) message.append(self.format_example(dev_df.iloc[i, :], cot=cot))
return message return message
@ -157,5 +204,6 @@ class ChatGLM_Evaluator(Evaluator):
score = outputs.scores[0][0].tolist() score = outputs.scores[0][0].tolist()
choice_score = [score[167], score[333], score[251], score[416]] choice_score = [score[167], score[333], score[251], score[416]]
ranked_index = [index for index, value in sorted(list(enumerate(choice_score)), key=lambda x:x[1], reverse=True)] ranked_index = [index for index, value in
sorted(list(enumerate(choice_score)), key=lambda x: x[1], reverse=True)]
return self.choices[ranked_index[0]] return self.choices[ranked_index[0]]

@ -141,6 +141,37 @@ class ChatGPT_Evaluator(Evaluator):
test_df.to_csv(os.path.join(save_result_dir, result_file_name),encoding="utf-8",index=False) test_df.to_csv(os.path.join(save_result_dir, result_file_name),encoding="utf-8",index=False)
return correct_ratio return correct_ratio
def eval_qa(self, subject_name, qa_df, save_result_dir=None):
for row_index, row in tqdm(qa_df.iterrows(),total=len(qa_df)):
question = [
{"role":"user","content":row['question']}
]
full_prompt = question
response=None
timeout_counter=0
while response is None and timeout_counter<=30:
try:
response = openai.ChatCompletion.create(
model=self.model_name,
messages=full_prompt,
temperature=0.
)
except Exception as msg:
if "timeout=600" in str(msg):
timeout_counter+=1
print(msg)
sleep(5)
continue
if response==None:
response_str=""
qa_df.loc[row_index, 'model_output'] = response_str
else:
response_str = response['choices'][0]['message']['content']
qa_df.loc[row_index, 'model_output'] = response_str
if save_result_dir:
result_file_name = f'{subject_name}_qa_test_result.csv'
qa_df.to_csv(os.path.join(save_result_dir, result_file_name))
def extract_ans(self,response_str): def extract_ans(self,response_str):
pattern=[ pattern=[
r"^选([A-D])", r"^选([A-D])",

@ -1,5 +1,3 @@
# rogue
from rogue import get_rouge_score

@ -0,0 +1,37 @@
from scoring.gpt_scorer import GPTScorer
from scoring.rogue_scorer import get_rouge_score
import pandas as pd
import time
from tqdm import tqdm
class AssessmentEngine:
def __init__(self, save_result_dir, api_key):
self.save_result_dir = save_result_dir
self.gpt_scorer = GPTScorer(api_key)
def eval_subject(self, subject_name, csv_file_name):
qa_result_df = pd.read_csv('logs/' + self.save_result_dir + '/' + csv_file_name)
start_time = time.time()
row_count = 0
rouge_score_sum = 0
for row_index, row in tqdm(qa_result_df.iterrows(), total=len(qa_result_df)):
row_count += 1
test_question = row['question']
model_response = row['model_output']
reference_answer = row['answer']
rouge_score = get_rouge_score(model_response, reference_answer)
rouge_1_f_score = rouge_score['rouge-1']['f']
rouge_score_sum += rouge_1_f_score
qa_result_df.loc[row_index, 'rouge_score'] = rouge_1_f_score
self.gpt_scorer.mode("accuracy")
gpt_score_acc, gpt_response_acc = self.gpt_scorer.score_with_chatgpt(test_question,
model_response, reference_answer)
qa_result_df.loc[row_index, 'gpt_score_acc'] = gpt_score_acc
qa_result_df.loc[row_index, 'gpt_response_acc'] = gpt_response_acc
end_time = time.time()
elapsed_time = end_time - start_time
print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "")
synthesis_score = rouge_score_sum / row_count
qa_result_df.to_csv('logs/' + self.save_result_dir + '/' + subject_name + '_qa_test_score_'
+ str(synthesis_score) + '.csv', index=False)

@ -1,25 +1,31 @@
import openai import openai
import re import re
class ModelScorer:
class GPTScorer:
def __init__(self, api_key): def __init__(self, api_key):
openai.api_key = api_key openai.api_key = api_key
self.eval_mode = "accuracy" self.eval_mode = "accuracy"
def score_with_chatgpt(self, text): def mode(self, mode):
self.eval_mode = mode
return self
def score_with_chatgpt(self, question, model_result, reference):
prompt = self.generate_scoring_prompt(question, model_result, reference)
try: try:
# 提交文本以获取ChatGPT评分 # 提交文本以获取ChatGPT评分
response = openai.ChatCompletion.create( response = openai.ChatCompletion.create(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages=text, messages=prompt,
) )
# 提取评分 # 提取评分
chatgpt_response = response.choices[0]['message']['content'] chatgpt_response = response.choices[0]['message']['content']
chatgpt_score = self.extract_score(chatgpt_response) chatgpt_score = self.extract_score(chatgpt_response)
return chatgpt_response,chatgpt_score return chatgpt_response, chatgpt_score
except Exception as e: except Exception as e:
print("An error occurred while scoring with ChatGPT:", e) print("An error occurred while scoring with ChatGPT:", e)
return None return None, None
def generate_scoring_prompt(self, question, model_result, reference): def generate_scoring_prompt(self, question, model_result, reference):
# 生成评分提示 # 生成评分提示
@ -44,13 +50,13 @@ class ModelScorer:
def extract_score(self, response_text): def extract_score(self, response_text):
# 提取评分 # 提取评分
pattern=[ pattern = [
r"^评分为([1-5])分", r"^评分为([1-5])分",
] ]
score_list=[] score_list = []
for p in pattern: for p in pattern:
if len(score_list)==0: if len(score_list) == 0:
score_list=re.findall(p,response_text) score_list = re.findall(p, response_text)
else: else:
break break
return score_list[0] return score_list[0]
@ -61,17 +67,19 @@ if __name__ == "__main__":
my_api_key = "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" my_api_key = "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"
# 初始化模型评分器 # 初始化模型评分器
scorer = ModelScorer(my_api_key) scorer = GPTScorer(my_api_key)
# 要评分的大模型结果 # 要评分的大模型结果
question = "秦Plus-DMi车型的安全气囊有哪些类型" sample_question = "秦Plus-DMi车型的安全气囊有哪些类型"
model_result = "截止到我最后更新知识的时候关于秦Plus-DMi车型的具体安全气囊类型的信息我并没有。通常来说汽车的安全气囊系统可能包括驾驶员气囊、副驾驶气囊、侧面气囊、头部气囊等。但具体车型的安全气囊配置可能会因地区、年份和车型的不同而有所差异。建议您直接查询该车型的官方资料或者联系经销商以获取最准确的信息。" sample_model_result = ("截止到我最后更新知识的时候关于秦Plus-DMi车型的具体安全气囊类型的信息我并没有。"
reference = "秦Plus-DMi配备有驾驶员安全气囊、前排乘员安全气囊、侧帘式安全气囊和座椅侧安全气囊。" "通常来说,汽车的安全气囊系统可能包括驾驶员气囊、副驾驶气囊、侧面气囊、头部气囊等。"
"但具体车型的安全气囊配置可能会因地区、年份和车型的不同而有所差异。"
"建议您直接查询该车型的官方资料或者联系经销商以获取最准确的信息。")
sample_reference = "秦Plus-DMi配备有驾驶员安全气囊、前排乘员安全气囊、侧帘式安全气囊和座椅侧安全气囊。"
prompt = scorer.generate_scoring_prompt(question, model_result, reference)
# 获取ChatGPT评分 # 获取ChatGPT评分
response,score = scorer.score_with_chatgpt(prompt) response_text, score = scorer.mode('accuracy').score_with_chatgpt(sample_question, sample_model_result, sample_reference)
if response is not None: if response_text is not None:
print("ChatGPT评分:", score, "\nChatGPT回复:", response) print("ChatGPT评分:", score, "\nChatGPT回复:", response_text)
else: else:
print("无法获取ChatGPT评分。") print("无法获取ChatGPT评分。")

@ -7,8 +7,8 @@ def get_rouge_score(s1, s2):
rouge = Rouge() rouge = Rouge()
s1 = " ".join(jieba.cut(s1)) s1 = " ".join(jieba.cut(s1))
s2 = " ".join(jieba.cut(s2)) s2 = " ".join(jieba.cut(s2))
print(s1) # print(s1)
print(s2) # print(s2)
return rouge.get_scores(s1, s2)[0] return rouge.get_scores(s1, s2)[0]

@ -0,0 +1,7 @@
from scoring.assessment_engine import AssessmentEngine
assessment_engine = AssessmentEngine("chatglm2_glm2_pt1_2024-03-08_11-24-47",
"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")
assessment_engine.eval_subject("car_knowledge", "car_knowledge_qa_test_result.csv")
assessment_engine.eval_subject("car_use", "car_use_qa_test_result.csv")
assessment_engine.eval_subject("car_market", "car_market_qa_test_result.csv")
Loading…
Cancel
Save