优化评分部分文件结构,增加对比式gpt评分

main
PeterAlbus 8 months ago
parent 148a9e1de0
commit 88a58600f6

2
.gitignore vendored

@ -167,3 +167,5 @@ cython_debug/
/ptuning/
/logs/
/data/
/qlora/
/metrics/

@ -0,0 +1,35 @@
import pandas as pd
# 读取两个csv文件
human_score_df = pd.read_csv('logs/other/human.csv')
machine_score_df = pd.read_csv('logs/other/result_diff_test_score_53.84256314043283.csv')
result_df = pd.DataFrame(columns=['question', 'answer', 'predict_finetune', 'predict_origin', 'acc_finetune', 'human_acc_finetune', 'acc_origin', 'human_acc_origin', 'fluency_finetune', 'human_fluency_finetune', 'diff_score', 'human_diff_score'])
result_df_row_index = 0
for row_index, row in machine_score_df.iterrows():
acc_finetune_diff = row['acc_finetune'] - human_score_df.loc[row_index, '准确度(微调后']
acc_origin_diff = row['acc_origin'] - human_score_df.loc[row_index, '准确度(微调前']
fluency_finetune_diff = row['fluency_finetune'] - human_score_df.loc[row_index, '流畅度(微调后']
diff_score_diff = row['diff_score'] - human_score_df.loc[row_index, '是否超过原模型']
print("准确度(微调后)差值:", abs(acc_finetune_diff),end=' ')
print("准确度(微调前)差值:", abs(acc_origin_diff),end=' ')
print("流畅度(微调后)差值:", abs(fluency_finetune_diff),end=' ')
print("是否超过原模型差值:", abs(diff_score_diff))
if abs(acc_finetune_diff) >= 2:
result_df.loc[result_df_row_index, 'question'] = machine_score_df.loc[row_index, 'question']
result_df.loc[result_df_row_index, 'answer'] = machine_score_df.loc[row_index, 'answer']
result_df.loc[result_df_row_index, 'predict_finetune'] = machine_score_df.loc[row_index, 'predict_finetune']
result_df.loc[result_df_row_index, 'predict_origin'] = machine_score_df.loc[row_index, 'predict_origin']
result_df.loc[result_df_row_index, 'acc_finetune'] = machine_score_df.loc[row_index, 'acc_finetune']
result_df.loc[result_df_row_index, 'human_acc_finetune'] = human_score_df.loc[row_index, '准确度(微调后']
result_df.loc[result_df_row_index, 'acc_origin'] = machine_score_df.loc[row_index, 'acc_origin']
result_df.loc[result_df_row_index, 'human_acc_origin'] = human_score_df.loc[row_index, '准确度(微调前']
result_df.loc[result_df_row_index, 'fluency_finetune'] = machine_score_df.loc[row_index, 'fluency_finetune']
result_df.loc[result_df_row_index, 'human_fluency_finetune'] = human_score_df.loc[row_index, '流畅度(微调后']
result_df.loc[result_df_row_index, 'diff_score'] = machine_score_df.loc[row_index, 'diff_score']
result_df.loc[result_df_row_index, 'human_diff_score'] = human_score_df.loc[row_index, '是否超过原模型']
result_df_row_index += 1
result_df.to_csv('logs/other/diff.csv', index=False)
# 信息的准确性应当被首要考虑,多余的未知真假的信息不应该带来加分。

@ -5,26 +5,40 @@ import torch
from evaluators.chatgpt import ChatGPT_Evaluator
from evaluators.chatglm import ChatGLM_Evaluator
from evaluators.chatglm2 import ChatGLM_Evaluator as ChatGLM2_Evaluator
from evaluators.chatglm3 import ChatGLM_Evaluator as ChatGLM3_Evaluator
import time
choices = ["A", "B", "C", "D"]
device = torch.device("cpu")
def main(args):
global device
if args.cuda_device:
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device
device = torch.device("cuda")
if "turbo" in args.model_name or "gpt-4" in args.model_name:
# print("Not supported yet")
# return -1
evaluator = ChatGPT_Evaluator(
choices=choices,
k=args.ntrain,
api_key=args.openai_key,
model_name=args.model_name
)
elif "chatglm3" in args.model_name:
if args.finetune:
fine_tune_model = args.finetune
else:
fine_tune_model = None
evaluator = ChatGLM3_Evaluator(
choices=choices,
k=args.ntrain,
model_name=args.model_name,
device=device,
finetune=fine_tune_model,
finetune_method=args.finetune_method
)
elif "chatglm2" in args.model_name:
if args.cuda_device:
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device
device = torch.device("cuda")
if args.finetune:
fine_tune_model = args.finetune
else:
@ -38,9 +52,6 @@ def main(args):
finetune_method=args.finetune_method
)
elif "chatglm" in args.model_name:
if args.cuda_device:
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device
device = torch.device("cuda")
if args.finetune:
fine_tune_model = args.finetune
else:
@ -50,7 +61,8 @@ def main(args):
k=args.ntrain,
model_name=args.model_name,
device=device,
finetune=fine_tune_model
finetune=fine_tune_model,
finetune_method=args.finetune_method
)
else:
print("Unknown model name")
@ -63,33 +75,34 @@ def main(args):
fine_tune_model_name = args.finetune
else:
fine_tune_model_name = 'original'
save_result_dir = os.path.join(r"logs", f"{args.model_name}_{fine_tune_model_name}_{run_date}")
save_result_dir = os.path.join(r"logs", f"{args.model_name}_{fine_tune_model_name}/{run_date}")
os.mkdir(save_result_dir)
subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market']
# subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market']
subject_list = ['car_knowledge_in_train', 'car_use_in_train', 'car_market_in_train']
qa_subject_list = ['car_knowledge', 'car_use', 'car_market']
# qa_subject_list = ['car_use', 'car_market'==
# qa_subject_list = ['car_use', 'car_market']
# for subject_name in subject_list:
# print("Now testing: " + subject_name)
# # subject_name=args.subject
# val_file_path = os.path.join('data/val', f'{subject_name}_val.csv')
# val_df = pd.read_csv(val_file_path)
# if args.few_shot:
# dev_file_path = os.path.join('data/dev', f'{subject_name}_dev.csv')
# dev_df = pd.read_csv(dev_file_path)
# correct_ratio = evaluator.eval_subject(subject_name, val_df, dev_df, few_shot=args.few_shot,
# save_result_dir=save_result_dir, cot=args.cot)
# else:
# correct_ratio = evaluator.eval_subject(subject_name, val_df, few_shot=args.few_shot,
# save_result_dir=save_result_dir)
# print("Acc:", correct_ratio)
for subject_name in qa_subject_list:
for subject_name in subject_list:
print("Now testing: " + subject_name)
qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv')
qa_df = pd.read_csv(qa_file_path)
evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir)
# subject_name=args.subject
val_file_path = os.path.join('data/val', f'{subject_name}_val.csv')
val_df = pd.read_csv(val_file_path)
if args.few_shot:
dev_file_path = os.path.join('data/dev', f'{subject_name}_dev.csv')
dev_df = pd.read_csv(dev_file_path)
correct_ratio = evaluator.eval_subject(subject_name, val_df, dev_df, few_shot=args.few_shot,
save_result_dir=save_result_dir, cot=args.cot)
else:
correct_ratio = evaluator.eval_subject(subject_name, val_df, few_shot=args.few_shot,
save_result_dir=save_result_dir)
print("Acc:", correct_ratio)
# for subject_name in qa_subject_list:
# print("Now testing: " + subject_name)
# qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv')
# qa_df = pd.read_csv(qa_file_path)
# evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir)
if __name__ == "__main__":

@ -17,13 +17,13 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
class ChatGLM_Evaluator(Evaluator):
def __init__(self, choices, k, model_name, device, finetune=None):
def __init__(self, choices, k, model_name, device, finetune=None, finetune_method=None):
super(ChatGLM_Evaluator, self).__init__(choices, model_name, k)
# try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem
# or directly clone the model
self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna")
if finetune:
CHECKPOINT_PATH = "ptuning/" + finetune
if finetune_method == "ptuning":
CHECKPOINT_PATH = "ptuning/glm1/" + finetune
config = AutoConfig.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, pre_seq_len=128)
self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", config=config, trust_remote_code=True)
prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"))
@ -38,17 +38,15 @@ class ChatGLM_Evaluator(Evaluator):
else:
self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna",
resume_download=True).half().to(device)
print("Model loaded!(GLM)")
print("Model loaded! (GLM original)")
# self.model = self.model.eval()
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None):
correct_num = 0
if save_result_dir:
if few_shot:
result = []
score = []
answer_list = []
result = []
score = []
answer_list = []
if few_shot:
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
else:
@ -62,8 +60,13 @@ class ChatGLM_Evaluator(Evaluator):
# For ChatGLM, we use answer extraction in answer-only mode too.
ans, direct_extract = self.extract_cot_answer(row, response)
else: # zero-shot by extracting answer from distribution
ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048,
history=history)
response, _ = self.model.chat(self.tokenizer, question, max_length=300,
do_sample=False, history=history)
response = response.strip()
ans, direct_extract = self.extract_cot_answer(row, response)
# print(response, ans)
# ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048,
# history=history)
if ans == answers[row_index]:
correct_num += 1
correct = 1

@ -26,11 +26,11 @@ class ChatGLM_Evaluator(Evaluator):
if finetune_method == "lora":
self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna",
resume_download=True).half().to(device)
peft_model_id = "lora/" + finetune
peft_model_id = "lora/glm2/" + finetune
self.model = PeftModel.from_pretrained(self.model, peft_model_id)
print("Model loaded! use GLM2" + finetune)
elif finetune_method == "ptuning":
CHECKPOINT_PATH = "ptuning/" + finetune
CHECKPOINT_PATH = "ptuning/glm2/" + finetune
config = AutoConfig.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, pre_seq_len=128)
self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", config=config, trust_remote_code=True)
prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"))
@ -55,7 +55,7 @@ class ChatGLM_Evaluator(Evaluator):
answer_list = []
if few_shot:
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
print(history)
# print(history)
else:
# _ , history = self.model.chat(self.tokenizer, "接下来会提供给你一些选择题,请选出正确的答案。", do_sample=False)
history = [('接下来会提供给你一些选择题,请选出正确的答案,给出正确的选项即可。', '好的,我会尽力解答。')]

@ -25,7 +25,7 @@ class AssessmentEngine:
rouge_score_sum += rouge_1_f_score
qa_result_df.loc[row_index, 'rouge_score'] = rouge_1_f_score
self.gpt_scorer.mode("accuracy")
gpt_score_acc, gpt_response_acc = self.gpt_scorer.score_with_chatgpt(test_question,
gpt_response_acc, gpt_score_acc = self.gpt_scorer.score_with_chatgpt(test_question,
model_response, reference_answer)
qa_result_df.loc[row_index, 'gpt_score_acc'] = gpt_score_acc
qa_result_df.loc[row_index, 'gpt_response_acc'] = gpt_response_acc
@ -35,3 +35,115 @@ class AssessmentEngine:
synthesis_score = rouge_score_sum / row_count
qa_result_df.to_csv('logs/' + self.save_result_dir + '/' + subject_name + '_qa_test_score_'
+ str(synthesis_score) + '.csv', index=False)
def eval_result_diff(self, csv_file_name):
result_diff_df = pd.read_csv('logs/' + self.save_result_dir + '/' + csv_file_name)
result_diff_df['rouge_score_finetune'] = 0
result_diff_df['rouge_score_origin'] = 0
result_diff_df['acc_finetune'] = 0
result_diff_df['acc_origin'] = 0
result_diff_df['fluency_finetune'] = 0
result_diff_df['fluency_origin'] = 0
result_diff_df['diff_score'] = 0
result_diff_df['acc_response_finetune'] = 0
result_diff_df['acc_response_origin'] = 0
result_diff_df['fluency_response_finetune'] = 0
result_diff_df['fluency_response_origin'] = 0
result_diff_df['diff_score_response'] = 0
start_time = time.time()
finetune_rouge_score_sum = 0
origin_rouge_score_sum = 0
finetune_acc_score_sum = 0
origin_acc_score_sum = 0
finetune_fluency_score_sum = 0
origin_fluency_score_sum = 0
model_better_score_sum = 0
row_count = 0
for row_index, row in tqdm(result_diff_df.iterrows(), total=len(result_diff_df)):
if row['question'] == '':
continue
row_count += 1
test_question = row['question']
finetune_model_response = row['predict_finetune']
original_model_response = row['predict_origin']
reference_answer = row['answer']
# 计算ROUGE分数
finetune_rouge_score = get_rouge_score(finetune_model_response, reference_answer)
finetune_rouge_1_f_score = finetune_rouge_score['rouge-1']['f']
finetune_rouge_score_sum += finetune_rouge_1_f_score
result_diff_df.loc[row_index, 'rouge_score_finetune'] = finetune_rouge_1_f_score
origin_rouge_score = get_rouge_score(original_model_response, reference_answer)
origin_rouge_1_f_score = origin_rouge_score['rouge-1']['f']
origin_rouge_score_sum += origin_rouge_1_f_score
result_diff_df.loc[row_index, 'rouge_score_origin'] = origin_rouge_1_f_score
self.gpt_scorer.mode("accuracy")
gpt_response_acc, gpt_score_acc = (self.gpt_scorer.score_with_chatgpt(test_question,
finetune_model_response,
reference_answer))
result_diff_df.loc[row_index, 'acc_finetune'] = gpt_score_acc
result_diff_df.loc[row_index, 'acc_response_finetune'] = gpt_response_acc
if (gpt_score_acc is not None) and gpt_score_acc.isdigit():
finetune_acc_score_sum += float(gpt_score_acc)
gpt_response_acc, gpt_score_acc = (self.gpt_scorer.score_with_chatgpt(test_question,
original_model_response,
reference_answer))
result_diff_df.loc[row_index, 'acc_origin'] = gpt_score_acc
result_diff_df.loc[row_index, 'acc_response_origin'] = gpt_response_acc
if (gpt_score_acc is not None) and gpt_score_acc.isdigit():
origin_acc_score_sum += float(gpt_score_acc)
self.gpt_scorer.mode("fluency")
gpt_response_fluency, gpt_score_fluency = (self.gpt_scorer.score_with_chatgpt(test_question,
finetune_model_response,
reference_answer))
result_diff_df.loc[row_index, 'fluency_finetune'] = gpt_score_fluency
result_diff_df.loc[row_index, 'fluency_response_finetune'] = gpt_response_fluency
if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit():
finetune_fluency_score_sum += float(gpt_score_fluency)
gpt_response_fluency, gpt_score_fluency = (self.gpt_scorer.score_with_chatgpt(test_question,
original_model_response,
reference_answer))
result_diff_df.loc[row_index, 'fluency_origin'] = gpt_score_fluency
result_diff_df.loc[row_index, 'fluency_response_origin'] = gpt_response_fluency
if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit():
origin_fluency_score_sum += float(gpt_score_fluency)
self.gpt_scorer.mode("diff")
gpt_response_diff, gpt_score_diff = (self.gpt_scorer.score_with_chatgpt(test_question,
finetune_model_response,
reference_answer,
original_model_response))
result_diff_df.loc[row_index, 'diff_score'] = gpt_score_diff
result_diff_df.loc[row_index, 'diff_score_response'] = gpt_response_diff
if (gpt_score_diff is not None) and gpt_score_diff.isdigit():
model_better_score_sum += float(gpt_score_diff)
result_diff_df.to_csv('logs/' + self.save_result_dir + '/result_diff_test_score_tmp.csv', index=False)
end_time = time.time()
elapsed_time = end_time - start_time
print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "")
synthesis_rouge_score = finetune_rouge_score_sum / row_count
original_rouge_score = origin_rouge_score_sum / row_count
synthesis_acc_score = finetune_acc_score_sum / row_count
original_acc_score = origin_acc_score_sum / row_count
synthesis_fluency_score = finetune_fluency_score_sum / row_count
original_fluency_score = origin_fluency_score_sum / row_count
synthesis_diff_score = model_better_score_sum / row_count
print("微调模型ROUGE分数", synthesis_rouge_score)
print("原模型ROUGE分数", original_rouge_score)
print("微调模型准确性分数:", synthesis_acc_score)
print("原模型准确性分数:", original_acc_score)
print("微调模型流畅度分数:", synthesis_fluency_score)
print("原模型流畅度分数:", original_fluency_score)
print("微调模型优于原模型分数:", synthesis_diff_score)
synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100 / 4 +
synthesis_fluency_score * 100 / 3 + synthesis_diff_score * 100 / 3) / 4
print("综合评分:", synthesis_score)
original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100 / 4 +
original_fluency_score * 100 / 3 + 66) / 4
print("原模型综合评分:", original_synthesis_score)
# 获取当前时间的字符串
current_time = time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))
result_diff_df.to_csv('logs/' + self.save_result_dir + '/' + current_time + '_result_diff_test_score_'
+ str(synthesis_score) + '.csv', index=False)

@ -8,11 +8,14 @@ class GPTScorer:
self.eval_mode = "accuracy"
def mode(self, mode):
# 判断模式是否合法
if mode not in ["accuracy", "fluency", "diff"]:
raise ValueError("Invalid mode. Must be one of 'accuracy', 'fluency' or 'diff'.")
self.eval_mode = mode
return self
def score_with_chatgpt(self, question, model_result, reference):
prompt = self.generate_scoring_prompt(question, model_result, reference)
def score_with_chatgpt(self, question, model_result, reference, origin_model_result=None):
prompt = self.generate_scoring_prompt(question, model_result, reference, origin_model_result)
try:
# 提交文本以获取ChatGPT评分
response = openai.ChatCompletion.create(
@ -25,33 +28,75 @@ class GPTScorer:
return chatgpt_response, chatgpt_score
except Exception as e:
print("An error occurred while scoring with ChatGPT:", e)
return None, None
return None, '2'
def generate_scoring_prompt(self, question, model_result, reference):
def generate_scoring_prompt(self, question, model_result, reference, origin_model_result=None):
# 生成评分提示
base_prompt = []
if self.eval_mode == "accuracy":
# base_prompt = [{
# "role": "system",
# "content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。"
# "请对比参考答案和大模型生成结果从信息准确性的角度评分以下生成的结果以评估其质量。满分为5分。"
# "评分标准为信息准确无误——5分。信息大致符合实际信息——4分。"
# "信息不全面但明确表达了自身无法回答——3分。信息完全错误——2分。回答无关或回答语句不完整——1分。"
# "可以根据实际情况稍作调整。"
# "回复格式为评分为x分。理由xxx。"
# }]
base_prompt = [{
"role": "system",
"content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。"
"请对比参考答案和大模型生成结果从信息准确性的角度评分以下生成的结果以评估其质量。满分为5分。"
"评分标准为信息准确无误——5分。信息大致符合实际信息——4分。"
"信息不全面但明确表达了自身无法回答——3分。信息完全错误——2分。回答无关——1分。"
"可以根据实际情况稍作调整。"
"回复格式为评分为x分。理由xxx。"
"请对比参考答案和大模型生成结果,从信息准确性的角度评分以下生成的结果,以评估其质量。满分为4分。"
"信息的准确性应当被首要考虑,多余的未知真假的信息不应该带来加分。"
"评分标准为模型回答正确——4分。模型回答模糊但部分准确——3分。"
"模型无法给出解答但明确表示无法解答——2分。模型给出错误或无法理解的回答/模型回答语句不完整——1分"
"回复格式为:理由xxx。因此评分为x分"
}]
prompt = base_prompt + [
{
"role": "user",
"content": f"问题:{question}\n\n生成的结果:{model_result}\n\n参考答案:{reference}"
}
]
elif self.eval_mode == "fluency":
base_prompt = [{
"role": "system",
"content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。"
"请从语言流畅度的角度评分大模型生成的结果以评估其质量。满分为3分。"
"评分标准为模型回答流畅符合日常语言习惯——3分。模型回答流畅但存在突然中断等情况——2分。"
"模型回答无条理可能重复输出某些单词——1分。"
"回复格式为理由xxx。因此评分为x分。"
}]
elif self.eval_mode == "diff":
base_prompt = [{
"role": "system",
"content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案、一个大模型生成的结果和一个微调后大模型生成结果。"
"请对比这些结果判断微调后大模型的结果是否优于原模型。满分为3分。"
"信息的准确性应当被首要考虑,多余的未知真假的信息不应该带来加分。"
"对比时请关注结果和参考答案的契合度。"
"评分标准为认为回答优于原模型——3分。认为回答与原模型持平——2分。"
"认为回答不如原模型——1分。"
"回复格式为理由xxx。因此评分为x分。"
}]
if self.eval_mode == "diff":
if origin_model_result is None:
raise ValueError("The original model result is required in 'diff' mode.")
prompt = base_prompt + [
{
"role": "user",
"content": f"问题:{question}\n\n原模型生成的结果:{origin_model_result}\n\n"
f"微调后模型生成的结果:{model_result}\n\n参考答案:{reference}"
}
]
else:
prompt = base_prompt + [
{
"role": "user",
"content": f"问题:{question}\n\n生成的结果:{model_result}\n\n参考答案:{reference}"
}
]
return prompt
# AIzaSyAW_h8itGLwNhYTfx1EDLthhcHHlcIfs7w google
def extract_score(self, response_text):
# 提取评分
pattern = [
r"^评分为([1-5])分",
r"评分:([1-5])分",
]
score_list = []
for p in pattern:
@ -59,6 +104,8 @@ class GPTScorer:
score_list = re.findall(p, response_text)
else:
break
if len(score_list) == 0:
return '3'
return score_list[0]

@ -0,0 +1,10 @@
import evaluate
perplexity = evaluate.load("../metrics/perplexity")
input_texts = ["你好!", "打死哦对吉萨大你去我家而且我就", "这辆车非常优秀"]
results = perplexity.compute(model_id='gpt2',
add_start_token=False,
predictions=input_texts)
print(list(results.keys()))
print(results["perplexities"])

@ -1,7 +1,28 @@
from scoring.assessment_engine import AssessmentEngine
assessment_engine = AssessmentEngine("chatglm2_glm2_pt1_2024-03-08_11-24-47",
assessment_engine = AssessmentEngine("other",
"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")
assessment_engine.eval_subject("car_knowledge", "car_knowledge_qa_test_result.csv")
assessment_engine.eval_subject("car_use", "car_use_qa_test_result.csv")
assessment_engine.eval_subject("car_market", "car_market_qa_test_result.csv")
assessment_engine.eval_result_diff("0408output-dora.csv")
# synthesis_rouge_score = 0.30358589506467687
# print("微调模型ROUGE分数", synthesis_rouge_score)
# original_rouge_score = 0.26004000118452175
# print("原模型ROUGE分数", original_rouge_score)
# synthesis_acc_score = 2.768
# print("微调模型准确性分数:", synthesis_acc_score)
# original_acc_score = 2.724
# print("原模型准确性分数:", original_acc_score)
# synthesis_fluency_score = 2.098
# print("微调模型流畅度分数:", synthesis_fluency_score)
# original_fluency_score = 2.236
# print("原模型流畅度分数:", original_fluency_score)
# synthesis_diff_score = 2.278
# print("微调模型优于原模型分数:", synthesis_diff_score)
#
# synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100/4 + synthesis_fluency_score * 100/3
# + synthesis_diff_score * 100/3 ) / 4
# original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100/4 +
# original_fluency_score * 100/3 + 66 ) / 4
#
# print("综合评分:", synthesis_score)
# print("原模型综合评分:", original_synthesis_score)

Loading…
Cancel
Save