|
|
@ -8,6 +8,7 @@ from transformers.generation.utils import LogitsProcessorList
|
|
|
|
from evaluators.evaluator import Evaluator
|
|
|
|
from evaluators.evaluator import Evaluator
|
|
|
|
from peft import PeftModel
|
|
|
|
from peft import PeftModel
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class InvalidScoreLogitsProcessor(LogitsProcessor):
|
|
|
|
class InvalidScoreLogitsProcessor(LogitsProcessor):
|
|
|
|
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
|
|
|
|
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
|
|
|
|
if torch.isnan(scores).any() or torch.isinf(scores).any():
|
|
|
|
if torch.isnan(scores).any() or torch.isinf(scores).any():
|
|
|
@ -15,57 +16,82 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
|
|
|
|
scores[..., 5] = 5e4
|
|
|
|
scores[..., 5] = 5e4
|
|
|
|
return scores
|
|
|
|
return scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ChatGLM_Evaluator(Evaluator):
|
|
|
|
class ChatGLM_Evaluator(Evaluator):
|
|
|
|
def __init__(self, choices, k, model_name, device, finetune=None):
|
|
|
|
def __init__(self, choices, k, model_name, device, finetune=None, finetune_method=None):
|
|
|
|
super(ChatGLM_Evaluator, self).__init__(choices, model_name, k)
|
|
|
|
super(ChatGLM_Evaluator, self).__init__(choices, model_name, k)
|
|
|
|
# try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem
|
|
|
|
# try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem
|
|
|
|
# or directly clone the model
|
|
|
|
# or directly clone the model
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna")
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna")
|
|
|
|
self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna", resume_download=True).half().to(device)
|
|
|
|
if finetune_method == "lora":
|
|
|
|
if finetune:
|
|
|
|
self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna",
|
|
|
|
|
|
|
|
resume_download=True).half().to(device)
|
|
|
|
peft_model_id = "lora/" + finetune
|
|
|
|
peft_model_id = "lora/" + finetune
|
|
|
|
self.model = PeftModel.from_pretrained(self.model, peft_model_id)
|
|
|
|
self.model = PeftModel.from_pretrained(self.model, peft_model_id)
|
|
|
|
print("Model loaded! use GLM2" + finetune)
|
|
|
|
print("Model loaded! use GLM2" + finetune)
|
|
|
|
|
|
|
|
elif finetune_method == "ptuning":
|
|
|
|
|
|
|
|
CHECKPOINT_PATH = "ptuning/" + finetune
|
|
|
|
|
|
|
|
config = AutoConfig.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, pre_seq_len=128)
|
|
|
|
|
|
|
|
self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", config=config, trust_remote_code=True)
|
|
|
|
|
|
|
|
prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"))
|
|
|
|
|
|
|
|
new_prefix_state_dict = {}
|
|
|
|
|
|
|
|
for k, v in prefix_state_dict.items():
|
|
|
|
|
|
|
|
if k.startswith("transformer.prefix_encoder."):
|
|
|
|
|
|
|
|
new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
|
|
|
|
|
|
|
|
self.model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
|
|
|
|
|
|
|
|
self.model = self.model.half().to(device)
|
|
|
|
|
|
|
|
self.model.transformer.prefix_encoder.float()
|
|
|
|
|
|
|
|
print("Model loaded! use GLM2 + " + finetune)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
|
|
|
|
self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna",
|
|
|
|
|
|
|
|
resume_download=True).half().to(device)
|
|
|
|
print("Model loaded!(GLM2)")
|
|
|
|
print("Model loaded!(GLM2)")
|
|
|
|
# self.model = self.model.eval()
|
|
|
|
# self.model = self.model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None):
|
|
|
|
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None):
|
|
|
|
correct_num = 0
|
|
|
|
correct_num = 0
|
|
|
|
if save_result_dir:
|
|
|
|
result = []
|
|
|
|
if few_shot:
|
|
|
|
score = []
|
|
|
|
result = []
|
|
|
|
answer_list = []
|
|
|
|
score = []
|
|
|
|
|
|
|
|
answer_list = []
|
|
|
|
|
|
|
|
if few_shot:
|
|
|
|
if few_shot:
|
|
|
|
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
|
|
|
|
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
|
|
|
|
|
|
|
|
print(history)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
history = []
|
|
|
|
# _ , history = self.model.chat(self.tokenizer, "接下来会提供给你一些选择题,请选出正确的答案。", do_sample=False)
|
|
|
|
|
|
|
|
history = [('接下来会提供给你一些选择题,请选出正确的答案,给出正确的选项即可。', '好的,我会尽力解答。')]
|
|
|
|
|
|
|
|
# print(history)
|
|
|
|
answers = list(test_df['answer'])
|
|
|
|
answers = list(test_df['answer'])
|
|
|
|
for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)):
|
|
|
|
for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)):
|
|
|
|
question = self.format_example(row, include_answer=False, cot=cot)
|
|
|
|
question = self.format_example(row, include_answer=False, cot=cot)
|
|
|
|
if few_shot:
|
|
|
|
if few_shot:
|
|
|
|
response, _ = self.model.chat(self.tokenizer, question, do_sample=False, history=history)
|
|
|
|
response, _ = self.model.chat(self.tokenizer, question, max_length=300,
|
|
|
|
|
|
|
|
do_sample=False, history=history)
|
|
|
|
response = response.strip()
|
|
|
|
response = response.strip()
|
|
|
|
# For ChatGLM, we use answer extraction in answer-only mode too.
|
|
|
|
# For ChatGLM, we use answer extraction in answer-only mode too.
|
|
|
|
ans, direct_extract = self.extract_cot_answer(row, response)
|
|
|
|
ans, direct_extract = self.extract_cot_answer(row, response)
|
|
|
|
else: # zero-shot by extracting answer from distribution
|
|
|
|
else: # zero-shot by extracting answer from distribution
|
|
|
|
ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, history=history)
|
|
|
|
response, _ = self.model.chat(self.tokenizer, question, max_length=300,
|
|
|
|
|
|
|
|
do_sample=False, history=history)
|
|
|
|
|
|
|
|
response = response.strip()
|
|
|
|
|
|
|
|
ans, direct_extract = self.extract_cot_answer(row, response)
|
|
|
|
|
|
|
|
print(response, ans)
|
|
|
|
|
|
|
|
# ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048,
|
|
|
|
|
|
|
|
# history=history)
|
|
|
|
if ans == answers[row_index]:
|
|
|
|
if ans == answers[row_index]:
|
|
|
|
correct_num += 1
|
|
|
|
correct_num += 1
|
|
|
|
correct = 1
|
|
|
|
correct = 1
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
correct = 0
|
|
|
|
correct = 0
|
|
|
|
if save_result_dir:
|
|
|
|
if save_result_dir:
|
|
|
|
if few_shot:
|
|
|
|
# if few_shot:
|
|
|
|
result.append(response)
|
|
|
|
result.append(response)
|
|
|
|
answer_list.append(ans)
|
|
|
|
answer_list.append(ans)
|
|
|
|
score.append(correct)
|
|
|
|
score.append(correct)
|
|
|
|
correct_ratio = 100*correct_num/len(answers)
|
|
|
|
correct_ratio = 100 * correct_num / len(answers)
|
|
|
|
|
|
|
|
|
|
|
|
if save_result_dir:
|
|
|
|
if save_result_dir:
|
|
|
|
if few_shot:
|
|
|
|
# if few_shot:
|
|
|
|
test_df['model_output'] = result
|
|
|
|
test_df['model_output'] = result
|
|
|
|
test_df['correctness'] = score
|
|
|
|
test_df['correctness'] = score
|
|
|
|
test_df['model_answer'] = answer_list
|
|
|
|
test_df['model_answer'] = answer_list
|
|
|
|
result_file_name = f'{subject_name}_{correct_ratio}_test.csv'
|
|
|
|
result_file_name = f'{subject_name}_{correct_ratio}_test.csv'
|
|
|
@ -75,12 +101,33 @@ class ChatGLM_Evaluator(Evaluator):
|
|
|
|
|
|
|
|
|
|
|
|
return correct_ratio
|
|
|
|
return correct_ratio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def eval_qa(self, subject_name, qa_df, save_result_dir=None):
|
|
|
|
|
|
|
|
# history = []
|
|
|
|
|
|
|
|
history = [('接下来会给你一些一些汽车领域相关问题,请回答。', '好的,我会尽力解答。')]
|
|
|
|
|
|
|
|
for row_index, row in tqdm(qa_df.iterrows(), total=len(qa_df)):
|
|
|
|
|
|
|
|
question = row['question']
|
|
|
|
|
|
|
|
response, _ = self.model.chat(self.tokenizer, question, max_length=300, do_sample=False, history=history)
|
|
|
|
|
|
|
|
# current_length = 0
|
|
|
|
|
|
|
|
# response = ""
|
|
|
|
|
|
|
|
# for resp, _ in self.model.stream_chat(self.tokenizer, question, max_length=300,
|
|
|
|
|
|
|
|
# do_sample=False, history=history):
|
|
|
|
|
|
|
|
# print(resp[current_length:], end="", flush=True)
|
|
|
|
|
|
|
|
# current_length = len(resp)
|
|
|
|
|
|
|
|
# response = resp
|
|
|
|
|
|
|
|
# print('')
|
|
|
|
|
|
|
|
response = response.strip()
|
|
|
|
|
|
|
|
qa_df.loc[row_index, 'model_output'] = response
|
|
|
|
|
|
|
|
if save_result_dir:
|
|
|
|
|
|
|
|
result_file_name = f'{subject_name}_qa_test_result.csv'
|
|
|
|
|
|
|
|
qa_df.to_csv(os.path.join(save_result_dir, result_file_name))
|
|
|
|
|
|
|
|
|
|
|
|
def generate_few_shot_prompt(self, subject, dev_df, cot=False):
|
|
|
|
def generate_few_shot_prompt(self, subject, dev_df, cot=False):
|
|
|
|
message = []
|
|
|
|
message = []
|
|
|
|
k = self.k
|
|
|
|
k = self.k
|
|
|
|
if self.k == -1:
|
|
|
|
if self.k == -1:
|
|
|
|
k = dev_df.shape[0]
|
|
|
|
k = dev_df.shape[0]
|
|
|
|
message.append(self.format_example(dev_df.iloc[0, :], cot=cot, add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"))
|
|
|
|
message.append(self.format_example(dev_df.iloc[0, :], cot=cot,
|
|
|
|
|
|
|
|
add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"))
|
|
|
|
for i in range(1, k):
|
|
|
|
for i in range(1, k):
|
|
|
|
message.append(self.format_example(dev_df.iloc[i, :], cot=cot))
|
|
|
|
message.append(self.format_example(dev_df.iloc[i, :], cot=cot))
|
|
|
|
return message
|
|
|
|
return message
|
|
|
@ -157,5 +204,6 @@ class ChatGLM_Evaluator(Evaluator):
|
|
|
|
|
|
|
|
|
|
|
|
score = outputs.scores[0][0].tolist()
|
|
|
|
score = outputs.scores[0][0].tolist()
|
|
|
|
choice_score = [score[167], score[333], score[251], score[416]]
|
|
|
|
choice_score = [score[167], score[333], score[251], score[416]]
|
|
|
|
ranked_index = [index for index, value in sorted(list(enumerate(choice_score)), key=lambda x:x[1], reverse=True)]
|
|
|
|
ranked_index = [index for index, value in
|
|
|
|
|
|
|
|
sorted(list(enumerate(choice_score)), key=lambda x: x[1], reverse=True)]
|
|
|
|
return self.choices[ranked_index[0]]
|
|
|
|
return self.choices[ranked_index[0]]
|
|
|
|