import os import argparse import pandas as pd import torch from evaluators.chatgpt import ChatGPT_Evaluator from evaluators.chatglm import ChatGLM_Evaluator from evaluators.chatglm2 import ChatGLM2_Evaluator from evaluators.chatglm3 import ChatGLM3_Evaluator import time from scoring.assessment_engine import AssessmentEngine choices = ["A", "B", "C", "D"] device = torch.device("cpu") def main(args): global device evaluator_class = None if args.cuda_device: os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device device = torch.device("cuda") if "turbo" in args.model_name or "gpt-4" in args.model_name: evaluator = ChatGPT_Evaluator( choices=choices, k=args.ntrain, api_key=args.openai_key, model_name=args.model_name ) elif "chatglm3" in args.model_name: if args.finetune: fine_tune_model = args.finetune evaluator_class = ChatGLM3_Evaluator else: fine_tune_model = None evaluator = ChatGLM3_Evaluator( choices=choices, k=args.ntrain, model_name=args.model_name, device=device, finetune=fine_tune_model, finetune_method=args.finetune_method ) elif "chatglm2" in args.model_name: if args.finetune: fine_tune_model = args.finetune evaluator_class = ChatGLM2_Evaluator else: fine_tune_model = None evaluator = ChatGLM2_Evaluator( choices=choices, k=args.ntrain, model_name=args.model_name, device=device, finetune=fine_tune_model, finetune_method=args.finetune_method ) elif "chatglm" in args.model_name: if args.finetune: fine_tune_model = args.finetune evaluator_class = ChatGLM_Evaluator else: fine_tune_model = None evaluator = ChatGLM_Evaluator( choices=choices, k=args.ntrain, model_name=args.model_name, device=device, finetune=fine_tune_model, finetune_method=args.finetune_method ) else: print("Unknown model name") return -1 if not os.path.exists(r"logs"): os.mkdir(r"logs") run_date = time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time())) if args.finetune: fine_tune_model_name = args.finetune else: fine_tune_model_name = 'original' save_result_dir = os.path.join(r"logs", f"{args.model_name}_{fine_tune_model_name}/{run_date}") os.makedirs(save_result_dir) subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market'] subject_list.extend(['car_knowledge_in_train', 'car_use_in_train', 'car_market_in_train']) # qa_subject_list = ['car_knowledge', 'car_use', 'car_market'] qa_subject_list = ['car_market'] for subject_name in subject_list: print("Now testing: " + subject_name) # subject_name=args.subject val_file_path = os.path.join('data/val', f'{subject_name}_val.csv') val_df = pd.read_csv(val_file_path) if args.few_shot: dev_file_path = os.path.join('data/dev', f'{subject_name}_dev.csv') dev_df = pd.read_csv(dev_file_path) correct_ratio = evaluator.eval_subject(subject_name, val_df, dev_df, few_shot=args.few_shot, save_result_dir=save_result_dir, cot=args.cot) else: correct_ratio = evaluator.eval_subject(subject_name, val_df, few_shot=args.few_shot, save_result_dir=save_result_dir) print("Acc:", correct_ratio) result_list = [] # for subject_name in qa_subject_list: # print("Now testing: " + subject_name) # qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv') # qa_df = pd.read_csv(qa_file_path) # result_list.append(evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir)) # if evaluator_class is not None: # del evaluator # evaluator = evaluator_class( # choices=choices, # k=args.ntrain, # model_name=args.model_name, # device=device # ) # for index,subject_name in enumerate(qa_subject_list): # print("Now testing (origin): " + subject_name) # qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv') # qa_df = pd.read_csv(qa_file_path) # origin_result = evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir) # origin_result = origin_result.rename(columns={"model_output": "predict_origin"}) # result_df = result_list[index].rename(columns={"model_output": "predict_finetune"}).join(origin_result["predict_origin"]) # result_file_name = f'{subject_name}_qa_compare_result.csv' # result_df.to_csv(os.path.join(save_result_dir, result_file_name)) # assessment_engine = AssessmentEngine(save_result_dir, args.openai_key) # for subject_name in qa_subject_list: # assessment_engine.eval_result_diff(f'{subject_name}_qa_compare_result.csv') if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--ntrain", "-k", type=int, default=5) parser.add_argument("--openai_key", type=str, default="xxx") parser.add_argument("--minimax_group_id", type=str, default="xxx") parser.add_argument("--minimax_key", type=str, default="xxx") parser.add_argument("--few_shot", action="store_true") parser.add_argument("--model_name", type=str) parser.add_argument("--cot", action="store_true") # parser.add_argument("--subject","-s",type=str,default="operating_system") parser.add_argument("--cuda_device", type=str) parser.add_argument("--finetune", type=str) parser.add_argument("--finetune_method", type=str) user_args = parser.parse_args() main(user_args)