|
|
|
|
import google.generativeai as genai
|
|
|
|
|
from scoring.llm_scorer import extract_score, LanguageModelScorer
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def request_gemini(gemini_model, prompt, retries=3):
|
|
|
|
|
def ordinal(n):
|
|
|
|
|
return str(n) + {1: "st", 2: "nd", 3: "rd"}.get(10 <= n % 100 <= 20 and n or n % 10, "th")
|
|
|
|
|
|
|
|
|
|
for i in range(retries):
|
|
|
|
|
try:
|
|
|
|
|
response = gemini_model.generate_content(prompt)
|
|
|
|
|
return response.text
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"\nAn error occurred while scoring with Gemini: {e}, it's the {ordinal(i + 1)} time.")
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
continue
|
|
|
|
|
print("Failed to get response from Gemini. Use default score.")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GeminiScorer(LanguageModelScorer):
|
|
|
|
|
def __init__(self, api_key):
|
|
|
|
|
super().__init__()
|
|
|
|
|
genai.configure(api_key=api_key)
|
|
|
|
|
self.api_key = api_key
|
|
|
|
|
self.model = genai.GenerativeModel('gemini-pro')
|
|
|
|
|
|
|
|
|
|
def score_with_llm(self, question, model_result, reference, origin_model_result=None):
|
|
|
|
|
prompt = self.generate_scoring_prompt(question, model_result, reference, origin_model_result)
|
|
|
|
|
try:
|
|
|
|
|
gemini_response = request_gemini(self.model, prompt, retries=5)
|
|
|
|
|
gemini_score = extract_score(gemini_response)
|
|
|
|
|
return gemini_response, gemini_score
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print("\nAn error occurred while extract score:", e)
|
|
|
|
|
return None, '2'
|
|
|
|
|
|
|
|
|
|
def generate_scoring_prompt(self, question, model_result, reference, origin_model_result=None):
|
|
|
|
|
# 生成评分提示
|
|
|
|
|
base_prompt = ''
|
|
|
|
|
if self.eval_mode == "accuracy":
|
|
|
|
|
base_prompt = ("你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。"
|
|
|
|
|
"请对比参考答案和大模型生成结果,从信息准确性的角度评分以下生成的结果,以评估其质量。满分为4分。"
|
|
|
|
|
"信息的准确性应当被首要考虑,多余的未知真假的信息不应该带来加分。"
|
|
|
|
|
"评分标准为:模型回答正确——4分。模型回答模糊,但部分准确——3分。"
|
|
|
|
|
"模型无法给出解答,但明确表示无法解答——2分。模型给出错误或无法理解的回答/模型回答语句不完整——1分。"
|
|
|
|
|
"回复格式为:理由:xxx。因此,评分为x分。")
|
|
|
|
|
elif self.eval_mode == "fluency":
|
|
|
|
|
base_prompt = ("你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。"
|
|
|
|
|
"请从语言流畅度的角度评分大模型生成的结果,以评估其质量。满分为3分。"
|
|
|
|
|
"评分标准为:模型回答流畅,符合日常语言习惯——3分。模型回答流畅,但存在突然中断等情况——2分。"
|
|
|
|
|
"模型回答无条理,可能重复输出某些单词——1分。"
|
|
|
|
|
"回复格式为:理由:xxx。因此,评分为x分。")
|
|
|
|
|
|
|
|
|
|
elif self.eval_mode == "diff":
|
|
|
|
|
base_prompt = ("你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案、一个大模型1生成的结果和一个大模型2生成的结果。"
|
|
|
|
|
"请对比这些结果,判断大模型2的结果和大模型1哪个更好。满分为3分。"
|
|
|
|
|
"信息的准确性应当被首要考虑,多余的未知真假的信息不应该带来加分。"
|
|
|
|
|
"对比时请关注结果和参考答案的契合度。"
|
|
|
|
|
"评分标准为:认为大模型2的结果更好——3分。认为两者结果持平——2分。"
|
|
|
|
|
"认为大模型1的结果更好——1分。"
|
|
|
|
|
"回复格式为:理由:xxx。因此,评分为x分。")
|
|
|
|
|
|
|
|
|
|
if self.eval_mode == "diff":
|
|
|
|
|
if origin_model_result is None:
|
|
|
|
|
raise ValueError("The original model result is required in 'diff' mode.")
|
|
|
|
|
prompt = base_prompt + '\n' + (f"问题:{question}\n\n大模型1生成的结果:{origin_model_result}\n\n"
|
|
|
|
|
f"大模型2生成的结果:{model_result}\n\n参考答案:{reference}")
|
|
|
|
|
else:
|
|
|
|
|
prompt = base_prompt + '\n' + f"问题:{question}\n\n生成的结果:{model_result}\n\n参考答案:{reference}"
|
|
|
|
|
return prompt
|