You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

73 lines
4.5 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import google.generativeai as genai
from scoring.llm_scorer import extract_score, LanguageModelScorer
import time
def request_gemini(gemini_model, prompt, retries=3):
def ordinal(n):
return str(n) + {1: "st", 2: "nd", 3: "rd"}.get(10 <= n % 100 <= 20 and n or n % 10, "th")
for i in range(retries):
try:
response = gemini_model.generate_content(prompt)
return response.text
except Exception as e:
print(f"\nAn error occurred while scoring with Gemini: {e}, it's the {ordinal(i + 1)} time.")
time.sleep(1)
continue
print("Failed to get response from Gemini. Use default score.")
return None
class GeminiScorer(LanguageModelScorer):
def __init__(self, api_key):
super().__init__()
genai.configure(api_key=api_key)
self.api_key = api_key
self.model = genai.GenerativeModel('gemini-pro')
def score_with_llm(self, question, model_result, reference, origin_model_result=None):
prompt = self.generate_scoring_prompt(question, model_result, reference, origin_model_result)
try:
gemini_response = request_gemini(self.model, prompt, retries=5)
gemini_score = extract_score(gemini_response)
return gemini_response, gemini_score
except Exception as e:
print("\nAn error occurred while extract score:", e)
return None, '2'
def generate_scoring_prompt(self, question, model_result, reference, origin_model_result=None):
# 生成评分提示
base_prompt = ''
if self.eval_mode == "accuracy":
base_prompt = ("你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。"
"请对比参考答案和大模型生成结果从信息准确性的角度评分以下生成的结果以评估其质量。满分为4分。"
"信息的准确性应当被首要考虑,多余的未知真假的信息不应该带来加分。"
"评分标准为模型回答正确——4分。模型回答模糊但部分准确——3分。"
"模型无法给出解答但明确表示无法解答——2分。模型给出错误或无法理解的回答/模型回答语句不完整——1分。"
"回复格式为理由xxx。因此评分为x分。")
elif self.eval_mode == "fluency":
base_prompt = ("你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。"
"请从语言流畅度的角度评分大模型生成的结果以评估其质量。满分为3分。"
"评分标准为模型回答流畅符合日常语言习惯——3分。模型回答流畅但存在突然中断等情况——2分。"
"模型回答无条理可能重复输出某些单词——1分。"
"回复格式为理由xxx。因此评分为x分。")
elif self.eval_mode == "diff":
base_prompt = ("你是一个汽车领域专家接下来将向你提供一个问题、一个参考答案、一个大模型1生成的结果和一个大模型2生成的结果。"
"请对比这些结果判断大模型2的结果和大模型1哪个更好。满分为3分。"
"信息的准确性应当被首要考虑,多余的未知真假的信息不应该带来加分。"
"对比时请关注结果和参考答案的契合度。"
"评分标准为认为大模型2的结果更好——3分。认为两者结果持平——2分。"
"认为大模型1的结果更好——1分。"
"回复格式为理由xxx。因此评分为x分。")
if self.eval_mode == "diff":
if origin_model_result is None:
raise ValueError("The original model result is required in 'diff' mode.")
prompt = base_prompt + '\n' + (f"问题:{question}\n\n大模型1生成的结果{origin_model_result}\n\n"
f"大模型2生成的结果{model_result}\n\n参考答案:{reference}")
else:
prompt = base_prompt + '\n' + f"问题:{question}\n\n生成的结果:{model_result}\n\n参考答案:{reference}"
return prompt