fix: 修复评估时模型输出json格式不对导致读取错误的问题 (#133)

* feature: add cot data evaluation function

* fix: added verification to evaluation results

* fix: fix the prompt for evaluating

* fix: 修复当评估结果为空导致读取失败的问题
This commit is contained in:
hefanli
2025-12-04 18:49:50 +08:00
committed by GitHub
parent 31c4966608
commit 744d15ba24
14 changed files with 373 additions and 219 deletions

View File

@@ -1,5 +1,6 @@
import asyncio
import uuid
import math
import json
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query, BackgroundTasks
@@ -171,7 +172,7 @@ async def list_evaluation_tasks(
# 转换为响应模型
items = [_map_to_task_detail_response(task) for task in tasks]
total_pages = (total + size - 1) // size if size > 0 else 0
total_pages = math.ceil(total / size) if total > 0 else 0
return StandardResponse(
code=200,
@@ -217,7 +218,7 @@ async def list_evaluation_items(
count_query = select(func.count()).select_from(query.subquery())
total = (await db.execute(count_query)).scalar_one()
files = (await db.execute(query.offset(offset).limit(size))).scalars().all()
total_pages = (total + size - 1) // size if size > 0 else 0
total_pages = math.ceil(total / size) if total > 0 else 0
file_responses = [
EvaluationFileResponse(
taskId=file.task_id,
@@ -298,7 +299,7 @@ async def list_evaluation_items(
taskId=item.task_id,
itemId=item.item_id,
fileId=item.file_id,
evalContent=json.loads(item.eval_content),
evalContent=json.loads(item.eval_content) if item.eval_content else None,
evalScore=float(item.eval_score) if item.eval_score else None,
evalResult=json.loads(item.eval_result),
status=item.status
@@ -306,7 +307,7 @@ async def list_evaluation_items(
for item in items
]
total_pages = (total + size - 1) // size if size > 0 else 0
total_pages = math.ceil(total / size) if total > 0 else 0
return StandardResponse(
code=200,
@@ -387,6 +388,12 @@ async def delete_eval_tasks(
.where(EvaluationItem.task_id == task_id)
)
# 删除评估文件
await db.execute(
EvaluationFile.__table__.delete()
.where(EvaluationFile.task_id == task_id)
)
# 删除任务
await db.delete(task)
await db.commit()
@@ -419,6 +426,7 @@ def _map_to_task_detail_response(
sourceId=task.source_id,
sourceName=task.source_name,
status=task.status,
evalMethod=task.eval_method,
evalProcess=task.eval_process,
evalPrompt=task.eval_prompt,
evalConfig=json.loads(task.eval_config),

View File

@@ -36,6 +36,7 @@ class EvaluationTaskItem(BaseModel):
source_id: Optional[str] = Field(..., alias="sourceId", description="数据源ID")
source_name: Optional[str] = Field(None, alias="sourceName", description="数据源名称")
status: TaskStatus = Field(..., description="任务状态")
eval_method: Optional[str] = Field(None, alias="evalMethod", description="评估方式")
eval_process: Optional[float] = Field(0, alias="evalProcess", description="评估进度")
created_at: Optional[str] = Field(None, alias="createdAt", description="创建时间")
updated_at: Optional[str] = Field(None, alias="updatedAt", description="更新时间")

View File

@@ -1,3 +1,7 @@
from app.core.logging import get_logger
logger = get_logger(__name__)
EVALUATION_PROMPT_TEMPLATE = [
{
"evalType": "QA",
@@ -51,26 +55,90 @@ EVALUATION_PROMPT_TEMPLATE = [
请按照以下JSON格式输出评估结果,评估结果为Y/N,符合标注输出Y,不符合标准输出N:
{
"result": {{result_example}
"result": {
{result_example}
},
"evaluation": "这是一个高质量的问答数据集。问题表述清晰具体,答案准确完整且逻辑性强,与原始文本高度相关。建议:可以进一步丰富答案的细节描述。"
}
"""
},
{
"evalType": "COT",
"defaultDimensions": [
{
"dimension": "思维链逻辑是否连贯",
"description": "分析思维链中推理链条的连续性:步骤间有明确的逻辑连接词;每一步都是基于前置在步骤的结果;没有逻辑跳跃或断层;推理方向一致,不偏离目标。"
},
{
"dimension": "推理步骤是否合理必要",
"description": "分析思维链中对于步骤分解的合理性和必要性:复杂问题被适当分解; 每个步骤都是解决整体问题的必要部分;步骤粒度适中(既不过细也不过粗);符合人类认知习惯。"
},
{
"dimension": "内容是否准确",
"description": "分析整个COT数据内容是否准确:所有陈述的事实必须准确;展示每一步的计算结果(如何涉及数学计算,必须保证数学计算无错误);逻辑推导有效且合理,最终答案与推理过程一致。"
}
],
"prompt": """
# Role: COT数据质量评估专家
## Profile:
- Description: 你是一名专业的Chain-of-Thought(CoT)推理数据质量评估专家,擅长从多个维度对COT数据进行质量评估,挑选出有助于模型学习如何分解问题、展示推理链条,提高模型对于复杂问题解决能力的COT数据。具备深度学习、自然语言处理和数据科学的专业背景。
## Skills:
1. 能够从多个维度对COT数据进行综合评估,保证客观、专业、细致
2. 擅长识别COT数据中的潜在问题,如推包含事实性错误(关键信息错误),存在严重逻辑矛(无法自洽),包含有害、偏见或不当内容,完全偏离主题,抄袭或高度重复内容等
3. 能够给出具体的改进建议和质量评分,并提供可操作的优化方案
## 评估维度:
{dimensions}
## 问题或指令:
{question}
## 思维链:
{chain_of_thought}
## 结论:
{conclusion}
## 注意事项:
- 评估结论要具体指出优点和不足,提供可操作的改进建议
- 评估结论控制在150字以内,简洁明了但要涵盖关键信息
## 输出要求:
请按照以下JSON格式输出评估结果,评估结果为Y/N,符合标注输出Y,不符合标准输出N;将评估结论写到evaluation中:
{
"result": {
{result_example}
},
"evaluation": "这是一个高质量的COT数据。思维链逻辑连贯,推理步骤合理,信息完整。建议:部分表达可以进一步优化,以及个别步骤的过渡可以更加平滑。"
}
"""
}
]
def get_dimensions_for_qa(dimensions: list[dict]) -> str:
dimensions_str = "\n"
dimensions_str = ""
index = 1
for dimension in dimensions:
dimensions_str += f"### {index}. {dimension.get("dimension")}\n**评估标准:**\n{dimension.get("description")}\n\n"
if index > 1:
dimensions_str += "\n"
dimensions_str += f"### {index}. {dimension.get("dimension")}\n**评估标准:**\n{dimension.get("description")}"
if index < len(dimensions):
dimensions_str += "\n"
index += 1
return dimensions_str
def get_result_example_for_qa(dimensions: list[dict]) -> str:
result_example = ""
index = 1
for dimension in dimensions:
result_example += f'\n "{dimension.get("dimension")}": "Y",'
if index > 1:
result_example += "\n "
result_example += f'"{dimension.get("dimension")}": "Y"'
if index < len(dimensions):
result_example += ","
index += 1
return result_example
def get_prompt(task_type: str, dimensions: list[dict]) -> str:

View File

@@ -1,7 +1,7 @@
"""
Schema for evaluation prompt templates.
"""
from typing import List, Dict, Any
from typing import List
from pydantic import BaseModel, Field

View File

@@ -2,7 +2,7 @@ import json
import uuid
import asyncio
from sqlalchemy import select
from sqlalchemy import select, func
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.exception import BusinessErrorCodeEnum, BusinessException
@@ -13,7 +13,7 @@ from app.db.models.data_synthesis import DataSynthesisFileInstance, SynthesisDat
from app.db.session import AsyncSessionLocal
from app.module.evaluation.schema.evaluation import SourceType
from app.module.shared.schema import TaskStatus
from app.module.shared.util.model_chat import call_openai_style_model
from app.module.shared.util.model_chat import call_openai_style_model, _extract_json_substring
from app.module.evaluation.schema.prompt import get_prompt
from app.module.shared.util.structured_file import StructuredFileHandlerFactory
from app.module.system.service.common_service import get_model_by_id
@@ -35,6 +35,10 @@ class EvaluationExecutor:
prompt_text = ((prompt_text.replace("{content}", eval_content.get("input"))
.replace("{question}", eval_content.get("instruction")))
.replace("{answer}", eval_content.get("output")))
if self.task.task_type == "COT":
prompt_text = ((prompt_text.replace("{question}", eval_content.get("question"))
.replace("{conclusion}", eval_content.get("conclusion")))
.replace("{chain_of_thought}", eval_content.get("chain_of_thought")))
return prompt_text
async def execute(self):
@@ -44,29 +48,44 @@ class EvaluationExecutor:
files = (await self.db.execute(
select(EvaluationFile).where(EvaluationFile.task_id == self.task.id)
)).scalars().all()
query = select(EvaluationItem).where(EvaluationItem.task_id == self.task.id)
count_query = select(func.count()).select_from(query.subquery())
total = (await self.db.execute(count_query)).scalar_one()
evaluated_count = 0
for file in files:
items = (await self.db.execute(
select(EvaluationItem).where(EvaluationItem.task_id == self.task.id)
.where(EvaluationItem.file_id == file.file_id)
)).scalars().all()
items = (await self.db.execute(query.where(EvaluationItem.file_id == file.file_id))).scalars().all()
tasks = [
self.evaluate_item(model_config, item, semaphore)
for item in items
]
await asyncio.gather(*tasks, return_exceptions=True)
file.evaluated_count = len(items)
evaluated_count += file.evaluated_count
self.task.eval_process = evaluated_count / total
await self.db.commit()
async def evaluate_item(self, model_config, item: EvaluationItem, semaphore: asyncio.Semaphore):
async with semaphore:
prompt_text = self.get_eval_prompt(item)
resp_text = await asyncio.to_thread(
call_openai_style_model, model_config.base_url, model_config.api_key, model_config.model_name,
prompt_text,
)
item.eval_result = resp_text
item.status = TaskStatus.COMPLETED.value
await self.db.commit()
max_try = 3
while max_try > 0:
prompt_text = self.get_eval_prompt(item)
resp_text = await asyncio.to_thread(
call_openai_style_model, model_config.base_url, model_config.api_key, model_config.model_name,
prompt_text,
)
resp_text = _extract_json_substring(resp_text)
try:
json.loads(resp_text)
except Exception as e:
logger.error(
f"Failed to parse LLM answer as JSON for task={self.task.id}, file={item.file_id}: {e}. Raw answer: {resp_text!r}"
)
max_try -= 1
continue
item.eval_result = resp_text
item.status = TaskStatus.COMPLETED.value
await self.db.commit()
return
def get_source_type(self) -> SourceType:
@@ -119,7 +138,7 @@ class SynthesisEvaluationExecutor(EvaluationExecutor):
async def save_eval_items(self):
synthesis_files = ((await self.db.execute(select(DataSynthesisFileInstance)
.where(DataSynthesisFileInstance.task_id == self.task.source_id)))
.where(DataSynthesisFileInstance.synthesis_instance_id == self.task.source_id)))
.scalars().all())
for synthesis_file in synthesis_files:
synthesis_datas = ((await self.db.execute(select(SynthesisData)
@@ -132,7 +151,7 @@ class SynthesisEvaluationExecutor(EvaluationExecutor):
task_id=self.task.id,
file_id=synthesis_file.id,
item_id=synthesis_data.id,
eval_content=synthesis_data.data,
eval_content=json.dumps(synthesis_data.data),
status=TaskStatus.PENDING.value,
created_by=self.task.created_by,
updated_by=self.task.updated_by,