fix: 修复评估时模型输出json格式不对导致读取错误的问题 (#133)

* feature: add cot data evaluation function * fix: added verification to evaluation results * fix: fix the prompt for evaluating * fix: 修复当评估结果为空导致读取失败的问题
2025-12-04 18:49:50 +08:00
parent 31c4966608
commit 744d15ba24
14 changed files with 373 additions and 219 deletions
--- a/runtime/datamate-python/app/module/evaluation/interface/evaluation.py
+++ b/runtime/datamate-python/app/module/evaluation/interface/evaluation.py
@@ -1,5 +1,6 @@
 import asyncio
 import uuid
+import math
 import json
 from typing import Optional
 from fastapi import APIRouter, Depends, HTTPException, Query, BackgroundTasks
@@ -171,7 +172,7 @@ async def list_evaluation_tasks(

        # 转换为响应模型
        items = [_map_to_task_detail_response(task) for task in tasks]
-        total_pages = (total + size - 1) // size if size > 0 else 0
+        total_pages = math.ceil(total / size) if total > 0 else 0

        return StandardResponse(
            code=200,
@@ -217,7 +218,7 @@ async def list_evaluation_items(
        count_query = select(func.count()).select_from(query.subquery())
        total = (await db.execute(count_query)).scalar_one()
        files = (await db.execute(query.offset(offset).limit(size))).scalars().all()
-        total_pages = (total + size - 1) // size if size > 0 else 0
+        total_pages = math.ceil(total / size) if total > 0 else 0
        file_responses = [
            EvaluationFileResponse(
                taskId=file.task_id,
@@ -298,7 +299,7 @@ async def list_evaluation_items(
                taskId=item.task_id,
                itemId=item.item_id,
                fileId=item.file_id,
-                evalContent=json.loads(item.eval_content),
+                evalContent=json.loads(item.eval_content) if item.eval_content else None,
                evalScore=float(item.eval_score) if item.eval_score else None,
                evalResult=json.loads(item.eval_result),
                status=item.status
@@ -306,7 +307,7 @@ async def list_evaluation_items(
            for item in items
        ]

-        total_pages = (total + size - 1) // size if size > 0 else 0
+        total_pages = math.ceil(total / size) if total > 0 else 0

        return StandardResponse(
            code=200,
@@ -387,6 +388,12 @@ async def delete_eval_tasks(
            .where(EvaluationItem.task_id == task_id)
        )

+        # 删除评估文件
+        await db.execute(
+            EvaluationFile.__table__.delete()
+            .where(EvaluationFile.task_id == task_id)
+        )
+
        # 删除任务
        await db.delete(task)
        await db.commit()
@@ -419,6 +426,7 @@ def _map_to_task_detail_response(
        sourceId=task.source_id,
        sourceName=task.source_name,
        status=task.status,
+        evalMethod=task.eval_method,
        evalProcess=task.eval_process,
        evalPrompt=task.eval_prompt,
        evalConfig=json.loads(task.eval_config),
--- a/runtime/datamate-python/app/module/evaluation/schema/evaluation.py
+++ b/runtime/datamate-python/app/module/evaluation/schema/evaluation.py
@@ -36,6 +36,7 @@ class EvaluationTaskItem(BaseModel):
    source_id: Optional[str] = Field(..., alias="sourceId", description="数据源ID")
    source_name: Optional[str] = Field(None, alias="sourceName", description="数据源名称")
    status: TaskStatus = Field(..., description="任务状态")
+    eval_method: Optional[str] = Field(None, alias="evalMethod", description="评估方式")
    eval_process: Optional[float] = Field(0, alias="evalProcess", description="评估进度")
    created_at: Optional[str] = Field(None, alias="createdAt", description="创建时间")
    updated_at: Optional[str] = Field(None, alias="updatedAt", description="更新时间")
--- a/runtime/datamate-python/app/module/evaluation/schema/prompt.py
+++ b/runtime/datamate-python/app/module/evaluation/schema/prompt.py
@@ -1,3 +1,7 @@
+from app.core.logging import get_logger
+
+logger = get_logger(__name__)
+
 EVALUATION_PROMPT_TEMPLATE = [
    {
        "evalType": "QA",
@@ -51,26 +55,90 @@ EVALUATION_PROMPT_TEMPLATE = [
 请按照以下JSON格式输出评估结果，评估结果为Y/N，符合标注输出Y，不符合标准输出N：

 {
-  "result": {{result_example}
+  "result": {
+    {result_example}
  },
  "evaluation": "这是一个高质量的问答数据集。问题表述清晰具体，答案准确完整且逻辑性强，与原始文本高度相关。建议：可以进一步丰富答案的细节描述。"
 }
+"""
+    },
+    {
+        "evalType": "COT",
+        "defaultDimensions": [
+            {
+                "dimension": "思维链逻辑是否连贯",
+                "description": "分析思维链中推理链条的连续性：步骤间有明确的逻辑连接词；每一步都是基于前置在步骤的结果；没有逻辑跳跃或断层；推理方向一致，不偏离目标。"
+            },
+            {
+                "dimension": "推理步骤是否合理必要",
+                "description": "分析思维链中对于步骤分解的合理性和必要性：复杂问题被适当分解; 每个步骤都是解决整体问题的必要部分;步骤粒度适中（既不过细也不过粗）；符合人类认知习惯。"
+            },
+            {
+                "dimension": "内容是否准确",
+                "description": "分析整个COT数据内容是否准确：所有陈述的事实必须准确；展示每一步的计算结果（如何涉及数学计算，必须保证数学计算无错误）；逻辑推导有效且合理，最终答案与推理过程一致。"
+            }
+        ],
+        "prompt": """
+# Role: COT数据质量评估专家
+## Profile:
+- Description: 你是一名专业的Chain-of-Thought（CoT）推理数据质量评估专家，擅长从多个维度对COT数据进行质量评估，挑选出有助于模型学习如何分解问题、展示推理链条，提高模型对于复杂问题解决能力的COT数据。具备深度学习、自然语言处理和数据科学的专业背景。
+
+## Skills:
+1. 能够从多个维度对COT数据进行综合评估，保证客观、专业、细致
+2. 擅长识别COT数据中的潜在问题，如推包含事实性错误（关键信息错误），存在严重逻辑矛（无法自洽），包含有害、偏见或不当内容，完全偏离主题，抄袭或高度重复内容等
+3. 能够给出具体的改进建议和质量评分，并提供可操作的优化方案
+
+## 评估维度:
+{dimensions}
+
+## 问题或指令:
+{question}
+
+## 思维链:
+{chain_of_thought}
+
+## 结论:
+{conclusion}
+
+## 注意事项:
+- 评估结论要具体指出优点和不足，提供可操作的改进建议
+- 评估结论控制在150字以内，简洁明了但要涵盖关键信息
+
+## 输出要求:
+请按照以下JSON格式输出评估结果，评估结果为Y/N，符合标注输出Y，不符合标准输出N；将评估结论写到evaluation中：
+
+{
+  "result": {
+    {result_example}
+  },
+  "evaluation": "这是一个高质量的COT数据。思维链逻辑连贯，推理步骤合理，信息完整。建议：部分表达可以进一步优化，以及个别步骤的过渡可以更加平滑。"
+}
 """
    }
 ]

 def get_dimensions_for_qa(dimensions: list[dict]) -> str:
-    dimensions_str = "\n"
+    dimensions_str = ""
    index = 1
    for dimension in dimensions:
-        dimensions_str += f"### {index}. {dimension.get("dimension")}\n**评估标准：**\n{dimension.get("description")}\n\n"
+        if index > 1:
+            dimensions_str += "\n"
+        dimensions_str += f"### {index}. {dimension.get("dimension")}\n**评估标准：**\n{dimension.get("description")}"
+        if index < len(dimensions):
+            dimensions_str += "\n"
        index += 1
    return dimensions_str

 def get_result_example_for_qa(dimensions: list[dict]) -> str:
    result_example = ""
+    index = 1
    for dimension in dimensions:
-        result_example += f'\n    "{dimension.get("dimension")}": "Y",'
+        if index > 1:
+            result_example += "\n    "
+        result_example += f'"{dimension.get("dimension")}": "Y"'
+        if index < len(dimensions):
+            result_example += ","
+        index += 1
    return result_example

 def get_prompt(task_type: str, dimensions: list[dict]) -> str:
--- a/runtime/datamate-python/app/module/evaluation/schema/prompt_template.py
+++ b/runtime/datamate-python/app/module/evaluation/schema/prompt_template.py
@@ -1,7 +1,7 @@
 """
 Schema for evaluation prompt templates.
 """
-from typing import List, Dict, Any
+from typing import List
 from pydantic import BaseModel, Field


--- a/runtime/datamate-python/app/module/evaluation/service/evaluation.py
+++ b/runtime/datamate-python/app/module/evaluation/service/evaluation.py
@@ -2,7 +2,7 @@ import json
 import uuid
 import asyncio

-from sqlalchemy import select
+from sqlalchemy import select, func
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.core.exception import BusinessErrorCodeEnum, BusinessException
@@ -13,7 +13,7 @@ from app.db.models.data_synthesis import DataSynthesisFileInstance, SynthesisDat
 from app.db.session import AsyncSessionLocal
 from app.module.evaluation.schema.evaluation import SourceType
 from app.module.shared.schema import TaskStatus
-from app.module.shared.util.model_chat import call_openai_style_model
+from app.module.shared.util.model_chat import call_openai_style_model, _extract_json_substring
 from app.module.evaluation.schema.prompt import get_prompt
 from app.module.shared.util.structured_file import StructuredFileHandlerFactory
 from app.module.system.service.common_service import get_model_by_id
@@ -35,6 +35,10 @@ class EvaluationExecutor:
            prompt_text = ((prompt_text.replace("{content}", eval_content.get("input"))
                            .replace("{question}", eval_content.get("instruction")))
                           .replace("{answer}", eval_content.get("output")))
+        if self.task.task_type == "COT":
+            prompt_text = ((prompt_text.replace("{question}", eval_content.get("question"))
+                            .replace("{conclusion}", eval_content.get("conclusion")))
+                           .replace("{chain_of_thought}", eval_content.get("chain_of_thought")))
        return prompt_text

    async def execute(self):
@@ -44,29 +48,44 @@ class EvaluationExecutor:
        files = (await self.db.execute(
            select(EvaluationFile).where(EvaluationFile.task_id == self.task.id)
        )).scalars().all()
+        query = select(EvaluationItem).where(EvaluationItem.task_id == self.task.id)
+        count_query = select(func.count()).select_from(query.subquery())
+        total = (await self.db.execute(count_query)).scalar_one()
+        evaluated_count = 0
        for file in files:
-            items = (await self.db.execute(
-                select(EvaluationItem).where(EvaluationItem.task_id == self.task.id)
-                .where(EvaluationItem.file_id == file.file_id)
-            )).scalars().all()
+            items = (await self.db.execute(query.where(EvaluationItem.file_id == file.file_id))).scalars().all()
            tasks = [
                self.evaluate_item(model_config, item, semaphore)
                for item in items
            ]
            await asyncio.gather(*tasks, return_exceptions=True)
            file.evaluated_count = len(items)
+            evaluated_count += file.evaluated_count
+            self.task.eval_process = evaluated_count / total
            await self.db.commit()

    async def evaluate_item(self, model_config, item: EvaluationItem, semaphore: asyncio.Semaphore):
        async with semaphore:
-            prompt_text = self.get_eval_prompt(item)
-            resp_text = await asyncio.to_thread(
-                call_openai_style_model, model_config.base_url, model_config.api_key, model_config.model_name,
-                prompt_text,
-            )
-            item.eval_result = resp_text
-            item.status = TaskStatus.COMPLETED.value
-            await self.db.commit()
+            max_try = 3
+            while max_try > 0:
+                prompt_text = self.get_eval_prompt(item)
+                resp_text = await asyncio.to_thread(
+                    call_openai_style_model, model_config.base_url, model_config.api_key, model_config.model_name,
+                    prompt_text,
+                )
+                resp_text = _extract_json_substring(resp_text)
+                try:
+                    json.loads(resp_text)
+                except Exception as e:
+                    logger.error(
+                        f"Failed to parse LLM answer as JSON for task={self.task.id}, file={item.file_id}: {e}. Raw answer: {resp_text!r}"
+                    )
+                    max_try -= 1
+                    continue
+                item.eval_result = resp_text
+                item.status = TaskStatus.COMPLETED.value
+                await self.db.commit()
+                return


    def get_source_type(self) -> SourceType:
@@ -119,7 +138,7 @@ class SynthesisEvaluationExecutor(EvaluationExecutor):

    async def save_eval_items(self):
        synthesis_files = ((await self.db.execute(select(DataSynthesisFileInstance)
-                               .where(DataSynthesisFileInstance.task_id == self.task.source_id)))
+                               .where(DataSynthesisFileInstance.synthesis_instance_id == self.task.source_id)))
                           .scalars().all())
        for synthesis_file in synthesis_files:
            synthesis_datas = ((await self.db.execute(select(SynthesisData)
@@ -132,7 +151,7 @@ class SynthesisEvaluationExecutor(EvaluationExecutor):
                    task_id=self.task.id,
                    file_id=synthesis_file.id,
                    item_id=synthesis_data.id,
-                    eval_content=synthesis_data.data,
+                    eval_content=json.dumps(synthesis_data.data),
                    status=TaskStatus.PENDING.value,
                    created_by=self.task.created_by,
                    updated_by=self.task.updated_by,