fix: 修复评估时模型输出json格式不对导致读取错误的问题 (#133)

* feature: add cot data evaluation function * fix: added verification to evaluation results * fix: fix the prompt for evaluating * fix: 修复当评估结果为空导致读取失败的问题
2025-12-04 18:49:50 +08:00
parent 31c4966608
commit 744d15ba24
14 changed files with 373 additions and 219 deletions
--- a/runtime/datamate-python/app/module/generation/service/generation_service.py
+++ b/runtime/datamate-python/app/module/generation/service/generation_service.py
@@ -28,6 +28,7 @@ from app.db.models.data_synthesis import (
 from app.db.models.dataset_management import DatasetFiles
 from app.db.models.model_config import get_model_by_id
 from app.db.session import logger
+from app.module.shared.util.model_chat import _extract_json_substring
 from app.module.system.service.common_service import get_chat_client, chat


@@ -365,7 +366,7 @@ class GenerationService:
            return

        # 1. 预处理原始回答：尝试从中截取出最可能的 JSON 片段
-        cleaned = self._extract_json_substring(raw_answer)
+        cleaned = _extract_json_substring(raw_answer)

        # 2. 解析 JSON，统一成列表结构
        try:
@@ -426,45 +427,6 @@ class GenerationService:
        await self.db.commit()
        await self.db.refresh(file_instance)

-    @staticmethod
-    def _extract_json_substring(raw: str) -> str:
-        """从 LLM 的原始回答中提取最可能的 JSON 字符串片段。
-
-        处理思路：
-        - 原始回答可能是：说明文字 + JSON + 说明文字，甚至带有 Markdown 代码块。
-        - 优先在文本中查找第一个 '{' 或 '[' 作为 JSON 起始；
-        - 再从后向前找最后一个 '}' 或 ']' 作为结束；
-        - 如果找不到合适的边界，就退回原始字符串。
-        该方法不会保证截取的一定是合法 JSON，但能显著提高 json.loads 的成功率。
-        """
-        if not raw:
-            return raw
-
-        start = None
-        end = None
-
-        # 查找第一个 JSON 起始符号
-        for i, ch in enumerate(raw):
-            if ch in "[{":
-                start = i
-                break
-
-        # 查找最后一个 JSON 结束符号
-        for i in range(len(raw) - 1, -1, -1):
-            if raw[i] in "]}":
-                end = i + 1  # 切片是左闭右开
-                break
-
-        if start is not None and end is not None and start < end:
-            return raw[start:end].strip()
-
-        # 兜底：去掉常见 Markdown 包裹（```json ... ```）
-        stripped = raw.strip()
-        if stripped.startswith("```"):
-            # 去掉首尾 ``` 标记
-            stripped = stripped.strip("`")
-        return stripped
-
    async def _get_or_create_file_instance(
        self,
        synthesis_task_id: str,