From 27b1cc8e096080aac755172d1eb95470b220f09f Mon Sep 17 00:00:00 2001 From: Dallas98 <40557804+Dallas98@users.noreply.github.com> Date: Thu, 18 Dec 2025 19:19:54 +0800 Subject: [PATCH] feat(model_chat): enhance JSON parsing by removing additional thought tags and improving fallback logic (#178) * fix(chart): update Helm chart helpers and values for improved configuration * feat(SynthesisTaskTab): enhance task table with tooltip support and improved column widths * feat(CreateTask, SynthFileTask): improve task creation and detail view with enhanced payload handling and UI updates * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthDataDetail): add delete action for chunks with confirmation prompt * feat(SynthDataDetail): update edit and delete buttons to icon-only format * feat(SynthDataDetail): add confirmation modals for chunk and synthesis data deletion * feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection * feat(DataSynthesis): refactor data synthesis models and update task handling logic * feat(DataSynthesis): streamline synthesis task handling and enhance chunk processing logic * feat(DataSynthesis): refactor data synthesis models and update task handling logic * fix(generation_service): ensure processed chunks are incremented regardless of question generation success * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(model_chat): enhance JSON parsing by removing additional thought tags and improving fallback logic --- .../app/module/shared/util/model_chat.py | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/runtime/datamate-python/app/module/shared/util/model_chat.py b/runtime/datamate-python/app/module/shared/util/model_chat.py index bac586d..f47b5f1 100644 --- a/runtime/datamate-python/app/module/shared/util/model_chat.py +++ b/runtime/datamate-python/app/module/shared/util/model_chat.py @@ -24,18 +24,41 @@ def extract_json_substring(raw: str) -> str: - 再从后向前找最后一个 '}' 或 ']' 作为结束; - 如果找不到合适的边界,就退回原始字符串。 - 部分模型可能会在回复中加入 `...` 内部思考内容,应在解析前先去除。 + - 也有模型会在 JSON 前后增加如 ...... 等标签,本方法会一并去除。 该方法不会保证截取的一定是合法 JSON,但能显著提高 json.loads 的成功率。 """ if not raw: return raw - # 先移除所有 ... 段落(包括跨多行的情况) try: import re - raw = re.sub(r"[\s\S]*?", "", raw, flags=re.IGNORECASE) + # 1. 先把所有完整的思考标签块整体去掉:... 等 + thought_tags = [ + "think", + "thinking", + "analysis", + "reasoning", + "reflection", + "inner_thoughts", + ] + for tag in thought_tags: + pattern = rf"<{tag}>[\s\S]*?" + raw = re.sub(pattern, "", raw, flags=re.IGNORECASE) + + # 2. 再做一次“截取最后一个 (或其它思考标签结束)之后的内容”的兜底 + # 这样就算标签不成对或嵌套异常,也能保留尾部真正的回答 + last_pos = -1 + for tag in thought_tags: + # 匹配类似 或 + m = list(re.finditer(rf"", raw, flags=re.IGNORECASE)) + if m: + last_pos = max(last_pos, m[-1].end()) + if last_pos != -1 and last_pos < len(raw): + raw = raw[last_pos:] + except Exception: - # 正则异常时不影响后续逻辑,继续使用原始文本 + # 正则异常时不影响后续逻辑,继续使用当前文本 pass start = None