From d70a3eda0d6526fd6c852c808ce343d2bf4cabe0 Mon Sep 17 00:00:00 2001
From: Dallas98 <40557804+Dallas98@users.noreply.github.com>
Date: Fri, 19 Dec 2025 09:34:02 +0800
Subject: [PATCH] feat(generation_service): add document filtering to remove
 short documents based on chunk size (#180)

* fix(chart): update Helm chart helpers and values for improved configuration

* feat(SynthesisTaskTab): enhance task table with tooltip support and improved column widths

* feat(CreateTask, SynthFileTask): improve task creation and detail view with enhanced payload handling and UI updates

* feat(SynthFileTask): enhance file display with progress tracking and delete action

* feat(SynthFileTask): enhance file display with progress tracking and delete action

* feat(SynthDataDetail): add delete action for chunks with confirmation prompt

* feat(SynthDataDetail): update edit and delete buttons to icon-only format

* feat(SynthDataDetail): add confirmation modals for chunk and synthesis data deletion

* feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection

* feat(DataSynthesis): refactor data synthesis models and update task handling logic

* feat(DataSynthesis): streamline synthesis task handling and enhance chunk processing logic

* feat(DataSynthesis): refactor data synthesis models and update task handling logic

* fix(generation_service): ensure processed chunks are incremented regardless of question generation success

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options

* feat(model_chat): enhance JSON parsing by removing additional thought tags and improving fallback logic

* feat(generation_service): add document filtering to remove short documents based on chunk size
---
 .../module/generation/service/generation_service.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/runtime/datamate-python/app/module/generation/service/generation_service.py b/runtime/datamate-python/app/module/generation/service/generation_service.py
index a540bfd..22ee3f7 100644
--- a/runtime/datamate-python/app/module/generation/service/generation_service.py
+++ b/runtime/datamate-python/app/module/generation/service/generation_service.py
@@ -25,6 +25,17 @@ from app.module.shared.util.model_chat import extract_json_substring
 from app.module.system.service.common_service import chat, get_model_by_id, get_chat_client
 
 
+def _filter_docs(split_docs, chunk_size):
+    """
+    过滤文档，移除长度小于 chunk_size 的文档
+    """
+    filtered_docs = []
+    for doc in split_docs:
+        if len(doc.page_content) >= chunk_size * 0.7:
+            filtered_docs.append(doc)
+    return filtered_docs
+
+
 class GenerationService:
     def __init__(self, db: AsyncSession):
         self.db = db
@@ -464,7 +475,7 @@ class GenerationService:
         try:
             docs = load_documents(file_path)
             split_docs = DocumentSplitter.auto_split(docs, chunk_size, chunk_overlap)
-            return split_docs
+            return _filter_docs(split_docs, chunk_size)
         except Exception as e:
             logger.error(f"Error loading or splitting file {file_path}: {e}")
             raise