feat(generation_service): add document filtering to remove short documents based on chunk size (#180)

* fix(chart): update Helm chart helpers and values for improved configuration * feat(SynthesisTaskTab): enhance task table with tooltip support and improved column widths * feat(CreateTask, SynthFileTask): improve task creation and detail view with enhanced payload handling and UI updates * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthDataDetail): add delete action for chunks with confirmation prompt * feat(SynthDataDetail): update edit and delete buttons to icon-only format * feat(SynthDataDetail): add confirmation modals for chunk and synthesis data deletion * feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection * feat(DataSynthesis): refactor data synthesis models and update task handling logic * feat(DataSynthesis): streamline synthesis task handling and enhance chunk processing logic * feat(DataSynthesis): refactor data synthesis models and update task handling logic * fix(generation_service): ensure processed chunks are incremented regardless of question generation success * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(model_chat): enhance JSON parsing by removing additional thought tags and improving fallback logic * feat(generation_service): add document filtering to remove short documents based on chunk size
2025-12-19 09:34:02 +08:00
parent be875086db
commit d70a3eda0d
1 changed files with 12 additions and 1 deletions
--- a/runtime/datamate-python/app/module/generation/service/generation_service.py
+++ b/runtime/datamate-python/app/module/generation/service/generation_service.py
@@ -25,6 +25,17 @@ from app.module.shared.util.model_chat import extract_json_substring
 from app.module.system.service.common_service import chat, get_model_by_id, get_chat_client


+def _filter_docs(split_docs, chunk_size):
+    """
+    过滤文档，移除长度小于 chunk_size 的文档
+    """
+    filtered_docs = []
+    for doc in split_docs:
+        if len(doc.page_content) >= chunk_size * 0.7:
+            filtered_docs.append(doc)
+    return filtered_docs
+
+
 class GenerationService:
    def __init__(self, db: AsyncSession):
        self.db = db
@@ -464,7 +475,7 @@ class GenerationService:
        try:
            docs = load_documents(file_path)
            split_docs = DocumentSplitter.auto_split(docs, chunk_size, chunk_overlap)
-            return split_docs
+            return _filter_docs(split_docs, chunk_size)
        except Exception as e:
            logger.error(f"Error loading or splitting file {file_path}: {e}")
            raise