From d70a3eda0d6526fd6c852c808ce343d2bf4cabe0 Mon Sep 17 00:00:00 2001 From: Dallas98 <40557804+Dallas98@users.noreply.github.com> Date: Fri, 19 Dec 2025 09:34:02 +0800 Subject: [PATCH] feat(generation_service): add document filtering to remove short documents based on chunk size (#180) * fix(chart): update Helm chart helpers and values for improved configuration * feat(SynthesisTaskTab): enhance task table with tooltip support and improved column widths * feat(CreateTask, SynthFileTask): improve task creation and detail view with enhanced payload handling and UI updates * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthDataDetail): add delete action for chunks with confirmation prompt * feat(SynthDataDetail): update edit and delete buttons to icon-only format * feat(SynthDataDetail): add confirmation modals for chunk and synthesis data deletion * feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection * feat(DataSynthesis): refactor data synthesis models and update task handling logic * feat(DataSynthesis): streamline synthesis task handling and enhance chunk processing logic * feat(DataSynthesis): refactor data synthesis models and update task handling logic * fix(generation_service): ensure processed chunks are incremented regardless of question generation success * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(model_chat): enhance JSON parsing by removing additional thought tags and improving fallback logic * feat(generation_service): add document filtering to remove short documents based on chunk size --- .../module/generation/service/generation_service.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/runtime/datamate-python/app/module/generation/service/generation_service.py b/runtime/datamate-python/app/module/generation/service/generation_service.py index a540bfd..22ee3f7 100644 --- a/runtime/datamate-python/app/module/generation/service/generation_service.py +++ b/runtime/datamate-python/app/module/generation/service/generation_service.py @@ -25,6 +25,17 @@ from app.module.shared.util.model_chat import extract_json_substring from app.module.system.service.common_service import chat, get_model_by_id, get_chat_client +def _filter_docs(split_docs, chunk_size): + """ + 过滤文档,移除长度小于 chunk_size 的文档 + """ + filtered_docs = [] + for doc in split_docs: + if len(doc.page_content) >= chunk_size * 0.7: + filtered_docs.append(doc) + return filtered_docs + + class GenerationService: def __init__(self, db: AsyncSession): self.db = db @@ -464,7 +475,7 @@ class GenerationService: try: docs = load_documents(file_path) split_docs = DocumentSplitter.auto_split(docs, chunk_size, chunk_overlap) - return split_docs + return _filter_docs(split_docs, chunk_size) except Exception as e: logger.error(f"Error loading or splitting file {file_path}: {e}") raise