You've already forked DataMate
feat(generation_service): add document filtering to remove short documents based on chunk size (#180)
* fix(chart): update Helm chart helpers and values for improved configuration * feat(SynthesisTaskTab): enhance task table with tooltip support and improved column widths * feat(CreateTask, SynthFileTask): improve task creation and detail view with enhanced payload handling and UI updates * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthDataDetail): add delete action for chunks with confirmation prompt * feat(SynthDataDetail): update edit and delete buttons to icon-only format * feat(SynthDataDetail): add confirmation modals for chunk and synthesis data deletion * feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection * feat(DataSynthesis): refactor data synthesis models and update task handling logic * feat(DataSynthesis): streamline synthesis task handling and enhance chunk processing logic * feat(DataSynthesis): refactor data synthesis models and update task handling logic * fix(generation_service): ensure processed chunks are incremented regardless of question generation success * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(model_chat): enhance JSON parsing by removing additional thought tags and improving fallback logic * feat(generation_service): add document filtering to remove short documents based on chunk size
This commit is contained in:
@@ -25,6 +25,17 @@ from app.module.shared.util.model_chat import extract_json_substring
|
|||||||
from app.module.system.service.common_service import chat, get_model_by_id, get_chat_client
|
from app.module.system.service.common_service import chat, get_model_by_id, get_chat_client
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_docs(split_docs, chunk_size):
|
||||||
|
"""
|
||||||
|
过滤文档,移除长度小于 chunk_size 的文档
|
||||||
|
"""
|
||||||
|
filtered_docs = []
|
||||||
|
for doc in split_docs:
|
||||||
|
if len(doc.page_content) >= chunk_size * 0.7:
|
||||||
|
filtered_docs.append(doc)
|
||||||
|
return filtered_docs
|
||||||
|
|
||||||
|
|
||||||
class GenerationService:
|
class GenerationService:
|
||||||
def __init__(self, db: AsyncSession):
|
def __init__(self, db: AsyncSession):
|
||||||
self.db = db
|
self.db = db
|
||||||
@@ -464,7 +475,7 @@ class GenerationService:
|
|||||||
try:
|
try:
|
||||||
docs = load_documents(file_path)
|
docs = load_documents(file_path)
|
||||||
split_docs = DocumentSplitter.auto_split(docs, chunk_size, chunk_overlap)
|
split_docs = DocumentSplitter.auto_split(docs, chunk_size, chunk_overlap)
|
||||||
return split_docs
|
return _filter_docs(split_docs, chunk_size)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error loading or splitting file {file_path}: {e}")
|
logger.error(f"Error loading or splitting file {file_path}: {e}")
|
||||||
raise
|
raise
|
||||||
|
|||||||
Reference in New Issue
Block a user