feat(data-management): 扩展源文档排除功能支持Excel文件类型

- 在后端服务中扩展源文档类型检查，新增对XLS和XLSX文件的支持 - 修改DatasetFileApplicationService中的过滤逻辑，统一处理所有源文档类型 - 新增isSourceDocument和isDerivedFile辅助方法进行文件类型判断 - 更新前端DatasetFileTransfer组件中的注释说明 - 在Python运行时依赖中添加openpyxl和xlrd库以支持Excel文件处理 - 修改标注项目接口中源文档类型的集合定义 - 更新文件操作钩子中的派生文件排除逻辑
2026-01-31 11:30:55 +08:00
parent 6c7ea0c25e
commit b5d7c66240
8 changed files with 210 additions and 119 deletions
--- a/runtime/datamate-python/app/module/annotation/interface/project.py
+++ b/runtime/datamate-python/app/module/annotation/interface/project.py
@@ -27,6 +27,7 @@ router = APIRouter(
 )
 logger = get_logger(__name__)
 TEXT_DATASET_TYPE = "TEXT"
+SOURCE_DOCUMENT_FILE_TYPES = {"pdf", "doc", "docx", "xls", "xlsx"}

@router.get("/{mapping_id}/login")
 async def login_label_studio(
@@ -123,18 +124,14 @@ async def create_mapping(
        file_records = file_result.scalars().all()
        snapshot_file_ids: list[str] = []
        if dataset_type == TEXT_DATASET_TYPE:
-            derived_source_ids = set()
+            snapshot_file_ids = []
            for file_record in file_records:
-                metadata = getattr(file_record, "dataset_filemetadata", None)
-                if isinstance(metadata, dict):
-                    source_id = metadata.get("derived_from_file_id")
-                    if source_id:
-                        derived_source_ids.add(str(source_id))
-            snapshot_file_ids = [
-                str(file_record.id)
-                for file_record in file_records
-                if file_record.id and str(file_record.id) not in derived_source_ids
-            ]
+                if not file_record.id:
+                    continue
+                file_type = str(getattr(file_record, "file_type", "") or "").lower()
+                if file_type in SOURCE_DOCUMENT_FILE_TYPES:
+                    continue
+                snapshot_file_ids.append(str(file_record.id))
        else:
            snapshot_file_ids = [
                str(file_record.id)