You've already forked DataMate
feat(data-management): 扩展源文档排除功能支持Excel文件类型
- 在后端服务中扩展源文档类型检查,新增对XLS和XLSX文件的支持 - 修改DatasetFileApplicationService中的过滤逻辑,统一处理所有源文档类型 - 新增isSourceDocument和isDerivedFile辅助方法进行文件类型判断 - 更新前端DatasetFileTransfer组件中的注释说明 - 在Python运行时依赖中添加openpyxl和xlrd库以支持Excel文件处理 - 修改标注项目接口中源文档类型的集合定义 - 更新文件操作钩子中的派生文件排除逻辑
This commit is contained in:
@@ -27,6 +27,7 @@ router = APIRouter(
|
||||
)
|
||||
logger = get_logger(__name__)
|
||||
TEXT_DATASET_TYPE = "TEXT"
|
||||
SOURCE_DOCUMENT_FILE_TYPES = {"pdf", "doc", "docx", "xls", "xlsx"}
|
||||
|
||||
@router.get("/{mapping_id}/login")
|
||||
async def login_label_studio(
|
||||
@@ -123,18 +124,14 @@ async def create_mapping(
|
||||
file_records = file_result.scalars().all()
|
||||
snapshot_file_ids: list[str] = []
|
||||
if dataset_type == TEXT_DATASET_TYPE:
|
||||
derived_source_ids = set()
|
||||
snapshot_file_ids = []
|
||||
for file_record in file_records:
|
||||
metadata = getattr(file_record, "dataset_filemetadata", None)
|
||||
if isinstance(metadata, dict):
|
||||
source_id = metadata.get("derived_from_file_id")
|
||||
if source_id:
|
||||
derived_source_ids.add(str(source_id))
|
||||
snapshot_file_ids = [
|
||||
str(file_record.id)
|
||||
for file_record in file_records
|
||||
if file_record.id and str(file_record.id) not in derived_source_ids
|
||||
]
|
||||
if not file_record.id:
|
||||
continue
|
||||
file_type = str(getattr(file_record, "file_type", "") or "").lower()
|
||||
if file_type in SOURCE_DOCUMENT_FILE_TYPES:
|
||||
continue
|
||||
snapshot_file_ids.append(str(file_record.id))
|
||||
else:
|
||||
snapshot_file_ids = [
|
||||
str(file_record.id)
|
||||
|
||||
Reference in New Issue
Block a user