feat(annotation): 添加文本项目源文档过滤功能

- 在 EditorProjectInfo 中增加 datasetType 字段
- 移除前端硬编码的源文档扩展名列表
- 添加 isTextProject 判断逻辑
- 实现 prefetch 和 loadTasks 中的源文档排除参数
- 在后端接口中添加 excludeSourceDocuments 参数
- 实现源文档类型的数据库查询过滤逻辑
- 优化任务列表加载性能,避免不必要的源文档加载
This commit is contained in:
2026-01-29 16:29:40 +08:00
parent f5f0add529
commit 0b69845a29
4 changed files with 73 additions and 19 deletions

View File

@@ -55,10 +55,20 @@ async def list_editor_tasks(
project_id: str = Path(..., description="标注项目ID(t_dm_labeling_projects.id)"),
page: int = Query(0, ge=0, description="页码(从0开始)"),
size: int = Query(50, ge=1, le=200, description="每页大小"),
exclude_source_documents: Optional[bool] = Query(
None,
alias="excludeSourceDocuments",
description="是否排除已被转换为TXT的源文档文件(PDF/DOC/DOCX,仅文本数据集生效)",
),
db: AsyncSession = Depends(get_db),
):
service = AnnotationEditorService(db)
result = await service.list_tasks(project_id, page=page, size=size)
result = await service.list_tasks(
project_id,
page=page,
size=size,
exclude_source_documents=exclude_source_documents,
)
return StandardResponse(code=200, message="success", data=result)

View File

@@ -19,6 +19,11 @@ class EditorProjectInfo(BaseModel):
project_id: str = Field(..., alias="projectId", description="DataMate 标注项目ID(t_dm_labeling_projects.id)")
dataset_id: str = Field(..., alias="datasetId", description="数据集ID(t_dm_datasets.id)")
dataset_type: Optional[str] = Field(
None,
alias="datasetType",
description="数据集类型(TEXT/IMAGE/AUDIO/VIDEO 等)",
)
template_id: Optional[str] = Field(None, alias="templateId", description="模板ID(t_dm_annotation_templates.id)")
label_config: Optional[str] = Field(None, alias="labelConfig", description="Label Studio XML 配置")
supported: bool = Field(..., description="当前数据类型是否支持内嵌编辑器")

View File

@@ -18,7 +18,7 @@ import hashlib
import json
import xml.etree.ElementTree as ET
from fastapi import HTTPException
from sqlalchemy import case, func, select
from sqlalchemy import case, func, select, or_
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.config import settings
@@ -69,6 +69,8 @@ SUPPORTED_EDITOR_DATASET_TYPES = (
DATASET_TYPE_VIDEO,
)
SEGMENTATION_ENABLED_KEY = "segmentation_enabled"
SOURCE_DOCUMENT_EXTENSIONS = (".pdf", ".doc", ".docx")
SOURCE_DOCUMENT_TYPES = ("pdf", "doc", "docx")
class AnnotationEditorService:
@@ -346,6 +348,14 @@ class AnnotationEditorService:
return ET.tostring(root, encoding="unicode")
@classmethod
def _build_source_document_filter(cls):
file_type_lower = func.lower(DatasetFiles.file_type)
file_name_lower = func.lower(DatasetFiles.file_name)
type_condition = file_type_lower.in_(SOURCE_DOCUMENT_TYPES)
name_conditions = [file_name_lower.like(f"%{ext}") for ext in SOURCE_DOCUMENT_EXTENSIONS]
return or_(type_condition, *name_conditions)
def _build_task_data(
self,
display_text: str,
@@ -404,18 +414,35 @@ class AnnotationEditorService:
return EditorProjectInfo(
projectId=project.id,
datasetId=project.dataset_id,
datasetType=dataset_type or None,
templateId=project.template_id,
labelConfig=label_config,
supported=supported,
unsupportedReason=unsupported_reason,
)
async def list_tasks(self, project_id: str, page: int = 0, size: int = 50) -> EditorTaskListResponse:
async def list_tasks(
self,
project_id: str,
page: int = 0,
size: int = 50,
exclude_source_documents: Optional[bool] = None,
) -> EditorTaskListResponse:
project = await self._get_project_or_404(project_id)
dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
should_exclude_source_documents = False
if dataset_type == DATASET_TYPE_TEXT:
should_exclude_source_documents = (
exclude_source_documents if exclude_source_documents is not None else True
)
base_conditions = [DatasetFiles.dataset_id == project.dataset_id]
if should_exclude_source_documents:
base_conditions.append(~self._build_source_document_filter())
count_result = await self.db.execute(
select(func.count()).select_from(DatasetFiles).where(
DatasetFiles.dataset_id == project.dataset_id
*base_conditions
)
)
total = int(count_result.scalar() or 0)
@@ -431,7 +458,7 @@ class AnnotationEditorService:
(AnnotationResult.file_id == DatasetFiles.id)
& (AnnotationResult.project_id == project_id),
)
.where(DatasetFiles.dataset_id == project.dataset_id)
.where(*base_conditions)
.order_by(annotated_sort_key.asc(), DatasetFiles.created_at.desc())
.offset(page * size)
.limit(size)