diff --git a/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx b/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx index 8c37b2c..a03a3c2 100644 --- a/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx +++ b/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx @@ -13,6 +13,7 @@ import { type EditorProjectInfo = { projectId: string; datasetId: string; + datasetType?: string | null; templateId?: string | null; labelConfig?: string | null; supported: boolean; @@ -87,7 +88,6 @@ type SwitchDecision = "save" | "discard" | "cancel"; const LSF_IFRAME_SRC = "/lsf/lsf.html"; const TASK_PAGE_START = 0; const TASK_PAGE_SIZE = 200; -const SOURCE_DOCUMENT_EXTENSIONS = [".pdf", ".doc", ".docx"]; type NormalizedTaskList = { items: EditorTaskListItem[]; @@ -167,20 +167,12 @@ const mergeTaskItems = (base: EditorTaskListItem[], next: EditorTaskListItem[]) return merged; }; -const isSourceDocumentFile = (item: EditorTaskListItem) => { - const fileName = item.fileName?.toLowerCase() ?? ""; - return SOURCE_DOCUMENT_EXTENSIONS.some((ext) => fileName.endsWith(ext)); -}; - -const filterSourceDocumentTasks = (items: EditorTaskListItem[]) => - items.filter((item) => !isSourceDocumentFile(item)); - const normalizeTaskListResponse = ( response: ApiResponse | null | undefined, fallbackPage: number, ): NormalizedTaskList => { const content = response?.data?.content; - const items = filterSourceDocumentTasks(Array.isArray(content) ? content : []); + const items = Array.isArray(content) ? content : []; const size = response?.data?.size ?? TASK_PAGE_SIZE; const total = response?.data?.totalElements ?? items.length; const totalPages = @@ -237,6 +229,10 @@ export default function LabelStudioTextEditor() { const [segmented, setSegmented] = useState(false); const [segments, setSegments] = useState([]); const [currentSegmentIndex, setCurrentSegmentIndex] = useState(0); + const isTextProject = useMemo( + () => (project?.datasetType || "").toUpperCase() === "TEXT", + [project?.datasetType], + ); const focusIframe = useCallback(() => { const iframe = iframeRef.current; @@ -292,9 +288,13 @@ export default function LabelStudioTextEditor() { for (let page = startPage; page < totalPages; page += 1) { if (prefetchSeqRef.current !== seq) return; try { - const resp = (await listEditorTasksUsingGet(projectId, { + const params = { page, size: TASK_PAGE_SIZE, + ...(isTextProject ? { excludeSourceDocuments: true } : {}), + }; + const resp = (await listEditorTasksUsingGet(projectId, { + ...params, })) as ApiResponse; const normalized = normalizeTaskListResponse(resp, page); setTasks((prev) => mergeTaskItems(prev, normalized.items)); @@ -311,7 +311,7 @@ export default function LabelStudioTextEditor() { } }; void run(); - }, [projectId]); + }, [isTextProject, projectId]); const loadTasks = useCallback(async (options?: { mode?: "reset" | "append"; @@ -334,9 +334,13 @@ export default function LabelStudioTextEditor() { } try { const nextPage = mode === "append" ? taskPage + 1 : TASK_PAGE_START; - const resp = (await listEditorTasksUsingGet(projectId, { + const params = { page: nextPage, size: TASK_PAGE_SIZE, + ...(isTextProject ? { excludeSourceDocuments: true } : {}), + }; + const resp = (await listEditorTasksUsingGet(projectId, { + ...params, })) as ApiResponse; const normalized = normalizeTaskListResponse(resp, nextPage); if (mode === "append") { @@ -366,7 +370,15 @@ export default function LabelStudioTextEditor() { setLoadingTasks(false); } } - }, [message, projectId, startPrefetchTasks, taskPage, taskTotalPages, updateTaskSelection]); + }, [ + isTextProject, + message, + projectId, + startPrefetchTasks, + taskPage, + taskTotalPages, + updateTaskSelection, + ]); const initEditorForFile = useCallback(async (fileId: string, segmentIdx?: number) => { if (!project?.supported) return; diff --git a/runtime/datamate-python/app/module/annotation/interface/editor.py b/runtime/datamate-python/app/module/annotation/interface/editor.py index 316ce6b..cb521e9 100644 --- a/runtime/datamate-python/app/module/annotation/interface/editor.py +++ b/runtime/datamate-python/app/module/annotation/interface/editor.py @@ -55,10 +55,20 @@ async def list_editor_tasks( project_id: str = Path(..., description="标注项目ID(t_dm_labeling_projects.id)"), page: int = Query(0, ge=0, description="页码(从0开始)"), size: int = Query(50, ge=1, le=200, description="每页大小"), + exclude_source_documents: Optional[bool] = Query( + None, + alias="excludeSourceDocuments", + description="是否排除已被转换为TXT的源文档文件(PDF/DOC/DOCX,仅文本数据集生效)", + ), db: AsyncSession = Depends(get_db), ): service = AnnotationEditorService(db) - result = await service.list_tasks(project_id, page=page, size=size) + result = await service.list_tasks( + project_id, + page=page, + size=size, + exclude_source_documents=exclude_source_documents, + ) return StandardResponse(code=200, message="success", data=result) diff --git a/runtime/datamate-python/app/module/annotation/schema/editor.py b/runtime/datamate-python/app/module/annotation/schema/editor.py index cd7bf1d..99c704e 100644 --- a/runtime/datamate-python/app/module/annotation/schema/editor.py +++ b/runtime/datamate-python/app/module/annotation/schema/editor.py @@ -19,6 +19,11 @@ class EditorProjectInfo(BaseModel): project_id: str = Field(..., alias="projectId", description="DataMate 标注项目ID(t_dm_labeling_projects.id)") dataset_id: str = Field(..., alias="datasetId", description="数据集ID(t_dm_datasets.id)") + dataset_type: Optional[str] = Field( + None, + alias="datasetType", + description="数据集类型(TEXT/IMAGE/AUDIO/VIDEO 等)", + ) template_id: Optional[str] = Field(None, alias="templateId", description="模板ID(t_dm_annotation_templates.id)") label_config: Optional[str] = Field(None, alias="labelConfig", description="Label Studio XML 配置") supported: bool = Field(..., description="当前数据类型是否支持内嵌编辑器") diff --git a/runtime/datamate-python/app/module/annotation/service/editor.py b/runtime/datamate-python/app/module/annotation/service/editor.py index ea93627..5e26380 100644 --- a/runtime/datamate-python/app/module/annotation/service/editor.py +++ b/runtime/datamate-python/app/module/annotation/service/editor.py @@ -18,7 +18,7 @@ import hashlib import json import xml.etree.ElementTree as ET from fastapi import HTTPException -from sqlalchemy import case, func, select +from sqlalchemy import case, func, select, or_ from sqlalchemy.ext.asyncio import AsyncSession from app.core.config import settings @@ -69,6 +69,8 @@ SUPPORTED_EDITOR_DATASET_TYPES = ( DATASET_TYPE_VIDEO, ) SEGMENTATION_ENABLED_KEY = "segmentation_enabled" +SOURCE_DOCUMENT_EXTENSIONS = (".pdf", ".doc", ".docx") +SOURCE_DOCUMENT_TYPES = ("pdf", "doc", "docx") class AnnotationEditorService: @@ -346,6 +348,14 @@ class AnnotationEditorService: return ET.tostring(root, encoding="unicode") + @classmethod + def _build_source_document_filter(cls): + file_type_lower = func.lower(DatasetFiles.file_type) + file_name_lower = func.lower(DatasetFiles.file_name) + type_condition = file_type_lower.in_(SOURCE_DOCUMENT_TYPES) + name_conditions = [file_name_lower.like(f"%{ext}") for ext in SOURCE_DOCUMENT_EXTENSIONS] + return or_(type_condition, *name_conditions) + def _build_task_data( self, display_text: str, @@ -404,18 +414,35 @@ class AnnotationEditorService: return EditorProjectInfo( projectId=project.id, datasetId=project.dataset_id, + datasetType=dataset_type or None, templateId=project.template_id, labelConfig=label_config, supported=supported, unsupportedReason=unsupported_reason, ) - async def list_tasks(self, project_id: str, page: int = 0, size: int = 50) -> EditorTaskListResponse: + async def list_tasks( + self, + project_id: str, + page: int = 0, + size: int = 50, + exclude_source_documents: Optional[bool] = None, + ) -> EditorTaskListResponse: project = await self._get_project_or_404(project_id) + dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id)) + should_exclude_source_documents = False + if dataset_type == DATASET_TYPE_TEXT: + should_exclude_source_documents = ( + exclude_source_documents if exclude_source_documents is not None else True + ) + + base_conditions = [DatasetFiles.dataset_id == project.dataset_id] + if should_exclude_source_documents: + base_conditions.append(~self._build_source_document_filter()) count_result = await self.db.execute( select(func.count()).select_from(DatasetFiles).where( - DatasetFiles.dataset_id == project.dataset_id + *base_conditions ) ) total = int(count_result.scalar() or 0) @@ -431,7 +458,7 @@ class AnnotationEditorService: (AnnotationResult.file_id == DatasetFiles.id) & (AnnotationResult.project_id == project_id), ) - .where(DatasetFiles.dataset_id == project.dataset_id) + .where(*base_conditions) .order_by(annotated_sort_key.asc(), DatasetFiles.created_at.desc()) .offset(page * size) .limit(size)