feat(annotation): 添加文本项目源文档过滤功能

- 在 EditorProjectInfo 中增加 datasetType 字段
- 移除前端硬编码的源文档扩展名列表
- 添加 isTextProject 判断逻辑
- 实现 prefetch 和 loadTasks 中的源文档排除参数
- 在后端接口中添加 excludeSourceDocuments 参数
- 实现源文档类型的数据库查询过滤逻辑
- 优化任务列表加载性能,避免不必要的源文档加载
This commit is contained in:
2026-01-29 16:29:40 +08:00
parent f5f0add529
commit 0b69845a29
4 changed files with 73 additions and 19 deletions

View File

@@ -13,6 +13,7 @@ import {
type EditorProjectInfo = { type EditorProjectInfo = {
projectId: string; projectId: string;
datasetId: string; datasetId: string;
datasetType?: string | null;
templateId?: string | null; templateId?: string | null;
labelConfig?: string | null; labelConfig?: string | null;
supported: boolean; supported: boolean;
@@ -87,7 +88,6 @@ type SwitchDecision = "save" | "discard" | "cancel";
const LSF_IFRAME_SRC = "/lsf/lsf.html"; const LSF_IFRAME_SRC = "/lsf/lsf.html";
const TASK_PAGE_START = 0; const TASK_PAGE_START = 0;
const TASK_PAGE_SIZE = 200; const TASK_PAGE_SIZE = 200;
const SOURCE_DOCUMENT_EXTENSIONS = [".pdf", ".doc", ".docx"];
type NormalizedTaskList = { type NormalizedTaskList = {
items: EditorTaskListItem[]; items: EditorTaskListItem[];
@@ -167,20 +167,12 @@ const mergeTaskItems = (base: EditorTaskListItem[], next: EditorTaskListItem[])
return merged; return merged;
}; };
const isSourceDocumentFile = (item: EditorTaskListItem) => {
const fileName = item.fileName?.toLowerCase() ?? "";
return SOURCE_DOCUMENT_EXTENSIONS.some((ext) => fileName.endsWith(ext));
};
const filterSourceDocumentTasks = (items: EditorTaskListItem[]) =>
items.filter((item) => !isSourceDocumentFile(item));
const normalizeTaskListResponse = ( const normalizeTaskListResponse = (
response: ApiResponse<EditorTaskListResponse> | null | undefined, response: ApiResponse<EditorTaskListResponse> | null | undefined,
fallbackPage: number, fallbackPage: number,
): NormalizedTaskList => { ): NormalizedTaskList => {
const content = response?.data?.content; const content = response?.data?.content;
const items = filterSourceDocumentTasks(Array.isArray(content) ? content : []); const items = Array.isArray(content) ? content : [];
const size = response?.data?.size ?? TASK_PAGE_SIZE; const size = response?.data?.size ?? TASK_PAGE_SIZE;
const total = response?.data?.totalElements ?? items.length; const total = response?.data?.totalElements ?? items.length;
const totalPages = const totalPages =
@@ -237,6 +229,10 @@ export default function LabelStudioTextEditor() {
const [segmented, setSegmented] = useState(false); const [segmented, setSegmented] = useState(false);
const [segments, setSegments] = useState<SegmentInfo[]>([]); const [segments, setSegments] = useState<SegmentInfo[]>([]);
const [currentSegmentIndex, setCurrentSegmentIndex] = useState(0); const [currentSegmentIndex, setCurrentSegmentIndex] = useState(0);
const isTextProject = useMemo(
() => (project?.datasetType || "").toUpperCase() === "TEXT",
[project?.datasetType],
);
const focusIframe = useCallback(() => { const focusIframe = useCallback(() => {
const iframe = iframeRef.current; const iframe = iframeRef.current;
@@ -292,9 +288,13 @@ export default function LabelStudioTextEditor() {
for (let page = startPage; page < totalPages; page += 1) { for (let page = startPage; page < totalPages; page += 1) {
if (prefetchSeqRef.current !== seq) return; if (prefetchSeqRef.current !== seq) return;
try { try {
const resp = (await listEditorTasksUsingGet(projectId, { const params = {
page, page,
size: TASK_PAGE_SIZE, size: TASK_PAGE_SIZE,
...(isTextProject ? { excludeSourceDocuments: true } : {}),
};
const resp = (await listEditorTasksUsingGet(projectId, {
...params,
})) as ApiResponse<EditorTaskListResponse>; })) as ApiResponse<EditorTaskListResponse>;
const normalized = normalizeTaskListResponse(resp, page); const normalized = normalizeTaskListResponse(resp, page);
setTasks((prev) => mergeTaskItems(prev, normalized.items)); setTasks((prev) => mergeTaskItems(prev, normalized.items));
@@ -311,7 +311,7 @@ export default function LabelStudioTextEditor() {
} }
}; };
void run(); void run();
}, [projectId]); }, [isTextProject, projectId]);
const loadTasks = useCallback(async (options?: { const loadTasks = useCallback(async (options?: {
mode?: "reset" | "append"; mode?: "reset" | "append";
@@ -334,9 +334,13 @@ export default function LabelStudioTextEditor() {
} }
try { try {
const nextPage = mode === "append" ? taskPage + 1 : TASK_PAGE_START; const nextPage = mode === "append" ? taskPage + 1 : TASK_PAGE_START;
const resp = (await listEditorTasksUsingGet(projectId, { const params = {
page: nextPage, page: nextPage,
size: TASK_PAGE_SIZE, size: TASK_PAGE_SIZE,
...(isTextProject ? { excludeSourceDocuments: true } : {}),
};
const resp = (await listEditorTasksUsingGet(projectId, {
...params,
})) as ApiResponse<EditorTaskListResponse>; })) as ApiResponse<EditorTaskListResponse>;
const normalized = normalizeTaskListResponse(resp, nextPage); const normalized = normalizeTaskListResponse(resp, nextPage);
if (mode === "append") { if (mode === "append") {
@@ -366,7 +370,15 @@ export default function LabelStudioTextEditor() {
setLoadingTasks(false); setLoadingTasks(false);
} }
} }
}, [message, projectId, startPrefetchTasks, taskPage, taskTotalPages, updateTaskSelection]); }, [
isTextProject,
message,
projectId,
startPrefetchTasks,
taskPage,
taskTotalPages,
updateTaskSelection,
]);
const initEditorForFile = useCallback(async (fileId: string, segmentIdx?: number) => { const initEditorForFile = useCallback(async (fileId: string, segmentIdx?: number) => {
if (!project?.supported) return; if (!project?.supported) return;

View File

@@ -55,10 +55,20 @@ async def list_editor_tasks(
project_id: str = Path(..., description="标注项目ID(t_dm_labeling_projects.id)"), project_id: str = Path(..., description="标注项目ID(t_dm_labeling_projects.id)"),
page: int = Query(0, ge=0, description="页码(从0开始)"), page: int = Query(0, ge=0, description="页码(从0开始)"),
size: int = Query(50, ge=1, le=200, description="每页大小"), size: int = Query(50, ge=1, le=200, description="每页大小"),
exclude_source_documents: Optional[bool] = Query(
None,
alias="excludeSourceDocuments",
description="是否排除已被转换为TXT的源文档文件(PDF/DOC/DOCX,仅文本数据集生效)",
),
db: AsyncSession = Depends(get_db), db: AsyncSession = Depends(get_db),
): ):
service = AnnotationEditorService(db) service = AnnotationEditorService(db)
result = await service.list_tasks(project_id, page=page, size=size) result = await service.list_tasks(
project_id,
page=page,
size=size,
exclude_source_documents=exclude_source_documents,
)
return StandardResponse(code=200, message="success", data=result) return StandardResponse(code=200, message="success", data=result)

View File

@@ -19,6 +19,11 @@ class EditorProjectInfo(BaseModel):
project_id: str = Field(..., alias="projectId", description="DataMate 标注项目ID(t_dm_labeling_projects.id)") project_id: str = Field(..., alias="projectId", description="DataMate 标注项目ID(t_dm_labeling_projects.id)")
dataset_id: str = Field(..., alias="datasetId", description="数据集ID(t_dm_datasets.id)") dataset_id: str = Field(..., alias="datasetId", description="数据集ID(t_dm_datasets.id)")
dataset_type: Optional[str] = Field(
None,
alias="datasetType",
description="数据集类型(TEXT/IMAGE/AUDIO/VIDEO 等)",
)
template_id: Optional[str] = Field(None, alias="templateId", description="模板ID(t_dm_annotation_templates.id)") template_id: Optional[str] = Field(None, alias="templateId", description="模板ID(t_dm_annotation_templates.id)")
label_config: Optional[str] = Field(None, alias="labelConfig", description="Label Studio XML 配置") label_config: Optional[str] = Field(None, alias="labelConfig", description="Label Studio XML 配置")
supported: bool = Field(..., description="当前数据类型是否支持内嵌编辑器") supported: bool = Field(..., description="当前数据类型是否支持内嵌编辑器")

View File

@@ -18,7 +18,7 @@ import hashlib
import json import json
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from fastapi import HTTPException from fastapi import HTTPException
from sqlalchemy import case, func, select from sqlalchemy import case, func, select, or_
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.core.config import settings from app.core.config import settings
@@ -69,6 +69,8 @@ SUPPORTED_EDITOR_DATASET_TYPES = (
DATASET_TYPE_VIDEO, DATASET_TYPE_VIDEO,
) )
SEGMENTATION_ENABLED_KEY = "segmentation_enabled" SEGMENTATION_ENABLED_KEY = "segmentation_enabled"
SOURCE_DOCUMENT_EXTENSIONS = (".pdf", ".doc", ".docx")
SOURCE_DOCUMENT_TYPES = ("pdf", "doc", "docx")
class AnnotationEditorService: class AnnotationEditorService:
@@ -346,6 +348,14 @@ class AnnotationEditorService:
return ET.tostring(root, encoding="unicode") return ET.tostring(root, encoding="unicode")
@classmethod
def _build_source_document_filter(cls):
file_type_lower = func.lower(DatasetFiles.file_type)
file_name_lower = func.lower(DatasetFiles.file_name)
type_condition = file_type_lower.in_(SOURCE_DOCUMENT_TYPES)
name_conditions = [file_name_lower.like(f"%{ext}") for ext in SOURCE_DOCUMENT_EXTENSIONS]
return or_(type_condition, *name_conditions)
def _build_task_data( def _build_task_data(
self, self,
display_text: str, display_text: str,
@@ -404,18 +414,35 @@ class AnnotationEditorService:
return EditorProjectInfo( return EditorProjectInfo(
projectId=project.id, projectId=project.id,
datasetId=project.dataset_id, datasetId=project.dataset_id,
datasetType=dataset_type or None,
templateId=project.template_id, templateId=project.template_id,
labelConfig=label_config, labelConfig=label_config,
supported=supported, supported=supported,
unsupportedReason=unsupported_reason, unsupportedReason=unsupported_reason,
) )
async def list_tasks(self, project_id: str, page: int = 0, size: int = 50) -> EditorTaskListResponse: async def list_tasks(
self,
project_id: str,
page: int = 0,
size: int = 50,
exclude_source_documents: Optional[bool] = None,
) -> EditorTaskListResponse:
project = await self._get_project_or_404(project_id) project = await self._get_project_or_404(project_id)
dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
should_exclude_source_documents = False
if dataset_type == DATASET_TYPE_TEXT:
should_exclude_source_documents = (
exclude_source_documents if exclude_source_documents is not None else True
)
base_conditions = [DatasetFiles.dataset_id == project.dataset_id]
if should_exclude_source_documents:
base_conditions.append(~self._build_source_document_filter())
count_result = await self.db.execute( count_result = await self.db.execute(
select(func.count()).select_from(DatasetFiles).where( select(func.count()).select_from(DatasetFiles).where(
DatasetFiles.dataset_id == project.dataset_id *base_conditions
) )
) )
total = int(count_result.scalar() or 0) total = int(count_result.scalar() or 0)
@@ -431,7 +458,7 @@ class AnnotationEditorService:
(AnnotationResult.file_id == DatasetFiles.id) (AnnotationResult.file_id == DatasetFiles.id)
& (AnnotationResult.project_id == project_id), & (AnnotationResult.project_id == project_id),
) )
.where(DatasetFiles.dataset_id == project.dataset_id) .where(*base_conditions)
.order_by(annotated_sort_key.asc(), DatasetFiles.created_at.desc()) .order_by(annotated_sort_key.asc(), DatasetFiles.created_at.desc())
.offset(page * size) .offset(page * size)
.limit(size) .limit(size)