You've already forked DataMate
feat(annotation): 添加文本项目源文档过滤功能
- 在 EditorProjectInfo 中增加 datasetType 字段 - 移除前端硬编码的源文档扩展名列表 - 添加 isTextProject 判断逻辑 - 实现 prefetch 和 loadTasks 中的源文档排除参数 - 在后端接口中添加 excludeSourceDocuments 参数 - 实现源文档类型的数据库查询过滤逻辑 - 优化任务列表加载性能,避免不必要的源文档加载
This commit is contained in:
@@ -13,6 +13,7 @@ import {
|
|||||||
type EditorProjectInfo = {
|
type EditorProjectInfo = {
|
||||||
projectId: string;
|
projectId: string;
|
||||||
datasetId: string;
|
datasetId: string;
|
||||||
|
datasetType?: string | null;
|
||||||
templateId?: string | null;
|
templateId?: string | null;
|
||||||
labelConfig?: string | null;
|
labelConfig?: string | null;
|
||||||
supported: boolean;
|
supported: boolean;
|
||||||
@@ -87,7 +88,6 @@ type SwitchDecision = "save" | "discard" | "cancel";
|
|||||||
const LSF_IFRAME_SRC = "/lsf/lsf.html";
|
const LSF_IFRAME_SRC = "/lsf/lsf.html";
|
||||||
const TASK_PAGE_START = 0;
|
const TASK_PAGE_START = 0;
|
||||||
const TASK_PAGE_SIZE = 200;
|
const TASK_PAGE_SIZE = 200;
|
||||||
const SOURCE_DOCUMENT_EXTENSIONS = [".pdf", ".doc", ".docx"];
|
|
||||||
|
|
||||||
type NormalizedTaskList = {
|
type NormalizedTaskList = {
|
||||||
items: EditorTaskListItem[];
|
items: EditorTaskListItem[];
|
||||||
@@ -167,20 +167,12 @@ const mergeTaskItems = (base: EditorTaskListItem[], next: EditorTaskListItem[])
|
|||||||
return merged;
|
return merged;
|
||||||
};
|
};
|
||||||
|
|
||||||
const isSourceDocumentFile = (item: EditorTaskListItem) => {
|
|
||||||
const fileName = item.fileName?.toLowerCase() ?? "";
|
|
||||||
return SOURCE_DOCUMENT_EXTENSIONS.some((ext) => fileName.endsWith(ext));
|
|
||||||
};
|
|
||||||
|
|
||||||
const filterSourceDocumentTasks = (items: EditorTaskListItem[]) =>
|
|
||||||
items.filter((item) => !isSourceDocumentFile(item));
|
|
||||||
|
|
||||||
const normalizeTaskListResponse = (
|
const normalizeTaskListResponse = (
|
||||||
response: ApiResponse<EditorTaskListResponse> | null | undefined,
|
response: ApiResponse<EditorTaskListResponse> | null | undefined,
|
||||||
fallbackPage: number,
|
fallbackPage: number,
|
||||||
): NormalizedTaskList => {
|
): NormalizedTaskList => {
|
||||||
const content = response?.data?.content;
|
const content = response?.data?.content;
|
||||||
const items = filterSourceDocumentTasks(Array.isArray(content) ? content : []);
|
const items = Array.isArray(content) ? content : [];
|
||||||
const size = response?.data?.size ?? TASK_PAGE_SIZE;
|
const size = response?.data?.size ?? TASK_PAGE_SIZE;
|
||||||
const total = response?.data?.totalElements ?? items.length;
|
const total = response?.data?.totalElements ?? items.length;
|
||||||
const totalPages =
|
const totalPages =
|
||||||
@@ -237,6 +229,10 @@ export default function LabelStudioTextEditor() {
|
|||||||
const [segmented, setSegmented] = useState(false);
|
const [segmented, setSegmented] = useState(false);
|
||||||
const [segments, setSegments] = useState<SegmentInfo[]>([]);
|
const [segments, setSegments] = useState<SegmentInfo[]>([]);
|
||||||
const [currentSegmentIndex, setCurrentSegmentIndex] = useState(0);
|
const [currentSegmentIndex, setCurrentSegmentIndex] = useState(0);
|
||||||
|
const isTextProject = useMemo(
|
||||||
|
() => (project?.datasetType || "").toUpperCase() === "TEXT",
|
||||||
|
[project?.datasetType],
|
||||||
|
);
|
||||||
|
|
||||||
const focusIframe = useCallback(() => {
|
const focusIframe = useCallback(() => {
|
||||||
const iframe = iframeRef.current;
|
const iframe = iframeRef.current;
|
||||||
@@ -292,9 +288,13 @@ export default function LabelStudioTextEditor() {
|
|||||||
for (let page = startPage; page < totalPages; page += 1) {
|
for (let page = startPage; page < totalPages; page += 1) {
|
||||||
if (prefetchSeqRef.current !== seq) return;
|
if (prefetchSeqRef.current !== seq) return;
|
||||||
try {
|
try {
|
||||||
const resp = (await listEditorTasksUsingGet(projectId, {
|
const params = {
|
||||||
page,
|
page,
|
||||||
size: TASK_PAGE_SIZE,
|
size: TASK_PAGE_SIZE,
|
||||||
|
...(isTextProject ? { excludeSourceDocuments: true } : {}),
|
||||||
|
};
|
||||||
|
const resp = (await listEditorTasksUsingGet(projectId, {
|
||||||
|
...params,
|
||||||
})) as ApiResponse<EditorTaskListResponse>;
|
})) as ApiResponse<EditorTaskListResponse>;
|
||||||
const normalized = normalizeTaskListResponse(resp, page);
|
const normalized = normalizeTaskListResponse(resp, page);
|
||||||
setTasks((prev) => mergeTaskItems(prev, normalized.items));
|
setTasks((prev) => mergeTaskItems(prev, normalized.items));
|
||||||
@@ -311,7 +311,7 @@ export default function LabelStudioTextEditor() {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
void run();
|
void run();
|
||||||
}, [projectId]);
|
}, [isTextProject, projectId]);
|
||||||
|
|
||||||
const loadTasks = useCallback(async (options?: {
|
const loadTasks = useCallback(async (options?: {
|
||||||
mode?: "reset" | "append";
|
mode?: "reset" | "append";
|
||||||
@@ -334,9 +334,13 @@ export default function LabelStudioTextEditor() {
|
|||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
const nextPage = mode === "append" ? taskPage + 1 : TASK_PAGE_START;
|
const nextPage = mode === "append" ? taskPage + 1 : TASK_PAGE_START;
|
||||||
const resp = (await listEditorTasksUsingGet(projectId, {
|
const params = {
|
||||||
page: nextPage,
|
page: nextPage,
|
||||||
size: TASK_PAGE_SIZE,
|
size: TASK_PAGE_SIZE,
|
||||||
|
...(isTextProject ? { excludeSourceDocuments: true } : {}),
|
||||||
|
};
|
||||||
|
const resp = (await listEditorTasksUsingGet(projectId, {
|
||||||
|
...params,
|
||||||
})) as ApiResponse<EditorTaskListResponse>;
|
})) as ApiResponse<EditorTaskListResponse>;
|
||||||
const normalized = normalizeTaskListResponse(resp, nextPage);
|
const normalized = normalizeTaskListResponse(resp, nextPage);
|
||||||
if (mode === "append") {
|
if (mode === "append") {
|
||||||
@@ -366,7 +370,15 @@ export default function LabelStudioTextEditor() {
|
|||||||
setLoadingTasks(false);
|
setLoadingTasks(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}, [message, projectId, startPrefetchTasks, taskPage, taskTotalPages, updateTaskSelection]);
|
}, [
|
||||||
|
isTextProject,
|
||||||
|
message,
|
||||||
|
projectId,
|
||||||
|
startPrefetchTasks,
|
||||||
|
taskPage,
|
||||||
|
taskTotalPages,
|
||||||
|
updateTaskSelection,
|
||||||
|
]);
|
||||||
|
|
||||||
const initEditorForFile = useCallback(async (fileId: string, segmentIdx?: number) => {
|
const initEditorForFile = useCallback(async (fileId: string, segmentIdx?: number) => {
|
||||||
if (!project?.supported) return;
|
if (!project?.supported) return;
|
||||||
|
|||||||
@@ -55,10 +55,20 @@ async def list_editor_tasks(
|
|||||||
project_id: str = Path(..., description="标注项目ID(t_dm_labeling_projects.id)"),
|
project_id: str = Path(..., description="标注项目ID(t_dm_labeling_projects.id)"),
|
||||||
page: int = Query(0, ge=0, description="页码(从0开始)"),
|
page: int = Query(0, ge=0, description="页码(从0开始)"),
|
||||||
size: int = Query(50, ge=1, le=200, description="每页大小"),
|
size: int = Query(50, ge=1, le=200, description="每页大小"),
|
||||||
|
exclude_source_documents: Optional[bool] = Query(
|
||||||
|
None,
|
||||||
|
alias="excludeSourceDocuments",
|
||||||
|
description="是否排除已被转换为TXT的源文档文件(PDF/DOC/DOCX,仅文本数据集生效)",
|
||||||
|
),
|
||||||
db: AsyncSession = Depends(get_db),
|
db: AsyncSession = Depends(get_db),
|
||||||
):
|
):
|
||||||
service = AnnotationEditorService(db)
|
service = AnnotationEditorService(db)
|
||||||
result = await service.list_tasks(project_id, page=page, size=size)
|
result = await service.list_tasks(
|
||||||
|
project_id,
|
||||||
|
page=page,
|
||||||
|
size=size,
|
||||||
|
exclude_source_documents=exclude_source_documents,
|
||||||
|
)
|
||||||
return StandardResponse(code=200, message="success", data=result)
|
return StandardResponse(code=200, message="success", data=result)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -19,6 +19,11 @@ class EditorProjectInfo(BaseModel):
|
|||||||
|
|
||||||
project_id: str = Field(..., alias="projectId", description="DataMate 标注项目ID(t_dm_labeling_projects.id)")
|
project_id: str = Field(..., alias="projectId", description="DataMate 标注项目ID(t_dm_labeling_projects.id)")
|
||||||
dataset_id: str = Field(..., alias="datasetId", description="数据集ID(t_dm_datasets.id)")
|
dataset_id: str = Field(..., alias="datasetId", description="数据集ID(t_dm_datasets.id)")
|
||||||
|
dataset_type: Optional[str] = Field(
|
||||||
|
None,
|
||||||
|
alias="datasetType",
|
||||||
|
description="数据集类型(TEXT/IMAGE/AUDIO/VIDEO 等)",
|
||||||
|
)
|
||||||
template_id: Optional[str] = Field(None, alias="templateId", description="模板ID(t_dm_annotation_templates.id)")
|
template_id: Optional[str] = Field(None, alias="templateId", description="模板ID(t_dm_annotation_templates.id)")
|
||||||
label_config: Optional[str] = Field(None, alias="labelConfig", description="Label Studio XML 配置")
|
label_config: Optional[str] = Field(None, alias="labelConfig", description="Label Studio XML 配置")
|
||||||
supported: bool = Field(..., description="当前数据类型是否支持内嵌编辑器")
|
supported: bool = Field(..., description="当前数据类型是否支持内嵌编辑器")
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ import hashlib
|
|||||||
import json
|
import json
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
from fastapi import HTTPException
|
from fastapi import HTTPException
|
||||||
from sqlalchemy import case, func, select
|
from sqlalchemy import case, func, select, or_
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.core.config import settings
|
from app.core.config import settings
|
||||||
@@ -69,6 +69,8 @@ SUPPORTED_EDITOR_DATASET_TYPES = (
|
|||||||
DATASET_TYPE_VIDEO,
|
DATASET_TYPE_VIDEO,
|
||||||
)
|
)
|
||||||
SEGMENTATION_ENABLED_KEY = "segmentation_enabled"
|
SEGMENTATION_ENABLED_KEY = "segmentation_enabled"
|
||||||
|
SOURCE_DOCUMENT_EXTENSIONS = (".pdf", ".doc", ".docx")
|
||||||
|
SOURCE_DOCUMENT_TYPES = ("pdf", "doc", "docx")
|
||||||
|
|
||||||
|
|
||||||
class AnnotationEditorService:
|
class AnnotationEditorService:
|
||||||
@@ -346,6 +348,14 @@ class AnnotationEditorService:
|
|||||||
|
|
||||||
return ET.tostring(root, encoding="unicode")
|
return ET.tostring(root, encoding="unicode")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _build_source_document_filter(cls):
|
||||||
|
file_type_lower = func.lower(DatasetFiles.file_type)
|
||||||
|
file_name_lower = func.lower(DatasetFiles.file_name)
|
||||||
|
type_condition = file_type_lower.in_(SOURCE_DOCUMENT_TYPES)
|
||||||
|
name_conditions = [file_name_lower.like(f"%{ext}") for ext in SOURCE_DOCUMENT_EXTENSIONS]
|
||||||
|
return or_(type_condition, *name_conditions)
|
||||||
|
|
||||||
def _build_task_data(
|
def _build_task_data(
|
||||||
self,
|
self,
|
||||||
display_text: str,
|
display_text: str,
|
||||||
@@ -404,18 +414,35 @@ class AnnotationEditorService:
|
|||||||
return EditorProjectInfo(
|
return EditorProjectInfo(
|
||||||
projectId=project.id,
|
projectId=project.id,
|
||||||
datasetId=project.dataset_id,
|
datasetId=project.dataset_id,
|
||||||
|
datasetType=dataset_type or None,
|
||||||
templateId=project.template_id,
|
templateId=project.template_id,
|
||||||
labelConfig=label_config,
|
labelConfig=label_config,
|
||||||
supported=supported,
|
supported=supported,
|
||||||
unsupportedReason=unsupported_reason,
|
unsupportedReason=unsupported_reason,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def list_tasks(self, project_id: str, page: int = 0, size: int = 50) -> EditorTaskListResponse:
|
async def list_tasks(
|
||||||
|
self,
|
||||||
|
project_id: str,
|
||||||
|
page: int = 0,
|
||||||
|
size: int = 50,
|
||||||
|
exclude_source_documents: Optional[bool] = None,
|
||||||
|
) -> EditorTaskListResponse:
|
||||||
project = await self._get_project_or_404(project_id)
|
project = await self._get_project_or_404(project_id)
|
||||||
|
dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
|
||||||
|
should_exclude_source_documents = False
|
||||||
|
if dataset_type == DATASET_TYPE_TEXT:
|
||||||
|
should_exclude_source_documents = (
|
||||||
|
exclude_source_documents if exclude_source_documents is not None else True
|
||||||
|
)
|
||||||
|
|
||||||
|
base_conditions = [DatasetFiles.dataset_id == project.dataset_id]
|
||||||
|
if should_exclude_source_documents:
|
||||||
|
base_conditions.append(~self._build_source_document_filter())
|
||||||
|
|
||||||
count_result = await self.db.execute(
|
count_result = await self.db.execute(
|
||||||
select(func.count()).select_from(DatasetFiles).where(
|
select(func.count()).select_from(DatasetFiles).where(
|
||||||
DatasetFiles.dataset_id == project.dataset_id
|
*base_conditions
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
total = int(count_result.scalar() or 0)
|
total = int(count_result.scalar() or 0)
|
||||||
@@ -431,7 +458,7 @@ class AnnotationEditorService:
|
|||||||
(AnnotationResult.file_id == DatasetFiles.id)
|
(AnnotationResult.file_id == DatasetFiles.id)
|
||||||
& (AnnotationResult.project_id == project_id),
|
& (AnnotationResult.project_id == project_id),
|
||||||
)
|
)
|
||||||
.where(DatasetFiles.dataset_id == project.dataset_id)
|
.where(*base_conditions)
|
||||||
.order_by(annotated_sort_key.asc(), DatasetFiles.created_at.desc())
|
.order_by(annotated_sort_key.asc(), DatasetFiles.created_at.desc())
|
||||||
.offset(page * size)
|
.offset(page * size)
|
||||||
.limit(size)
|
.limit(size)
|
||||||
|
|||||||
Reference in New Issue
Block a user