diff --git a/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx b/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx index 92d4a01..b7edc80 100644 --- a/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx +++ b/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx @@ -6,6 +6,7 @@ import { useNavigate, useParams } from "react-router"; import { getEditorProjectInfoUsingGet, getEditorTaskUsingGet, + getEditorTaskSegmentsUsingGet, listEditorTasksUsingGet, upsertEditorAnnotationUsingPut, } from "../annotation.api"; @@ -38,9 +39,6 @@ type LsfMessage = { type SegmentInfo = { idx: number; - text: string; - start: number; - end: number; hasAnnotation: boolean; lineIndex: number; chunkIndex: number; @@ -66,10 +64,16 @@ type EditorTaskPayload = { type EditorTaskResponse = { task?: EditorTaskPayload; segmented?: boolean; - segments?: SegmentInfo[]; + totalSegments?: number; currentSegmentIndex?: number; }; +type EditorTaskSegmentsResponse = { + segmented?: boolean; + segments?: SegmentInfo[]; + totalSegments?: number; +}; + type EditorTaskListResponse = { content?: EditorTaskListItem[]; totalElements?: number; @@ -288,6 +292,7 @@ export default function LabelStudioTextEditor() { const segmentStatsCacheRef = useRef>({}); const segmentStatsSeqRef = useRef(0); const segmentStatsLoadingRef = useRef>(new Set()); + const segmentSummaryFileRef = useRef(""); const [loadingProject, setLoadingProject] = useState(true); const [loadingTasks, setLoadingTasks] = useState(false); @@ -358,9 +363,7 @@ export default function LabelStudioTextEditor() { if (segmentStatsCacheRef.current[fileId] || segmentStatsLoadingRef.current.has(fileId)) return; segmentStatsLoadingRef.current.add(fileId); try { - const resp = (await getEditorTaskUsingGet(projectId, fileId, { - segmentIndex: 0, - })) as ApiResponse; + const resp = (await getEditorTaskSegmentsUsingGet(projectId, fileId)) as ApiResponse; if (segmentStatsSeqRef.current !== seq) return; const data = resp?.data; if (!data?.segmented) return; @@ -591,20 +594,38 @@ export default function LabelStudioTextEditor() { if (seq !== initSeqRef.current) return; // 更新分段状态 - const segmentIndex = data?.segmented + const isSegmented = !!data?.segmented; + const segmentIndex = isSegmented ? resolveSegmentIndex(data.currentSegmentIndex) ?? 0 : undefined; - if (data?.segmented) { - const stats = buildSegmentStats(data.segments); + if (isSegmented) { + let nextSegments: SegmentInfo[] = []; + if (segmentSummaryFileRef.current === fileId && segments.length > 0) { + nextSegments = segments; + } else { + try { + const segmentResp = (await getEditorTaskSegmentsUsingGet(projectId, fileId)) as ApiResponse; + if (seq !== initSeqRef.current) return; + const segmentData = segmentResp?.data; + if (segmentData?.segmented) { + nextSegments = Array.isArray(segmentData.segments) ? segmentData.segments : []; + } + } catch (e) { + console.error(e); + } + } + const stats = buildSegmentStats(nextSegments); setSegmented(true); - setSegments(data.segments || []); + setSegments(nextSegments); setCurrentSegmentIndex(segmentIndex ?? 0); updateSegmentStatsCache(fileId, stats); + segmentSummaryFileRef.current = fileId; } else { setSegmented(false); setSegments([]); setCurrentSegmentIndex(0); updateSegmentStatsCache(fileId, null); + segmentSummaryFileRef.current = fileId; } const taskData = { @@ -664,7 +685,7 @@ export default function LabelStudioTextEditor() { } finally { if (seq === initSeqRef.current) setLoadingTaskDetail(false); } - }, [iframeReady, message, postToIframe, project, projectId, updateSegmentStatsCache]); + }, [iframeReady, message, postToIframe, project, projectId, segments, updateSegmentStatsCache]); const advanceAfterSave = useCallback(async (fileId: string, segmentIndex?: number) => { if (!fileId) return; @@ -979,6 +1000,7 @@ export default function LabelStudioTextEditor() { setSegmented(false); setSegments([]); setCurrentSegmentIndex(0); + segmentSummaryFileRef.current = ""; savedSnapshotsRef.current = {}; segmentStatsSeqRef.current += 1; segmentStatsCacheRef.current = {}; diff --git a/frontend/src/pages/DataAnnotation/annotation.api.ts b/frontend/src/pages/DataAnnotation/annotation.api.ts index 9adb01a..9a437ca 100644 --- a/frontend/src/pages/DataAnnotation/annotation.api.ts +++ b/frontend/src/pages/DataAnnotation/annotation.api.ts @@ -1,20 +1,23 @@ import { get, post, put, del, download } from "@/utils/request"; -// 导出格式类型 -export type ExportFormat = "json" | "jsonl" | "csv" | "coco" | "yolo"; - -// 标注任务管理相关接口 -export function queryAnnotationTasksUsingGet(params?: any) { - return get("/api/annotation/project", params); -} - -export function createAnnotationTaskUsingPost(data: any) { - return post("/api/annotation/project", data); -} - -export function syncAnnotationTaskUsingPost(data: any) { - return post(`/api/annotation/task/sync`, data); -} +// 导出格式类型 +export type ExportFormat = "json" | "jsonl" | "csv" | "coco" | "yolo"; + +type RequestParams = Record; +type RequestPayload = Record; + +// 标注任务管理相关接口 +export function queryAnnotationTasksUsingGet(params?: RequestParams) { + return get("/api/annotation/project", params); +} + +export function createAnnotationTaskUsingPost(data: RequestPayload) { + return post("/api/annotation/project", data); +} + +export function syncAnnotationTaskUsingPost(data: RequestPayload) { + return post(`/api/annotation/task/sync`, data); +} export function deleteAnnotationTaskByIdUsingDelete(mappingId: string) { // Backend expects mapping UUID as path parameter @@ -25,9 +28,9 @@ export function getAnnotationTaskByIdUsingGet(taskId: string) { return get(`/api/annotation/project/${taskId}`); } -export function updateAnnotationTaskByIdUsingPut(taskId: string, data: any) { - return put(`/api/annotation/project/${taskId}`, data); -} +export function updateAnnotationTaskByIdUsingPut(taskId: string, data: RequestPayload) { + return put(`/api/annotation/project/${taskId}`, data); +} // 标签配置管理 export function getTagConfigUsingGet() { @@ -35,20 +38,20 @@ export function getTagConfigUsingGet() { } // 标注模板管理 -export function queryAnnotationTemplatesUsingGet(params?: any) { - return get("/api/annotation/template", params); -} +export function queryAnnotationTemplatesUsingGet(params?: RequestParams) { + return get("/api/annotation/template", params); +} + +export function createAnnotationTemplateUsingPost(data: RequestPayload) { + return post("/api/annotation/template", data); +} -export function createAnnotationTemplateUsingPost(data: any) { - return post("/api/annotation/template", data); -} - -export function updateAnnotationTemplateByIdUsingPut( - templateId: string | number, - data: any -) { - return put(`/api/annotation/template/${templateId}`, data); -} +export function updateAnnotationTemplateByIdUsingPut( + templateId: string | number, + data: RequestPayload +) { + return put(`/api/annotation/template/${templateId}`, data); +} export function deleteAnnotationTemplateByIdUsingDelete( templateId: string | number @@ -65,27 +68,31 @@ export function getEditorProjectInfoUsingGet(projectId: string) { return get(`/api/annotation/editor/projects/${projectId}`); } -export function listEditorTasksUsingGet(projectId: string, params?: any) { - return get(`/api/annotation/editor/projects/${projectId}/tasks`, params); -} +export function listEditorTasksUsingGet(projectId: string, params?: RequestParams) { + return get(`/api/annotation/editor/projects/${projectId}/tasks`, params); +} -export function getEditorTaskUsingGet( - projectId: string, - fileId: string, - params?: { segmentIndex?: number } -) { - return get(`/api/annotation/editor/projects/${projectId}/tasks/${fileId}`, params); -} +export function getEditorTaskUsingGet( + projectId: string, + fileId: string, + params?: { segmentIndex?: number } +) { + return get(`/api/annotation/editor/projects/${projectId}/tasks/${fileId}`, params); +} + +export function getEditorTaskSegmentsUsingGet(projectId: string, fileId: string) { + return get(`/api/annotation/editor/projects/${projectId}/tasks/${fileId}/segments`); +} export function upsertEditorAnnotationUsingPut( - projectId: string, - fileId: string, - data: { - annotation: any; - expectedUpdatedAt?: string; - segmentIndex?: number; - } -) { + projectId: string, + fileId: string, + data: { + annotation: Record; + expectedUpdatedAt?: string; + segmentIndex?: number; + } +) { return put(`/api/annotation/editor/projects/${projectId}/tasks/${fileId}/annotation`, data); } diff --git a/runtime/datamate-python/app/module/annotation/interface/editor.py b/runtime/datamate-python/app/module/annotation/interface/editor.py index cb521e9..d748cb2 100644 --- a/runtime/datamate-python/app/module/annotation/interface/editor.py +++ b/runtime/datamate-python/app/module/annotation/interface/editor.py @@ -20,6 +20,7 @@ from app.module.annotation.schema.editor import ( EditorProjectInfo, EditorTaskListResponse, EditorTaskResponse, + EditorTaskSegmentsResponse, UpsertAnnotationRequest, UpsertAnnotationResponse, ) @@ -87,6 +88,20 @@ async def get_editor_task( return StandardResponse(code=200, message="success", data=task) +@router.get( + "/projects/{project_id}/tasks/{file_id}/segments", + response_model=StandardResponse[EditorTaskSegmentsResponse], +) +async def list_editor_task_segments( + project_id: str = Path(..., description="标注项目ID(t_dm_labeling_projects.id)"), + file_id: str = Path(..., description="文件ID(t_dm_dataset_files.id)"), + db: AsyncSession = Depends(get_db), +): + service = AnnotationEditorService(db) + result = await service.get_task_segments(project_id, file_id) + return StandardResponse(code=200, message="success", data=result) + + @router.put( "/projects/{project_id}/tasks/{file_id}/annotation", response_model=StandardResponse[UpsertAnnotationResponse], diff --git a/runtime/datamate-python/app/module/annotation/schema/editor.py b/runtime/datamate-python/app/module/annotation/schema/editor.py index 06ced10..d9b776b 100644 --- a/runtime/datamate-python/app/module/annotation/schema/editor.py +++ b/runtime/datamate-python/app/module/annotation/schema/editor.py @@ -79,12 +79,9 @@ class EditorTaskListResponse(BaseModel): class SegmentInfo(BaseModel): - """段落信息(用于文本分段标注)""" + """段落摘要(用于文本分段标注)""" idx: int = Field(..., description="段落索引") - text: str = Field(..., description="段落文本") - start: int = Field(..., description="在原文中的起始位置") - end: int = Field(..., description="在原文中的结束位置") has_annotation: bool = Field(False, alias="hasAnnotation", description="该段落是否已有标注") line_index: int = Field(0, alias="lineIndex", description="JSONL 行索引(从0开始)") chunk_index: int = Field(0, alias="chunkIndex", description="行内分片索引(从0开始)") @@ -100,13 +97,22 @@ class EditorTaskResponse(BaseModel): # 分段相关字段 segmented: bool = Field(False, description="是否启用分段模式") - segments: Optional[List[SegmentInfo]] = Field(None, description="段落列表") total_segments: int = Field(0, alias="totalSegments", description="总段落数") current_segment_index: int = Field(0, alias="currentSegmentIndex", description="当前段落索引") model_config = ConfigDict(populate_by_name=True) +class EditorTaskSegmentsResponse(BaseModel): + """编辑器段落摘要响应""" + + segmented: bool = Field(False, description="是否启用分段模式") + segments: List[SegmentInfo] = Field(default_factory=list, description="段落摘要列表") + total_segments: int = Field(0, alias="totalSegments", description="总段落数") + + model_config = ConfigDict(populate_by_name=True) + + class UpsertAnnotationRequest(BaseModel): """保存/覆盖最终标注(Label Studio annotation 原始对象)""" diff --git a/runtime/datamate-python/app/module/annotation/service/editor.py b/runtime/datamate-python/app/module/annotation/service/editor.py index f0787a2..26fb4a6 100644 --- a/runtime/datamate-python/app/module/annotation/service/editor.py +++ b/runtime/datamate-python/app/module/annotation/service/editor.py @@ -37,6 +37,7 @@ from app.module.annotation.schema.editor import ( EditorTaskListItem, EditorTaskListResponse, EditorTaskResponse, + EditorTaskSegmentsResponse, SegmentInfo, UpsertAnnotationRequest, UpsertAnnotationResponse, @@ -538,6 +539,49 @@ class AnnotationEditorService: return value return raw_text + def _build_segment_contexts( + self, + records: List[Tuple[Optional[Dict[str, Any]], str]], + record_texts: List[str], + segment_annotation_keys: set[str], + ) -> Tuple[List[SegmentInfo], List[Tuple[Optional[Dict[str, Any]], str, str, int, int]]]: + splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD) + segment_contexts: List[Tuple[Optional[Dict[str, Any]], str, str, int, int]] = [] + segment_cursor = 0 + + for record_index, ((payload, raw_text), record_text) in enumerate(zip(records, record_texts)): + normalized_text = record_text or "" + if len(normalized_text) > self.SEGMENT_THRESHOLD: + raw_segments = splitter.split(normalized_text) + for chunk_index, seg in enumerate(raw_segments): + segments.append( + SegmentInfo( + idx=segment_cursor, + hasAnnotation=str(segment_cursor) in segment_annotation_keys, + lineIndex=record_index, + chunkIndex=chunk_index, + ) + ) + segment_contexts.append((payload, raw_text, seg["text"], record_index, chunk_index)) + segment_cursor += 1 + else: + segments.append( + SegmentInfo( + idx=segment_cursor, + hasAnnotation=str(segment_cursor) in segment_annotation_keys, + lineIndex=record_index, + chunkIndex=0, + ) + ) + segment_contexts.append((payload, raw_text, normalized_text, record_index, 0)) + segment_cursor += 1 + + if not segments: + segments = [SegmentInfo(idx=0, hasAnnotation=False, lineIndex=0, chunkIndex=0)] + segment_contexts = [(None, "", "", 0, 0)] + + return segments, segment_contexts + async def get_project_info(self, project_id: str) -> EditorProjectInfo: project = await self._get_project_or_404(project_id) @@ -668,6 +712,87 @@ class AnnotationEditorService: return await self._build_text_task(project, file_record, file_id, segment_index) + async def get_task_segments( + self, + project_id: str, + file_id: str, + ) -> EditorTaskSegmentsResponse: + project = await self._get_project_or_404(project_id) + + dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id)) + if dataset_type != DATASET_TYPE_TEXT: + raise HTTPException( + status_code=400, + detail="当前仅支持 TEXT 项目的段落摘要", + ) + + file_result = await self.db.execute( + select(DatasetFiles).where( + DatasetFiles.id == file_id, + DatasetFiles.dataset_id == project.dataset_id, + ) + ) + file_record = file_result.scalar_one_or_none() + if not file_record: + raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}") + + if not self._resolve_segmentation_enabled(project): + return EditorTaskSegmentsResponse(segmented=False, segments=[], totalSegments=0) + + text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id) + assert isinstance(text_content, str) + label_config = await self._resolve_project_label_config(project) + primary_text_key = self._resolve_primary_text_key(label_config) + file_name = str(getattr(file_record, "file_name", "")).lower() + + records: List[Tuple[Optional[Dict[str, Any]], str]] = [] + if file_name.endswith(JSONL_EXTENSION): + records = self._parse_jsonl_records(text_content) + else: + parsed_payload = self._try_parse_json_payload(text_content) + if parsed_payload: + records = [(parsed_payload, text_content)] + + if not records: + records = [(None, text_content)] + + record_texts = [ + self._resolve_primary_text_value(payload, raw_text, primary_text_key) + for payload, raw_text in records + ] + if not record_texts: + record_texts = [text_content] + + needs_segmentation = len(records) > 1 or any( + len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts + ) + if not needs_segmentation: + return EditorTaskSegmentsResponse(segmented=False, segments=[], totalSegments=0) + + ann_result = await self.db.execute( + select(AnnotationResult).where( + AnnotationResult.project_id == project.id, + AnnotationResult.file_id == file_id, + ) + ) + ann = ann_result.scalar_one_or_none() + segment_annotations: Dict[str, Dict[str, Any]] = {} + if ann and isinstance(ann.annotation, dict): + segment_annotations = self._extract_segment_annotations(ann.annotation) + segment_annotation_keys = set(segment_annotations.keys()) + + segments, _ = self._build_segment_contexts( + records, + record_texts, + segment_annotation_keys, + ) + + return EditorTaskSegmentsResponse( + segmented=True, + segments=segments, + totalSegments=len(segments), + ) + async def _build_text_task( self, project: LabelingProject, @@ -723,7 +848,8 @@ class AnnotationEditorService: needs_segmentation = segmentation_enabled and ( len(records) > 1 or any(len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts) ) - segments: Optional[List[SegmentInfo]] = None + segments: List[SegmentInfo] = [] + segment_contexts: List[Tuple[Optional[Dict[str, Any]], str, str, int, int]] = [] current_segment_index = 0 display_text = record_texts[0] if record_texts else text_content selected_payload = records[0][0] if records else None @@ -732,46 +858,13 @@ class AnnotationEditorService: display_text = "\n".join(record_texts) if record_texts else text_content if needs_segmentation: - splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD) - segment_contexts: List[Tuple[Optional[Dict[str, Any]], str, str, int, int]] = [] - segments = [] - segment_cursor = 0 - - for record_index, ((payload, raw_text), record_text) in enumerate(zip(records, record_texts)): - normalized_text = record_text or "" - if len(normalized_text) > self.SEGMENT_THRESHOLD: - raw_segments = splitter.split(normalized_text) - for chunk_index, seg in enumerate(raw_segments): - segments.append(SegmentInfo( - idx=segment_cursor, - text=seg["text"], - start=seg["start"], - end=seg["end"], - hasAnnotation=str(segment_cursor) in segment_annotation_keys, - lineIndex=record_index, - chunkIndex=chunk_index, - )) - segment_contexts.append((payload, raw_text, seg["text"], record_index, chunk_index)) - segment_cursor += 1 - else: - segments.append(SegmentInfo( - idx=segment_cursor, - text=normalized_text, - start=0, - end=len(normalized_text), - hasAnnotation=str(segment_cursor) in segment_annotation_keys, - lineIndex=record_index, - chunkIndex=0, - )) - segment_contexts.append((payload, raw_text, normalized_text, record_index, 0)) - segment_cursor += 1 - - if not segments: - segments = [SegmentInfo(idx=0, text="", start=0, end=0, hasAnnotation=False, lineIndex=0, chunkIndex=0)] - segment_contexts = [(None, "", "", 0, 0)] - + _, segment_contexts = self._build_segment_contexts( + records, + record_texts, + segment_annotation_keys, + ) current_segment_index = segment_index if segment_index is not None else 0 - if current_segment_index < 0 or current_segment_index >= len(segments): + if current_segment_index < 0 or current_segment_index >= len(segment_contexts): current_segment_index = 0 selected_payload, _, display_text, _, _ = segment_contexts[current_segment_index] @@ -849,8 +942,7 @@ class AnnotationEditorService: task=task, annotationUpdatedAt=annotation_updated_at, segmented=needs_segmentation, - segments=segments, - totalSegments=len(segments) if segments else 1, + totalSegments=len(segment_contexts) if needs_segmentation else 1, currentSegmentIndex=current_segment_index, )