From c5c8e6c69e57c507858faf5ca9bf503d29c27436 Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sat, 31 Jan 2026 14:36:16 +0800 Subject: [PATCH] =?UTF-8?q?feat(annotation):=20=E6=B7=BB=E5=8A=A0=E5=88=86?= =?UTF-8?q?=E6=AE=B5=E6=A0=87=E6=B3=A8=E5=8A=9F=E8=83=BD=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 定义分段标注相关常量(segmented、segments、result等键名) - 实现分段标注提取方法_extract_segment_annotations处理字典和列表格式 - 添加分段标注判断方法_is_segmented_annotation检测标注状态 - 修改_has_annotation_result方法使用新的分段标注处理逻辑 - 在任务创建过程中集成分段标注数据处理 - 更新导出服务中的分段标注结果扁平化处理 - 实现标注归一化方法支持分段标注格式转换 - 调整JSON和CSV导出格式适配分段标注结构 --- .../app/module/annotation/service/editor.py | 103 ++++++++++++++---- .../app/module/annotation/service/export.py | 71 +++++++++++- 2 files changed, 145 insertions(+), 29 deletions(-) diff --git a/runtime/datamate-python/app/module/annotation/service/editor.py b/runtime/datamate-python/app/module/annotation/service/editor.py index 8457d3b..10953a3 100644 --- a/runtime/datamate-python/app/module/annotation/service/editor.py +++ b/runtime/datamate-python/app/module/annotation/service/editor.py @@ -59,6 +59,11 @@ FILE_ID_CAMEL_KEY = "fileId" FILE_NAME_CAMEL_KEY = "fileName" SEGMENT_INDEX_KEY = "segment_index" SEGMENT_INDEX_CAMEL_KEY = "segmentIndex" +SEGMENTED_KEY = "segmented" +SEGMENTS_KEY = "segments" +SEGMENT_RESULT_KEY = "result" +SEGMENT_CREATED_AT_KEY = "created_at" +SEGMENT_UPDATED_AT_KEY = "updated_at" JSONL_EXTENSION = ".jsonl" TEXTUAL_OBJECT_CATEGORIES = {"text", "document"} IMAGE_OBJECT_CATEGORIES = {"image"} @@ -352,22 +357,63 @@ class AnnotationEditorService: return ET.tostring(root, encoding="unicode") + @staticmethod + def _extract_segment_annotations(payload: Optional[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]: + if not payload or not isinstance(payload, dict): + return {} + segments = payload.get(SEGMENTS_KEY) + if isinstance(segments, dict): + normalized: Dict[str, Dict[str, Any]] = {} + for key, value in segments.items(): + if isinstance(value, dict): + normalized[str(key)] = value + return normalized + if isinstance(segments, list): + normalized: Dict[str, Dict[str, Any]] = {} + for idx, value in enumerate(segments): + if not isinstance(value, dict): + continue + key = ( + value.get(SEGMENT_INDEX_CAMEL_KEY) + or value.get(SEGMENT_INDEX_KEY) + or value.get("segment") + or value.get("idx") + ) + if key is None: + key = idx + normalized[str(key)] = value + return normalized + return {} + + @staticmethod + def _is_segmented_annotation(payload: Optional[Dict[str, Any]]) -> bool: + if not payload or not isinstance(payload, dict): + return False + if payload.get(SEGMENTED_KEY): + return True + segments = payload.get(SEGMENTS_KEY) + if isinstance(segments, dict): + return len(segments) > 0 + if isinstance(segments, list): + return len(segments) > 0 + return False + @staticmethod def _has_annotation_result(payload: Optional[Dict[str, Any]]) -> bool: if not payload or not isinstance(payload, dict): return False - if payload.get("segmented"): - segments = payload.get("segments", {}) - if not isinstance(segments, dict): + if AnnotationEditorService._is_segmented_annotation(payload): + segments = AnnotationEditorService._extract_segment_annotations(payload) + if not segments: return False for segment in segments.values(): if not isinstance(segment, dict): continue - result = segment.get("result") + result = segment.get(SEGMENT_RESULT_KEY) if isinstance(result, list) and len(result) > 0: return True return False - result = payload.get("result") + result = payload.get(SEGMENT_RESULT_KEY) return isinstance(result, list) and len(result) > 0 @classmethod @@ -591,6 +637,13 @@ class AnnotationEditorService: ls_task_id = self._make_ls_task_id(project.id, file_id) + segment_annotations: Dict[str, Dict[str, Any]] = {} + has_segmented_annotation = False + if ann and isinstance(ann.annotation, dict): + segment_annotations = self._extract_segment_annotations(ann.annotation) + has_segmented_annotation = self._is_segmented_annotation(ann.annotation) + segment_annotation_keys = set(segment_annotations.keys()) + # 判断是否需要分段(JSONL 多行或主文本超过阈值) segmentation_enabled = self._resolve_segmentation_enabled(project) if not segmentation_enabled: @@ -606,10 +659,6 @@ class AnnotationEditorService: selected_payload = None display_text = "\n".join(record_texts) if record_texts else text_content - segment_annotations: Dict[str, Any] = {} - if ann and ann.annotation and ann.annotation.get("segmented"): - segment_annotations = ann.annotation.get("segments", {}) - if needs_segmentation: splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD) segment_contexts: List[Tuple[Optional[Dict[str, Any]], str, str, int, int]] = [] @@ -626,7 +675,7 @@ class AnnotationEditorService: text=seg["text"], start=seg["start"], end=seg["end"], - hasAnnotation=str(segment_cursor) in segment_annotations, + hasAnnotation=str(segment_cursor) in segment_annotation_keys, lineIndex=record_index, chunkIndex=chunk_index, )) @@ -638,7 +687,7 @@ class AnnotationEditorService: text=normalized_text, start=0, end=len(normalized_text), - hasAnnotation=str(segment_cursor) in segment_annotations, + hasAnnotation=str(segment_cursor) in segment_annotation_keys, lineIndex=record_index, chunkIndex=0, )) @@ -679,19 +728,18 @@ class AnnotationEditorService: if ann: annotation_updated_at = ann.updated_at - if needs_segmentation and ann.annotation and ann.annotation.get("segmented"): + if needs_segmentation and has_segmented_annotation: # 分段模式:获取当前段落的标注 - segment_annotations = ann.annotation.get("segments", {}) seg_ann = segment_annotations.get(str(current_segment_index), {}) stored = { "id": self._make_ls_annotation_id(project.id, file_id) + current_segment_index, "task": ls_task_id, - "result": seg_ann.get("result", []), - "created_at": seg_ann.get("created_at", datetime.utcnow().isoformat() + "Z"), - "updated_at": seg_ann.get("updated_at", datetime.utcnow().isoformat() + "Z"), + "result": seg_ann.get(SEGMENT_RESULT_KEY, []), + "created_at": seg_ann.get(SEGMENT_CREATED_AT_KEY, datetime.utcnow().isoformat() + "Z"), + "updated_at": seg_ann.get(SEGMENT_UPDATED_AT_KEY, datetime.utcnow().isoformat() + "Z"), } task["annotations"] = [stored] - elif not needs_segmentation and not (ann.annotation or {}).get("segmented"): + elif not needs_segmentation and not has_segmented_annotation: # 非分段模式:直接返回存储的 annotation 原始对象 stored = dict(ann.annotation or {}) stored["task"] = ls_task_id @@ -968,21 +1016,28 @@ class AnnotationEditorService: Returns: 合并后的 annotation 结构 """ - if not existing or not existing.get("segmented"): + if not existing or not existing.get(SEGMENTED_KEY): # 初始化分段结构 base: Dict[str, Any] = { - "segmented": True, + SEGMENTED_KEY: True, "version": 1, - "segments": {}, + SEGMENTS_KEY: {}, } else: base = dict(existing) + if not base.get(SEGMENTED_KEY): + base[SEGMENTED_KEY] = True + segments = base.get(SEGMENTS_KEY) + if not isinstance(segments, dict): + segments = {} + base[SEGMENTS_KEY] = segments + # 更新指定段落的标注 - base["segments"][str(segment_index)] = { - "result": new_annotation.get("result", []), - "created_at": new_annotation.get("created_at", datetime.utcnow().isoformat() + "Z"), - "updated_at": datetime.utcnow().isoformat() + "Z", + segments[str(segment_index)] = { + SEGMENT_RESULT_KEY: new_annotation.get(SEGMENT_RESULT_KEY, []), + SEGMENT_CREATED_AT_KEY: new_annotation.get(SEGMENT_CREATED_AT_KEY, datetime.utcnow().isoformat() + "Z"), + SEGMENT_UPDATED_AT_KEY: datetime.utcnow().isoformat() + "Z", } return base diff --git a/runtime/datamate-python/app/module/annotation/service/export.py b/runtime/datamate-python/app/module/annotation/service/export.py index 50e351e..c2a4aec 100644 --- a/runtime/datamate-python/app/module/annotation/service/export.py +++ b/runtime/datamate-python/app/module/annotation/service/export.py @@ -63,6 +63,12 @@ from ..schema.export import ( logger = get_logger(__name__) +SEGMENTED_KEY = "segmented" +SEGMENTS_KEY = "segments" +SEGMENT_RESULT_KEY = "result" +SEGMENT_INDEX_KEY = "segmentIndex" +SEGMENT_INDEX_FALLBACK_KEY = "segment_index" + class AnnotationExportService: """标注数据导出服务""" @@ -239,6 +245,61 @@ class AnnotationExportService: return items + @staticmethod + def _flatten_annotation_results(annotation: Dict[str, Any]) -> List[Dict[str, Any]]: + if not annotation or not isinstance(annotation, dict): + return [] + segments = annotation.get(SEGMENTS_KEY) + if annotation.get(SEGMENTED_KEY) or isinstance(segments, (dict, list)): + results: List[Dict[str, Any]] = [] + if isinstance(segments, dict): + for key, segment in segments.items(): + if not isinstance(segment, dict): + continue + segment_results = segment.get(SEGMENT_RESULT_KEY) + if not isinstance(segment_results, list): + continue + for item in segment_results: + if isinstance(item, dict): + normalized = dict(item) + if SEGMENT_INDEX_KEY not in normalized and SEGMENT_INDEX_FALLBACK_KEY not in normalized: + normalized[SEGMENT_INDEX_KEY] = int(key) if str(key).isdigit() else key + results.append(normalized) + else: + results.append({"value": item, SEGMENT_INDEX_KEY: key}) + elif isinstance(segments, list): + for idx, segment in enumerate(segments): + if not isinstance(segment, dict): + continue + segment_results = segment.get(SEGMENT_RESULT_KEY) + if not isinstance(segment_results, list): + continue + segment_index = segment.get(SEGMENT_INDEX_KEY, segment.get(SEGMENT_INDEX_FALLBACK_KEY, idx)) + for item in segment_results: + if isinstance(item, dict): + normalized = dict(item) + if SEGMENT_INDEX_KEY not in normalized and SEGMENT_INDEX_FALLBACK_KEY not in normalized: + normalized[SEGMENT_INDEX_KEY] = segment_index + results.append(normalized) + else: + results.append({"value": item, SEGMENT_INDEX_KEY: segment_index}) + return results + result = annotation.get(SEGMENT_RESULT_KEY) + return result if isinstance(result, list) else [] + + @classmethod + def _normalize_annotation_for_export(cls, annotation: Dict[str, Any]) -> Dict[str, Any]: + if not annotation or not isinstance(annotation, dict): + return {} + segments = annotation.get(SEGMENTS_KEY) + if annotation.get(SEGMENTED_KEY) or isinstance(segments, (dict, list)): + normalized = dict(annotation) + normalized_result = cls._flatten_annotation_results(annotation) + if SEGMENT_RESULT_KEY not in normalized or not isinstance(normalized.get(SEGMENT_RESULT_KEY), list): + normalized[SEGMENT_RESULT_KEY] = normalized_result + return normalized + return annotation + def _export_json( self, items: List[AnnotationExportItem], project_name: str ) -> Tuple[bytes, str, str]: @@ -252,7 +313,7 @@ class AnnotationExportService: "file_id": item.file_id, "file_name": item.file_name, "data": item.data, - "annotations": item.annotations, + "annotations": [self._normalize_annotation_for_export(ann) for ann in item.annotations], "created_at": item.created_at.isoformat() if item.created_at else None, "updated_at": item.updated_at.isoformat() if item.updated_at else None, } @@ -274,7 +335,7 @@ class AnnotationExportService: "file_id": item.file_id, "file_name": item.file_name, "data": item.data, - "annotations": item.annotations, + "annotations": [self._normalize_annotation_for_export(ann) for ann in item.annotations], "created_at": item.created_at.isoformat() if item.created_at else None, "updated_at": item.updated_at.isoformat() if item.updated_at else None, } @@ -307,7 +368,7 @@ class AnnotationExportService: # 提取标签信息(支持多种标注类型) labels = [] for ann in item.annotations: - results = ann.get("result", []) + results = self._flatten_annotation_results(ann) for r in results: value = r.get("value", {}) label_type = r.get("type", "") @@ -382,7 +443,7 @@ class AnnotationExportService: # 处理标注 for ann in item.annotations: - results = ann.get("result", []) + results = self._flatten_annotation_results(ann) for r in results: # 处理矩形框标注 (rectanglelabels) if r.get("type") == "rectanglelabels": @@ -434,7 +495,7 @@ class AnnotationExportService: lines = [] for ann in item.annotations: - results = ann.get("result", []) + results = self._flatten_annotation_results(ann) for r in results: # 处理矩形框标注 if r.get("type") == "rectanglelabels":