feat(annotation): 添加文本数据集段落切片功能

- 在前端组件中新增 segmentationEnabled 字段控制切片开关 - 为文本数据集添加段落切片配置选项，默认启用切片功能 - 在后端接口中新增 segmentation_enabled 参数传递给标注项目 - 实现切片逻辑控制，支持文本数据的自动段落分割 - 添加数据集类型判断，仅文本数据集支持切片配置 - 更新标注任务创建和编辑表单中的切片相关字段处理
2026-01-26 12:05:21 +08:00
parent fa160164d2
commit 371df12a96
7 changed files with 218 additions and 32 deletions
--- a/runtime/datamate-python/app/module/annotation/service/editor.py
+++ b/runtime/datamate-python/app/module/annotation/service/editor.py
@@ -56,6 +56,7 @@ TEXTUAL_OBJECT_CATEGORIES = {"text", "document"}
 MEDIA_OBJECT_CATEGORIES = {"image"}
 OBJECT_NAME_HEADER_PREFIX = "dm_object_header_"
 SUPPORTED_EDITOR_DATASET_TYPES = ("TEXT", "IMAGE")
+SEGMENTATION_ENABLED_KEY = "segmentation_enabled"


 class AnnotationEditorService:
@@ -149,6 +150,18 @@ class AnnotationEditorService:
            label_config = self._decorate_label_config_for_editor(label_config)
        return label_config

+    @staticmethod
+    def _resolve_segmentation_enabled(project: LabelingProject) -> bool:
+        config = project.configuration
+        if not isinstance(config, dict):
+            return True
+        value = config.get(SEGMENTATION_ENABLED_KEY)
+        if isinstance(value, bool):
+            return value
+        if value is None:
+            return True
+        return bool(value)
+
    @classmethod
    def _resolve_primary_text_key(cls, label_config: Optional[str]) -> Optional[str]:
        if not label_config:
@@ -513,13 +526,19 @@ class AnnotationEditorService:
        ls_task_id = self._make_ls_task_id(project.id, file_id)

        # 判断是否需要分段（JSONL 多行或主文本超过阈值）
-        needs_segmentation = len(records) > 1 or any(
-            len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts
+        segmentation_enabled = self._resolve_segmentation_enabled(project)
+        if not segmentation_enabled:
+            segment_index = None
+        needs_segmentation = segmentation_enabled and (
+            len(records) > 1 or any(len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts)
        )
        segments: Optional[List[SegmentInfo]] = None
        current_segment_index = 0
        display_text = record_texts[0] if record_texts else text_content
        selected_payload = records[0][0] if records else None
+        if not segmentation_enabled and len(records) > 1:
+            selected_payload = None
+            display_text = "\n".join(record_texts) if record_texts else text_content

        segment_annotations: Dict[str, Any] = {}
        if ann and ann.annotation and ann.annotation.get("segmented"):