From 0e30e658e963d000fd1025c63411bba3fdff0c9d Mon Sep 17 00:00:00 2001
From: Jerry Yan <792602257@qq.com>
Date: Fri, 23 Jan 2026 22:04:34 +0800
Subject: [PATCH] =?UTF-8?q?feat(annotation):=20=E6=B7=BB=E5=8A=A0=20JSONL?=
 =?UTF-8?q?=20=E6=96=87=E4=BB=B6=E6=94=AF=E6=8C=81=E5=92=8C=E6=96=87?=
 =?UTF-8?q?=E6=9C=AC=E9=94=AE=E8=A7=A3=E6=9E=90=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 添加 JSONL 文件扩展名常量定义
- 实现主文本键解析方法 _resolve_primary_text_key
- 添加 JSONL 记录解析方法 _parse_jsonl_records
- 修改任务数据构建方法以支持主文本键参数
- 实现主文本值解析方法 _resolve_primary_text_value
- 更新项目信息获取逻辑以支持 JSONL 多行记录处理
- 修改分段逻辑以支持 JSONL 多行或超长文本分段
- 调整标注处理逻辑以正确处理分段标注场景
---
 .../Annotate/LabelStudioTextEditor.tsx        |   6 +-
 .../app/module/annotation/schema/editor.py    |   2 +
 .../app/module/annotation/service/editor.py   | 141 ++++++++++++++----
 3 files changed, 118 insertions(+), 31 deletions(-)
diff --git a/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx b/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx
index abe543d..798d4f6 100644
--- a/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx
+++ b/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx
@@ -38,6 +38,8 @@ type SegmentInfo = {
   start: number;
   end: number;
   hasAnnotation: boolean;
+  lineIndex: number;
+  chunkIndex: number;
 };
 
 type ApiResponse<T> = {
@@ -726,9 +728,9 @@ export default function LabelStudioTextEditor() {
                     type={seg.idx === currentSegmentIndex ? "primary" : "default"}
                     onClick={() => handleSegmentChange(seg.idx)}
                     disabled={segmentSwitching || saving || loadingTaskDetail || !lsReady}
-                    style={{ minWidth: 32, padding: "0 8px" }}
+                    style={{ minWidth: 64, padding: "0 6px" }}
                   >
-                    {seg.idx + 1}
+                    {`行${seg.lineIndex + 1}/片${seg.chunkIndex + 1}`}
                     {seg.hasAnnotation && (
                       <CheckOutlined style={{ marginLeft: 2, fontSize: 10 }} />
                     )}
diff --git a/runtime/datamate-python/app/module/annotation/schema/editor.py b/runtime/datamate-python/app/module/annotation/schema/editor.py
index 9c6b130..cd7bf1d 100644
--- a/runtime/datamate-python/app/module/annotation/schema/editor.py
+++ b/runtime/datamate-python/app/module/annotation/schema/editor.py
@@ -59,6 +59,8 @@ class SegmentInfo(BaseModel):
     start: int = Field(..., description="在原文中的起始位置")
     end: int = Field(..., description="在原文中的结束位置")
     has_annotation: bool = Field(False, alias="hasAnnotation", description="该段落是否已有标注")
+    line_index: int = Field(0, alias="lineIndex", description="JSONL 行索引（从0开始）")
+    chunk_index: int = Field(0, alias="chunkIndex", description="行内分片索引（从0开始）")
 
     model_config = ConfigDict(populate_by_name=True)
 
diff --git a/runtime/datamate-python/app/module/annotation/service/editor.py b/runtime/datamate-python/app/module/annotation/service/editor.py
index f8b6632..ba79aac 100644
--- a/runtime/datamate-python/app/module/annotation/service/editor.py
+++ b/runtime/datamate-python/app/module/annotation/service/editor.py
@@ -48,6 +48,7 @@ FILE_ID_CAMEL_KEY = "fileId"
 FILE_NAME_CAMEL_KEY = "fileName"
 SEGMENT_INDEX_KEY = "segment_index"
 SEGMENT_INDEX_CAMEL_KEY = "segmentIndex"
+JSONL_EXTENSION = ".jsonl"
 TEXTUAL_OBJECT_CATEGORIES = {"text", "document"}
 OBJECT_NAME_HEADER_PREFIX = "dm_object_header_"
 
@@ -117,6 +118,17 @@ class AnnotationEditorService:
             label_config = self._decorate_label_config_for_editor(label_config)
         return label_config
 
+    @classmethod
+    def _resolve_primary_text_key(cls, label_config: Optional[str]) -> Optional[str]:
+        if not label_config:
+            return None
+        keys = cls._extract_textual_value_keys(label_config)
+        if not keys:
+            return None
+        if TEXT_DATA_KEY in keys:
+            return TEXT_DATA_KEY
+        return keys[0]
+
     @staticmethod
     def _try_parse_json_payload(text_content: str) -> Optional[Dict[str, Any]]:
         if not text_content:
@@ -132,6 +144,15 @@ class AnnotationEditorService:
             return None
         return parsed if isinstance(parsed, dict) else None
 
+    @classmethod
+    def _parse_jsonl_records(cls, text_content: str) -> List[Tuple[Optional[Dict[str, Any]], str]]:
+        lines = [line for line in text_content.splitlines() if line.strip()]
+        records: List[Tuple[Optional[Dict[str, Any]], str]] = []
+        for line in lines:
+            payload = cls._try_parse_json_payload(line)
+            records.append((payload, line))
+        return records
+
     @staticmethod
     def _is_textual_object_tag(object_tag: str) -> bool:
         config = LabelStudioTagConfig.get_object_config(object_tag) or {}
@@ -252,10 +273,11 @@ class AnnotationEditorService:
         file_record: DatasetFiles,
         dataset_id: str,
         file_id: str,
+        primary_text_key: Optional[str],
     ) -> Dict[str, Any]:
         data: Dict[str, Any] = dict(parsed_payload or {})
-        if self._needs_placeholder(data.get(TEXT_DATA_KEY)):
-            data[TEXT_DATA_KEY] = display_text
+        text_key = primary_text_key or TEXT_DATA_KEY
+        data[text_key] = display_text
 
         file_name = str(getattr(file_record, "file_name", ""))
         data[FILE_ID_KEY] = file_id
@@ -268,6 +290,23 @@ class AnnotationEditorService:
         self._apply_text_placeholders(data, label_config)
         return data
 
+    @classmethod
+    def _resolve_primary_text_value(
+        cls,
+        parsed_payload: Optional[Dict[str, Any]],
+        raw_text: str,
+        primary_text_key: Optional[str],
+    ) -> str:
+        if parsed_payload and primary_text_key:
+            value = parsed_payload.get(primary_text_key)
+            if isinstance(value, str) and value.strip():
+                return value
+        if parsed_payload and not primary_text_key:
+            value = parsed_payload.get(TEXT_DATA_KEY)
+            if isinstance(value, str) and value.strip():
+                return value
+        return raw_text
+
     async def get_project_info(self, project_id: str) -> EditorProjectInfo:
         project = await self._get_project_or_404(project_id)
 
@@ -372,7 +411,25 @@ class AnnotationEditorService:
         text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
         assert isinstance(text_content, str)
         label_config = await self._resolve_project_label_config(project)
-        parsed_payload = self._try_parse_json_payload(text_content)
+        primary_text_key = self._resolve_primary_text_key(label_config)
+        file_name = str(getattr(file_record, "file_name", "")).lower()
+        records: List[Tuple[Optional[Dict[str, Any]], str]] = []
+        if file_name.endswith(JSONL_EXTENSION):
+            records = self._parse_jsonl_records(text_content)
+        else:
+            parsed_payload = self._try_parse_json_payload(text_content)
+            if parsed_payload:
+                records = [(parsed_payload, text_content)]
+
+        if not records:
+            records = [(None, text_content)]
+
+        record_texts = [
+            self._resolve_primary_text_value(payload, raw_text, primary_text_key)
+            for payload, raw_text in records
+        ]
+        if not record_texts:
+            record_texts = [text_content]
 
         # 获取现有标注
         ann_result = await self.db.execute(
@@ -385,47 +442,73 @@ class AnnotationEditorService:
 
         ls_task_id = self._make_ls_task_id(project_id, file_id)
 
-        # 判断是否需要分段
-        needs_segmentation = len(text_content) > self.SEGMENT_THRESHOLD
+        # 判断是否需要分段（JSONL 多行或主文本超过阈值）
+        needs_segmentation = len(records) > 1 or any(
+            len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts
+        )
         segments: Optional[List[SegmentInfo]] = None
         current_segment_index = 0
-        display_text = text_content
+        display_text = record_texts[0] if record_texts else text_content
+        selected_payload = records[0][0] if records else None
+
+        segment_annotations: Dict[str, Any] = {}
+        if ann and ann.annotation and ann.annotation.get("segmented"):
+            segment_annotations = ann.annotation.get("segments", {})
 
         if needs_segmentation:
             splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD)
-            raw_segments = splitter.split(text_content)
-            current_segment_index = segment_index if segment_index is not None else 0
+            segment_contexts: List[Tuple[Optional[Dict[str, Any]], str, str, int, int]] = []
+            segments = []
+            segment_cursor = 0
 
-            # 校验段落索引
-            if current_segment_index < 0 or current_segment_index >= len(raw_segments):
+            for record_index, ((payload, raw_text), record_text) in enumerate(zip(records, record_texts)):
+                normalized_text = record_text or ""
+                if len(normalized_text) > self.SEGMENT_THRESHOLD:
+                    raw_segments = splitter.split(normalized_text)
+                    for chunk_index, seg in enumerate(raw_segments):
+                        segments.append(SegmentInfo(
+                            idx=segment_cursor,
+                            text=seg["text"],
+                            start=seg["start"],
+                            end=seg["end"],
+                            hasAnnotation=str(segment_cursor) in segment_annotations,
+                            lineIndex=record_index,
+                            chunkIndex=chunk_index,
+                        ))
+                        segment_contexts.append((payload, raw_text, seg["text"], record_index, chunk_index))
+                        segment_cursor += 1
+                else:
+                    segments.append(SegmentInfo(
+                        idx=segment_cursor,
+                        text=normalized_text,
+                        start=0,
+                        end=len(normalized_text),
+                        hasAnnotation=str(segment_cursor) in segment_annotations,
+                        lineIndex=record_index,
+                        chunkIndex=0,
+                    ))
+                    segment_contexts.append((payload, raw_text, normalized_text, record_index, 0))
+                    segment_cursor += 1
+
+            if not segments:
+                segments = [SegmentInfo(idx=0, text="", start=0, end=0, hasAnnotation=False, lineIndex=0, chunkIndex=0)]
+                segment_contexts = [(None, "", "", 0, 0)]
+
+            current_segment_index = segment_index if segment_index is not None else 0
+            if current_segment_index < 0 or current_segment_index >= len(segments):
                 current_segment_index = 0
 
-            # 标记每个段落是否已有标注
-            segment_annotations: Dict[str, Any] = {}
-            if ann and ann.annotation and ann.annotation.get("segmented"):
-                segment_annotations = ann.annotation.get("segments", {})
-
-            segments = []
-            for seg in raw_segments:
-                segments.append(SegmentInfo(
-                    idx=seg["idx"],
-                    text=seg["text"],
-                    start=seg["start"],
-                    end=seg["end"],
-                    hasAnnotation=str(seg["idx"]) in segment_annotations,
-                ))
-
-            # 当前段落文本用于 task.data.text
-            display_text = raw_segments[current_segment_index]["text"]
+            selected_payload, _, display_text, _, _ = segment_contexts[current_segment_index]
 
         # 构造 task 对象
         task_data = self._build_task_data(
             display_text=display_text,
-            parsed_payload=parsed_payload,
+            parsed_payload=selected_payload,
             label_config=label_config,
             file_record=file_record,
             dataset_id=project.dataset_id,
             file_id=file_id,
+            primary_text_key=primary_text_key,
         )
         if needs_segmentation:
             task_data[SEGMENT_INDEX_KEY] = current_segment_index
@@ -453,7 +536,7 @@ class AnnotationEditorService:
                     "updated_at": seg_ann.get("updated_at", datetime.utcnow().isoformat() + "Z"),
                 }
                 task["annotations"] = [stored]
-            elif not needs_segmentation:
+            elif not needs_segmentation and not (ann.annotation or {}).get("segmented"):
                 # 非分段模式：直接返回存储的 annotation 原始对象
                 stored = dict(ann.annotation or {})
                 stored["task"] = ls_task_id