From 0e30e658e963d000fd1025c63411bba3fdff0c9d Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Fri, 23 Jan 2026 22:04:34 +0800 Subject: [PATCH] =?UTF-8?q?feat(annotation):=20=E6=B7=BB=E5=8A=A0=20JSONL?= =?UTF-8?q?=20=E6=96=87=E4=BB=B6=E6=94=AF=E6=8C=81=E5=92=8C=E6=96=87?= =?UTF-8?q?=E6=9C=AC=E9=94=AE=E8=A7=A3=E6=9E=90=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加 JSONL 文件扩展名常量定义 - 实现主文本键解析方法 _resolve_primary_text_key - 添加 JSONL 记录解析方法 _parse_jsonl_records - 修改任务数据构建方法以支持主文本键参数 - 实现主文本值解析方法 _resolve_primary_text_value - 更新项目信息获取逻辑以支持 JSONL 多行记录处理 - 修改分段逻辑以支持 JSONL 多行或超长文本分段 - 调整标注处理逻辑以正确处理分段标注场景 --- .../Annotate/LabelStudioTextEditor.tsx | 6 +- .../app/module/annotation/schema/editor.py | 2 + .../app/module/annotation/service/editor.py | 141 ++++++++++++++---- 3 files changed, 118 insertions(+), 31 deletions(-) diff --git a/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx b/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx index abe543d..798d4f6 100644 --- a/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx +++ b/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx @@ -38,6 +38,8 @@ type SegmentInfo = { start: number; end: number; hasAnnotation: boolean; + lineIndex: number; + chunkIndex: number; }; type ApiResponse = { @@ -726,9 +728,9 @@ export default function LabelStudioTextEditor() { type={seg.idx === currentSegmentIndex ? "primary" : "default"} onClick={() => handleSegmentChange(seg.idx)} disabled={segmentSwitching || saving || loadingTaskDetail || !lsReady} - style={{ minWidth: 32, padding: "0 8px" }} + style={{ minWidth: 64, padding: "0 6px" }} > - {seg.idx + 1} + {`行${seg.lineIndex + 1}/片${seg.chunkIndex + 1}`} {seg.hasAnnotation && ( )} diff --git a/runtime/datamate-python/app/module/annotation/schema/editor.py b/runtime/datamate-python/app/module/annotation/schema/editor.py index 9c6b130..cd7bf1d 100644 --- a/runtime/datamate-python/app/module/annotation/schema/editor.py +++ b/runtime/datamate-python/app/module/annotation/schema/editor.py @@ -59,6 +59,8 @@ class SegmentInfo(BaseModel): start: int = Field(..., description="在原文中的起始位置") end: int = Field(..., description="在原文中的结束位置") has_annotation: bool = Field(False, alias="hasAnnotation", description="该段落是否已有标注") + line_index: int = Field(0, alias="lineIndex", description="JSONL 行索引(从0开始)") + chunk_index: int = Field(0, alias="chunkIndex", description="行内分片索引(从0开始)") model_config = ConfigDict(populate_by_name=True) diff --git a/runtime/datamate-python/app/module/annotation/service/editor.py b/runtime/datamate-python/app/module/annotation/service/editor.py index f8b6632..ba79aac 100644 --- a/runtime/datamate-python/app/module/annotation/service/editor.py +++ b/runtime/datamate-python/app/module/annotation/service/editor.py @@ -48,6 +48,7 @@ FILE_ID_CAMEL_KEY = "fileId" FILE_NAME_CAMEL_KEY = "fileName" SEGMENT_INDEX_KEY = "segment_index" SEGMENT_INDEX_CAMEL_KEY = "segmentIndex" +JSONL_EXTENSION = ".jsonl" TEXTUAL_OBJECT_CATEGORIES = {"text", "document"} OBJECT_NAME_HEADER_PREFIX = "dm_object_header_" @@ -117,6 +118,17 @@ class AnnotationEditorService: label_config = self._decorate_label_config_for_editor(label_config) return label_config + @classmethod + def _resolve_primary_text_key(cls, label_config: Optional[str]) -> Optional[str]: + if not label_config: + return None + keys = cls._extract_textual_value_keys(label_config) + if not keys: + return None + if TEXT_DATA_KEY in keys: + return TEXT_DATA_KEY + return keys[0] + @staticmethod def _try_parse_json_payload(text_content: str) -> Optional[Dict[str, Any]]: if not text_content: @@ -132,6 +144,15 @@ class AnnotationEditorService: return None return parsed if isinstance(parsed, dict) else None + @classmethod + def _parse_jsonl_records(cls, text_content: str) -> List[Tuple[Optional[Dict[str, Any]], str]]: + lines = [line for line in text_content.splitlines() if line.strip()] + records: List[Tuple[Optional[Dict[str, Any]], str]] = [] + for line in lines: + payload = cls._try_parse_json_payload(line) + records.append((payload, line)) + return records + @staticmethod def _is_textual_object_tag(object_tag: str) -> bool: config = LabelStudioTagConfig.get_object_config(object_tag) or {} @@ -252,10 +273,11 @@ class AnnotationEditorService: file_record: DatasetFiles, dataset_id: str, file_id: str, + primary_text_key: Optional[str], ) -> Dict[str, Any]: data: Dict[str, Any] = dict(parsed_payload or {}) - if self._needs_placeholder(data.get(TEXT_DATA_KEY)): - data[TEXT_DATA_KEY] = display_text + text_key = primary_text_key or TEXT_DATA_KEY + data[text_key] = display_text file_name = str(getattr(file_record, "file_name", "")) data[FILE_ID_KEY] = file_id @@ -268,6 +290,23 @@ class AnnotationEditorService: self._apply_text_placeholders(data, label_config) return data + @classmethod + def _resolve_primary_text_value( + cls, + parsed_payload: Optional[Dict[str, Any]], + raw_text: str, + primary_text_key: Optional[str], + ) -> str: + if parsed_payload and primary_text_key: + value = parsed_payload.get(primary_text_key) + if isinstance(value, str) and value.strip(): + return value + if parsed_payload and not primary_text_key: + value = parsed_payload.get(TEXT_DATA_KEY) + if isinstance(value, str) and value.strip(): + return value + return raw_text + async def get_project_info(self, project_id: str) -> EditorProjectInfo: project = await self._get_project_or_404(project_id) @@ -372,7 +411,25 @@ class AnnotationEditorService: text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id) assert isinstance(text_content, str) label_config = await self._resolve_project_label_config(project) - parsed_payload = self._try_parse_json_payload(text_content) + primary_text_key = self._resolve_primary_text_key(label_config) + file_name = str(getattr(file_record, "file_name", "")).lower() + records: List[Tuple[Optional[Dict[str, Any]], str]] = [] + if file_name.endswith(JSONL_EXTENSION): + records = self._parse_jsonl_records(text_content) + else: + parsed_payload = self._try_parse_json_payload(text_content) + if parsed_payload: + records = [(parsed_payload, text_content)] + + if not records: + records = [(None, text_content)] + + record_texts = [ + self._resolve_primary_text_value(payload, raw_text, primary_text_key) + for payload, raw_text in records + ] + if not record_texts: + record_texts = [text_content] # 获取现有标注 ann_result = await self.db.execute( @@ -385,47 +442,73 @@ class AnnotationEditorService: ls_task_id = self._make_ls_task_id(project_id, file_id) - # 判断是否需要分段 - needs_segmentation = len(text_content) > self.SEGMENT_THRESHOLD + # 判断是否需要分段(JSONL 多行或主文本超过阈值) + needs_segmentation = len(records) > 1 or any( + len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts + ) segments: Optional[List[SegmentInfo]] = None current_segment_index = 0 - display_text = text_content + display_text = record_texts[0] if record_texts else text_content + selected_payload = records[0][0] if records else None + + segment_annotations: Dict[str, Any] = {} + if ann and ann.annotation and ann.annotation.get("segmented"): + segment_annotations = ann.annotation.get("segments", {}) if needs_segmentation: splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD) - raw_segments = splitter.split(text_content) - current_segment_index = segment_index if segment_index is not None else 0 + segment_contexts: List[Tuple[Optional[Dict[str, Any]], str, str, int, int]] = [] + segments = [] + segment_cursor = 0 - # 校验段落索引 - if current_segment_index < 0 or current_segment_index >= len(raw_segments): + for record_index, ((payload, raw_text), record_text) in enumerate(zip(records, record_texts)): + normalized_text = record_text or "" + if len(normalized_text) > self.SEGMENT_THRESHOLD: + raw_segments = splitter.split(normalized_text) + for chunk_index, seg in enumerate(raw_segments): + segments.append(SegmentInfo( + idx=segment_cursor, + text=seg["text"], + start=seg["start"], + end=seg["end"], + hasAnnotation=str(segment_cursor) in segment_annotations, + lineIndex=record_index, + chunkIndex=chunk_index, + )) + segment_contexts.append((payload, raw_text, seg["text"], record_index, chunk_index)) + segment_cursor += 1 + else: + segments.append(SegmentInfo( + idx=segment_cursor, + text=normalized_text, + start=0, + end=len(normalized_text), + hasAnnotation=str(segment_cursor) in segment_annotations, + lineIndex=record_index, + chunkIndex=0, + )) + segment_contexts.append((payload, raw_text, normalized_text, record_index, 0)) + segment_cursor += 1 + + if not segments: + segments = [SegmentInfo(idx=0, text="", start=0, end=0, hasAnnotation=False, lineIndex=0, chunkIndex=0)] + segment_contexts = [(None, "", "", 0, 0)] + + current_segment_index = segment_index if segment_index is not None else 0 + if current_segment_index < 0 or current_segment_index >= len(segments): current_segment_index = 0 - # 标记每个段落是否已有标注 - segment_annotations: Dict[str, Any] = {} - if ann and ann.annotation and ann.annotation.get("segmented"): - segment_annotations = ann.annotation.get("segments", {}) - - segments = [] - for seg in raw_segments: - segments.append(SegmentInfo( - idx=seg["idx"], - text=seg["text"], - start=seg["start"], - end=seg["end"], - hasAnnotation=str(seg["idx"]) in segment_annotations, - )) - - # 当前段落文本用于 task.data.text - display_text = raw_segments[current_segment_index]["text"] + selected_payload, _, display_text, _, _ = segment_contexts[current_segment_index] # 构造 task 对象 task_data = self._build_task_data( display_text=display_text, - parsed_payload=parsed_payload, + parsed_payload=selected_payload, label_config=label_config, file_record=file_record, dataset_id=project.dataset_id, file_id=file_id, + primary_text_key=primary_text_key, ) if needs_segmentation: task_data[SEGMENT_INDEX_KEY] = current_segment_index @@ -453,7 +536,7 @@ class AnnotationEditorService: "updated_at": seg_ann.get("updated_at", datetime.utcnow().isoformat() + "Z"), } task["annotations"] = [stored] - elif not needs_segmentation: + elif not needs_segmentation and not (ann.annotation or {}).get("segmented"): # 非分段模式:直接返回存储的 annotation 原始对象 stored = dict(ann.annotation or {}) stored["task"] = ls_task_id