feat(annotation): 增强标注编辑器的文本数据处理功能

- 添加 JSON 和 XML 解析支持用于处理标注配置 - 实现文本占位符填充机制优化用户体验 - 集成标签工作室配置管理功能 - 添加文本对象类型检测和分类处理 - 实现标注配置装饰器增强编辑器
2026-01-22 16:22:32 +08:00
parent d996040b7f
commit c638182c72
1 changed files with 180 additions and 11 deletions
--- a/runtime/datamate-python/app/module/annotation/service/editor.py
+++ b/runtime/datamate-python/app/module/annotation/service/editor.py
@@ -14,12 +14,15 @@ from datetime import datetime
 from typing import Any, Dict, List, Optional, Tuple

 import hashlib
+import json
+import xml.etree.ElementTree as ET
 from fastapi import HTTPException
 from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.core.logging import get_logger
 from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject
+from app.module.annotation.config import LabelStudioTagConfig
 from app.module.annotation.schema.editor import (
    EditorProjectInfo,
    EditorTaskListItem,
@@ -36,6 +39,13 @@ from app.module.annotation.service.text_fetcher import fetch_text_content_via_do

 logger = get_logger(__name__)

+TEXT_DATA_KEY = "text"
+DATASET_ID_KEY = "dataset_id"
+FILE_ID_KEY = "file_id"
+FILE_NAME_KEY = "file_name"
+TEXTUAL_OBJECT_CATEGORIES = {"text", "document"}
+OBJECT_NAME_HEADER_PREFIX = "dm_object_header_"
+

 class AnnotationEditorService:
    """Label Studio Editor 集成服务（TEXT POC 版）"""
@@ -92,6 +102,163 @@ class AnnotationEditorService:
        template = await self.template_service.get_template(self.db, template_id)
        return getattr(template, "label_config", None) if template else None

+    async def _resolve_project_label_config(self, project: LabelingProject) -> Optional[str]:
+        label_config = None
+        if project.configuration and isinstance(project.configuration, dict):
+            label_config = project.configuration.get("label_config")
+        if not label_config:
+            label_config = await self._get_label_config(project.template_id)
+        if label_config:
+            label_config = self._decorate_label_config_for_editor(label_config)
+        return label_config
+
+    @staticmethod
+    def _try_parse_json_payload(text_content: str) -> Optional[Dict[str, Any]]:
+        if not text_content:
+            return None
+        stripped = text_content.strip()
+        if not stripped:
+            return None
+        if stripped[0] not in ("{", "["):
+            return None
+        try:
+            parsed = json.loads(stripped)
+        except Exception:
+            return None
+        return parsed if isinstance(parsed, dict) else None
+
+    @staticmethod
+    def _is_textual_object_tag(object_tag: str) -> bool:
+        config = LabelStudioTagConfig.get_object_config(object_tag) or {}
+        category = config.get("category")
+        return category in TEXTUAL_OBJECT_CATEGORIES
+
+    @classmethod
+    def _extract_textual_value_keys(cls, label_config: str) -> List[str]:
+        try:
+            root = ET.fromstring(label_config)
+        except Exception as exc:
+            logger.warning("解析 label_config 失败，已跳过占位填充：%s", exc)
+            return []
+
+        object_types = LabelStudioTagConfig.get_object_types()
+        seen: Dict[str, None] = {}
+        for element in root.iter():
+            if element.tag not in object_types:
+                continue
+            if not cls._is_textual_object_tag(element.tag):
+                continue
+            value = element.attrib.get("value", "")
+            if not value.startswith("$"):
+                continue
+            key = value[1:].strip()
+            if not key:
+                continue
+            seen[key] = None
+        return list(seen.keys())
+
+    @staticmethod
+    def _needs_placeholder(value: Any) -> bool:
+        if value is None:
+            return True
+        if isinstance(value, str) and not value.strip():
+            return True
+        return False
+
+    def _apply_text_placeholders(self, data: Dict[str, Any], label_config: Optional[str]) -> None:
+        if not label_config:
+            return
+        for key in self._extract_textual_value_keys(label_config):
+            if self._needs_placeholder(data.get(key)):
+                data[key] = key
+
+    @staticmethod
+    def _header_already_present(header: ET.Element, name: str) -> bool:
+        value = header.attrib.get("value", "")
+        if value == name:
+            return True
+        header_text = (header.text or "").strip()
+        return header_text == name
+
+    def _decorate_label_config_for_editor(self, label_config: str) -> str:
+        try:
+            root = ET.fromstring(label_config)
+        except Exception as exc:
+            logger.warning("解析 label_config 失败，已跳过 name 展示增强：%s", exc)
+            return label_config
+
+        object_types = LabelStudioTagConfig.get_object_types()
+        used_names = set()
+        for element in root.iter():
+            name = element.attrib.get("name")
+            if name:
+                used_names.add(name)
+
+        def allocate_header_name(base: str) -> str:
+            candidate = f"{OBJECT_NAME_HEADER_PREFIX}{base}"
+            if candidate not in used_names:
+                used_names.add(candidate)
+                return candidate
+            idx = 1
+            while f"{candidate}_{idx}" in used_names:
+                idx += 1
+            resolved = f"{candidate}_{idx}"
+            used_names.add(resolved)
+            return resolved
+
+        for parent in root.iter():
+            children = list(parent)
+            i = 0
+            while i < len(children):
+                child = children[i]
+                if child.tag not in object_types:
+                    i += 1
+                    continue
+                if not self._is_textual_object_tag(child.tag):
+                    i += 1
+                    continue
+                obj_name = child.attrib.get("name")
+                if not obj_name:
+                    i += 1
+                    continue
+
+                if i > 0:
+                    prev = children[i - 1]
+                    if prev.tag == "Header" and self._header_already_present(prev, obj_name):
+                        i += 1
+                        continue
+
+                header = ET.Element("Header")
+                header.set("name", allocate_header_name(obj_name))
+                header.set("value", obj_name)
+
+                parent.insert(i, header)
+                children.insert(i, header)
+                i += 2
+            # continue outer loop
+
+        return ET.tostring(root, encoding="unicode")
+
+    def _build_task_data(
+        self,
+        display_text: str,
+        parsed_payload: Optional[Dict[str, Any]],
+        label_config: Optional[str],
+        file_record: DatasetFiles,
+        dataset_id: str,
+        file_id: str,
+    ) -> Dict[str, Any]:
+        data: Dict[str, Any] = dict(parsed_payload or {})
+        if self._needs_placeholder(data.get(TEXT_DATA_KEY)):
+            data[TEXT_DATA_KEY] = display_text
+
+        data.setdefault(FILE_ID_KEY, file_id)
+        data.setdefault(DATASET_ID_KEY, dataset_id)
+        data.setdefault(FILE_NAME_KEY, getattr(file_record, "file_name", ""))
+
+        self._apply_text_placeholders(data, label_config)
+        return data
+
    async def get_project_info(self, project_id: str) -> EditorProjectInfo:
        project = await self._get_project_or_404(project_id)

@@ -102,11 +269,7 @@ class AnnotationEditorService:
            unsupported_reason = f"当前仅支持 TEXT，项目数据类型为: {dataset_type or 'UNKNOWN'}"

        # 优先使用项目配置中的label_config（用户编辑版本），其次使用模板默认配置
-        label_config = None
-        if project.configuration and isinstance(project.configuration, dict):
-            label_config = project.configuration.get("label_config")
-        if not label_config:
-            label_config = await self._get_label_config(project.template_id)
+        label_config = await self._resolve_project_label_config(project)

        return EditorProjectInfo(
            projectId=project.id,
@@ -198,6 +361,9 @@ class AnnotationEditorService:
            raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")

        text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
+        assert isinstance(text_content, str)
+        label_config = await self._resolve_project_label_config(project)
+        parsed_payload = self._try_parse_json_payload(text_content)

        # 获取现有标注
        ann_result = await self.db.execute(
@@ -244,14 +410,17 @@ class AnnotationEditorService:
            display_text = raw_segments[current_segment_index]["text"]

        # 构造 task 对象
+        task_data = self._build_task_data(
+            display_text=display_text,
+            parsed_payload=parsed_payload,
+            label_config=label_config,
+            file_record=file_record,
+            dataset_id=project.dataset_id,
+            file_id=file_id,
+        )
        task: Dict[str, Any] = {
            "id": ls_task_id,
-            "data": {
-                "text": display_text,
-                "file_id": file_id,
-                "dataset_id": project.dataset_id,
-                "file_name": getattr(file_record, "file_name", ""),
-            },
+            "data": task_data,
            "annotations": [],
        }