From c638182c720419b81fc39abc7bb2815358dfd297 Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Thu, 22 Jan 2026 16:22:32 +0800 Subject: [PATCH] =?UTF-8?q?feat(annotation):=20=E5=A2=9E=E5=BC=BA=E6=A0=87?= =?UTF-8?q?=E6=B3=A8=E7=BC=96=E8=BE=91=E5=99=A8=E7=9A=84=E6=96=87=E6=9C=AC?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=A4=84=E7=90=86=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加 JSON 和 XML 解析支持用于处理标注配置 - 实现文本占位符填充机制优化用户体验 - 集成标签工作室配置管理功能 - 添加文本对象类型检测和分类处理 - 实现标注配置装饰器增强编辑器 --- .../app/module/annotation/service/editor.py | 191 +++++++++++++++++- 1 file changed, 180 insertions(+), 11 deletions(-) diff --git a/runtime/datamate-python/app/module/annotation/service/editor.py b/runtime/datamate-python/app/module/annotation/service/editor.py index e700dad..30c8098 100644 --- a/runtime/datamate-python/app/module/annotation/service/editor.py +++ b/runtime/datamate-python/app/module/annotation/service/editor.py @@ -14,12 +14,15 @@ from datetime import datetime from typing import Any, Dict, List, Optional, Tuple import hashlib +import json +import xml.etree.ElementTree as ET from fastapi import HTTPException from sqlalchemy import func, select from sqlalchemy.ext.asyncio import AsyncSession from app.core.logging import get_logger from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject +from app.module.annotation.config import LabelStudioTagConfig from app.module.annotation.schema.editor import ( EditorProjectInfo, EditorTaskListItem, @@ -36,6 +39,13 @@ from app.module.annotation.service.text_fetcher import fetch_text_content_via_do logger = get_logger(__name__) +TEXT_DATA_KEY = "text" +DATASET_ID_KEY = "dataset_id" +FILE_ID_KEY = "file_id" +FILE_NAME_KEY = "file_name" +TEXTUAL_OBJECT_CATEGORIES = {"text", "document"} +OBJECT_NAME_HEADER_PREFIX = "dm_object_header_" + class AnnotationEditorService: """Label Studio Editor 集成服务(TEXT POC 版)""" @@ -92,6 +102,163 @@ class AnnotationEditorService: template = await self.template_service.get_template(self.db, template_id) return getattr(template, "label_config", None) if template else None + async def _resolve_project_label_config(self, project: LabelingProject) -> Optional[str]: + label_config = None + if project.configuration and isinstance(project.configuration, dict): + label_config = project.configuration.get("label_config") + if not label_config: + label_config = await self._get_label_config(project.template_id) + if label_config: + label_config = self._decorate_label_config_for_editor(label_config) + return label_config + + @staticmethod + def _try_parse_json_payload(text_content: str) -> Optional[Dict[str, Any]]: + if not text_content: + return None + stripped = text_content.strip() + if not stripped: + return None + if stripped[0] not in ("{", "["): + return None + try: + parsed = json.loads(stripped) + except Exception: + return None + return parsed if isinstance(parsed, dict) else None + + @staticmethod + def _is_textual_object_tag(object_tag: str) -> bool: + config = LabelStudioTagConfig.get_object_config(object_tag) or {} + category = config.get("category") + return category in TEXTUAL_OBJECT_CATEGORIES + + @classmethod + def _extract_textual_value_keys(cls, label_config: str) -> List[str]: + try: + root = ET.fromstring(label_config) + except Exception as exc: + logger.warning("解析 label_config 失败,已跳过占位填充:%s", exc) + return [] + + object_types = LabelStudioTagConfig.get_object_types() + seen: Dict[str, None] = {} + for element in root.iter(): + if element.tag not in object_types: + continue + if not cls._is_textual_object_tag(element.tag): + continue + value = element.attrib.get("value", "") + if not value.startswith("$"): + continue + key = value[1:].strip() + if not key: + continue + seen[key] = None + return list(seen.keys()) + + @staticmethod + def _needs_placeholder(value: Any) -> bool: + if value is None: + return True + if isinstance(value, str) and not value.strip(): + return True + return False + + def _apply_text_placeholders(self, data: Dict[str, Any], label_config: Optional[str]) -> None: + if not label_config: + return + for key in self._extract_textual_value_keys(label_config): + if self._needs_placeholder(data.get(key)): + data[key] = key + + @staticmethod + def _header_already_present(header: ET.Element, name: str) -> bool: + value = header.attrib.get("value", "") + if value == name: + return True + header_text = (header.text or "").strip() + return header_text == name + + def _decorate_label_config_for_editor(self, label_config: str) -> str: + try: + root = ET.fromstring(label_config) + except Exception as exc: + logger.warning("解析 label_config 失败,已跳过 name 展示增强:%s", exc) + return label_config + + object_types = LabelStudioTagConfig.get_object_types() + used_names = set() + for element in root.iter(): + name = element.attrib.get("name") + if name: + used_names.add(name) + + def allocate_header_name(base: str) -> str: + candidate = f"{OBJECT_NAME_HEADER_PREFIX}{base}" + if candidate not in used_names: + used_names.add(candidate) + return candidate + idx = 1 + while f"{candidate}_{idx}" in used_names: + idx += 1 + resolved = f"{candidate}_{idx}" + used_names.add(resolved) + return resolved + + for parent in root.iter(): + children = list(parent) + i = 0 + while i < len(children): + child = children[i] + if child.tag not in object_types: + i += 1 + continue + if not self._is_textual_object_tag(child.tag): + i += 1 + continue + obj_name = child.attrib.get("name") + if not obj_name: + i += 1 + continue + + if i > 0: + prev = children[i - 1] + if prev.tag == "Header" and self._header_already_present(prev, obj_name): + i += 1 + continue + + header = ET.Element("Header") + header.set("name", allocate_header_name(obj_name)) + header.set("value", obj_name) + + parent.insert(i, header) + children.insert(i, header) + i += 2 + # continue outer loop + + return ET.tostring(root, encoding="unicode") + + def _build_task_data( + self, + display_text: str, + parsed_payload: Optional[Dict[str, Any]], + label_config: Optional[str], + file_record: DatasetFiles, + dataset_id: str, + file_id: str, + ) -> Dict[str, Any]: + data: Dict[str, Any] = dict(parsed_payload or {}) + if self._needs_placeholder(data.get(TEXT_DATA_KEY)): + data[TEXT_DATA_KEY] = display_text + + data.setdefault(FILE_ID_KEY, file_id) + data.setdefault(DATASET_ID_KEY, dataset_id) + data.setdefault(FILE_NAME_KEY, getattr(file_record, "file_name", "")) + + self._apply_text_placeholders(data, label_config) + return data + async def get_project_info(self, project_id: str) -> EditorProjectInfo: project = await self._get_project_or_404(project_id) @@ -102,11 +269,7 @@ class AnnotationEditorService: unsupported_reason = f"当前仅支持 TEXT,项目数据类型为: {dataset_type or 'UNKNOWN'}" # 优先使用项目配置中的label_config(用户编辑版本),其次使用模板默认配置 - label_config = None - if project.configuration and isinstance(project.configuration, dict): - label_config = project.configuration.get("label_config") - if not label_config: - label_config = await self._get_label_config(project.template_id) + label_config = await self._resolve_project_label_config(project) return EditorProjectInfo( projectId=project.id, @@ -198,6 +361,9 @@ class AnnotationEditorService: raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}") text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id) + assert isinstance(text_content, str) + label_config = await self._resolve_project_label_config(project) + parsed_payload = self._try_parse_json_payload(text_content) # 获取现有标注 ann_result = await self.db.execute( @@ -244,14 +410,17 @@ class AnnotationEditorService: display_text = raw_segments[current_segment_index]["text"] # 构造 task 对象 + task_data = self._build_task_data( + display_text=display_text, + parsed_payload=parsed_payload, + label_config=label_config, + file_record=file_record, + dataset_id=project.dataset_id, + file_id=file_id, + ) task: Dict[str, Any] = { "id": ls_task_id, - "data": { - "text": display_text, - "file_id": file_id, - "dataset_id": project.dataset_id, - "file_name": getattr(file_record, "file_name", ""), - }, + "data": task_data, "annotations": [], }