You've already forked DataMate
feat(annotation): 增强标注编辑器的文本数据处理功能
- 添加 JSON 和 XML 解析支持用于处理标注配置 - 实现文本占位符填充机制优化用户体验 - 集成标签工作室配置管理功能 - 添加文本对象类型检测和分类处理 - 实现标注配置装饰器增强编辑器
This commit is contained in:
@@ -14,12 +14,15 @@ from datetime import datetime
|
|||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import json
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
from fastapi import HTTPException
|
from fastapi import HTTPException
|
||||||
from sqlalchemy import func, select
|
from sqlalchemy import func, select
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.core.logging import get_logger
|
from app.core.logging import get_logger
|
||||||
from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject
|
from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject
|
||||||
|
from app.module.annotation.config import LabelStudioTagConfig
|
||||||
from app.module.annotation.schema.editor import (
|
from app.module.annotation.schema.editor import (
|
||||||
EditorProjectInfo,
|
EditorProjectInfo,
|
||||||
EditorTaskListItem,
|
EditorTaskListItem,
|
||||||
@@ -36,6 +39,13 @@ from app.module.annotation.service.text_fetcher import fetch_text_content_via_do
|
|||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
TEXT_DATA_KEY = "text"
|
||||||
|
DATASET_ID_KEY = "dataset_id"
|
||||||
|
FILE_ID_KEY = "file_id"
|
||||||
|
FILE_NAME_KEY = "file_name"
|
||||||
|
TEXTUAL_OBJECT_CATEGORIES = {"text", "document"}
|
||||||
|
OBJECT_NAME_HEADER_PREFIX = "dm_object_header_"
|
||||||
|
|
||||||
|
|
||||||
class AnnotationEditorService:
|
class AnnotationEditorService:
|
||||||
"""Label Studio Editor 集成服务(TEXT POC 版)"""
|
"""Label Studio Editor 集成服务(TEXT POC 版)"""
|
||||||
@@ -92,6 +102,163 @@ class AnnotationEditorService:
|
|||||||
template = await self.template_service.get_template(self.db, template_id)
|
template = await self.template_service.get_template(self.db, template_id)
|
||||||
return getattr(template, "label_config", None) if template else None
|
return getattr(template, "label_config", None) if template else None
|
||||||
|
|
||||||
|
async def _resolve_project_label_config(self, project: LabelingProject) -> Optional[str]:
|
||||||
|
label_config = None
|
||||||
|
if project.configuration and isinstance(project.configuration, dict):
|
||||||
|
label_config = project.configuration.get("label_config")
|
||||||
|
if not label_config:
|
||||||
|
label_config = await self._get_label_config(project.template_id)
|
||||||
|
if label_config:
|
||||||
|
label_config = self._decorate_label_config_for_editor(label_config)
|
||||||
|
return label_config
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _try_parse_json_payload(text_content: str) -> Optional[Dict[str, Any]]:
|
||||||
|
if not text_content:
|
||||||
|
return None
|
||||||
|
stripped = text_content.strip()
|
||||||
|
if not stripped:
|
||||||
|
return None
|
||||||
|
if stripped[0] not in ("{", "["):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
parsed = json.loads(stripped)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return parsed if isinstance(parsed, dict) else None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_textual_object_tag(object_tag: str) -> bool:
|
||||||
|
config = LabelStudioTagConfig.get_object_config(object_tag) or {}
|
||||||
|
category = config.get("category")
|
||||||
|
return category in TEXTUAL_OBJECT_CATEGORIES
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _extract_textual_value_keys(cls, label_config: str) -> List[str]:
|
||||||
|
try:
|
||||||
|
root = ET.fromstring(label_config)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("解析 label_config 失败,已跳过占位填充:%s", exc)
|
||||||
|
return []
|
||||||
|
|
||||||
|
object_types = LabelStudioTagConfig.get_object_types()
|
||||||
|
seen: Dict[str, None] = {}
|
||||||
|
for element in root.iter():
|
||||||
|
if element.tag not in object_types:
|
||||||
|
continue
|
||||||
|
if not cls._is_textual_object_tag(element.tag):
|
||||||
|
continue
|
||||||
|
value = element.attrib.get("value", "")
|
||||||
|
if not value.startswith("$"):
|
||||||
|
continue
|
||||||
|
key = value[1:].strip()
|
||||||
|
if not key:
|
||||||
|
continue
|
||||||
|
seen[key] = None
|
||||||
|
return list(seen.keys())
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _needs_placeholder(value: Any) -> bool:
|
||||||
|
if value is None:
|
||||||
|
return True
|
||||||
|
if isinstance(value, str) and not value.strip():
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _apply_text_placeholders(self, data: Dict[str, Any], label_config: Optional[str]) -> None:
|
||||||
|
if not label_config:
|
||||||
|
return
|
||||||
|
for key in self._extract_textual_value_keys(label_config):
|
||||||
|
if self._needs_placeholder(data.get(key)):
|
||||||
|
data[key] = key
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _header_already_present(header: ET.Element, name: str) -> bool:
|
||||||
|
value = header.attrib.get("value", "")
|
||||||
|
if value == name:
|
||||||
|
return True
|
||||||
|
header_text = (header.text or "").strip()
|
||||||
|
return header_text == name
|
||||||
|
|
||||||
|
def _decorate_label_config_for_editor(self, label_config: str) -> str:
|
||||||
|
try:
|
||||||
|
root = ET.fromstring(label_config)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("解析 label_config 失败,已跳过 name 展示增强:%s", exc)
|
||||||
|
return label_config
|
||||||
|
|
||||||
|
object_types = LabelStudioTagConfig.get_object_types()
|
||||||
|
used_names = set()
|
||||||
|
for element in root.iter():
|
||||||
|
name = element.attrib.get("name")
|
||||||
|
if name:
|
||||||
|
used_names.add(name)
|
||||||
|
|
||||||
|
def allocate_header_name(base: str) -> str:
|
||||||
|
candidate = f"{OBJECT_NAME_HEADER_PREFIX}{base}"
|
||||||
|
if candidate not in used_names:
|
||||||
|
used_names.add(candidate)
|
||||||
|
return candidate
|
||||||
|
idx = 1
|
||||||
|
while f"{candidate}_{idx}" in used_names:
|
||||||
|
idx += 1
|
||||||
|
resolved = f"{candidate}_{idx}"
|
||||||
|
used_names.add(resolved)
|
||||||
|
return resolved
|
||||||
|
|
||||||
|
for parent in root.iter():
|
||||||
|
children = list(parent)
|
||||||
|
i = 0
|
||||||
|
while i < len(children):
|
||||||
|
child = children[i]
|
||||||
|
if child.tag not in object_types:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
if not self._is_textual_object_tag(child.tag):
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
obj_name = child.attrib.get("name")
|
||||||
|
if not obj_name:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if i > 0:
|
||||||
|
prev = children[i - 1]
|
||||||
|
if prev.tag == "Header" and self._header_already_present(prev, obj_name):
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
header = ET.Element("Header")
|
||||||
|
header.set("name", allocate_header_name(obj_name))
|
||||||
|
header.set("value", obj_name)
|
||||||
|
|
||||||
|
parent.insert(i, header)
|
||||||
|
children.insert(i, header)
|
||||||
|
i += 2
|
||||||
|
# continue outer loop
|
||||||
|
|
||||||
|
return ET.tostring(root, encoding="unicode")
|
||||||
|
|
||||||
|
def _build_task_data(
|
||||||
|
self,
|
||||||
|
display_text: str,
|
||||||
|
parsed_payload: Optional[Dict[str, Any]],
|
||||||
|
label_config: Optional[str],
|
||||||
|
file_record: DatasetFiles,
|
||||||
|
dataset_id: str,
|
||||||
|
file_id: str,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
data: Dict[str, Any] = dict(parsed_payload or {})
|
||||||
|
if self._needs_placeholder(data.get(TEXT_DATA_KEY)):
|
||||||
|
data[TEXT_DATA_KEY] = display_text
|
||||||
|
|
||||||
|
data.setdefault(FILE_ID_KEY, file_id)
|
||||||
|
data.setdefault(DATASET_ID_KEY, dataset_id)
|
||||||
|
data.setdefault(FILE_NAME_KEY, getattr(file_record, "file_name", ""))
|
||||||
|
|
||||||
|
self._apply_text_placeholders(data, label_config)
|
||||||
|
return data
|
||||||
|
|
||||||
async def get_project_info(self, project_id: str) -> EditorProjectInfo:
|
async def get_project_info(self, project_id: str) -> EditorProjectInfo:
|
||||||
project = await self._get_project_or_404(project_id)
|
project = await self._get_project_or_404(project_id)
|
||||||
|
|
||||||
@@ -102,11 +269,7 @@ class AnnotationEditorService:
|
|||||||
unsupported_reason = f"当前仅支持 TEXT,项目数据类型为: {dataset_type or 'UNKNOWN'}"
|
unsupported_reason = f"当前仅支持 TEXT,项目数据类型为: {dataset_type or 'UNKNOWN'}"
|
||||||
|
|
||||||
# 优先使用项目配置中的label_config(用户编辑版本),其次使用模板默认配置
|
# 优先使用项目配置中的label_config(用户编辑版本),其次使用模板默认配置
|
||||||
label_config = None
|
label_config = await self._resolve_project_label_config(project)
|
||||||
if project.configuration and isinstance(project.configuration, dict):
|
|
||||||
label_config = project.configuration.get("label_config")
|
|
||||||
if not label_config:
|
|
||||||
label_config = await self._get_label_config(project.template_id)
|
|
||||||
|
|
||||||
return EditorProjectInfo(
|
return EditorProjectInfo(
|
||||||
projectId=project.id,
|
projectId=project.id,
|
||||||
@@ -198,6 +361,9 @@ class AnnotationEditorService:
|
|||||||
raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
|
raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
|
||||||
|
|
||||||
text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
|
text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
|
||||||
|
assert isinstance(text_content, str)
|
||||||
|
label_config = await self._resolve_project_label_config(project)
|
||||||
|
parsed_payload = self._try_parse_json_payload(text_content)
|
||||||
|
|
||||||
# 获取现有标注
|
# 获取现有标注
|
||||||
ann_result = await self.db.execute(
|
ann_result = await self.db.execute(
|
||||||
@@ -244,14 +410,17 @@ class AnnotationEditorService:
|
|||||||
display_text = raw_segments[current_segment_index]["text"]
|
display_text = raw_segments[current_segment_index]["text"]
|
||||||
|
|
||||||
# 构造 task 对象
|
# 构造 task 对象
|
||||||
|
task_data = self._build_task_data(
|
||||||
|
display_text=display_text,
|
||||||
|
parsed_payload=parsed_payload,
|
||||||
|
label_config=label_config,
|
||||||
|
file_record=file_record,
|
||||||
|
dataset_id=project.dataset_id,
|
||||||
|
file_id=file_id,
|
||||||
|
)
|
||||||
task: Dict[str, Any] = {
|
task: Dict[str, Any] = {
|
||||||
"id": ls_task_id,
|
"id": ls_task_id,
|
||||||
"data": {
|
"data": task_data,
|
||||||
"text": display_text,
|
|
||||||
"file_id": file_id,
|
|
||||||
"dataset_id": project.dataset_id,
|
|
||||||
"file_name": getattr(file_record, "file_name", ""),
|
|
||||||
},
|
|
||||||
"annotations": [],
|
"annotations": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user