From c5ace0c4cce4c99a7c94005e606cca231b4f42ea Mon Sep 17 00:00:00 2001
From: Jerry Yan <792602257@qq.com>
Date: Sun, 25 Jan 2026 17:25:44 +0800
Subject: [PATCH] =?UTF-8?q?feat(annotation):=20=E6=94=AF=E6=8C=81=E5=9B=BE?=
 =?UTF-8?q?=E5=83=8F=E6=95=B0=E6=8D=AE=E9=9B=86=E7=9A=84=E5=86=85=E5=B5=8C?=
 =?UTF-8?q?=E6=A0=87=E6=B3=A8=E7=BC=96=E8=BE=91=E5=99=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 添加文件预览接口，支持以 inline 方式预览数据集中的指定文件
- 实现图像任务构建功能，支持图像标注任务的数据结构
- 扩展标注编辑器服务以支持 TEXT 和 IMAGE 类型数据集
- 添加媒体对象分类支持，解析图像标注配置
- 实现图像文件预览 URL 构建逻辑
- 优化项目信息获取和任务响应构建流程
- 修复数据库查询中的项目 ID 引用错误
---
 backend/openapi/specs/data-management.yaml    |  66 +++++--
 .../rest/DatasetFileController.java           |  53 ++++--
 .../app/module/annotation/service/editor.py   | 170 ++++++++++++++++--
 3 files changed, 240 insertions(+), 49 deletions(-)

diff --git a/backend/openapi/specs/data-management.yaml b/backend/openapi/specs/data-management.yaml
index 4a0fddf..2e38bc7 100644
--- a/backend/openapi/specs/data-management.yaml
+++ b/backend/openapi/specs/data-management.yaml
@@ -308,12 +308,12 @@ paths:
         '204':
           description: 删除成功
 
-  /data-management/datasets/{datasetId}/files/{fileId}/download:
-    get:
-      tags: [DatasetFile]
-      operationId: downloadDatasetFile
-      summary: 下载文件
-      description: 下载数据集中的指定文件
+  /data-management/datasets/{datasetId}/files/{fileId}/download:
+    get:
+      tags: [DatasetFile]
+      operationId: downloadDatasetFile
+      summary: 下载文件
+      description: 下载数据集中的指定文件
       parameters:
         - name: datasetId
           in: path
@@ -328,19 +328,47 @@ paths:
             type: string
           description: 文件ID
       responses:
-        '200':
-          description: 文件内容
-          content:
-            application/octet-stream:
-              schema:
-                type: string
-                format: binary
-
-  /data-management/datasets/{datasetId}/files/download:
-    get:
-      tags: [ DatasetFile ]
-      operationId: downloadDatasetFileAsZip
-      summary: 下载文件
+        '200':
+          description: 文件内容
+          content:
+            application/octet-stream:
+              schema:
+                type: string
+                format: binary
+
+  /data-management/datasets/{datasetId}/files/{fileId}/preview:
+    get:
+      tags: [DatasetFile]
+      operationId: previewDatasetFile
+      summary: 预览文件
+      description: 以 inline 方式预览数据集中的指定文件
+      parameters:
+        - name: datasetId
+          in: path
+          required: true
+          schema:
+            type: string
+          description: 数据集ID
+        - name: fileId
+          in: path
+          required: true
+          schema:
+            type: string
+          description: 文件ID
+      responses:
+        '200':
+          description: 文件内容
+          content:
+            application/octet-stream:
+              schema:
+                type: string
+                format: binary
+
+  /data-management/datasets/{datasetId}/files/download:
+    get:
+      tags: [ DatasetFile ]
+      operationId: downloadDatasetFileAsZip
+      summary: 下载文件
       description: 下载数据集中全部文件
       parameters:
         - name: datasetId
diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java
index e1b61d5..bdd41f4 100644
--- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java
+++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java
@@ -19,11 +19,12 @@ import jakarta.validation.Valid;
 import lombok.extern.slf4j.Slf4j;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.core.io.Resource;
-import org.springframework.http.HttpHeaders;
-import org.springframework.http.HttpStatus;
-import org.springframework.http.MediaType;
-import org.springframework.http.ResponseEntity;
-import org.springframework.web.bind.annotation.*;
+import org.springframework.http.HttpHeaders;
+import org.springframework.http.HttpStatus;
+import org.springframework.http.MediaType;
+import org.springframework.http.MediaTypeFactory;
+import org.springframework.http.ResponseEntity;
+import org.springframework.web.bind.annotation.*;
 
 import java.util.List;
 
@@ -83,10 +84,10 @@ public class DatasetFileController {
         }
     }
 
-    @IgnoreResponseWrap
-    @GetMapping(value = "/{fileId}/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE + ";charset=UTF-8")
-    public ResponseEntity<Resource> downloadDatasetFileById(@PathVariable("datasetId") String datasetId,
-                                                            @PathVariable("fileId") String fileId) {
+    @IgnoreResponseWrap
+    @GetMapping(value = "/{fileId}/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE + ";charset=UTF-8")
+    public ResponseEntity<Resource> downloadDatasetFileById(@PathVariable("datasetId") String datasetId,
+                                                            @PathVariable("fileId") String fileId) {
         try {
             DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId);
             Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId);
@@ -100,12 +101,34 @@ public class DatasetFileController {
             return ResponseEntity.status(HttpStatus.NOT_FOUND).build();
         } catch (Exception e) {
             return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build();
-        }
-    }
-
-    @IgnoreResponseWrap
-    @GetMapping(value = "/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE)
-    public void downloadDatasetFileAsZip(@PathVariable("datasetId") String datasetId, HttpServletResponse response) {
+        }
+    }
+
+    @IgnoreResponseWrap
+    @GetMapping(value = "/{fileId}/preview", produces = MediaType.ALL_VALUE)
+    public ResponseEntity<Resource> previewDatasetFileById(@PathVariable("datasetId") String datasetId,
+                                                           @PathVariable("fileId") String fileId) {
+        try {
+            DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId);
+            Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId);
+            MediaType mediaType = MediaTypeFactory.getMediaType(resource)
+                    .orElse(MediaType.APPLICATION_OCTET_STREAM);
+
+            return ResponseEntity.ok()
+                    .contentType(mediaType)
+                    .header(HttpHeaders.CONTENT_DISPOSITION,
+                            "inline; filename=\"" + datasetFile.getFileName() + "\"")
+                    .body(resource);
+        } catch (IllegalArgumentException e) {
+            return ResponseEntity.status(HttpStatus.NOT_FOUND).build();
+        } catch (Exception e) {
+            return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build();
+        }
+    }
+
+    @IgnoreResponseWrap
+    @GetMapping(value = "/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE)
+    public void downloadDatasetFileAsZip(@PathVariable("datasetId") String datasetId, HttpServletResponse response) {
         datasetFileApplicationService.downloadDatasetFileAsZip(datasetId, response);
     }
 
diff --git a/runtime/datamate-python/app/module/annotation/service/editor.py b/runtime/datamate-python/app/module/annotation/service/editor.py
index ba79aac..be7f4b9 100644
--- a/runtime/datamate-python/app/module/annotation/service/editor.py
+++ b/runtime/datamate-python/app/module/annotation/service/editor.py
@@ -12,6 +12,7 @@ from __future__ import annotations
 import uuid
 from datetime import datetime
 from typing import Any, Dict, List, Optional, Tuple
+from urllib.parse import urlparse
 
 import hashlib
 import json
@@ -20,6 +21,7 @@ from fastapi import HTTPException
 from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.core.config import settings
 from app.core.logging import get_logger
 from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject
 from app.module.annotation.config import LabelStudioTagConfig
@@ -40,6 +42,7 @@ from app.module.annotation.service.text_fetcher import fetch_text_content_via_do
 logger = get_logger(__name__)
 
 TEXT_DATA_KEY = "text"
+IMAGE_DATA_KEY = "image"
 DATASET_ID_KEY = "dataset_id"
 FILE_ID_KEY = "file_id"
 FILE_NAME_KEY = "file_name"
@@ -50,7 +53,9 @@ SEGMENT_INDEX_KEY = "segment_index"
 SEGMENT_INDEX_CAMEL_KEY = "segmentIndex"
 JSONL_EXTENSION = ".jsonl"
 TEXTUAL_OBJECT_CATEGORIES = {"text", "document"}
+MEDIA_OBJECT_CATEGORIES = {"image"}
 OBJECT_NAME_HEADER_PREFIX = "dm_object_header_"
+SUPPORTED_EDITOR_DATASET_TYPES = ("TEXT", "IMAGE")
 
 
 class AnnotationEditorService:
@@ -84,6 +89,32 @@ class AnnotationEditorService:
         # 单人单份最终标签：每个 task 只保留一个 annotation，id 直接与 task 绑定即可
         return self._stable_ls_id(f"annotation:{project_id}:{file_id}")
 
+    @staticmethod
+    def _normalize_dataset_type(dataset_type: Optional[str]) -> str:
+        return (dataset_type or "").upper()
+
+    @staticmethod
+    def _resolve_public_api_prefix() -> str:
+        base = (settings.datamate_backend_base_url or "").strip()
+        if not base:
+            return "/api"
+        parsed = urlparse(base)
+        if parsed.scheme and parsed.netloc:
+            prefix = parsed.path
+        else:
+            prefix = base
+        prefix = prefix.rstrip("/")
+        if not prefix:
+            return "/api"
+        if not prefix.startswith("/"):
+            prefix = "/" + prefix
+        return prefix
+
+    @classmethod
+    def _build_file_preview_url(cls, dataset_id: str, file_id: str) -> str:
+        prefix = cls._resolve_public_api_prefix()
+        return f"{prefix}/data-management/datasets/{dataset_id}/files/{file_id}/preview"
+
     async def _get_project_or_404(self, project_id: str) -> LabelingProject:
         result = await self.db.execute(
             select(LabelingProject).where(
@@ -129,6 +160,23 @@ class AnnotationEditorService:
             return TEXT_DATA_KEY
         return keys[0]
 
+    @classmethod
+    def _resolve_primary_media_key(
+        cls,
+        label_config: Optional[str],
+        default_key: str,
+        categories: Optional[set[str]] = None,
+    ) -> str:
+        if not label_config:
+            return default_key
+        target_categories = categories or set()
+        keys = cls._extract_object_value_keys_by_category(label_config, target_categories)
+        if not keys:
+            return default_key
+        if default_key in keys:
+            return default_key
+        return keys[0]
+
     @staticmethod
     def _try_parse_json_payload(text_content: str) -> Optional[Dict[str, Any]]:
         if not text_content:
@@ -160,7 +208,11 @@ class AnnotationEditorService:
         return category in TEXTUAL_OBJECT_CATEGORIES
 
     @classmethod
-    def _extract_textual_value_keys(cls, label_config: str) -> List[str]:
+    def _extract_object_value_keys_by_category(
+        cls,
+        label_config: str,
+        categories: set[str],
+    ) -> List[str]:
         try:
             root = ET.fromstring(label_config)
         except Exception as exc:
@@ -172,7 +224,9 @@ class AnnotationEditorService:
         for element in root.iter():
             if element.tag not in object_types:
                 continue
-            if not cls._is_textual_object_tag(element.tag):
+            config = LabelStudioTagConfig.get_object_config(element.tag) or {}
+            category = config.get("category")
+            if categories and category not in categories:
                 continue
             value = element.attrib.get("value", "")
             if not value.startswith("$"):
@@ -183,6 +237,10 @@ class AnnotationEditorService:
             seen[key] = None
         return list(seen.keys())
 
+    @classmethod
+    def _extract_textual_value_keys(cls, label_config: str) -> List[str]:
+        return cls._extract_object_value_keys_by_category(label_config, TEXTUAL_OBJECT_CATEGORIES)
+
     @staticmethod
     def _needs_placeholder(value: Any) -> bool:
         if value is None:
@@ -310,11 +368,12 @@ class AnnotationEditorService:
     async def get_project_info(self, project_id: str) -> EditorProjectInfo:
         project = await self._get_project_or_404(project_id)
 
-        dataset_type = await self._get_dataset_type(project.dataset_id)
-        supported = (dataset_type or "").upper() == "TEXT"
+        dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
+        supported = dataset_type in SUPPORTED_EDITOR_DATASET_TYPES
         unsupported_reason = None
         if not supported:
-            unsupported_reason = f"当前仅支持 TEXT，项目数据类型为: {dataset_type or 'UNKNOWN'}"
+            supported_hint = "/".join(SUPPORTED_EDITOR_DATASET_TYPES)
+            unsupported_reason = f"当前仅支持 {supported_hint}，项目数据类型为: {dataset_type or 'UNKNOWN'}"
 
         # 优先使用项目配置中的label_config（用户编辑版本），其次使用模板默认配置
         label_config = await self._resolve_project_label_config(project)
@@ -393,10 +452,9 @@ class AnnotationEditorService:
     ) -> EditorTaskResponse:
         project = await self._get_project_or_404(project_id)
 
-        # TEXT 支持校验
-        dataset_type = await self._get_dataset_type(project.dataset_id)
-        if (dataset_type or "").upper() != "TEXT":
-            raise HTTPException(status_code=400, detail="当前仅支持 TEXT 项目的内嵌编辑器")
+        dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
+        if dataset_type not in SUPPORTED_EDITOR_DATASET_TYPES:
+            raise HTTPException(status_code=400, detail="当前仅支持 TEXT/IMAGE 项目的内嵌编辑器")
 
         file_result = await self.db.execute(
             select(DatasetFiles).where(
@@ -408,6 +466,18 @@ class AnnotationEditorService:
         if not file_record:
             raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
 
+        if dataset_type == "IMAGE":
+            return await self._build_image_task(project, file_record, file_id)
+
+        return await self._build_text_task(project, file_record, file_id, segment_index)
+
+    async def _build_text_task(
+        self,
+        project: LabelingProject,
+        file_record: DatasetFiles,
+        file_id: str,
+        segment_index: Optional[int],
+    ) -> EditorTaskResponse:
         text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
         assert isinstance(text_content, str)
         label_config = await self._resolve_project_label_config(project)
@@ -434,13 +504,13 @@ class AnnotationEditorService:
         # 获取现有标注
         ann_result = await self.db.execute(
             select(AnnotationResult).where(
-                AnnotationResult.project_id == project_id,
+                AnnotationResult.project_id == project.id,
                 AnnotationResult.file_id == file_id,
             )
         )
         ann = ann_result.scalar_one_or_none()
 
-        ls_task_id = self._make_ls_task_id(project_id, file_id)
+        ls_task_id = self._make_ls_task_id(project.id, file_id)
 
         # 判断是否需要分段（JSONL 多行或主文本超过阈值）
         needs_segmentation = len(records) > 1 or any(
@@ -529,7 +599,7 @@ class AnnotationEditorService:
                 segment_annotations = ann.annotation.get("segments", {})
                 seg_ann = segment_annotations.get(str(current_segment_index), {})
                 stored = {
-                    "id": self._make_ls_annotation_id(project_id, file_id) + current_segment_index,
+                    "id": self._make_ls_annotation_id(project.id, file_id) + current_segment_index,
                     "task": ls_task_id,
                     "result": seg_ann.get("result", []),
                     "created_at": seg_ann.get("created_at", datetime.utcnow().isoformat() + "Z"),
@@ -541,11 +611,11 @@ class AnnotationEditorService:
                 stored = dict(ann.annotation or {})
                 stored["task"] = ls_task_id
                 if not isinstance(stored.get("id"), int):
-                    stored["id"] = self._make_ls_annotation_id(project_id, file_id)
+                    stored["id"] = self._make_ls_annotation_id(project.id, file_id)
                 task["annotations"] = [stored]
             else:
                 # 首次从非分段切换到分段：提供空标注
-                empty_ann_id = self._make_ls_annotation_id(project_id, file_id) + current_segment_index
+                empty_ann_id = self._make_ls_annotation_id(project.id, file_id) + current_segment_index
                 task["annotations"] = [
                     {
                         "id": empty_ann_id,
@@ -557,7 +627,7 @@ class AnnotationEditorService:
                 ]
         else:
             # 提供一个空 annotation，避免前端在没有选中 annotation 时无法产生 result
-            empty_ann_id = self._make_ls_annotation_id(project_id, file_id)
+            empty_ann_id = self._make_ls_annotation_id(project.id, file_id)
             if needs_segmentation:
                 empty_ann_id += current_segment_index
             task["annotations"] = [
@@ -579,6 +649,76 @@ class AnnotationEditorService:
             currentSegmentIndex=current_segment_index,
         )
 
+    async def _build_image_task(
+        self,
+        project: LabelingProject,
+        file_record: DatasetFiles,
+        file_id: str,
+    ) -> EditorTaskResponse:
+        label_config = await self._resolve_project_label_config(project)
+        image_key = self._resolve_primary_media_key(
+            label_config,
+            IMAGE_DATA_KEY,
+            MEDIA_OBJECT_CATEGORIES,
+        )
+        preview_url = self._build_file_preview_url(project.dataset_id, file_id)
+        file_name = str(getattr(file_record, "file_name", ""))
+
+        task_data: Dict[str, Any] = {
+            image_key: preview_url,
+            FILE_ID_KEY: file_id,
+            FILE_ID_CAMEL_KEY: file_id,
+            DATASET_ID_KEY: project.dataset_id,
+            DATASET_ID_CAMEL_KEY: project.dataset_id,
+            FILE_NAME_KEY: file_name,
+            FILE_NAME_CAMEL_KEY: file_name,
+        }
+
+        # 获取现有标注
+        ann_result = await self.db.execute(
+            select(AnnotationResult).where(
+                AnnotationResult.project_id == project.id,
+                AnnotationResult.file_id == file_id,
+            )
+        )
+        ann = ann_result.scalar_one_or_none()
+        ls_task_id = self._make_ls_task_id(project.id, file_id)
+
+        task: Dict[str, Any] = {
+            "id": ls_task_id,
+            "data": task_data,
+            "annotations": [],
+        }
+
+        annotation_updated_at = None
+        if ann and not (ann.annotation or {}).get("segmented"):
+            annotation_updated_at = ann.updated_at
+            stored = dict(ann.annotation or {})
+            stored["task"] = ls_task_id
+            if not isinstance(stored.get("id"), int):
+                stored["id"] = self._make_ls_annotation_id(project.id, file_id)
+            task["annotations"] = [stored]
+        else:
+            empty_ann_id = self._make_ls_annotation_id(project.id, file_id)
+            task["annotations"] = [
+                {
+                    "id": empty_ann_id,
+                    "task": ls_task_id,
+                    "result": [],
+                    "created_at": datetime.utcnow().isoformat() + "Z",
+                    "updated_at": datetime.utcnow().isoformat() + "Z",
+                }
+            ]
+
+        return EditorTaskResponse(
+            task=task,
+            annotationUpdatedAt=annotation_updated_at,
+            segmented=False,
+            segments=None,
+            totalSegments=1,
+            currentSegmentIndex=0,
+        )
+
     async def upsert_annotation(self, project_id: str, file_id: str, request: UpsertAnnotationRequest) -> UpsertAnnotationResponse:
         project = await self._get_project_or_404(project_id)