feat(annotation): 实现标注结果同步到知识管理功能

- 在知识条目实体中新增来源数据集ID和文件ID字段 - 实现标注编辑器中同步标注结果到知识管理的服务逻辑 - 添加知识同步服务类处理标注到知识条目的转换和同步 - 实现通过下载接口获取文本内容的独立服务模块 - 更新知识条目查询接口支持按来源数据集和文件ID过滤 - 自动创建和关联标注项目对应的知识集 - 支持文本和Markdown文件的内容合并标注结果 - 添加同步过程中的错误处理和日志记录机制
2026-01-21 16:09:34 +08:00
parent 6baf66b304
commit 73f0ab65fa
7 changed files with 422 additions and 46 deletions
--- a/runtime/datamate-python/app/module/annotation/service/editor.py
+++ b/runtime/datamate-python/app/module/annotation/service/editor.py
@@ -14,12 +14,10 @@ from datetime import datetime
 from typing import Any, Dict, List, Optional, Tuple

 import hashlib
-import httpx
 from fastapi import HTTPException
 from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession

-from app.core.config import settings
 from app.core.logging import get_logger
 from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject
 from app.module.annotation.schema.editor import (
@@ -32,7 +30,9 @@ from app.module.annotation.schema.editor import (
    UpsertAnnotationResponse,
 )
 from app.module.annotation.service.template import AnnotationTemplateService
+from app.module.annotation.service.knowledge_sync import KnowledgeSyncService
 from app.module.annotation.service.annotation_text_splitter import AnnotationTextSplitter
+from app.module.annotation.service.text_fetcher import fetch_text_content_via_download_api

 logger = get_logger(__name__)

@@ -172,44 +172,7 @@ class AnnotationEditorService:
        )

    async def _fetch_text_content_via_download_api(self, dataset_id: str, file_id: str) -> str:
-        base = settings.datamate_backend_base_url.rstrip("/")
-        url = f"{base}/data-management/datasets/{dataset_id}/files/{file_id}/download"
-
-        try:
-            async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
-                resp = await client.get(url)
-                resp.raise_for_status()
-
-                content_length = resp.headers.get("content-length")
-                if content_length:
-                    try:
-                        if int(content_length) > settings.editor_max_text_bytes:
-                            raise HTTPException(
-                                status_code=413,
-                                detail=f"文本文件过大，限制 {settings.editor_max_text_bytes} 字节",
-                            )
-                    except ValueError:
-                        # content-length 非法则忽略，走实际长度判断
-                        pass
-
-                data = resp.content
-                if len(data) > settings.editor_max_text_bytes:
-                    raise HTTPException(
-                        status_code=413,
-                        detail=f"文本文件过大，限制 {settings.editor_max_text_bytes} 字节",
-                    )
-
-                # TEXT POC：默认按 UTF-8 解码，不可解码字符用替换符处理
-                return data.decode("utf-8", errors="replace")
-
-        except HTTPException:
-            raise
-        except httpx.HTTPStatusError as e:
-            logger.error(f"读取文本失败: dataset={dataset_id}, file={file_id}, http={e.response.status_code}")
-            raise HTTPException(status_code=502, detail="读取文本失败（下载接口返回错误）")
-        except Exception as e:
-            logger.error(f"读取文本失败: dataset={dataset_id}, file={file_id}, err={e}")
-            raise HTTPException(status_code=502, detail="读取文本失败（下载接口调用异常）")
+        return await fetch_text_content_via_download_api(dataset_id, file_id)

    async def get_task(
        self,
@@ -355,13 +318,14 @@ class AnnotationEditorService:
        project = await self._get_project_or_404(project_id)

        # 校验文件归属
-        file_check = await self.db.execute(
-            select(DatasetFiles.id).where(
+        file_result = await self.db.execute(
+            select(DatasetFiles).where(
                DatasetFiles.id == file_id,
                DatasetFiles.dataset_id == project.dataset_id,
            )
        )
-        if not file_check.scalar_one_or_none():
+        file_record = file_result.scalar_one_or_none()
+        if not file_record:
            raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")

        annotation_payload = dict(request.annotation or {})
@@ -406,10 +370,12 @@ class AnnotationEditorService:
            await self.db.commit()
            await self.db.refresh(existing)

-            return UpsertAnnotationResponse(
+            response = UpsertAnnotationResponse(
                annotationId=existing.id,
                updatedAt=existing.updated_at or now,
            )
+            await self._sync_annotation_to_knowledge(project, file_record, final_payload, existing.updated_at)
+            return response

        new_id = str(uuid.uuid4())
        record = AnnotationResult(
@@ -424,10 +390,12 @@ class AnnotationEditorService:
        await self.db.commit()
        await self.db.refresh(record)

-        return UpsertAnnotationResponse(
+        response = UpsertAnnotationResponse(
            annotationId=record.id,
            updatedAt=record.updated_at or now,
        )
+        await self._sync_annotation_to_knowledge(project, file_record, final_payload, record.updated_at)
+        return response

    def _merge_segment_annotation(
        self,
@@ -465,3 +433,21 @@ class AnnotationEditorService:

        return base

+    async def _sync_annotation_to_knowledge(
+        self,
+        project: LabelingProject,
+        file_record: DatasetFiles,
+        annotation: Dict[str, Any],
+        annotation_updated_at: Optional[datetime],
+    ) -> None:
+        """同步标注结果到知识管理（失败不影响标注保存）"""
+        try:
+            await KnowledgeSyncService(self.db).sync_annotation_to_knowledge(
+                project=project,
+                file_record=file_record,
+                annotation=annotation,
+                annotation_updated_at=annotation_updated_at,
+            )
+        except Exception as exc:
+            logger.warning("标注同步知识管理失败：%s", exc)
+