feat(annotation): 实现标注结果同步到知识管理功能

- 在知识条目实体中新增来源数据集ID和文件ID字段
- 实现标注编辑器中同步标注结果到知识管理的服务逻辑
- 添加知识同步服务类处理标注到知识条目的转换和同步
- 实现通过下载接口获取文本内容的独立服务模块
- 更新知识条目查询接口支持按来源数据集和文件ID过滤
- 自动创建和关联标注项目对应的知识集
- 支持文本和Markdown文件的内容合并标注结果
- 添加同步过程中的错误处理和日志记录机制
This commit is contained in:
2026-01-21 16:09:34 +08:00
parent 6baf66b304
commit 73f0ab65fa
7 changed files with 422 additions and 46 deletions

View File

@@ -14,12 +14,10 @@ from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple
import hashlib
import httpx
from fastapi import HTTPException
from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.config import settings
from app.core.logging import get_logger
from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject
from app.module.annotation.schema.editor import (
@@ -32,7 +30,9 @@ from app.module.annotation.schema.editor import (
UpsertAnnotationResponse,
)
from app.module.annotation.service.template import AnnotationTemplateService
from app.module.annotation.service.knowledge_sync import KnowledgeSyncService
from app.module.annotation.service.annotation_text_splitter import AnnotationTextSplitter
from app.module.annotation.service.text_fetcher import fetch_text_content_via_download_api
logger = get_logger(__name__)
@@ -172,44 +172,7 @@ class AnnotationEditorService:
)
async def _fetch_text_content_via_download_api(self, dataset_id: str, file_id: str) -> str:
base = settings.datamate_backend_base_url.rstrip("/")
url = f"{base}/data-management/datasets/{dataset_id}/files/{file_id}/download"
try:
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
resp = await client.get(url)
resp.raise_for_status()
content_length = resp.headers.get("content-length")
if content_length:
try:
if int(content_length) > settings.editor_max_text_bytes:
raise HTTPException(
status_code=413,
detail=f"文本文件过大,限制 {settings.editor_max_text_bytes} 字节",
)
except ValueError:
# content-length 非法则忽略,走实际长度判断
pass
data = resp.content
if len(data) > settings.editor_max_text_bytes:
raise HTTPException(
status_code=413,
detail=f"文本文件过大,限制 {settings.editor_max_text_bytes} 字节",
)
# TEXT POC:默认按 UTF-8 解码,不可解码字符用替换符处理
return data.decode("utf-8", errors="replace")
except HTTPException:
raise
except httpx.HTTPStatusError as e:
logger.error(f"读取文本失败: dataset={dataset_id}, file={file_id}, http={e.response.status_code}")
raise HTTPException(status_code=502, detail="读取文本失败(下载接口返回错误)")
except Exception as e:
logger.error(f"读取文本失败: dataset={dataset_id}, file={file_id}, err={e}")
raise HTTPException(status_code=502, detail="读取文本失败(下载接口调用异常)")
return await fetch_text_content_via_download_api(dataset_id, file_id)
async def get_task(
self,
@@ -355,13 +318,14 @@ class AnnotationEditorService:
project = await self._get_project_or_404(project_id)
# 校验文件归属
file_check = await self.db.execute(
select(DatasetFiles.id).where(
file_result = await self.db.execute(
select(DatasetFiles).where(
DatasetFiles.id == file_id,
DatasetFiles.dataset_id == project.dataset_id,
)
)
if not file_check.scalar_one_or_none():
file_record = file_result.scalar_one_or_none()
if not file_record:
raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
annotation_payload = dict(request.annotation or {})
@@ -406,10 +370,12 @@ class AnnotationEditorService:
await self.db.commit()
await self.db.refresh(existing)
return UpsertAnnotationResponse(
response = UpsertAnnotationResponse(
annotationId=existing.id,
updatedAt=existing.updated_at or now,
)
await self._sync_annotation_to_knowledge(project, file_record, final_payload, existing.updated_at)
return response
new_id = str(uuid.uuid4())
record = AnnotationResult(
@@ -424,10 +390,12 @@ class AnnotationEditorService:
await self.db.commit()
await self.db.refresh(record)
return UpsertAnnotationResponse(
response = UpsertAnnotationResponse(
annotationId=record.id,
updatedAt=record.updated_at or now,
)
await self._sync_annotation_to_knowledge(project, file_record, final_payload, record.updated_at)
return response
def _merge_segment_annotation(
self,
@@ -465,3 +433,21 @@ class AnnotationEditorService:
return base
async def _sync_annotation_to_knowledge(
self,
project: LabelingProject,
file_record: DatasetFiles,
annotation: Dict[str, Any],
annotation_updated_at: Optional[datetime],
) -> None:
"""同步标注结果到知识管理(失败不影响标注保存)"""
try:
await KnowledgeSyncService(self.db).sync_annotation_to_knowledge(
project=project,
file_record=file_record,
annotation=annotation,
annotation_updated_at=annotation_updated_at,
)
except Exception as exc:
logger.warning("标注同步知识管理失败:%s", exc)