You've already forked DataMate
feat(annotation): 实现标注结果同步到知识管理功能
- 在知识条目实体中新增来源数据集ID和文件ID字段 - 实现标注编辑器中同步标注结果到知识管理的服务逻辑 - 添加知识同步服务类处理标注到知识条目的转换和同步 - 实现通过下载接口获取文本内容的独立服务模块 - 更新知识条目查询接口支持按来源数据集和文件ID过滤 - 自动创建和关联标注项目对应的知识集 - 支持文本和Markdown文件的内容合并标注结果 - 添加同步过程中的错误处理和日志记录机制
This commit is contained in:
@@ -14,12 +14,10 @@ from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import hashlib
|
||||
import httpx
|
||||
from fastapi import HTTPException
|
||||
from sqlalchemy import func, select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.logging import get_logger
|
||||
from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject
|
||||
from app.module.annotation.schema.editor import (
|
||||
@@ -32,7 +30,9 @@ from app.module.annotation.schema.editor import (
|
||||
UpsertAnnotationResponse,
|
||||
)
|
||||
from app.module.annotation.service.template import AnnotationTemplateService
|
||||
from app.module.annotation.service.knowledge_sync import KnowledgeSyncService
|
||||
from app.module.annotation.service.annotation_text_splitter import AnnotationTextSplitter
|
||||
from app.module.annotation.service.text_fetcher import fetch_text_content_via_download_api
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -172,44 +172,7 @@ class AnnotationEditorService:
|
||||
)
|
||||
|
||||
async def _fetch_text_content_via_download_api(self, dataset_id: str, file_id: str) -> str:
|
||||
base = settings.datamate_backend_base_url.rstrip("/")
|
||||
url = f"{base}/data-management/datasets/{dataset_id}/files/{file_id}/download"
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
||||
resp = await client.get(url)
|
||||
resp.raise_for_status()
|
||||
|
||||
content_length = resp.headers.get("content-length")
|
||||
if content_length:
|
||||
try:
|
||||
if int(content_length) > settings.editor_max_text_bytes:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"文本文件过大,限制 {settings.editor_max_text_bytes} 字节",
|
||||
)
|
||||
except ValueError:
|
||||
# content-length 非法则忽略,走实际长度判断
|
||||
pass
|
||||
|
||||
data = resp.content
|
||||
if len(data) > settings.editor_max_text_bytes:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"文本文件过大,限制 {settings.editor_max_text_bytes} 字节",
|
||||
)
|
||||
|
||||
# TEXT POC:默认按 UTF-8 解码,不可解码字符用替换符处理
|
||||
return data.decode("utf-8", errors="replace")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"读取文本失败: dataset={dataset_id}, file={file_id}, http={e.response.status_code}")
|
||||
raise HTTPException(status_code=502, detail="读取文本失败(下载接口返回错误)")
|
||||
except Exception as e:
|
||||
logger.error(f"读取文本失败: dataset={dataset_id}, file={file_id}, err={e}")
|
||||
raise HTTPException(status_code=502, detail="读取文本失败(下载接口调用异常)")
|
||||
return await fetch_text_content_via_download_api(dataset_id, file_id)
|
||||
|
||||
async def get_task(
|
||||
self,
|
||||
@@ -355,13 +318,14 @@ class AnnotationEditorService:
|
||||
project = await self._get_project_or_404(project_id)
|
||||
|
||||
# 校验文件归属
|
||||
file_check = await self.db.execute(
|
||||
select(DatasetFiles.id).where(
|
||||
file_result = await self.db.execute(
|
||||
select(DatasetFiles).where(
|
||||
DatasetFiles.id == file_id,
|
||||
DatasetFiles.dataset_id == project.dataset_id,
|
||||
)
|
||||
)
|
||||
if not file_check.scalar_one_or_none():
|
||||
file_record = file_result.scalar_one_or_none()
|
||||
if not file_record:
|
||||
raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
|
||||
|
||||
annotation_payload = dict(request.annotation or {})
|
||||
@@ -406,10 +370,12 @@ class AnnotationEditorService:
|
||||
await self.db.commit()
|
||||
await self.db.refresh(existing)
|
||||
|
||||
return UpsertAnnotationResponse(
|
||||
response = UpsertAnnotationResponse(
|
||||
annotationId=existing.id,
|
||||
updatedAt=existing.updated_at or now,
|
||||
)
|
||||
await self._sync_annotation_to_knowledge(project, file_record, final_payload, existing.updated_at)
|
||||
return response
|
||||
|
||||
new_id = str(uuid.uuid4())
|
||||
record = AnnotationResult(
|
||||
@@ -424,10 +390,12 @@ class AnnotationEditorService:
|
||||
await self.db.commit()
|
||||
await self.db.refresh(record)
|
||||
|
||||
return UpsertAnnotationResponse(
|
||||
response = UpsertAnnotationResponse(
|
||||
annotationId=record.id,
|
||||
updatedAt=record.updated_at or now,
|
||||
)
|
||||
await self._sync_annotation_to_knowledge(project, file_record, final_payload, record.updated_at)
|
||||
return response
|
||||
|
||||
def _merge_segment_annotation(
|
||||
self,
|
||||
@@ -465,3 +433,21 @@ class AnnotationEditorService:
|
||||
|
||||
return base
|
||||
|
||||
async def _sync_annotation_to_knowledge(
|
||||
self,
|
||||
project: LabelingProject,
|
||||
file_record: DatasetFiles,
|
||||
annotation: Dict[str, Any],
|
||||
annotation_updated_at: Optional[datetime],
|
||||
) -> None:
|
||||
"""同步标注结果到知识管理(失败不影响标注保存)"""
|
||||
try:
|
||||
await KnowledgeSyncService(self.db).sync_annotation_to_knowledge(
|
||||
project=project,
|
||||
file_record=file_record,
|
||||
annotation=annotation,
|
||||
annotation_updated_at=annotation_updated_at,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("标注同步知识管理失败:%s", exc)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user