Files
DataMate/runtime/datamate-python/app/module/annotation/service/text_fetcher.py
Jerry Yan 73f0ab65fa feat(annotation): 实现标注结果同步到知识管理功能
- 在知识条目实体中新增来源数据集ID和文件ID字段
- 实现标注编辑器中同步标注结果到知识管理的服务逻辑
- 添加知识同步服务类处理标注到知识条目的转换和同步
- 实现通过下载接口获取文本内容的独立服务模块
- 更新知识条目查询接口支持按来源数据集和文件ID过滤
- 自动创建和关联标注项目对应的知识集
- 支持文本和Markdown文件的内容合并标注结果
- 添加同步过程中的错误处理和日志记录机制
2026-01-21 16:09:34 +08:00

57 lines
2.2 KiB
Python

from __future__ import annotations
import httpx
from fastapi import HTTPException
from app.core.config import settings
from app.core.logging import get_logger
logger = get_logger(__name__)
async def fetch_text_content_via_download_api(dataset_id: str, file_id: str) -> str:
"""通过下载接口读取文本内容"""
base = settings.datamate_backend_base_url.rstrip("/")
url = f"{base}/data-management/datasets/{dataset_id}/files/{file_id}/download"
try:
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
resp = await client.get(url)
resp.raise_for_status()
content_length = resp.headers.get("content-length")
if content_length:
try:
if int(content_length) > settings.editor_max_text_bytes:
raise HTTPException(
status_code=413,
detail=f"文本文件过大,限制 {settings.editor_max_text_bytes} 字节",
)
except ValueError:
# content-length 非法则忽略,走实际长度判断
pass
data = resp.content
if len(data) > settings.editor_max_text_bytes:
raise HTTPException(
status_code=413,
detail=f"文本文件过大,限制 {settings.editor_max_text_bytes} 字节",
)
# TEXT POC:默认按 UTF-8 解码,不可解码字符用替换符处理
return data.decode("utf-8", errors="replace")
except HTTPException:
raise
except httpx.HTTPStatusError as exc:
logger.error(
"读取文本失败: dataset=%s, file=%s, http=%s",
dataset_id,
file_id,
exc.response.status_code,
)
raise HTTPException(status_code=502, detail="读取文本失败(下载接口返回错误)")
except Exception as exc:
logger.error("读取文本失败: dataset=%s, file=%s, err=%s", dataset_id, file_id, exc)
raise HTTPException(status_code=502, detail="读取文本失败(下载接口调用异常)")