You've already forked DataMate
- 在知识条目实体中新增来源数据集ID和文件ID字段 - 实现标注编辑器中同步标注结果到知识管理的服务逻辑 - 添加知识同步服务类处理标注到知识条目的转换和同步 - 实现通过下载接口获取文本内容的独立服务模块 - 更新知识条目查询接口支持按来源数据集和文件ID过滤 - 自动创建和关联标注项目对应的知识集 - 支持文本和Markdown文件的内容合并标注结果 - 添加同步过程中的错误处理和日志记录机制
57 lines
2.2 KiB
Python
57 lines
2.2 KiB
Python
from __future__ import annotations
|
|
|
|
import httpx
|
|
from fastapi import HTTPException
|
|
|
|
from app.core.config import settings
|
|
from app.core.logging import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
async def fetch_text_content_via_download_api(dataset_id: str, file_id: str) -> str:
|
|
"""通过下载接口读取文本内容"""
|
|
base = settings.datamate_backend_base_url.rstrip("/")
|
|
url = f"{base}/data-management/datasets/{dataset_id}/files/{file_id}/download"
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
|
resp = await client.get(url)
|
|
resp.raise_for_status()
|
|
|
|
content_length = resp.headers.get("content-length")
|
|
if content_length:
|
|
try:
|
|
if int(content_length) > settings.editor_max_text_bytes:
|
|
raise HTTPException(
|
|
status_code=413,
|
|
detail=f"文本文件过大,限制 {settings.editor_max_text_bytes} 字节",
|
|
)
|
|
except ValueError:
|
|
# content-length 非法则忽略,走实际长度判断
|
|
pass
|
|
|
|
data = resp.content
|
|
if len(data) > settings.editor_max_text_bytes:
|
|
raise HTTPException(
|
|
status_code=413,
|
|
detail=f"文本文件过大,限制 {settings.editor_max_text_bytes} 字节",
|
|
)
|
|
|
|
# TEXT POC:默认按 UTF-8 解码,不可解码字符用替换符处理
|
|
return data.decode("utf-8", errors="replace")
|
|
|
|
except HTTPException:
|
|
raise
|
|
except httpx.HTTPStatusError as exc:
|
|
logger.error(
|
|
"读取文本失败: dataset=%s, file=%s, http=%s",
|
|
dataset_id,
|
|
file_id,
|
|
exc.response.status_code,
|
|
)
|
|
raise HTTPException(status_code=502, detail="读取文本失败(下载接口返回错误)")
|
|
except Exception as exc:
|
|
logger.error("读取文本失败: dataset=%s, file=%s, err=%s", dataset_id, file_id, exc)
|
|
raise HTTPException(status_code=502, detail="读取文本失败(下载接口调用异常)")
|