You've already forked DataMate
- 将editor_max_text_bytes默认值从2MB改为0,表示不限制 - 更新文本获取服务中的大小检查逻辑,只在max_bytes大于0时进行限制 - 修改错误提示信息中的字节限制显示 - 优化配置参数的条件判断流程
58 lines
2.2 KiB
Python
58 lines
2.2 KiB
Python
from __future__ import annotations
|
|
|
|
import httpx
|
|
from fastapi import HTTPException
|
|
|
|
from app.core.config import settings
|
|
from app.core.logging import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
async def fetch_text_content_via_download_api(dataset_id: str, file_id: str) -> str:
|
|
"""通过下载接口读取文本内容"""
|
|
base = settings.datamate_backend_base_url.rstrip("/")
|
|
url = f"{base}/data-management/datasets/{dataset_id}/files/{file_id}/download"
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
|
resp = await client.get(url)
|
|
resp.raise_for_status()
|
|
|
|
max_bytes = settings.editor_max_text_bytes
|
|
content_length = resp.headers.get("content-length")
|
|
if max_bytes > 0 and content_length:
|
|
try:
|
|
if int(content_length) > max_bytes:
|
|
raise HTTPException(
|
|
status_code=413,
|
|
detail=f"文本文件过大,限制 {max_bytes} 字节",
|
|
)
|
|
except ValueError:
|
|
# content-length 非法则忽略,走实际长度判断
|
|
pass
|
|
|
|
data = resp.content
|
|
if max_bytes > 0 and len(data) > max_bytes:
|
|
raise HTTPException(
|
|
status_code=413,
|
|
detail=f"文本文件过大,限制 {max_bytes} 字节",
|
|
)
|
|
|
|
# TEXT POC:默认按 UTF-8 解码,不可解码字符用替换符处理
|
|
return data.decode("utf-8", errors="replace")
|
|
|
|
except HTTPException:
|
|
raise
|
|
except httpx.HTTPStatusError as exc:
|
|
logger.error(
|
|
"读取文本失败: dataset=%s, file=%s, http=%s",
|
|
dataset_id,
|
|
file_id,
|
|
exc.response.status_code,
|
|
)
|
|
raise HTTPException(status_code=502, detail="读取文本失败(下载接口返回错误)")
|
|
except Exception as exc:
|
|
logger.error("读取文本失败: dataset=%s, file=%s, err=%s", dataset_id, file_id, exc)
|
|
raise HTTPException(status_code=502, detail="读取文本失败(下载接口调用异常)")
|