DataMate/runtime/datamate-python/app/module/annotation/service/text_fetcher.py

from __future__ import annotations

import httpx
from fastapi import HTTPException

from app.core.config import settings
from app.core.logging import get_logger

logger = get_logger(__name__)


async def fetch_text_content_via_download_api(dataset_id: str, file_id: str) -> str:
    """通过下载接口读取文本内容"""
    base = settings.datamate_backend_base_url.rstrip("/")
    url = f"{base}/data-management/datasets/{dataset_id}/files/{file_id}/download"

    try:
        async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
            resp = await client.get(url)
            resp.raise_for_status()

            content_length = resp.headers.get("content-length")
            if content_length:
                try:
                    if int(content_length) > settings.editor_max_text_bytes:
                        raise HTTPException(
                            status_code=413,
                            detail=f"文本文件过大，限制 {settings.editor_max_text_bytes} 字节",
                        )
                except ValueError:
                    # content-length 非法则忽略，走实际长度判断
                    pass

            data = resp.content
            if len(data) > settings.editor_max_text_bytes:
                raise HTTPException(
                    status_code=413,
                    detail=f"文本文件过大，限制 {settings.editor_max_text_bytes} 字节",
                )

            # TEXT POC：默认按 UTF-8 解码，不可解码字符用替换符处理
            return data.decode("utf-8", errors="replace")

    except HTTPException:
        raise
    except httpx.HTTPStatusError as exc:
        logger.error(
            "读取文本失败: dataset=%s, file=%s, http=%s",
            dataset_id,
            file_id,
            exc.response.status_code,
        )
        raise HTTPException(status_code=502, detail="读取文本失败（下载接口返回错误）")
    except Exception as exc:
        logger.error("读取文本失败: dataset=%s, file=%s, err=%s", dataset_id, file_id, exc)
        raise HTTPException(status_code=502, detail="读取文本失败（下载接口调用异常）")