feat(dataset): 添加PDF文本提取功能支持

- 新增dataset模块路由配置
- 添加PdfTextExtractRequest和PdfTextExtractResponse数据传输对象
- 实现PDF文本提取接口,支持从PDF文件中提取文本内容
- 集成数据库会话管理和异步处理能力
This commit is contained in:
2026-01-29 11:44:50 +08:00
parent 5eafcf0145
commit ab957ab53d
7 changed files with 198 additions and 52 deletions

View File

@@ -6,6 +6,7 @@ from .ratio.interface import router as ratio_router
from .generation.interface import router as generation_router
from .evaluation.interface import router as evaluation_router
from .collection.interface import router as collection_route
from .dataset.interface import router as dataset_router
router = APIRouter(
prefix="/api"
@@ -17,5 +18,6 @@ router.include_router(ratio_router)
router.include_router(generation_router)
router.include_router(evaluation_router)
router.include_router(collection_route)
router.include_router(dataset_router)
__all__ = ["router"]

View File

@@ -0,0 +1,22 @@
from fastapi import APIRouter, Depends
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.session import get_db
from app.module.dataset.schema.pdf_extract import PdfTextExtractRequest, PdfTextExtractResponse
from app.module.dataset.service.pdf_extract import PdfTextExtractService
from app.module.shared.schema.common import StandardResponse
router = APIRouter(
prefix="",
tags=["dataset"],
)
@router.post("/pdf-text-extract", response_model=StandardResponse[PdfTextExtractResponse])
async def extract_pdf_text(
request: PdfTextExtractRequest,
db: AsyncSession = Depends(get_db),
):
service = PdfTextExtractService(db)
result = await service.extract_pdf_to_text(request.dataset_id, request.file_id)
return StandardResponse(code=200, message="Success", data=result)