You've already forked DataMate
feat(dataset): 添加PDF文本提取功能支持
- 新增dataset模块路由配置 - 添加PdfTextExtractRequest和PdfTextExtractResponse数据传输对象 - 实现PDF文本提取接口,支持从PDF文件中提取文本内容 - 集成数据库会话管理和异步处理能力
This commit is contained in:
@@ -14,6 +14,7 @@ import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
|||||||
import com.datamate.common.interfaces.PagedResponse;
|
import com.datamate.common.interfaces.PagedResponse;
|
||||||
import com.datamate.common.interfaces.PagingQuery;
|
import com.datamate.common.interfaces.PagingQuery;
|
||||||
import com.datamate.datamanagement.common.enums.DuplicateMethod;
|
import com.datamate.datamanagement.common.enums.DuplicateMethod;
|
||||||
|
import com.datamate.datamanagement.common.enums.DatasetType;
|
||||||
import com.datamate.datamanagement.domain.contants.DatasetConstant;
|
import com.datamate.datamanagement.domain.contants.DatasetConstant;
|
||||||
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
||||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
||||||
@@ -66,10 +67,12 @@ import java.util.stream.Stream;
|
|||||||
@Service
|
@Service
|
||||||
@Transactional
|
@Transactional
|
||||||
public class DatasetFileApplicationService {
|
public class DatasetFileApplicationService {
|
||||||
|
private static final String PDF_FILE_TYPE = "pdf";
|
||||||
|
|
||||||
private final DatasetFileRepository datasetFileRepository;
|
private final DatasetFileRepository datasetFileRepository;
|
||||||
private final DatasetRepository datasetRepository;
|
private final DatasetRepository datasetRepository;
|
||||||
private final FileService fileService;
|
private final FileService fileService;
|
||||||
|
private final PdfTextExtractAsyncService pdfTextExtractAsyncService;
|
||||||
|
|
||||||
@Value("${datamate.data-management.base-path:/dataset}")
|
@Value("${datamate.data-management.base-path:/dataset}")
|
||||||
private String datasetBasePath;
|
private String datasetBasePath;
|
||||||
@@ -79,10 +82,13 @@ public class DatasetFileApplicationService {
|
|||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository,
|
public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository,
|
||||||
DatasetRepository datasetRepository, FileService fileService) {
|
DatasetRepository datasetRepository,
|
||||||
|
FileService fileService,
|
||||||
|
PdfTextExtractAsyncService pdfTextExtractAsyncService) {
|
||||||
this.datasetFileRepository = datasetFileRepository;
|
this.datasetFileRepository = datasetFileRepository;
|
||||||
this.datasetRepository = datasetRepository;
|
this.datasetRepository = datasetRepository;
|
||||||
this.fileService = fileService;
|
this.fileService = fileService;
|
||||||
|
this.pdfTextExtractAsyncService = pdfTextExtractAsyncService;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -418,6 +424,7 @@ public class DatasetFileApplicationService {
|
|||||||
setDatasetFileId(datasetFile, dataset);
|
setDatasetFileId(datasetFile, dataset);
|
||||||
datasetFileRepository.saveOrUpdate(datasetFile);
|
datasetFileRepository.saveOrUpdate(datasetFile);
|
||||||
dataset.addFile(datasetFile);
|
dataset.addFile(datasetFile);
|
||||||
|
triggerPdfTextExtraction(dataset, datasetFile);
|
||||||
}
|
}
|
||||||
dataset.active();
|
dataset.active();
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
@@ -701,6 +708,11 @@ public class DatasetFileApplicationService {
|
|||||||
try {
|
try {
|
||||||
Files.createDirectories(Path.of(dataset.getPath()));
|
Files.createDirectories(Path.of(dataset.getPath()));
|
||||||
Files.copy(sourceFilePath, targetFilePath);
|
Files.copy(sourceFilePath, targetFilePath);
|
||||||
|
DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName(
|
||||||
|
dataset.getId(),
|
||||||
|
sourceFilePath.getFileName().toString()
|
||||||
|
);
|
||||||
|
triggerPdfTextExtraction(dataset, datasetFile);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
|
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
|
||||||
}
|
}
|
||||||
@@ -753,6 +765,7 @@ public class DatasetFileApplicationService {
|
|||||||
setDatasetFileId(datasetFile, dataset);
|
setDatasetFileId(datasetFile, dataset);
|
||||||
dataset.addFile(datasetFile);
|
dataset.addFile(datasetFile);
|
||||||
addedFiles.add(datasetFile);
|
addedFiles.add(datasetFile);
|
||||||
|
triggerPdfTextExtraction(dataset, datasetFile);
|
||||||
}
|
}
|
||||||
datasetFileRepository.saveOrUpdateBatch(addedFiles, 100);
|
datasetFileRepository.saveOrUpdateBatch(addedFiles, 100);
|
||||||
dataset.active();
|
dataset.active();
|
||||||
@@ -761,4 +774,18 @@ public class DatasetFileApplicationService {
|
|||||||
// If file copy is needed, use copyFilesToDatasetDir endpoint instead
|
// If file copy is needed, use copyFilesToDatasetDir endpoint instead
|
||||||
return addedFiles;
|
return addedFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) {
|
||||||
|
if (dataset == null || datasetFile == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (dataset.getDatasetType() != DatasetType.TEXT) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
String fileType = datasetFile.getFileType();
|
||||||
|
if (fileType == null || !fileType.equalsIgnoreCase(PDF_FILE_TYPE)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,54 @@
|
|||||||
|
package com.datamate.datamanagement.application;
|
||||||
|
|
||||||
|
import com.datamate.common.infrastructure.common.Response;
|
||||||
|
import com.datamate.datamanagement.infrastructure.client.PdfTextExtractClient;
|
||||||
|
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractRequest;
|
||||||
|
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractResponse;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.scheduling.annotation.Async;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
public class PdfTextExtractAsyncService {
|
||||||
|
private final PdfTextExtractClient pdfTextExtractClient;
|
||||||
|
|
||||||
|
@Async
|
||||||
|
public void extractPdfText(String datasetId, String fileId) {
|
||||||
|
try {
|
||||||
|
Response<PdfTextExtractResponse> response = pdfTextExtractClient.extractPdfText(
|
||||||
|
new PdfTextExtractRequest(datasetId, fileId)
|
||||||
|
);
|
||||||
|
if (response == null) {
|
||||||
|
log.warn("PdfTextExtract returned null response, datasetId={}, fileId={}", datasetId, fileId);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
String code = response.getCode();
|
||||||
|
if (!"0".equals(code) && !"200".equals(code)) {
|
||||||
|
log.warn(
|
||||||
|
"PdfTextExtract failed, datasetId={}, fileId={}, code={}, message={}",
|
||||||
|
datasetId,
|
||||||
|
fileId,
|
||||||
|
code,
|
||||||
|
response.getMessage()
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
PdfTextExtractResponse data = response.getData();
|
||||||
|
if (data != null) {
|
||||||
|
log.info(
|
||||||
|
"PdfTextExtract succeeded, datasetId={}, fileId={}, textFileId={}",
|
||||||
|
datasetId,
|
||||||
|
fileId,
|
||||||
|
data.getTextFileId()
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
log.info("PdfTextExtract succeeded, datasetId={}, fileId={}", datasetId, fileId);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error("PdfTextExtract call failed, datasetId={}, fileId={}", datasetId, fileId, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
package com.datamate.datamanagement.infrastructure.client;
|
||||||
|
|
||||||
|
import com.datamate.common.infrastructure.common.Response;
|
||||||
|
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractRequest;
|
||||||
|
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractResponse;
|
||||||
|
import org.springframework.cloud.openfeign.FeignClient;
|
||||||
|
import org.springframework.web.bind.annotation.PostMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestBody;
|
||||||
|
|
||||||
|
@FeignClient(name = "pdf-text-extract-service", url = "${pdf.text.extract.service.url:http://datamate-backend-python:18000}")
|
||||||
|
public interface PdfTextExtractClient {
|
||||||
|
|
||||||
|
@PostMapping("/api/dataset/pdf-text-extract")
|
||||||
|
Response<PdfTextExtractResponse> extractPdfText(@RequestBody PdfTextExtractRequest request);
|
||||||
|
}
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
package com.datamate.datamanagement.infrastructure.client.dto;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class PdfTextExtractRequest {
|
||||||
|
private String datasetId;
|
||||||
|
private String fileId;
|
||||||
|
}
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
package com.datamate.datamanagement.infrastructure.client.dto;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
public class PdfTextExtractResponse {
|
||||||
|
private String datasetId;
|
||||||
|
private String sourceFileId;
|
||||||
|
private String textFileId;
|
||||||
|
private String textFileName;
|
||||||
|
private String textFilePath;
|
||||||
|
private Long textFileSize;
|
||||||
|
}
|
||||||
@@ -6,6 +6,7 @@ from .ratio.interface import router as ratio_router
|
|||||||
from .generation.interface import router as generation_router
|
from .generation.interface import router as generation_router
|
||||||
from .evaluation.interface import router as evaluation_router
|
from .evaluation.interface import router as evaluation_router
|
||||||
from .collection.interface import router as collection_route
|
from .collection.interface import router as collection_route
|
||||||
|
from .dataset.interface import router as dataset_router
|
||||||
|
|
||||||
router = APIRouter(
|
router = APIRouter(
|
||||||
prefix="/api"
|
prefix="/api"
|
||||||
@@ -17,5 +18,6 @@ router.include_router(ratio_router)
|
|||||||
router.include_router(generation_router)
|
router.include_router(generation_router)
|
||||||
router.include_router(evaluation_router)
|
router.include_router(evaluation_router)
|
||||||
router.include_router(collection_route)
|
router.include_router(collection_route)
|
||||||
|
router.include_router(dataset_router)
|
||||||
|
|
||||||
__all__ = ["router"]
|
__all__ = ["router"]
|
||||||
|
|||||||
@@ -0,0 +1,22 @@
|
|||||||
|
from fastapi import APIRouter, Depends
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
|
from app.db.session import get_db
|
||||||
|
from app.module.dataset.schema.pdf_extract import PdfTextExtractRequest, PdfTextExtractResponse
|
||||||
|
from app.module.dataset.service.pdf_extract import PdfTextExtractService
|
||||||
|
from app.module.shared.schema.common import StandardResponse
|
||||||
|
|
||||||
|
router = APIRouter(
|
||||||
|
prefix="",
|
||||||
|
tags=["dataset"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/pdf-text-extract", response_model=StandardResponse[PdfTextExtractResponse])
|
||||||
|
async def extract_pdf_text(
|
||||||
|
request: PdfTextExtractRequest,
|
||||||
|
db: AsyncSession = Depends(get_db),
|
||||||
|
):
|
||||||
|
service = PdfTextExtractService(db)
|
||||||
|
result = await service.extract_pdf_to_text(request.dataset_id, request.file_id)
|
||||||
|
return StandardResponse(code=200, message="Success", data=result)
|
||||||
Reference in New Issue
Block a user