You've already forked DataMate
feat(dataset): 添加PDF文本提取功能支持
- 新增dataset模块路由配置 - 添加PdfTextExtractRequest和PdfTextExtractResponse数据传输对象 - 实现PDF文本提取接口,支持从PDF文件中提取文本内容 - 集成数据库会话管理和异步处理能力
This commit is contained in:
@@ -14,6 +14,7 @@ import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
||||
import com.datamate.common.interfaces.PagedResponse;
|
||||
import com.datamate.common.interfaces.PagingQuery;
|
||||
import com.datamate.datamanagement.common.enums.DuplicateMethod;
|
||||
import com.datamate.datamanagement.common.enums.DatasetType;
|
||||
import com.datamate.datamanagement.domain.contants.DatasetConstant;
|
||||
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
||||
@@ -66,10 +67,12 @@ import java.util.stream.Stream;
|
||||
@Service
|
||||
@Transactional
|
||||
public class DatasetFileApplicationService {
|
||||
private static final String PDF_FILE_TYPE = "pdf";
|
||||
|
||||
private final DatasetFileRepository datasetFileRepository;
|
||||
private final DatasetRepository datasetRepository;
|
||||
private final FileService fileService;
|
||||
private final PdfTextExtractAsyncService pdfTextExtractAsyncService;
|
||||
|
||||
@Value("${datamate.data-management.base-path:/dataset}")
|
||||
private String datasetBasePath;
|
||||
@@ -79,10 +82,13 @@ public class DatasetFileApplicationService {
|
||||
|
||||
@Autowired
|
||||
public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository,
|
||||
DatasetRepository datasetRepository, FileService fileService) {
|
||||
DatasetRepository datasetRepository,
|
||||
FileService fileService,
|
||||
PdfTextExtractAsyncService pdfTextExtractAsyncService) {
|
||||
this.datasetFileRepository = datasetFileRepository;
|
||||
this.datasetRepository = datasetRepository;
|
||||
this.fileService = fileService;
|
||||
this.pdfTextExtractAsyncService = pdfTextExtractAsyncService;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -418,6 +424,7 @@ public class DatasetFileApplicationService {
|
||||
setDatasetFileId(datasetFile, dataset);
|
||||
datasetFileRepository.saveOrUpdate(datasetFile);
|
||||
dataset.addFile(datasetFile);
|
||||
triggerPdfTextExtraction(dataset, datasetFile);
|
||||
}
|
||||
dataset.active();
|
||||
datasetRepository.updateById(dataset);
|
||||
@@ -701,6 +708,11 @@ public class DatasetFileApplicationService {
|
||||
try {
|
||||
Files.createDirectories(Path.of(dataset.getPath()));
|
||||
Files.copy(sourceFilePath, targetFilePath);
|
||||
DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName(
|
||||
dataset.getId(),
|
||||
sourceFilePath.getFileName().toString()
|
||||
);
|
||||
triggerPdfTextExtraction(dataset, datasetFile);
|
||||
} catch (IOException e) {
|
||||
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
|
||||
}
|
||||
@@ -753,6 +765,7 @@ public class DatasetFileApplicationService {
|
||||
setDatasetFileId(datasetFile, dataset);
|
||||
dataset.addFile(datasetFile);
|
||||
addedFiles.add(datasetFile);
|
||||
triggerPdfTextExtraction(dataset, datasetFile);
|
||||
}
|
||||
datasetFileRepository.saveOrUpdateBatch(addedFiles, 100);
|
||||
dataset.active();
|
||||
@@ -761,4 +774,18 @@ public class DatasetFileApplicationService {
|
||||
// If file copy is needed, use copyFilesToDatasetDir endpoint instead
|
||||
return addedFiles;
|
||||
}
|
||||
|
||||
private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) {
|
||||
if (dataset == null || datasetFile == null) {
|
||||
return;
|
||||
}
|
||||
if (dataset.getDatasetType() != DatasetType.TEXT) {
|
||||
return;
|
||||
}
|
||||
String fileType = datasetFile.getFileType();
|
||||
if (fileType == null || !fileType.equalsIgnoreCase(PDF_FILE_TYPE)) {
|
||||
return;
|
||||
}
|
||||
pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
package com.datamate.datamanagement.application;
|
||||
|
||||
import com.datamate.common.infrastructure.common.Response;
|
||||
import com.datamate.datamanagement.infrastructure.client.PdfTextExtractClient;
|
||||
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractRequest;
|
||||
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractResponse;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.scheduling.annotation.Async;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class PdfTextExtractAsyncService {
|
||||
private final PdfTextExtractClient pdfTextExtractClient;
|
||||
|
||||
@Async
|
||||
public void extractPdfText(String datasetId, String fileId) {
|
||||
try {
|
||||
Response<PdfTextExtractResponse> response = pdfTextExtractClient.extractPdfText(
|
||||
new PdfTextExtractRequest(datasetId, fileId)
|
||||
);
|
||||
if (response == null) {
|
||||
log.warn("PdfTextExtract returned null response, datasetId={}, fileId={}", datasetId, fileId);
|
||||
return;
|
||||
}
|
||||
String code = response.getCode();
|
||||
if (!"0".equals(code) && !"200".equals(code)) {
|
||||
log.warn(
|
||||
"PdfTextExtract failed, datasetId={}, fileId={}, code={}, message={}",
|
||||
datasetId,
|
||||
fileId,
|
||||
code,
|
||||
response.getMessage()
|
||||
);
|
||||
return;
|
||||
}
|
||||
PdfTextExtractResponse data = response.getData();
|
||||
if (data != null) {
|
||||
log.info(
|
||||
"PdfTextExtract succeeded, datasetId={}, fileId={}, textFileId={}",
|
||||
datasetId,
|
||||
fileId,
|
||||
data.getTextFileId()
|
||||
);
|
||||
} else {
|
||||
log.info("PdfTextExtract succeeded, datasetId={}, fileId={}", datasetId, fileId);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("PdfTextExtract call failed, datasetId={}, fileId={}", datasetId, fileId, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
package com.datamate.datamanagement.infrastructure.client;
|
||||
|
||||
import com.datamate.common.infrastructure.common.Response;
|
||||
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractRequest;
|
||||
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractResponse;
|
||||
import org.springframework.cloud.openfeign.FeignClient;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
|
||||
@FeignClient(name = "pdf-text-extract-service", url = "${pdf.text.extract.service.url:http://datamate-backend-python:18000}")
|
||||
public interface PdfTextExtractClient {
|
||||
|
||||
@PostMapping("/api/dataset/pdf-text-extract")
|
||||
Response<PdfTextExtractResponse> extractPdfText(@RequestBody PdfTextExtractRequest request);
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
package com.datamate.datamanagement.infrastructure.client.dto;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class PdfTextExtractRequest {
|
||||
private String datasetId;
|
||||
private String fileId;
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
package com.datamate.datamanagement.infrastructure.client.dto;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class PdfTextExtractResponse {
|
||||
private String datasetId;
|
||||
private String sourceFileId;
|
||||
private String textFileId;
|
||||
private String textFileName;
|
||||
private String textFilePath;
|
||||
private Long textFileSize;
|
||||
}
|
||||
@@ -6,6 +6,7 @@ from .ratio.interface import router as ratio_router
|
||||
from .generation.interface import router as generation_router
|
||||
from .evaluation.interface import router as evaluation_router
|
||||
from .collection.interface import router as collection_route
|
||||
from .dataset.interface import router as dataset_router
|
||||
|
||||
router = APIRouter(
|
||||
prefix="/api"
|
||||
@@ -17,5 +18,6 @@ router.include_router(ratio_router)
|
||||
router.include_router(generation_router)
|
||||
router.include_router(evaluation_router)
|
||||
router.include_router(collection_route)
|
||||
router.include_router(dataset_router)
|
||||
|
||||
__all__ = ["router"]
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
from fastapi import APIRouter, Depends
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db.session import get_db
|
||||
from app.module.dataset.schema.pdf_extract import PdfTextExtractRequest, PdfTextExtractResponse
|
||||
from app.module.dataset.service.pdf_extract import PdfTextExtractService
|
||||
from app.module.shared.schema.common import StandardResponse
|
||||
|
||||
router = APIRouter(
|
||||
prefix="",
|
||||
tags=["dataset"],
|
||||
)
|
||||
|
||||
|
||||
@router.post("/pdf-text-extract", response_model=StandardResponse[PdfTextExtractResponse])
|
||||
async def extract_pdf_text(
|
||||
request: PdfTextExtractRequest,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
service = PdfTextExtractService(db)
|
||||
result = await service.extract_pdf_to_text(request.dataset_id, request.file_id)
|
||||
return StandardResponse(code=200, message="Success", data=result)
|
||||
Reference in New Issue
Block a user