feat(dataset): 添加PDF文本提取功能支持

- 新增dataset模块路由配置
- 添加PdfTextExtractRequest和PdfTextExtractResponse数据传输对象
- 实现PDF文本提取接口,支持从PDF文件中提取文本内容
- 集成数据库会话管理和异步处理能力
This commit is contained in:
2026-01-29 11:44:50 +08:00
parent 5eafcf0145
commit ab957ab53d
7 changed files with 198 additions and 52 deletions

View File

@@ -13,11 +13,12 @@ import com.datamate.common.infrastructure.exception.CommonErrorCode;
import com.datamate.common.infrastructure.exception.SystemErrorCode;
import com.datamate.common.interfaces.PagedResponse;
import com.datamate.common.interfaces.PagingQuery;
import com.datamate.datamanagement.common.enums.DuplicateMethod;
import com.datamate.datamanagement.domain.contants.DatasetConstant;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo;
import com.datamate.datamanagement.common.enums.DuplicateMethod;
import com.datamate.datamanagement.common.enums.DatasetType;
import com.datamate.datamanagement.domain.contants.DatasetConstant;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo;
import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
@@ -65,11 +66,13 @@ import java.util.stream.Stream;
@Slf4j
@Service
@Transactional
public class DatasetFileApplicationService {
private final DatasetFileRepository datasetFileRepository;
private final DatasetRepository datasetRepository;
private final FileService fileService;
public class DatasetFileApplicationService {
private static final String PDF_FILE_TYPE = "pdf";
private final DatasetFileRepository datasetFileRepository;
private final DatasetRepository datasetRepository;
private final FileService fileService;
private final PdfTextExtractAsyncService pdfTextExtractAsyncService;
@Value("${datamate.data-management.base-path:/dataset}")
private String datasetBasePath;
@@ -77,13 +80,16 @@ public class DatasetFileApplicationService {
@Value("${datamate.data-management.file.duplicate:COVER}")
private DuplicateMethod duplicateMethod;
@Autowired
public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository,
DatasetRepository datasetRepository, FileService fileService) {
this.datasetFileRepository = datasetFileRepository;
this.datasetRepository = datasetRepository;
this.fileService = fileService;
}
@Autowired
public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository,
DatasetRepository datasetRepository,
FileService fileService,
PdfTextExtractAsyncService pdfTextExtractAsyncService) {
this.datasetFileRepository = datasetFileRepository;
this.datasetRepository = datasetRepository;
this.fileService = fileService;
this.pdfTextExtractAsyncService = pdfTextExtractAsyncService;
}
/**
* 获取数据集文件列表
@@ -405,23 +411,24 @@ public class DatasetFileApplicationService {
for (FileUploadResult file : unpacked) {
File savedFile = file.getSavedFile();
LocalDateTime currentTime = LocalDateTime.now();
DatasetFile datasetFile = DatasetFile.builder()
.id(UUID.randomUUID().toString())
.datasetId(datasetId)
.fileSize(savedFile.length())
DatasetFile datasetFile = DatasetFile.builder()
.id(UUID.randomUUID().toString())
.datasetId(datasetId)
.fileSize(savedFile.length())
.uploadTime(currentTime)
.lastAccessTime(currentTime)
.fileName(file.getFileName())
.filePath(savedFile.getPath())
.fileType(AnalyzerUtils.getExtension(file.getFileName()))
.build();
setDatasetFileId(datasetFile, dataset);
datasetFileRepository.saveOrUpdate(datasetFile);
dataset.addFile(datasetFile);
}
dataset.active();
datasetRepository.updateById(dataset);
}
setDatasetFileId(datasetFile, dataset);
datasetFileRepository.saveOrUpdate(datasetFile);
dataset.addFile(datasetFile);
triggerPdfTextExtraction(dataset, datasetFile);
}
dataset.active();
datasetRepository.updateById(dataset);
}
/**
* 在数据集下创建子目录
@@ -687,24 +694,29 @@ public class DatasetFileApplicationService {
dataset.addFile(datasetFile);
copiedFiles.add(datasetFile);
}
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
dataset.active();
datasetRepository.updateById(dataset);
CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
return copiedFiles;
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
dataset.active();
datasetRepository.updateById(dataset);
CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
return copiedFiles;
}
private void copyFilesToDatasetDir(List<String> sourcePaths, Dataset dataset) {
for (String sourcePath : sourcePaths) {
Path sourceFilePath = Paths.get(sourcePath);
Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString());
try {
Files.createDirectories(Path.of(dataset.getPath()));
Files.copy(sourceFilePath, targetFilePath);
} catch (IOException e) {
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
}
}
try {
Files.createDirectories(Path.of(dataset.getPath()));
Files.copy(sourceFilePath, targetFilePath);
DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName(
dataset.getId(),
sourceFilePath.getFileName().toString()
);
triggerPdfTextExtraction(dataset, datasetFile);
} catch (IOException e) {
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
}
}
}
/**
@@ -750,15 +762,30 @@ public class DatasetFileApplicationService {
.lastAccessTime(currentTime)
.metadata(metadata)
.build();
setDatasetFileId(datasetFile, dataset);
dataset.addFile(datasetFile);
addedFiles.add(datasetFile);
}
datasetFileRepository.saveOrUpdateBatch(addedFiles, 100);
dataset.active();
datasetRepository.updateById(dataset);
// Note: addFilesToDataset only creates DB records, no file system operations
// If file copy is needed, use copyFilesToDatasetDir endpoint instead
return addedFiles;
}
}
setDatasetFileId(datasetFile, dataset);
dataset.addFile(datasetFile);
addedFiles.add(datasetFile);
triggerPdfTextExtraction(dataset, datasetFile);
}
datasetFileRepository.saveOrUpdateBatch(addedFiles, 100);
dataset.active();
datasetRepository.updateById(dataset);
// Note: addFilesToDataset only creates DB records, no file system operations
// If file copy is needed, use copyFilesToDatasetDir endpoint instead
return addedFiles;
}
private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) {
if (dataset == null || datasetFile == null) {
return;
}
if (dataset.getDatasetType() != DatasetType.TEXT) {
return;
}
String fileType = datasetFile.getFileType();
if (fileType == null || !fileType.equalsIgnoreCase(PDF_FILE_TYPE)) {
return;
}
pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId());
}
}

View File

@@ -0,0 +1,54 @@
package com.datamate.datamanagement.application;
import com.datamate.common.infrastructure.common.Response;
import com.datamate.datamanagement.infrastructure.client.PdfTextExtractClient;
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractRequest;
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractResponse;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
@Service
@RequiredArgsConstructor
@Slf4j
public class PdfTextExtractAsyncService {
private final PdfTextExtractClient pdfTextExtractClient;
@Async
public void extractPdfText(String datasetId, String fileId) {
try {
Response<PdfTextExtractResponse> response = pdfTextExtractClient.extractPdfText(
new PdfTextExtractRequest(datasetId, fileId)
);
if (response == null) {
log.warn("PdfTextExtract returned null response, datasetId={}, fileId={}", datasetId, fileId);
return;
}
String code = response.getCode();
if (!"0".equals(code) && !"200".equals(code)) {
log.warn(
"PdfTextExtract failed, datasetId={}, fileId={}, code={}, message={}",
datasetId,
fileId,
code,
response.getMessage()
);
return;
}
PdfTextExtractResponse data = response.getData();
if (data != null) {
log.info(
"PdfTextExtract succeeded, datasetId={}, fileId={}, textFileId={}",
datasetId,
fileId,
data.getTextFileId()
);
} else {
log.info("PdfTextExtract succeeded, datasetId={}, fileId={}", datasetId, fileId);
}
} catch (Exception e) {
log.error("PdfTextExtract call failed, datasetId={}, fileId={}", datasetId, fileId, e);
}
}
}

View File

@@ -0,0 +1,15 @@
package com.datamate.datamanagement.infrastructure.client;
import com.datamate.common.infrastructure.common.Response;
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractRequest;
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractResponse;
import org.springframework.cloud.openfeign.FeignClient;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
@FeignClient(name = "pdf-text-extract-service", url = "${pdf.text.extract.service.url:http://datamate-backend-python:18000}")
public interface PdfTextExtractClient {
@PostMapping("/api/dataset/pdf-text-extract")
Response<PdfTextExtractResponse> extractPdfText(@RequestBody PdfTextExtractRequest request);
}

View File

@@ -0,0 +1,13 @@
package com.datamate.datamanagement.infrastructure.client.dto;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class PdfTextExtractRequest {
private String datasetId;
private String fileId;
}

View File

@@ -0,0 +1,13 @@
package com.datamate.datamanagement.infrastructure.client.dto;
import lombok.Data;
@Data
public class PdfTextExtractResponse {
private String datasetId;
private String sourceFileId;
private String textFileId;
private String textFileName;
private String textFilePath;
private Long textFileSize;
}

View File

@@ -6,6 +6,7 @@ from .ratio.interface import router as ratio_router
from .generation.interface import router as generation_router
from .evaluation.interface import router as evaluation_router
from .collection.interface import router as collection_route
from .dataset.interface import router as dataset_router
router = APIRouter(
prefix="/api"
@@ -17,5 +18,6 @@ router.include_router(ratio_router)
router.include_router(generation_router)
router.include_router(evaluation_router)
router.include_router(collection_route)
router.include_router(dataset_router)
__all__ = ["router"]

View File

@@ -0,0 +1,22 @@
from fastapi import APIRouter, Depends
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.session import get_db
from app.module.dataset.schema.pdf_extract import PdfTextExtractRequest, PdfTextExtractResponse
from app.module.dataset.service.pdf_extract import PdfTextExtractService
from app.module.shared.schema.common import StandardResponse
router = APIRouter(
prefix="",
tags=["dataset"],
)
@router.post("/pdf-text-extract", response_model=StandardResponse[PdfTextExtractResponse])
async def extract_pdf_text(
request: PdfTextExtractRequest,
db: AsyncSession = Depends(get_db),
):
service = PdfTextExtractService(db)
result = await service.extract_pdf_to_text(request.dataset_id, request.file_id)
return StandardResponse(code=200, message="Success", data=result)