From ab957ab53d84e2296d48e38dda1b81df9d012ca2 Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Thu, 29 Jan 2026 11:44:50 +0800 Subject: [PATCH] =?UTF-8?q?feat(dataset):=20=E6=B7=BB=E5=8A=A0PDF=E6=96=87?= =?UTF-8?q?=E6=9C=AC=E6=8F=90=E5=8F=96=E5=8A=9F=E8=83=BD=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增dataset模块路由配置 - 添加PdfTextExtractRequest和PdfTextExtractResponse数据传输对象 - 实现PDF文本提取接口,支持从PDF文件中提取文本内容 - 集成数据库会话管理和异步处理能力 --- .../DatasetFileApplicationService.java | 131 +++++++++++------- .../PdfTextExtractAsyncService.java | 54 ++++++++ .../client/PdfTextExtractClient.java | 15 ++ .../client/dto/PdfTextExtractRequest.java | 13 ++ .../client/dto/PdfTextExtractResponse.java | 13 ++ .../datamate-python/app/module/__init__.py | 2 + .../module/dataset/interface/pdf_extract.py | 22 +++ 7 files changed, 198 insertions(+), 52 deletions(-) create mode 100644 backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/PdfTextExtractAsyncService.java create mode 100644 backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/PdfTextExtractClient.java create mode 100644 backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/dto/PdfTextExtractRequest.java create mode 100644 backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/dto/PdfTextExtractResponse.java create mode 100644 runtime/datamate-python/app/module/dataset/interface/pdf_extract.py diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java index c4da21e..03dc0cd 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java @@ -13,11 +13,12 @@ import com.datamate.common.infrastructure.exception.CommonErrorCode; import com.datamate.common.infrastructure.exception.SystemErrorCode; import com.datamate.common.interfaces.PagedResponse; import com.datamate.common.interfaces.PagingQuery; -import com.datamate.datamanagement.common.enums.DuplicateMethod; -import com.datamate.datamanagement.domain.contants.DatasetConstant; -import com.datamate.datamanagement.domain.model.dataset.Dataset; -import com.datamate.datamanagement.domain.model.dataset.DatasetFile; -import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo; +import com.datamate.datamanagement.common.enums.DuplicateMethod; +import com.datamate.datamanagement.common.enums.DatasetType; +import com.datamate.datamanagement.domain.contants.DatasetConstant; +import com.datamate.datamanagement.domain.model.dataset.Dataset; +import com.datamate.datamanagement.domain.model.dataset.DatasetFile; +import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo; import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode; import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository; import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository; @@ -65,11 +66,13 @@ import java.util.stream.Stream; @Slf4j @Service @Transactional -public class DatasetFileApplicationService { - - private final DatasetFileRepository datasetFileRepository; - private final DatasetRepository datasetRepository; - private final FileService fileService; +public class DatasetFileApplicationService { + private static final String PDF_FILE_TYPE = "pdf"; + + private final DatasetFileRepository datasetFileRepository; + private final DatasetRepository datasetRepository; + private final FileService fileService; + private final PdfTextExtractAsyncService pdfTextExtractAsyncService; @Value("${datamate.data-management.base-path:/dataset}") private String datasetBasePath; @@ -77,13 +80,16 @@ public class DatasetFileApplicationService { @Value("${datamate.data-management.file.duplicate:COVER}") private DuplicateMethod duplicateMethod; - @Autowired - public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository, - DatasetRepository datasetRepository, FileService fileService) { - this.datasetFileRepository = datasetFileRepository; - this.datasetRepository = datasetRepository; - this.fileService = fileService; - } + @Autowired + public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository, + DatasetRepository datasetRepository, + FileService fileService, + PdfTextExtractAsyncService pdfTextExtractAsyncService) { + this.datasetFileRepository = datasetFileRepository; + this.datasetRepository = datasetRepository; + this.fileService = fileService; + this.pdfTextExtractAsyncService = pdfTextExtractAsyncService; + } /** * 获取数据集文件列表 @@ -405,23 +411,24 @@ public class DatasetFileApplicationService { for (FileUploadResult file : unpacked) { File savedFile = file.getSavedFile(); LocalDateTime currentTime = LocalDateTime.now(); - DatasetFile datasetFile = DatasetFile.builder() - .id(UUID.randomUUID().toString()) - .datasetId(datasetId) - .fileSize(savedFile.length()) + DatasetFile datasetFile = DatasetFile.builder() + .id(UUID.randomUUID().toString()) + .datasetId(datasetId) + .fileSize(savedFile.length()) .uploadTime(currentTime) .lastAccessTime(currentTime) .fileName(file.getFileName()) .filePath(savedFile.getPath()) .fileType(AnalyzerUtils.getExtension(file.getFileName())) .build(); - setDatasetFileId(datasetFile, dataset); - datasetFileRepository.saveOrUpdate(datasetFile); - dataset.addFile(datasetFile); - } - dataset.active(); - datasetRepository.updateById(dataset); - } + setDatasetFileId(datasetFile, dataset); + datasetFileRepository.saveOrUpdate(datasetFile); + dataset.addFile(datasetFile); + triggerPdfTextExtraction(dataset, datasetFile); + } + dataset.active(); + datasetRepository.updateById(dataset); + } /** * 在数据集下创建子目录 @@ -687,24 +694,29 @@ public class DatasetFileApplicationService { dataset.addFile(datasetFile); copiedFiles.add(datasetFile); } - datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100); - dataset.active(); - datasetRepository.updateById(dataset); - CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset)); - return copiedFiles; + datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100); + dataset.active(); + datasetRepository.updateById(dataset); + CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset)); + return copiedFiles; } private void copyFilesToDatasetDir(List sourcePaths, Dataset dataset) { for (String sourcePath : sourcePaths) { Path sourceFilePath = Paths.get(sourcePath); Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString()); - try { - Files.createDirectories(Path.of(dataset.getPath())); - Files.copy(sourceFilePath, targetFilePath); - } catch (IOException e) { - log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e); - } - } + try { + Files.createDirectories(Path.of(dataset.getPath())); + Files.copy(sourceFilePath, targetFilePath); + DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName( + dataset.getId(), + sourceFilePath.getFileName().toString() + ); + triggerPdfTextExtraction(dataset, datasetFile); + } catch (IOException e) { + log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e); + } + } } /** @@ -750,15 +762,30 @@ public class DatasetFileApplicationService { .lastAccessTime(currentTime) .metadata(metadata) .build(); - setDatasetFileId(datasetFile, dataset); - dataset.addFile(datasetFile); - addedFiles.add(datasetFile); - } - datasetFileRepository.saveOrUpdateBatch(addedFiles, 100); - dataset.active(); - datasetRepository.updateById(dataset); - // Note: addFilesToDataset only creates DB records, no file system operations - // If file copy is needed, use copyFilesToDatasetDir endpoint instead - return addedFiles; - } -} + setDatasetFileId(datasetFile, dataset); + dataset.addFile(datasetFile); + addedFiles.add(datasetFile); + triggerPdfTextExtraction(dataset, datasetFile); + } + datasetFileRepository.saveOrUpdateBatch(addedFiles, 100); + dataset.active(); + datasetRepository.updateById(dataset); + // Note: addFilesToDataset only creates DB records, no file system operations + // If file copy is needed, use copyFilesToDatasetDir endpoint instead + return addedFiles; + } + + private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) { + if (dataset == null || datasetFile == null) { + return; + } + if (dataset.getDatasetType() != DatasetType.TEXT) { + return; + } + String fileType = datasetFile.getFileType(); + if (fileType == null || !fileType.equalsIgnoreCase(PDF_FILE_TYPE)) { + return; + } + pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId()); + } +} diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/PdfTextExtractAsyncService.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/PdfTextExtractAsyncService.java new file mode 100644 index 0000000..69bf2c6 --- /dev/null +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/PdfTextExtractAsyncService.java @@ -0,0 +1,54 @@ +package com.datamate.datamanagement.application; + +import com.datamate.common.infrastructure.common.Response; +import com.datamate.datamanagement.infrastructure.client.PdfTextExtractClient; +import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractRequest; +import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractResponse; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.scheduling.annotation.Async; +import org.springframework.stereotype.Service; + +@Service +@RequiredArgsConstructor +@Slf4j +public class PdfTextExtractAsyncService { + private final PdfTextExtractClient pdfTextExtractClient; + + @Async + public void extractPdfText(String datasetId, String fileId) { + try { + Response response = pdfTextExtractClient.extractPdfText( + new PdfTextExtractRequest(datasetId, fileId) + ); + if (response == null) { + log.warn("PdfTextExtract returned null response, datasetId={}, fileId={}", datasetId, fileId); + return; + } + String code = response.getCode(); + if (!"0".equals(code) && !"200".equals(code)) { + log.warn( + "PdfTextExtract failed, datasetId={}, fileId={}, code={}, message={}", + datasetId, + fileId, + code, + response.getMessage() + ); + return; + } + PdfTextExtractResponse data = response.getData(); + if (data != null) { + log.info( + "PdfTextExtract succeeded, datasetId={}, fileId={}, textFileId={}", + datasetId, + fileId, + data.getTextFileId() + ); + } else { + log.info("PdfTextExtract succeeded, datasetId={}, fileId={}", datasetId, fileId); + } + } catch (Exception e) { + log.error("PdfTextExtract call failed, datasetId={}, fileId={}", datasetId, fileId, e); + } + } +} diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/PdfTextExtractClient.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/PdfTextExtractClient.java new file mode 100644 index 0000000..370fd0f --- /dev/null +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/PdfTextExtractClient.java @@ -0,0 +1,15 @@ +package com.datamate.datamanagement.infrastructure.client; + +import com.datamate.common.infrastructure.common.Response; +import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractRequest; +import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractResponse; +import org.springframework.cloud.openfeign.FeignClient; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestBody; + +@FeignClient(name = "pdf-text-extract-service", url = "${pdf.text.extract.service.url:http://datamate-backend-python:18000}") +public interface PdfTextExtractClient { + + @PostMapping("/api/dataset/pdf-text-extract") + Response extractPdfText(@RequestBody PdfTextExtractRequest request); +} diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/dto/PdfTextExtractRequest.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/dto/PdfTextExtractRequest.java new file mode 100644 index 0000000..f5ca359 --- /dev/null +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/dto/PdfTextExtractRequest.java @@ -0,0 +1,13 @@ +package com.datamate.datamanagement.infrastructure.client.dto; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@NoArgsConstructor +@AllArgsConstructor +public class PdfTextExtractRequest { + private String datasetId; + private String fileId; +} diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/dto/PdfTextExtractResponse.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/dto/PdfTextExtractResponse.java new file mode 100644 index 0000000..ee07291 --- /dev/null +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/client/dto/PdfTextExtractResponse.java @@ -0,0 +1,13 @@ +package com.datamate.datamanagement.infrastructure.client.dto; + +import lombok.Data; + +@Data +public class PdfTextExtractResponse { + private String datasetId; + private String sourceFileId; + private String textFileId; + private String textFileName; + private String textFilePath; + private Long textFileSize; +} diff --git a/runtime/datamate-python/app/module/__init__.py b/runtime/datamate-python/app/module/__init__.py index 74ae7c2..2dab088 100644 --- a/runtime/datamate-python/app/module/__init__.py +++ b/runtime/datamate-python/app/module/__init__.py @@ -6,6 +6,7 @@ from .ratio.interface import router as ratio_router from .generation.interface import router as generation_router from .evaluation.interface import router as evaluation_router from .collection.interface import router as collection_route +from .dataset.interface import router as dataset_router router = APIRouter( prefix="/api" @@ -17,5 +18,6 @@ router.include_router(ratio_router) router.include_router(generation_router) router.include_router(evaluation_router) router.include_router(collection_route) +router.include_router(dataset_router) __all__ = ["router"] diff --git a/runtime/datamate-python/app/module/dataset/interface/pdf_extract.py b/runtime/datamate-python/app/module/dataset/interface/pdf_extract.py new file mode 100644 index 0000000..c42f3c2 --- /dev/null +++ b/runtime/datamate-python/app/module/dataset/interface/pdf_extract.py @@ -0,0 +1,22 @@ +from fastapi import APIRouter, Depends +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.session import get_db +from app.module.dataset.schema.pdf_extract import PdfTextExtractRequest, PdfTextExtractResponse +from app.module.dataset.service.pdf_extract import PdfTextExtractService +from app.module.shared.schema.common import StandardResponse + +router = APIRouter( + prefix="", + tags=["dataset"], +) + + +@router.post("/pdf-text-extract", response_model=StandardResponse[PdfTextExtractResponse]) +async def extract_pdf_text( + request: PdfTextExtractRequest, + db: AsyncSession = Depends(get_db), +): + service = PdfTextExtractService(db) + result = await service.extract_pdf_to_text(request.dataset_id, request.file_id) + return StandardResponse(code=200, message="Success", data=result)