You've already forked DataMate
feat(dataset): 添加PDF文本提取功能支持
- 新增dataset模块路由配置 - 添加PdfTextExtractRequest和PdfTextExtractResponse数据传输对象 - 实现PDF文本提取接口,支持从PDF文件中提取文本内容 - 集成数据库会话管理和异步处理能力
This commit is contained in:
@@ -13,11 +13,12 @@ import com.datamate.common.infrastructure.exception.CommonErrorCode;
|
||||
import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
||||
import com.datamate.common.interfaces.PagedResponse;
|
||||
import com.datamate.common.interfaces.PagingQuery;
|
||||
import com.datamate.datamanagement.common.enums.DuplicateMethod;
|
||||
import com.datamate.datamanagement.domain.contants.DatasetConstant;
|
||||
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo;
|
||||
import com.datamate.datamanagement.common.enums.DuplicateMethod;
|
||||
import com.datamate.datamanagement.common.enums.DatasetType;
|
||||
import com.datamate.datamanagement.domain.contants.DatasetConstant;
|
||||
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo;
|
||||
import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode;
|
||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
|
||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
|
||||
@@ -65,11 +66,13 @@ import java.util.stream.Stream;
|
||||
@Slf4j
|
||||
@Service
|
||||
@Transactional
|
||||
public class DatasetFileApplicationService {
|
||||
|
||||
private final DatasetFileRepository datasetFileRepository;
|
||||
private final DatasetRepository datasetRepository;
|
||||
private final FileService fileService;
|
||||
public class DatasetFileApplicationService {
|
||||
private static final String PDF_FILE_TYPE = "pdf";
|
||||
|
||||
private final DatasetFileRepository datasetFileRepository;
|
||||
private final DatasetRepository datasetRepository;
|
||||
private final FileService fileService;
|
||||
private final PdfTextExtractAsyncService pdfTextExtractAsyncService;
|
||||
|
||||
@Value("${datamate.data-management.base-path:/dataset}")
|
||||
private String datasetBasePath;
|
||||
@@ -77,13 +80,16 @@ public class DatasetFileApplicationService {
|
||||
@Value("${datamate.data-management.file.duplicate:COVER}")
|
||||
private DuplicateMethod duplicateMethod;
|
||||
|
||||
@Autowired
|
||||
public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository,
|
||||
DatasetRepository datasetRepository, FileService fileService) {
|
||||
this.datasetFileRepository = datasetFileRepository;
|
||||
this.datasetRepository = datasetRepository;
|
||||
this.fileService = fileService;
|
||||
}
|
||||
@Autowired
|
||||
public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository,
|
||||
DatasetRepository datasetRepository,
|
||||
FileService fileService,
|
||||
PdfTextExtractAsyncService pdfTextExtractAsyncService) {
|
||||
this.datasetFileRepository = datasetFileRepository;
|
||||
this.datasetRepository = datasetRepository;
|
||||
this.fileService = fileService;
|
||||
this.pdfTextExtractAsyncService = pdfTextExtractAsyncService;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取数据集文件列表
|
||||
@@ -405,23 +411,24 @@ public class DatasetFileApplicationService {
|
||||
for (FileUploadResult file : unpacked) {
|
||||
File savedFile = file.getSavedFile();
|
||||
LocalDateTime currentTime = LocalDateTime.now();
|
||||
DatasetFile datasetFile = DatasetFile.builder()
|
||||
.id(UUID.randomUUID().toString())
|
||||
.datasetId(datasetId)
|
||||
.fileSize(savedFile.length())
|
||||
DatasetFile datasetFile = DatasetFile.builder()
|
||||
.id(UUID.randomUUID().toString())
|
||||
.datasetId(datasetId)
|
||||
.fileSize(savedFile.length())
|
||||
.uploadTime(currentTime)
|
||||
.lastAccessTime(currentTime)
|
||||
.fileName(file.getFileName())
|
||||
.filePath(savedFile.getPath())
|
||||
.fileType(AnalyzerUtils.getExtension(file.getFileName()))
|
||||
.build();
|
||||
setDatasetFileId(datasetFile, dataset);
|
||||
datasetFileRepository.saveOrUpdate(datasetFile);
|
||||
dataset.addFile(datasetFile);
|
||||
}
|
||||
dataset.active();
|
||||
datasetRepository.updateById(dataset);
|
||||
}
|
||||
setDatasetFileId(datasetFile, dataset);
|
||||
datasetFileRepository.saveOrUpdate(datasetFile);
|
||||
dataset.addFile(datasetFile);
|
||||
triggerPdfTextExtraction(dataset, datasetFile);
|
||||
}
|
||||
dataset.active();
|
||||
datasetRepository.updateById(dataset);
|
||||
}
|
||||
|
||||
/**
|
||||
* 在数据集下创建子目录
|
||||
@@ -687,24 +694,29 @@ public class DatasetFileApplicationService {
|
||||
dataset.addFile(datasetFile);
|
||||
copiedFiles.add(datasetFile);
|
||||
}
|
||||
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
|
||||
dataset.active();
|
||||
datasetRepository.updateById(dataset);
|
||||
CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
|
||||
return copiedFiles;
|
||||
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
|
||||
dataset.active();
|
||||
datasetRepository.updateById(dataset);
|
||||
CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
|
||||
return copiedFiles;
|
||||
}
|
||||
|
||||
private void copyFilesToDatasetDir(List<String> sourcePaths, Dataset dataset) {
|
||||
for (String sourcePath : sourcePaths) {
|
||||
Path sourceFilePath = Paths.get(sourcePath);
|
||||
Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString());
|
||||
try {
|
||||
Files.createDirectories(Path.of(dataset.getPath()));
|
||||
Files.copy(sourceFilePath, targetFilePath);
|
||||
} catch (IOException e) {
|
||||
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
|
||||
}
|
||||
}
|
||||
try {
|
||||
Files.createDirectories(Path.of(dataset.getPath()));
|
||||
Files.copy(sourceFilePath, targetFilePath);
|
||||
DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName(
|
||||
dataset.getId(),
|
||||
sourceFilePath.getFileName().toString()
|
||||
);
|
||||
triggerPdfTextExtraction(dataset, datasetFile);
|
||||
} catch (IOException e) {
|
||||
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -750,15 +762,30 @@ public class DatasetFileApplicationService {
|
||||
.lastAccessTime(currentTime)
|
||||
.metadata(metadata)
|
||||
.build();
|
||||
setDatasetFileId(datasetFile, dataset);
|
||||
dataset.addFile(datasetFile);
|
||||
addedFiles.add(datasetFile);
|
||||
}
|
||||
datasetFileRepository.saveOrUpdateBatch(addedFiles, 100);
|
||||
dataset.active();
|
||||
datasetRepository.updateById(dataset);
|
||||
// Note: addFilesToDataset only creates DB records, no file system operations
|
||||
// If file copy is needed, use copyFilesToDatasetDir endpoint instead
|
||||
return addedFiles;
|
||||
}
|
||||
}
|
||||
setDatasetFileId(datasetFile, dataset);
|
||||
dataset.addFile(datasetFile);
|
||||
addedFiles.add(datasetFile);
|
||||
triggerPdfTextExtraction(dataset, datasetFile);
|
||||
}
|
||||
datasetFileRepository.saveOrUpdateBatch(addedFiles, 100);
|
||||
dataset.active();
|
||||
datasetRepository.updateById(dataset);
|
||||
// Note: addFilesToDataset only creates DB records, no file system operations
|
||||
// If file copy is needed, use copyFilesToDatasetDir endpoint instead
|
||||
return addedFiles;
|
||||
}
|
||||
|
||||
private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) {
|
||||
if (dataset == null || datasetFile == null) {
|
||||
return;
|
||||
}
|
||||
if (dataset.getDatasetType() != DatasetType.TEXT) {
|
||||
return;
|
||||
}
|
||||
String fileType = datasetFile.getFileType();
|
||||
if (fileType == null || !fileType.equalsIgnoreCase(PDF_FILE_TYPE)) {
|
||||
return;
|
||||
}
|
||||
pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
package com.datamate.datamanagement.application;
|
||||
|
||||
import com.datamate.common.infrastructure.common.Response;
|
||||
import com.datamate.datamanagement.infrastructure.client.PdfTextExtractClient;
|
||||
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractRequest;
|
||||
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractResponse;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.scheduling.annotation.Async;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class PdfTextExtractAsyncService {
|
||||
private final PdfTextExtractClient pdfTextExtractClient;
|
||||
|
||||
@Async
|
||||
public void extractPdfText(String datasetId, String fileId) {
|
||||
try {
|
||||
Response<PdfTextExtractResponse> response = pdfTextExtractClient.extractPdfText(
|
||||
new PdfTextExtractRequest(datasetId, fileId)
|
||||
);
|
||||
if (response == null) {
|
||||
log.warn("PdfTextExtract returned null response, datasetId={}, fileId={}", datasetId, fileId);
|
||||
return;
|
||||
}
|
||||
String code = response.getCode();
|
||||
if (!"0".equals(code) && !"200".equals(code)) {
|
||||
log.warn(
|
||||
"PdfTextExtract failed, datasetId={}, fileId={}, code={}, message={}",
|
||||
datasetId,
|
||||
fileId,
|
||||
code,
|
||||
response.getMessage()
|
||||
);
|
||||
return;
|
||||
}
|
||||
PdfTextExtractResponse data = response.getData();
|
||||
if (data != null) {
|
||||
log.info(
|
||||
"PdfTextExtract succeeded, datasetId={}, fileId={}, textFileId={}",
|
||||
datasetId,
|
||||
fileId,
|
||||
data.getTextFileId()
|
||||
);
|
||||
} else {
|
||||
log.info("PdfTextExtract succeeded, datasetId={}, fileId={}", datasetId, fileId);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("PdfTextExtract call failed, datasetId={}, fileId={}", datasetId, fileId, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
package com.datamate.datamanagement.infrastructure.client;
|
||||
|
||||
import com.datamate.common.infrastructure.common.Response;
|
||||
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractRequest;
|
||||
import com.datamate.datamanagement.infrastructure.client.dto.PdfTextExtractResponse;
|
||||
import org.springframework.cloud.openfeign.FeignClient;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
|
||||
@FeignClient(name = "pdf-text-extract-service", url = "${pdf.text.extract.service.url:http://datamate-backend-python:18000}")
|
||||
public interface PdfTextExtractClient {
|
||||
|
||||
@PostMapping("/api/dataset/pdf-text-extract")
|
||||
Response<PdfTextExtractResponse> extractPdfText(@RequestBody PdfTextExtractRequest request);
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
package com.datamate.datamanagement.infrastructure.client.dto;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class PdfTextExtractRequest {
|
||||
private String datasetId;
|
||||
private String fileId;
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
package com.datamate.datamanagement.infrastructure.client.dto;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class PdfTextExtractResponse {
|
||||
private String datasetId;
|
||||
private String sourceFileId;
|
||||
private String textFileId;
|
||||
private String textFileName;
|
||||
private String textFilePath;
|
||||
private Long textFileSize;
|
||||
}
|
||||
Reference in New Issue
Block a user