You've already forked DataMate
fix(dataset): 解决数据集文件查询时空目录导致异常的问题
- 添加目录存在性检查,避免文件系统访问异常 - 目录不存在时返回空分页结果而不是抛出异常 - 优化数据集刚创建时的用户体验
This commit is contained in:
@@ -22,16 +22,16 @@ import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckIn
|
|||||||
import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode;
|
import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode;
|
||||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
|
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
|
||||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
|
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
|
||||||
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
||||||
import com.datamate.datamanagement.interfaces.dto.AddFilesRequest;
|
import com.datamate.datamanagement.interfaces.dto.AddFilesRequest;
|
||||||
import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest;
|
import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest;
|
||||||
import com.datamate.datamanagement.interfaces.dto.CreateDirectoryRequest;
|
import com.datamate.datamanagement.interfaces.dto.CreateDirectoryRequest;
|
||||||
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
|
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
|
||||||
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
|
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
|
||||||
import com.fasterxml.jackson.core.type.TypeReference;
|
import com.fasterxml.jackson.core.type.TypeReference;
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import jakarta.servlet.http.HttpServletResponse;
|
import jakarta.servlet.http.HttpServletResponse;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
|
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
|
||||||
import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
|
import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
|
||||||
@@ -40,24 +40,24 @@ import org.springframework.beans.factory.annotation.Autowired;
|
|||||||
import org.springframework.beans.factory.annotation.Value;
|
import org.springframework.beans.factory.annotation.Value;
|
||||||
import org.springframework.core.io.Resource;
|
import org.springframework.core.io.Resource;
|
||||||
import org.springframework.core.io.UrlResource;
|
import org.springframework.core.io.UrlResource;
|
||||||
import org.springframework.http.HttpHeaders;
|
import org.springframework.http.HttpHeaders;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
import org.springframework.transaction.annotation.Transactional;
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
import org.springframework.transaction.support.TransactionSynchronization;
|
import org.springframework.transaction.support.TransactionSynchronization;
|
||||||
import org.springframework.transaction.support.TransactionSynchronizationManager;
|
import org.springframework.transaction.support.TransactionSynchronizationManager;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.nio.file.attribute.BasicFileAttributes;
|
import java.nio.file.attribute.BasicFileAttributes;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
import java.time.ZoneId;
|
import java.time.ZoneId;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.CompletableFuture;
|
import java.util.concurrent.CompletableFuture;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
@@ -70,24 +70,24 @@ import java.util.stream.Stream;
|
|||||||
@Service
|
@Service
|
||||||
@Transactional
|
@Transactional
|
||||||
public class DatasetFileApplicationService {
|
public class DatasetFileApplicationService {
|
||||||
private static final String PDF_FILE_TYPE = "pdf";
|
private static final String PDF_FILE_TYPE = "pdf";
|
||||||
private static final String DOC_FILE_TYPE = "doc";
|
private static final String DOC_FILE_TYPE = "doc";
|
||||||
private static final String DOCX_FILE_TYPE = "docx";
|
private static final String DOCX_FILE_TYPE = "docx";
|
||||||
private static final String XLS_FILE_TYPE = "xls";
|
private static final String XLS_FILE_TYPE = "xls";
|
||||||
private static final String XLSX_FILE_TYPE = "xlsx";
|
private static final String XLSX_FILE_TYPE = "xlsx";
|
||||||
private static final Set<String> DOCUMENT_TEXT_FILE_TYPES = Set.of(
|
private static final Set<String> DOCUMENT_TEXT_FILE_TYPES = Set.of(
|
||||||
PDF_FILE_TYPE,
|
PDF_FILE_TYPE,
|
||||||
DOC_FILE_TYPE,
|
DOC_FILE_TYPE,
|
||||||
DOCX_FILE_TYPE,
|
DOCX_FILE_TYPE,
|
||||||
XLS_FILE_TYPE,
|
XLS_FILE_TYPE,
|
||||||
XLSX_FILE_TYPE
|
XLSX_FILE_TYPE
|
||||||
);
|
);
|
||||||
private static final String DERIVED_METADATA_KEY = "derived_from_file_id";
|
private static final String DERIVED_METADATA_KEY = "derived_from_file_id";
|
||||||
|
|
||||||
private final DatasetFileRepository datasetFileRepository;
|
private final DatasetFileRepository datasetFileRepository;
|
||||||
private final DatasetRepository datasetRepository;
|
private final DatasetRepository datasetRepository;
|
||||||
private final FileService fileService;
|
private final FileService fileService;
|
||||||
private final PdfTextExtractAsyncService pdfTextExtractAsyncService;
|
private final PdfTextExtractAsyncService pdfTextExtractAsyncService;
|
||||||
|
|
||||||
@Value("${datamate.data-management.base-path:/dataset}")
|
@Value("${datamate.data-management.base-path:/dataset}")
|
||||||
private String datasetBasePath;
|
private String datasetBasePath;
|
||||||
@@ -123,61 +123,65 @@ public class DatasetFileApplicationService {
|
|||||||
* @param status 状态过滤
|
* @param status 状态过滤
|
||||||
* @param name 文件名模糊查询
|
* @param name 文件名模糊查询
|
||||||
* @param hasAnnotation 是否有标注
|
* @param hasAnnotation 是否有标注
|
||||||
* @param excludeSourceDocuments 是否排除源文档(PDF/DOC/DOCX/XLS/XLSX)
|
* @param excludeSourceDocuments 是否排除源文档(PDF/DOC/DOCX/XLS/XLSX)
|
||||||
* @param pagingQuery 分页参数
|
* @param pagingQuery 分页参数
|
||||||
* @return 分页文件列表
|
* @return 分页文件列表
|
||||||
*/
|
*/
|
||||||
@Transactional(readOnly = true)
|
@Transactional(readOnly = true)
|
||||||
public PagedResponse<DatasetFile> getDatasetFiles(String datasetId, String fileType, String status, String name,
|
public PagedResponse<DatasetFile> getDatasetFiles(String datasetId, String fileType, String status, String name,
|
||||||
Boolean hasAnnotation, boolean excludeSourceDocuments, PagingQuery pagingQuery) {
|
Boolean hasAnnotation, boolean excludeSourceDocuments, PagingQuery pagingQuery) {
|
||||||
IPage<DatasetFile> page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize());
|
IPage<DatasetFile> page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize());
|
||||||
IPage<DatasetFile> files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page);
|
IPage<DatasetFile> files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page);
|
||||||
|
|
||||||
if (excludeSourceDocuments) {
|
if (excludeSourceDocuments) {
|
||||||
// 过滤掉源文档文件(PDF/DOC/DOCX/XLS/XLSX),用于标注场景只展示派生文件
|
// 过滤掉源文档文件(PDF/DOC/DOCX/XLS/XLSX),用于标注场景只展示派生文件
|
||||||
List<DatasetFile> filteredRecords = files.getRecords().stream()
|
List<DatasetFile> filteredRecords = files.getRecords().stream()
|
||||||
.filter(file -> !isSourceDocument(file))
|
.filter(file -> !isSourceDocument(file))
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
// 重新构建分页结果
|
// 重新构建分页结果
|
||||||
Page<DatasetFile> filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal());
|
Page<DatasetFile> filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal());
|
||||||
filteredPage.setRecords(filteredRecords);
|
filteredPage.setRecords(filteredRecords);
|
||||||
return PagedResponse.of(filteredPage);
|
return PagedResponse.of(filteredPage);
|
||||||
}
|
}
|
||||||
|
|
||||||
return PagedResponse.of(files);
|
return PagedResponse.of(files);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取数据集文件列表
|
* 获取数据集文件列表
|
||||||
*/
|
*/
|
||||||
@Transactional(readOnly = true)
|
@Transactional(readOnly = true)
|
||||||
public PagedResponse<DatasetFile> getDatasetFilesWithDirectory(String datasetId, String prefix, boolean excludeDerivedFiles, PagingQuery pagingQuery) {
|
public PagedResponse<DatasetFile> getDatasetFilesWithDirectory(String datasetId, String prefix, boolean excludeDerivedFiles, PagingQuery pagingQuery) {
|
||||||
Dataset dataset = datasetRepository.getById(datasetId);
|
Dataset dataset = datasetRepository.getById(datasetId);
|
||||||
int page = Math.max(pagingQuery.getPage(), 1);
|
int page = Math.max(pagingQuery.getPage(), 1);
|
||||||
int size = pagingQuery.getSize() == null || pagingQuery.getSize() < 0 ? 20 : pagingQuery.getSize();
|
int size = pagingQuery.getSize() == null || pagingQuery.getSize() < 0 ? 20 : pagingQuery.getSize();
|
||||||
if (dataset == null) {
|
if (dataset == null) {
|
||||||
return PagedResponse.of(new Page<>(page, size));
|
return PagedResponse.of(new Page<>(page, size));
|
||||||
}
|
}
|
||||||
String datasetPath = dataset.getPath();
|
String datasetPath = dataset.getPath();
|
||||||
Path queryPath = Path.of(dataset.getPath() + File.separator + prefix);
|
Path queryPath = Path.of(dataset.getPath() + File.separator + prefix);
|
||||||
Map<String, DatasetFile> datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId)
|
Map<String, DatasetFile> datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId)
|
||||||
.stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity()));
|
.stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity()));
|
||||||
Set<String> derivedFilePaths = excludeDerivedFiles
|
Set<String> derivedFilePaths = excludeDerivedFiles
|
||||||
? datasetFilesMap.values().stream()
|
? datasetFilesMap.values().stream()
|
||||||
.filter(this::isDerivedFile)
|
.filter(this::isDerivedFile)
|
||||||
.map(DatasetFile::getFilePath)
|
.map(DatasetFile::getFilePath)
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.collect(Collectors.toSet())
|
.collect(Collectors.toSet())
|
||||||
: Collections.emptySet();
|
: Collections.emptySet();
|
||||||
try (Stream<Path> pathStream = Files.list(queryPath)) {
|
// 如果目录不存在,直接返回空结果(数据集刚创建时目录可能还未生成)
|
||||||
List<Path> allFiles = pathStream
|
if (!Files.exists(queryPath)) {
|
||||||
.filter(path -> path.toString().startsWith(datasetPath))
|
return new PagedResponse<>(page, size, 0, 0, Collections.emptyList());
|
||||||
.filter(path -> !excludeDerivedFiles || Files.isDirectory(path) || !derivedFilePaths.contains(path.toString()))
|
}
|
||||||
.sorted(Comparator
|
try (Stream<Path> pathStream = Files.list(queryPath)) {
|
||||||
.comparing((Path path) -> !Files.isDirectory(path))
|
List<Path> allFiles = pathStream
|
||||||
.thenComparing(path -> path.getFileName().toString()))
|
.filter(path -> path.toString().startsWith(datasetPath))
|
||||||
.collect(Collectors.toList());
|
.filter(path -> !excludeDerivedFiles || Files.isDirectory(path) || !derivedFilePaths.contains(path.toString()))
|
||||||
|
.sorted(Comparator
|
||||||
|
.comparing((Path path) -> !Files.isDirectory(path))
|
||||||
|
.thenComparing(path -> path.getFileName().toString()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
// 计算分页
|
// 计算分页
|
||||||
int total = allFiles.size();
|
int total = allFiles.size();
|
||||||
@@ -195,15 +199,15 @@ public class DatasetFileApplicationService {
|
|||||||
List<DatasetFile> datasetFiles = pageData.stream().map(path -> getDatasetFile(path, datasetFilesMap)).toList();
|
List<DatasetFile> datasetFiles = pageData.stream().map(path -> getDatasetFile(path, datasetFilesMap)).toList();
|
||||||
|
|
||||||
return new PagedResponse<>(page, size, total, totalPages, datasetFiles);
|
return new PagedResponse<>(page, size, total, totalPages, datasetFiles);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
log.error("list dataset path error", e);
|
log.error("list dataset path error", e);
|
||||||
return PagedResponse.of(new Page<>(page, size));
|
return PagedResponse.of(new Page<>(page, size));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private DatasetFile getDatasetFile(Path path, Map<String, DatasetFile> datasetFilesMap) {
|
private DatasetFile getDatasetFile(Path path, Map<String, DatasetFile> datasetFilesMap) {
|
||||||
DatasetFile datasetFile = new DatasetFile();
|
DatasetFile datasetFile = new DatasetFile();
|
||||||
LocalDateTime localDateTime = LocalDateTime.now();
|
LocalDateTime localDateTime = LocalDateTime.now();
|
||||||
try {
|
try {
|
||||||
localDateTime = Files.getLastModifiedTime(path).toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime();
|
localDateTime = Files.getLastModifiedTime(path).toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
@@ -254,37 +258,37 @@ public class DatasetFileApplicationService {
|
|||||||
datasetFile = exist;
|
datasetFile = exist;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return datasetFile;
|
return datasetFile;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isSourceDocument(DatasetFile datasetFile) {
|
private boolean isSourceDocument(DatasetFile datasetFile) {
|
||||||
if (datasetFile == null) {
|
if (datasetFile == null) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
String fileType = datasetFile.getFileType();
|
String fileType = datasetFile.getFileType();
|
||||||
if (fileType == null || fileType.isBlank()) {
|
if (fileType == null || fileType.isBlank()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT));
|
return DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT));
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isDerivedFile(DatasetFile datasetFile) {
|
private boolean isDerivedFile(DatasetFile datasetFile) {
|
||||||
if (datasetFile == null) {
|
if (datasetFile == null) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
String metadata = datasetFile.getMetadata();
|
String metadata = datasetFile.getMetadata();
|
||||||
if (metadata == null || metadata.isBlank()) {
|
if (metadata == null || metadata.isBlank()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
ObjectMapper mapper = new ObjectMapper();
|
ObjectMapper mapper = new ObjectMapper();
|
||||||
Map<String, Object> metadataMap = mapper.readValue(metadata, new TypeReference<Map<String, Object>>() {});
|
Map<String, Object> metadataMap = mapper.readValue(metadata, new TypeReference<Map<String, Object>>() {});
|
||||||
return metadataMap.get(DERIVED_METADATA_KEY) != null;
|
return metadataMap.get(DERIVED_METADATA_KEY) != null;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.debug("Failed to parse dataset file metadata for derived detection: {}", datasetFile.getId(), e);
|
log.debug("Failed to parse dataset file metadata for derived detection: {}", datasetFile.getId(), e);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取文件详情
|
* 获取文件详情
|
||||||
@@ -740,17 +744,17 @@ public class DatasetFileApplicationService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 复制文件到数据集目录
|
* 复制文件到数据集目录
|
||||||
*
|
*
|
||||||
* @param datasetId 数据集id
|
* @param datasetId 数据集id
|
||||||
* @param req 复制文件请求
|
* @param req 复制文件请求
|
||||||
* @return 复制的文件列表
|
* @return 复制的文件列表
|
||||||
*/
|
*/
|
||||||
@Transactional
|
@Transactional
|
||||||
public List<DatasetFile> copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) {
|
public List<DatasetFile> copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) {
|
||||||
Dataset dataset = datasetRepository.getById(datasetId);
|
Dataset dataset = datasetRepository.getById(datasetId);
|
||||||
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
|
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
|
||||||
List<DatasetFile> copiedFiles = new ArrayList<>();
|
List<DatasetFile> copiedFiles = new ArrayList<>();
|
||||||
List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
|
List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
|
||||||
dataset.setFiles(existDatasetFiles);
|
dataset.setFiles(existDatasetFiles);
|
||||||
@@ -780,80 +784,80 @@ public class DatasetFileApplicationService {
|
|||||||
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
|
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
|
||||||
dataset.active();
|
dataset.active();
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
|
CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
|
||||||
return copiedFiles;
|
return copiedFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 复制文件到数据集目录(保留相对路径,适用于数据源导入)
|
* 复制文件到数据集目录(保留相对路径,适用于数据源导入)
|
||||||
*
|
*
|
||||||
* @param datasetId 数据集id
|
* @param datasetId 数据集id
|
||||||
* @param sourceRoot 数据源根目录
|
* @param sourceRoot 数据源根目录
|
||||||
* @param sourcePaths 源文件路径列表
|
* @param sourcePaths 源文件路径列表
|
||||||
* @return 复制的文件列表
|
* @return 复制的文件列表
|
||||||
*/
|
*/
|
||||||
@Transactional
|
@Transactional
|
||||||
public List<DatasetFile> copyFilesToDatasetDirWithSourceRoot(String datasetId, Path sourceRoot, List<String> sourcePaths) {
|
public List<DatasetFile> copyFilesToDatasetDirWithSourceRoot(String datasetId, Path sourceRoot, List<String> sourcePaths) {
|
||||||
Dataset dataset = datasetRepository.getById(datasetId);
|
Dataset dataset = datasetRepository.getById(datasetId);
|
||||||
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
|
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
|
||||||
|
|
||||||
Path normalizedRoot = sourceRoot.toAbsolutePath().normalize();
|
Path normalizedRoot = sourceRoot.toAbsolutePath().normalize();
|
||||||
List<DatasetFile> copiedFiles = new ArrayList<>();
|
List<DatasetFile> copiedFiles = new ArrayList<>();
|
||||||
List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
|
List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
|
||||||
dataset.setFiles(existDatasetFiles);
|
dataset.setFiles(existDatasetFiles);
|
||||||
Map<String, DatasetFile> copyTargets = new LinkedHashMap<>();
|
Map<String, DatasetFile> copyTargets = new LinkedHashMap<>();
|
||||||
|
|
||||||
for (String sourceFilePath : sourcePaths) {
|
for (String sourceFilePath : sourcePaths) {
|
||||||
if (sourceFilePath == null || sourceFilePath.isBlank()) {
|
if (sourceFilePath == null || sourceFilePath.isBlank()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
Path sourcePath = Paths.get(sourceFilePath).toAbsolutePath().normalize();
|
Path sourcePath = Paths.get(sourceFilePath).toAbsolutePath().normalize();
|
||||||
if (!sourcePath.startsWith(normalizedRoot)) {
|
if (!sourcePath.startsWith(normalizedRoot)) {
|
||||||
log.warn("Source file path is out of root: {}", sourceFilePath);
|
log.warn("Source file path is out of root: {}", sourceFilePath);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
|
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
|
||||||
log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath);
|
log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
Path relativePath = normalizedRoot.relativize(sourcePath);
|
Path relativePath = normalizedRoot.relativize(sourcePath);
|
||||||
String fileName = sourcePath.getFileName().toString();
|
String fileName = sourcePath.getFileName().toString();
|
||||||
File sourceFile = sourcePath.toFile();
|
File sourceFile = sourcePath.toFile();
|
||||||
LocalDateTime currentTime = LocalDateTime.now();
|
LocalDateTime currentTime = LocalDateTime.now();
|
||||||
Path targetPath = Paths.get(dataset.getPath(), relativePath.toString());
|
Path targetPath = Paths.get(dataset.getPath(), relativePath.toString());
|
||||||
|
|
||||||
DatasetFile datasetFile = DatasetFile.builder()
|
DatasetFile datasetFile = DatasetFile.builder()
|
||||||
.id(UUID.randomUUID().toString())
|
.id(UUID.randomUUID().toString())
|
||||||
.datasetId(datasetId)
|
.datasetId(datasetId)
|
||||||
.fileName(fileName)
|
.fileName(fileName)
|
||||||
.fileType(AnalyzerUtils.getExtension(fileName))
|
.fileType(AnalyzerUtils.getExtension(fileName))
|
||||||
.fileSize(sourceFile.length())
|
.fileSize(sourceFile.length())
|
||||||
.filePath(targetPath.toString())
|
.filePath(targetPath.toString())
|
||||||
.uploadTime(currentTime)
|
.uploadTime(currentTime)
|
||||||
.lastAccessTime(currentTime)
|
.lastAccessTime(currentTime)
|
||||||
.build();
|
.build();
|
||||||
setDatasetFileId(datasetFile, dataset);
|
setDatasetFileId(datasetFile, dataset);
|
||||||
dataset.addFile(datasetFile);
|
dataset.addFile(datasetFile);
|
||||||
copiedFiles.add(datasetFile);
|
copiedFiles.add(datasetFile);
|
||||||
copyTargets.put(sourceFilePath, datasetFile);
|
copyTargets.put(sourceFilePath, datasetFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (copiedFiles.isEmpty()) {
|
if (copiedFiles.isEmpty()) {
|
||||||
return copiedFiles;
|
return copiedFiles;
|
||||||
}
|
}
|
||||||
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
|
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
|
||||||
dataset.active();
|
dataset.active();
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
CompletableFuture.runAsync(() -> copyFilesToDatasetDirWithRelativePath(copyTargets, dataset, normalizedRoot));
|
CompletableFuture.runAsync(() -> copyFilesToDatasetDirWithRelativePath(copyTargets, dataset, normalizedRoot));
|
||||||
return copiedFiles;
|
return copiedFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void copyFilesToDatasetDir(List<String> sourcePaths, Dataset dataset) {
|
private void copyFilesToDatasetDir(List<String> sourcePaths, Dataset dataset) {
|
||||||
for (String sourcePath : sourcePaths) {
|
for (String sourcePath : sourcePaths) {
|
||||||
Path sourceFilePath = Paths.get(sourcePath);
|
Path sourceFilePath = Paths.get(sourcePath);
|
||||||
Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString());
|
Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString());
|
||||||
try {
|
try {
|
||||||
Files.createDirectories(Path.of(dataset.getPath()));
|
Files.createDirectories(Path.of(dataset.getPath()));
|
||||||
Files.copy(sourceFilePath, targetFilePath);
|
Files.copy(sourceFilePath, targetFilePath);
|
||||||
DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName(
|
DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName(
|
||||||
@@ -863,39 +867,39 @@ public class DatasetFileApplicationService {
|
|||||||
triggerPdfTextExtraction(dataset, datasetFile);
|
triggerPdfTextExtraction(dataset, datasetFile);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
|
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void copyFilesToDatasetDirWithRelativePath(
|
private void copyFilesToDatasetDirWithRelativePath(
|
||||||
Map<String, DatasetFile> copyTargets,
|
Map<String, DatasetFile> copyTargets,
|
||||||
Dataset dataset,
|
Dataset dataset,
|
||||||
Path sourceRoot
|
Path sourceRoot
|
||||||
) {
|
) {
|
||||||
Path datasetRoot = Paths.get(dataset.getPath()).toAbsolutePath().normalize();
|
Path datasetRoot = Paths.get(dataset.getPath()).toAbsolutePath().normalize();
|
||||||
Path normalizedRoot = sourceRoot.toAbsolutePath().normalize();
|
Path normalizedRoot = sourceRoot.toAbsolutePath().normalize();
|
||||||
for (Map.Entry<String, DatasetFile> entry : copyTargets.entrySet()) {
|
for (Map.Entry<String, DatasetFile> entry : copyTargets.entrySet()) {
|
||||||
Path sourcePath = Paths.get(entry.getKey()).toAbsolutePath().normalize();
|
Path sourcePath = Paths.get(entry.getKey()).toAbsolutePath().normalize();
|
||||||
if (!sourcePath.startsWith(normalizedRoot)) {
|
if (!sourcePath.startsWith(normalizedRoot)) {
|
||||||
log.warn("Source file path is out of root: {}", sourcePath);
|
log.warn("Source file path is out of root: {}", sourcePath);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
Path relativePath = normalizedRoot.relativize(sourcePath);
|
Path relativePath = normalizedRoot.relativize(sourcePath);
|
||||||
Path targetFilePath = datasetRoot.resolve(relativePath).normalize();
|
Path targetFilePath = datasetRoot.resolve(relativePath).normalize();
|
||||||
if (!targetFilePath.startsWith(datasetRoot)) {
|
if (!targetFilePath.startsWith(datasetRoot)) {
|
||||||
log.warn("Target file path is out of dataset path: {}", targetFilePath);
|
log.warn("Target file path is out of dataset path: {}", targetFilePath);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
Files.createDirectories(targetFilePath.getParent());
|
Files.createDirectories(targetFilePath.getParent());
|
||||||
Files.copy(sourcePath, targetFilePath);
|
Files.copy(sourcePath, targetFilePath);
|
||||||
triggerPdfTextExtraction(dataset, entry.getValue());
|
triggerPdfTextExtraction(dataset, entry.getValue());
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
|
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 添加文件到数据集(仅创建数据库记录,不执行文件系统操作)
|
* 添加文件到数据集(仅创建数据库记录,不执行文件系统操作)
|
||||||
*
|
*
|
||||||
@@ -952,31 +956,31 @@ public class DatasetFileApplicationService {
|
|||||||
return addedFiles;
|
return addedFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) {
|
private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) {
|
||||||
if (dataset == null || datasetFile == null) {
|
if (dataset == null || datasetFile == null) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (dataset.getDatasetType() != DatasetType.TEXT) {
|
if (dataset.getDatasetType() != DatasetType.TEXT) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
String fileType = datasetFile.getFileType();
|
String fileType = datasetFile.getFileType();
|
||||||
if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) {
|
if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
String datasetId = dataset.getId();
|
String datasetId = dataset.getId();
|
||||||
String fileId = datasetFile.getId();
|
String fileId = datasetFile.getId();
|
||||||
if (datasetId == null || fileId == null) {
|
if (datasetId == null || fileId == null) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (TransactionSynchronizationManager.isSynchronizationActive()) {
|
if (TransactionSynchronizationManager.isSynchronizationActive()) {
|
||||||
TransactionSynchronizationManager.registerSynchronization(new TransactionSynchronization() {
|
TransactionSynchronizationManager.registerSynchronization(new TransactionSynchronization() {
|
||||||
@Override
|
@Override
|
||||||
public void afterCommit() {
|
public void afterCommit() {
|
||||||
pdfTextExtractAsyncService.extractPdfText(datasetId, fileId);
|
pdfTextExtractAsyncService.extractPdfText(datasetId, fileId);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
pdfTextExtractAsyncService.extractPdfText(datasetId, fileId);
|
pdfTextExtractAsyncService.extractPdfText(datasetId, fileId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user