feat(data-management): 扩展文本数据集支持Excel文件类型

- 在DatasetFileApplicationService中添加XLS和XLSX文件类型到文档文本文件类型集合
- 更新DatasetTypeController中的TEXT数据集类型支持xls和xlsx扩展名
- 在pdf_extract.py中添加XLS和XLSX文件类型的常量定义和解析器配置
- 实现Excel文件转CSV的功能,支持单个工作表和多工作表的解析
- 添加对Excel文件的依赖检查和错误处理机制
- 修改目标文件路径构建逻辑以支持不同文件类型的派生扩展名
- 更新文本文件记录创建逻辑以使用派生文件类型而不是固定文本类型
This commit is contained in:
2026-01-31 11:11:24 +08:00
parent 85d7141a91
commit 498f23a0c4
3 changed files with 109 additions and 15 deletions

View File

@@ -67,10 +67,18 @@ import java.util.stream.Stream;
@Service
@Transactional
public class DatasetFileApplicationService {
private static final String PDF_FILE_TYPE = "pdf";
private static final String DOC_FILE_TYPE = "doc";
private static final String DOCX_FILE_TYPE = "docx";
private static final Set<String> DOCUMENT_TEXT_FILE_TYPES = Set.of(PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE);
private static final String PDF_FILE_TYPE = "pdf";
private static final String DOC_FILE_TYPE = "doc";
private static final String DOCX_FILE_TYPE = "docx";
private static final String XLS_FILE_TYPE = "xls";
private static final String XLSX_FILE_TYPE = "xlsx";
private static final Set<String> DOCUMENT_TEXT_FILE_TYPES = Set.of(
PDF_FILE_TYPE,
DOC_FILE_TYPE,
DOCX_FILE_TYPE,
XLS_FILE_TYPE,
XLSX_FILE_TYPE
);
private final DatasetFileRepository datasetFileRepository;
private final DatasetRepository datasetRepository;

View File

@@ -23,7 +23,7 @@ public class DatasetTypeController {
public List<DatasetTypeResponse> getDatasetTypes() {
return Arrays.asList(
createDatasetType("IMAGE", "图像数据集", "用于机器学习的图像数据集", Arrays.asList("jpg", "jpeg", "png", "bmp", "gif")),
createDatasetType("TEXT", "文本数据集", "用于文本分析的文本数据集", Arrays.asList("txt", "csv", "json", "xml")),
createDatasetType("TEXT", "文本数据集", "用于文本分析的文本数据集", Arrays.asList("txt", "csv", "xls", "xlsx", "json", "xml")),
createDatasetType("AUDIO", "音频数据集", "用于音频处理的音频数据集", Arrays.asList("wav", "mp3", "flac", "aac")),
createDatasetType("VIDEO", "视频数据集", "用于视频分析的视频数据集", Arrays.asList("mp4", "avi", "mov", "mkv")),
createDatasetType("MULTIMODAL", "多模态数据集", "包含多种数据类型的数据集", List.of("*"))