From 498f23a0c4e4f96814dcca4960aa6b567b20a8fb Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sat, 31 Jan 2026 11:11:24 +0800 Subject: [PATCH] =?UTF-8?q?feat(data-management):=20=E6=89=A9=E5=B1=95?= =?UTF-8?q?=E6=96=87=E6=9C=AC=E6=95=B0=E6=8D=AE=E9=9B=86=E6=94=AF=E6=8C=81?= =?UTF-8?q?Excel=E6=96=87=E4=BB=B6=E7=B1=BB=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 在DatasetFileApplicationService中添加XLS和XLSX文件类型到文档文本文件类型集合 - 更新DatasetTypeController中的TEXT数据集类型支持xls和xlsx扩展名 - 在pdf_extract.py中添加XLS和XLSX文件类型的常量定义和解析器配置 - 实现Excel文件转CSV的功能,支持单个工作表和多工作表的解析 - 添加对Excel文件的依赖检查和错误处理机制 - 修改目标文件路径构建逻辑以支持不同文件类型的派生扩展名 - 更新文本文件记录创建逻辑以使用派生文件类型而不是固定文本类型 --- .../DatasetFileApplicationService.java | 16 ++- .../rest/DatasetTypeController.java | 2 +- .../app/module/dataset/service/pdf_extract.py | 106 ++++++++++++++++-- 3 files changed, 109 insertions(+), 15 deletions(-) diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java index 2fcb373..72be4f7 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java @@ -67,10 +67,18 @@ import java.util.stream.Stream; @Service @Transactional public class DatasetFileApplicationService { - private static final String PDF_FILE_TYPE = "pdf"; - private static final String DOC_FILE_TYPE = "doc"; - private static final String DOCX_FILE_TYPE = "docx"; - private static final Set DOCUMENT_TEXT_FILE_TYPES = Set.of(PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE); + private static final String PDF_FILE_TYPE = "pdf"; + private static final String DOC_FILE_TYPE = "doc"; + private static final String DOCX_FILE_TYPE = "docx"; + private static final String XLS_FILE_TYPE = "xls"; + private static final String XLSX_FILE_TYPE = "xlsx"; + private static final Set DOCUMENT_TEXT_FILE_TYPES = Set.of( + PDF_FILE_TYPE, + DOC_FILE_TYPE, + DOCX_FILE_TYPE, + XLS_FILE_TYPE, + XLSX_FILE_TYPE + ); private final DatasetFileRepository datasetFileRepository; private final DatasetRepository datasetRepository; diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetTypeController.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetTypeController.java index dfc3600..05fa65e 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetTypeController.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetTypeController.java @@ -23,7 +23,7 @@ public class DatasetTypeController { public List getDatasetTypes() { return Arrays.asList( createDatasetType("IMAGE", "图像数据集", "用于机器学习的图像数据集", Arrays.asList("jpg", "jpeg", "png", "bmp", "gif")), - createDatasetType("TEXT", "文本数据集", "用于文本分析的文本数据集", Arrays.asList("txt", "csv", "json", "xml")), + createDatasetType("TEXT", "文本数据集", "用于文本分析的文本数据集", Arrays.asList("txt", "csv", "xls", "xlsx", "json", "xml")), createDatasetType("AUDIO", "音频数据集", "用于音频处理的音频数据集", Arrays.asList("wav", "mp3", "flac", "aac")), createDatasetType("VIDEO", "视频数据集", "用于视频分析的视频数据集", Arrays.asList("mp4", "avi", "mov", "mkv")), createDatasetType("MULTIMODAL", "多模态数据集", "包含多种数据类型的数据集", List.of("*")) diff --git a/runtime/datamate-python/app/module/dataset/service/pdf_extract.py b/runtime/datamate-python/app/module/dataset/service/pdf_extract.py index c6438b9..aec3c87 100644 --- a/runtime/datamate-python/app/module/dataset/service/pdf_extract.py +++ b/runtime/datamate-python/app/module/dataset/service/pdf_extract.py @@ -1,5 +1,8 @@ +import csv +import csv import datetime import os +from io import StringIO from pathlib import Path from fastapi import HTTPException @@ -16,18 +19,41 @@ logger = get_logger(__name__) PDF_FILE_TYPE = "pdf" DOC_FILE_TYPE = "doc" DOCX_FILE_TYPE = "docx" +XLS_FILE_TYPE = "xls" +XLSX_FILE_TYPE = "xlsx" +CSV_FILE_TYPE = "csv" TEXT_FILE_TYPE = "txt" +CSV_FILE_EXTENSION = ".csv" TEXT_FILE_EXTENSION = ".txt" -SUPPORTED_FILE_TYPES = {PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE} +EXCEL_FILE_TYPES = {XLS_FILE_TYPE, XLSX_FILE_TYPE} +SUPPORTED_FILE_TYPES = {PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE, XLS_FILE_TYPE, XLSX_FILE_TYPE} PARSER_BY_FILE_TYPE = { PDF_FILE_TYPE: "PyPDFLoader", DOC_FILE_TYPE: "Docx2txtLoader", DOCX_FILE_TYPE: "Docx2txtLoader", + XLS_FILE_TYPE: "xlrd", + XLSX_FILE_TYPE: "openpyxl", } DEFAULT_EXTENSION_BY_TYPE = { PDF_FILE_TYPE: ".pdf", DOC_FILE_TYPE: ".doc", DOCX_FILE_TYPE: ".docx", + XLS_FILE_TYPE: ".xls", + XLSX_FILE_TYPE: ".xlsx", +} +DERIVED_EXTENSION_BY_TYPE = { + PDF_FILE_TYPE: TEXT_FILE_EXTENSION, + DOC_FILE_TYPE: TEXT_FILE_EXTENSION, + DOCX_FILE_TYPE: TEXT_FILE_EXTENSION, + XLS_FILE_TYPE: CSV_FILE_EXTENSION, + XLSX_FILE_TYPE: CSV_FILE_EXTENSION, +} +DERIVED_FILE_TYPE_BY_SOURCE = { + PDF_FILE_TYPE: TEXT_FILE_TYPE, + DOC_FILE_TYPE: TEXT_FILE_TYPE, + DOCX_FILE_TYPE: TEXT_FILE_TYPE, + XLS_FILE_TYPE: CSV_FILE_TYPE, + XLSX_FILE_TYPE: CSV_FILE_TYPE, } DERIVED_METADATA_KEY = "derived_from_file_id" DERIVED_METADATA_NAME_KEY = "derived_from_file_name" @@ -45,9 +71,11 @@ class PdfTextExtractService: self._validate_dataset_and_file(dataset, file_record) file_type = str(getattr(file_record, "file_type", "") or "").lower() + derived_file_type = DERIVED_FILE_TYPE_BY_SOURCE.get(file_type, TEXT_FILE_TYPE) + assert derived_file_type source_path = self._resolve_source_path(file_record) dataset_path = self._resolve_dataset_path(dataset) - target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id) + target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id, file_type) existing_record = await self._find_existing_text_record(dataset_id, target_path) if existing_record: @@ -56,14 +84,18 @@ class PdfTextExtractService: if target_path.exists(): file_size = self._get_file_size(target_path) parser_name = PARSER_BY_FILE_TYPE.get(file_type, "") - record = await self._create_text_file_record(dataset, file_record, target_path, file_size, parser_name) + record = await self._create_text_file_record( + dataset, file_record, target_path, file_size, parser_name, derived_file_type + ) return self._build_response(dataset_id, file_id, record) text_content, parser_name = self._parse_document(source_path, file_type) assert isinstance(text_content, str) self._write_text_file(target_path, text_content) file_size = self._get_file_size(target_path) - record = await self._create_text_file_record(dataset, file_record, target_path, file_size, parser_name) + record = await self._create_text_file_record( + dataset, file_record, target_path, file_size, parser_name, derived_file_type + ) return self._build_response(dataset_id, file_id, record) async def _get_dataset(self, dataset_id: str) -> Dataset: @@ -92,7 +124,7 @@ class PdfTextExtractService: raise HTTPException(status_code=400, detail="仅支持文本类型数据集") file_type = str(getattr(file_record, "file_type", "") or "").lower() if file_type not in SUPPORTED_FILE_TYPES: - raise HTTPException(status_code=400, detail="仅支持 PDF/DOC/DOCX 文件解析") + raise HTTPException(status_code=400, detail="仅支持 PDF/DOC/DOCX/XLS/XLSX 文件解析") @staticmethod def _resolve_source_path(file_record: DatasetFiles) -> Path: @@ -111,13 +143,13 @@ class PdfTextExtractService: return dataset_path @staticmethod - def _build_output_filename(file_record: DatasetFiles, file_id: str) -> str: + def _build_output_filename(file_record: DatasetFiles, file_id: str, file_type: str) -> str: original_name = str(getattr(file_record, "file_name", "") or "").strip() if not original_name: - file_type = str(getattr(file_record, "file_type", "") or "").lower() default_extension = DEFAULT_EXTENSION_BY_TYPE.get(file_type, f".{file_type}") original_name = f"{file_id}{default_extension}" - return f"{original_name}{TEXT_FILE_EXTENSION}" + derived_extension = DERIVED_EXTENSION_BY_TYPE.get(file_type, TEXT_FILE_EXTENSION) + return f"{original_name}{derived_extension}" def _resolve_target_path( self, @@ -125,8 +157,9 @@ class PdfTextExtractService: source_path: Path, file_record: DatasetFiles, file_id: str, + file_type: str, ) -> Path: - output_name = self._build_output_filename(file_record, file_id) + output_name = self._build_output_filename(file_record, file_id, file_type) if dataset_path in source_path.parents: target_dir = source_path.parent else: @@ -151,6 +184,10 @@ class PdfTextExtractService: if file_type == PDF_FILE_TYPE: loader = PyPDFLoader(str(source_path)) parser_name = PARSER_BY_FILE_TYPE[PDF_FILE_TYPE] + elif file_type in EXCEL_FILE_TYPES: + parser_name = PARSER_BY_FILE_TYPE.get(file_type, "excel") + csv_content = PdfTextExtractService._parse_excel_to_csv(source_path, file_type) + return csv_content, parser_name else: loader = Docx2txtLoader(str(source_path)) parser_name = PARSER_BY_FILE_TYPE.get(file_type, "Docx2txtLoader") @@ -158,6 +195,53 @@ class PdfTextExtractService: contents = [doc.page_content for doc in docs if doc.page_content] return "\n\n".join(contents), parser_name + @staticmethod + def _parse_excel_to_csv(source_path: Path, file_type: str) -> str: + output = StringIO(newline="") + writer = csv.writer(output) + try: + if file_type == XLSX_FILE_TYPE: + try: + from openpyxl import load_workbook + except ImportError as exc: + raise HTTPException(status_code=500, detail="缺少 openpyxl 依赖") from exc + workbook = load_workbook(filename=str(source_path), read_only=True, data_only=True) + try: + sheet_names = workbook.sheetnames + include_sheet_name = len(sheet_names) > 1 + for sheet_name in sheet_names: + sheet = workbook[sheet_name] + for row in sheet.iter_rows(values_only=True): + row_values = list(row) + if include_sheet_name: + row_values.insert(0, sheet_name) + writer.writerow(row_values) + finally: + workbook.close() + elif file_type == XLS_FILE_TYPE: + try: + import xlrd + except ImportError as exc: + raise HTTPException(status_code=500, detail="缺少 xlrd 依赖") from exc + workbook = xlrd.open_workbook(str(source_path)) + sheet_names = workbook.sheet_names() + include_sheet_name = len(sheet_names) > 1 + for sheet_index in range(workbook.nsheets): + sheet = workbook.sheet_by_index(sheet_index) + for row_index in range(sheet.nrows): + row_values = sheet.row_values(row_index) + if include_sheet_name: + row_values = [sheet.name, *row_values] + writer.writerow(row_values) + else: + raise HTTPException(status_code=400, detail="不支持的 Excel 文件格式") + except HTTPException: + raise + except Exception as exc: + logger.error("Excel 转 CSV 失败: %s", exc) + raise HTTPException(status_code=500, detail="Excel 转 CSV 失败") from exc + return output.getvalue() + @staticmethod def _write_text_file(target_path: Path, content: str) -> None: with open(target_path, "w", encoding="utf-8") as handle: @@ -177,8 +261,10 @@ class PdfTextExtractService: target_path: Path, file_size: int, parser_name: str, + derived_file_type: str, ) -> DatasetFiles: assert parser_name + assert derived_file_type metadata = { DERIVED_METADATA_KEY: str(getattr(source_file, "id", "")), DERIVED_METADATA_NAME_KEY: str(getattr(source_file, "file_name", "")), @@ -189,7 +275,7 @@ class PdfTextExtractService: dataset_id=dataset.id, # type: ignore[arg-type] file_name=target_path.name, file_path=str(target_path), - file_type=TEXT_FILE_TYPE, + file_type=derived_file_type, file_size=file_size, dataset_filemetadata=metadata, last_access_time=datetime.datetime.now(datetime.UTC),