Compare commits

...

2 Commits

Author SHA1 Message Date
153066a95f fix(frontend): hide action dropdown in CardView when operations list is empty 2026-01-31 11:14:26 +08:00
498f23a0c4 feat(data-management): 扩展文本数据集支持Excel文件类型
- 在DatasetFileApplicationService中添加XLS和XLSX文件类型到文档文本文件类型集合
- 更新DatasetTypeController中的TEXT数据集类型支持xls和xlsx扩展名
- 在pdf_extract.py中添加XLS和XLSX文件类型的常量定义和解析器配置
- 实现Excel文件转CSV的功能,支持单个工作表和多工作表的解析
- 添加对Excel文件的依赖检查和错误处理机制
- 修改目标文件路径构建逻辑以支持不同文件类型的派生扩展名
- 更新文本文件记录创建逻辑以使用派生文件类型而不是固定文本类型
2026-01-31 11:11:24 +08:00
4 changed files with 110 additions and 16 deletions

View File

@@ -67,10 +67,18 @@ import java.util.stream.Stream;
@Service @Service
@Transactional @Transactional
public class DatasetFileApplicationService { public class DatasetFileApplicationService {
private static final String PDF_FILE_TYPE = "pdf"; private static final String PDF_FILE_TYPE = "pdf";
private static final String DOC_FILE_TYPE = "doc"; private static final String DOC_FILE_TYPE = "doc";
private static final String DOCX_FILE_TYPE = "docx"; private static final String DOCX_FILE_TYPE = "docx";
private static final Set<String> DOCUMENT_TEXT_FILE_TYPES = Set.of(PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE); private static final String XLS_FILE_TYPE = "xls";
private static final String XLSX_FILE_TYPE = "xlsx";
private static final Set<String> DOCUMENT_TEXT_FILE_TYPES = Set.of(
PDF_FILE_TYPE,
DOC_FILE_TYPE,
DOCX_FILE_TYPE,
XLS_FILE_TYPE,
XLSX_FILE_TYPE
);
private final DatasetFileRepository datasetFileRepository; private final DatasetFileRepository datasetFileRepository;
private final DatasetRepository datasetRepository; private final DatasetRepository datasetRepository;

View File

@@ -23,7 +23,7 @@ public class DatasetTypeController {
public List<DatasetTypeResponse> getDatasetTypes() { public List<DatasetTypeResponse> getDatasetTypes() {
return Arrays.asList( return Arrays.asList(
createDatasetType("IMAGE", "图像数据集", "用于机器学习的图像数据集", Arrays.asList("jpg", "jpeg", "png", "bmp", "gif")), createDatasetType("IMAGE", "图像数据集", "用于机器学习的图像数据集", Arrays.asList("jpg", "jpeg", "png", "bmp", "gif")),
createDatasetType("TEXT", "文本数据集", "用于文本分析的文本数据集", Arrays.asList("txt", "csv", "json", "xml")), createDatasetType("TEXT", "文本数据集", "用于文本分析的文本数据集", Arrays.asList("txt", "csv", "xls", "xlsx", "json", "xml")),
createDatasetType("AUDIO", "音频数据集", "用于音频处理的音频数据集", Arrays.asList("wav", "mp3", "flac", "aac")), createDatasetType("AUDIO", "音频数据集", "用于音频处理的音频数据集", Arrays.asList("wav", "mp3", "flac", "aac")),
createDatasetType("VIDEO", "视频数据集", "用于视频分析的视频数据集", Arrays.asList("mp4", "avi", "mov", "mkv")), createDatasetType("VIDEO", "视频数据集", "用于视频分析的视频数据集", Arrays.asList("mp4", "avi", "mov", "mkv")),
createDatasetType("MULTIMODAL", "多模态数据集", "包含多种数据类型的数据集", List.of("*")) createDatasetType("MULTIMODAL", "多模态数据集", "包含多种数据类型的数据集", List.of("*"))

View File

@@ -276,7 +276,7 @@ function CardView<T extends BaseCardDataType>(props: CardViewProps<T>) {
{formatDateTime(item?.updatedAt)} {formatDateTime(item?.updatedAt)}
</div> </div>
</div> </div>
{operations && ( {operations && ops(item).length > 0 && (
<ActionDropdown <ActionDropdown
actions={ops(item)} actions={ops(item)}
onAction={(key) => { onAction={(key) => {

View File

@@ -1,5 +1,8 @@
import csv
import csv
import datetime import datetime
import os import os
from io import StringIO
from pathlib import Path from pathlib import Path
from fastapi import HTTPException from fastapi import HTTPException
@@ -16,18 +19,41 @@ logger = get_logger(__name__)
PDF_FILE_TYPE = "pdf" PDF_FILE_TYPE = "pdf"
DOC_FILE_TYPE = "doc" DOC_FILE_TYPE = "doc"
DOCX_FILE_TYPE = "docx" DOCX_FILE_TYPE = "docx"
XLS_FILE_TYPE = "xls"
XLSX_FILE_TYPE = "xlsx"
CSV_FILE_TYPE = "csv"
TEXT_FILE_TYPE = "txt" TEXT_FILE_TYPE = "txt"
CSV_FILE_EXTENSION = ".csv"
TEXT_FILE_EXTENSION = ".txt" TEXT_FILE_EXTENSION = ".txt"
SUPPORTED_FILE_TYPES = {PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE} EXCEL_FILE_TYPES = {XLS_FILE_TYPE, XLSX_FILE_TYPE}
SUPPORTED_FILE_TYPES = {PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE, XLS_FILE_TYPE, XLSX_FILE_TYPE}
PARSER_BY_FILE_TYPE = { PARSER_BY_FILE_TYPE = {
PDF_FILE_TYPE: "PyPDFLoader", PDF_FILE_TYPE: "PyPDFLoader",
DOC_FILE_TYPE: "Docx2txtLoader", DOC_FILE_TYPE: "Docx2txtLoader",
DOCX_FILE_TYPE: "Docx2txtLoader", DOCX_FILE_TYPE: "Docx2txtLoader",
XLS_FILE_TYPE: "xlrd",
XLSX_FILE_TYPE: "openpyxl",
} }
DEFAULT_EXTENSION_BY_TYPE = { DEFAULT_EXTENSION_BY_TYPE = {
PDF_FILE_TYPE: ".pdf", PDF_FILE_TYPE: ".pdf",
DOC_FILE_TYPE: ".doc", DOC_FILE_TYPE: ".doc",
DOCX_FILE_TYPE: ".docx", DOCX_FILE_TYPE: ".docx",
XLS_FILE_TYPE: ".xls",
XLSX_FILE_TYPE: ".xlsx",
}
DERIVED_EXTENSION_BY_TYPE = {
PDF_FILE_TYPE: TEXT_FILE_EXTENSION,
DOC_FILE_TYPE: TEXT_FILE_EXTENSION,
DOCX_FILE_TYPE: TEXT_FILE_EXTENSION,
XLS_FILE_TYPE: CSV_FILE_EXTENSION,
XLSX_FILE_TYPE: CSV_FILE_EXTENSION,
}
DERIVED_FILE_TYPE_BY_SOURCE = {
PDF_FILE_TYPE: TEXT_FILE_TYPE,
DOC_FILE_TYPE: TEXT_FILE_TYPE,
DOCX_FILE_TYPE: TEXT_FILE_TYPE,
XLS_FILE_TYPE: CSV_FILE_TYPE,
XLSX_FILE_TYPE: CSV_FILE_TYPE,
} }
DERIVED_METADATA_KEY = "derived_from_file_id" DERIVED_METADATA_KEY = "derived_from_file_id"
DERIVED_METADATA_NAME_KEY = "derived_from_file_name" DERIVED_METADATA_NAME_KEY = "derived_from_file_name"
@@ -45,9 +71,11 @@ class PdfTextExtractService:
self._validate_dataset_and_file(dataset, file_record) self._validate_dataset_and_file(dataset, file_record)
file_type = str(getattr(file_record, "file_type", "") or "").lower() file_type = str(getattr(file_record, "file_type", "") or "").lower()
derived_file_type = DERIVED_FILE_TYPE_BY_SOURCE.get(file_type, TEXT_FILE_TYPE)
assert derived_file_type
source_path = self._resolve_source_path(file_record) source_path = self._resolve_source_path(file_record)
dataset_path = self._resolve_dataset_path(dataset) dataset_path = self._resolve_dataset_path(dataset)
target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id) target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id, file_type)
existing_record = await self._find_existing_text_record(dataset_id, target_path) existing_record = await self._find_existing_text_record(dataset_id, target_path)
if existing_record: if existing_record:
@@ -56,14 +84,18 @@ class PdfTextExtractService:
if target_path.exists(): if target_path.exists():
file_size = self._get_file_size(target_path) file_size = self._get_file_size(target_path)
parser_name = PARSER_BY_FILE_TYPE.get(file_type, "") parser_name = PARSER_BY_FILE_TYPE.get(file_type, "")
record = await self._create_text_file_record(dataset, file_record, target_path, file_size, parser_name) record = await self._create_text_file_record(
dataset, file_record, target_path, file_size, parser_name, derived_file_type
)
return self._build_response(dataset_id, file_id, record) return self._build_response(dataset_id, file_id, record)
text_content, parser_name = self._parse_document(source_path, file_type) text_content, parser_name = self._parse_document(source_path, file_type)
assert isinstance(text_content, str) assert isinstance(text_content, str)
self._write_text_file(target_path, text_content) self._write_text_file(target_path, text_content)
file_size = self._get_file_size(target_path) file_size = self._get_file_size(target_path)
record = await self._create_text_file_record(dataset, file_record, target_path, file_size, parser_name) record = await self._create_text_file_record(
dataset, file_record, target_path, file_size, parser_name, derived_file_type
)
return self._build_response(dataset_id, file_id, record) return self._build_response(dataset_id, file_id, record)
async def _get_dataset(self, dataset_id: str) -> Dataset: async def _get_dataset(self, dataset_id: str) -> Dataset:
@@ -92,7 +124,7 @@ class PdfTextExtractService:
raise HTTPException(status_code=400, detail="仅支持文本类型数据集") raise HTTPException(status_code=400, detail="仅支持文本类型数据集")
file_type = str(getattr(file_record, "file_type", "") or "").lower() file_type = str(getattr(file_record, "file_type", "") or "").lower()
if file_type not in SUPPORTED_FILE_TYPES: if file_type not in SUPPORTED_FILE_TYPES:
raise HTTPException(status_code=400, detail="仅支持 PDF/DOC/DOCX 文件解析") raise HTTPException(status_code=400, detail="仅支持 PDF/DOC/DOCX/XLS/XLSX 文件解析")
@staticmethod @staticmethod
def _resolve_source_path(file_record: DatasetFiles) -> Path: def _resolve_source_path(file_record: DatasetFiles) -> Path:
@@ -111,13 +143,13 @@ class PdfTextExtractService:
return dataset_path return dataset_path
@staticmethod @staticmethod
def _build_output_filename(file_record: DatasetFiles, file_id: str) -> str: def _build_output_filename(file_record: DatasetFiles, file_id: str, file_type: str) -> str:
original_name = str(getattr(file_record, "file_name", "") or "").strip() original_name = str(getattr(file_record, "file_name", "") or "").strip()
if not original_name: if not original_name:
file_type = str(getattr(file_record, "file_type", "") or "").lower()
default_extension = DEFAULT_EXTENSION_BY_TYPE.get(file_type, f".{file_type}") default_extension = DEFAULT_EXTENSION_BY_TYPE.get(file_type, f".{file_type}")
original_name = f"{file_id}{default_extension}" original_name = f"{file_id}{default_extension}"
return f"{original_name}{TEXT_FILE_EXTENSION}" derived_extension = DERIVED_EXTENSION_BY_TYPE.get(file_type, TEXT_FILE_EXTENSION)
return f"{original_name}{derived_extension}"
def _resolve_target_path( def _resolve_target_path(
self, self,
@@ -125,8 +157,9 @@ class PdfTextExtractService:
source_path: Path, source_path: Path,
file_record: DatasetFiles, file_record: DatasetFiles,
file_id: str, file_id: str,
file_type: str,
) -> Path: ) -> Path:
output_name = self._build_output_filename(file_record, file_id) output_name = self._build_output_filename(file_record, file_id, file_type)
if dataset_path in source_path.parents: if dataset_path in source_path.parents:
target_dir = source_path.parent target_dir = source_path.parent
else: else:
@@ -151,6 +184,10 @@ class PdfTextExtractService:
if file_type == PDF_FILE_TYPE: if file_type == PDF_FILE_TYPE:
loader = PyPDFLoader(str(source_path)) loader = PyPDFLoader(str(source_path))
parser_name = PARSER_BY_FILE_TYPE[PDF_FILE_TYPE] parser_name = PARSER_BY_FILE_TYPE[PDF_FILE_TYPE]
elif file_type in EXCEL_FILE_TYPES:
parser_name = PARSER_BY_FILE_TYPE.get(file_type, "excel")
csv_content = PdfTextExtractService._parse_excel_to_csv(source_path, file_type)
return csv_content, parser_name
else: else:
loader = Docx2txtLoader(str(source_path)) loader = Docx2txtLoader(str(source_path))
parser_name = PARSER_BY_FILE_TYPE.get(file_type, "Docx2txtLoader") parser_name = PARSER_BY_FILE_TYPE.get(file_type, "Docx2txtLoader")
@@ -158,6 +195,53 @@ class PdfTextExtractService:
contents = [doc.page_content for doc in docs if doc.page_content] contents = [doc.page_content for doc in docs if doc.page_content]
return "\n\n".join(contents), parser_name return "\n\n".join(contents), parser_name
@staticmethod
def _parse_excel_to_csv(source_path: Path, file_type: str) -> str:
output = StringIO(newline="")
writer = csv.writer(output)
try:
if file_type == XLSX_FILE_TYPE:
try:
from openpyxl import load_workbook
except ImportError as exc:
raise HTTPException(status_code=500, detail="缺少 openpyxl 依赖") from exc
workbook = load_workbook(filename=str(source_path), read_only=True, data_only=True)
try:
sheet_names = workbook.sheetnames
include_sheet_name = len(sheet_names) > 1
for sheet_name in sheet_names:
sheet = workbook[sheet_name]
for row in sheet.iter_rows(values_only=True):
row_values = list(row)
if include_sheet_name:
row_values.insert(0, sheet_name)
writer.writerow(row_values)
finally:
workbook.close()
elif file_type == XLS_FILE_TYPE:
try:
import xlrd
except ImportError as exc:
raise HTTPException(status_code=500, detail="缺少 xlrd 依赖") from exc
workbook = xlrd.open_workbook(str(source_path))
sheet_names = workbook.sheet_names()
include_sheet_name = len(sheet_names) > 1
for sheet_index in range(workbook.nsheets):
sheet = workbook.sheet_by_index(sheet_index)
for row_index in range(sheet.nrows):
row_values = sheet.row_values(row_index)
if include_sheet_name:
row_values = [sheet.name, *row_values]
writer.writerow(row_values)
else:
raise HTTPException(status_code=400, detail="不支持的 Excel 文件格式")
except HTTPException:
raise
except Exception as exc:
logger.error("Excel 转 CSV 失败: %s", exc)
raise HTTPException(status_code=500, detail="Excel 转 CSV 失败") from exc
return output.getvalue()
@staticmethod @staticmethod
def _write_text_file(target_path: Path, content: str) -> None: def _write_text_file(target_path: Path, content: str) -> None:
with open(target_path, "w", encoding="utf-8") as handle: with open(target_path, "w", encoding="utf-8") as handle:
@@ -177,8 +261,10 @@ class PdfTextExtractService:
target_path: Path, target_path: Path,
file_size: int, file_size: int,
parser_name: str, parser_name: str,
derived_file_type: str,
) -> DatasetFiles: ) -> DatasetFiles:
assert parser_name assert parser_name
assert derived_file_type
metadata = { metadata = {
DERIVED_METADATA_KEY: str(getattr(source_file, "id", "")), DERIVED_METADATA_KEY: str(getattr(source_file, "id", "")),
DERIVED_METADATA_NAME_KEY: str(getattr(source_file, "file_name", "")), DERIVED_METADATA_NAME_KEY: str(getattr(source_file, "file_name", "")),
@@ -189,7 +275,7 @@ class PdfTextExtractService:
dataset_id=dataset.id, # type: ignore[arg-type] dataset_id=dataset.id, # type: ignore[arg-type]
file_name=target_path.name, file_name=target_path.name,
file_path=str(target_path), file_path=str(target_path),
file_type=TEXT_FILE_TYPE, file_type=derived_file_type,
file_size=file_size, file_size=file_size,
dataset_filemetadata=metadata, dataset_filemetadata=metadata,
last_access_time=datetime.datetime.now(datetime.UTC), last_access_time=datetime.datetime.now(datetime.UTC),