diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java index 1d1a8bf..d1ac0bd 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java @@ -46,7 +46,7 @@ public class DatasetFileController { @PathVariable("datasetId") String datasetId, @RequestParam(value = "page", required = false, defaultValue = "0") Integer page, @RequestParam(value = "size", required = false, defaultValue = "20") Integer size, - @RequestParam(value = "prefix", required = false) String prefix) { + @RequestParam(value = "prefix", required = false, defaultValue = "") String prefix) { PagingQuery pagingQuery = new PagingQuery(page, size); PagedResponse filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory( datasetId, prefix, pagingQuery); diff --git a/frontend/src/components/business/DatasetFileTransfer.tsx b/frontend/src/components/business/DatasetFileTransfer.tsx index 24e3642..92284cb 100644 --- a/frontend/src/components/business/DatasetFileTransfer.tsx +++ b/frontend/src/components/business/DatasetFileTransfer.tsx @@ -78,6 +78,7 @@ const DatasetFileTransfer: React.FC = ({ const fetchDatasets = async () => { const { data } = await queryDatasetsUsingGet({ + // Ant Design Table pagination.current is 1-based; ensure backend also receives 1-based value page: datasetPagination.current, size: datasetPagination.pageSize, keyword: datasetSearch, @@ -98,29 +99,49 @@ const DatasetFileTransfer: React.FC = ({ 300 ); - const fetchFiles = useCallback(async () => { - if (!selectedDataset) return; - const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, { - page: filesPagination.current - 1, - size: filesPagination.pageSize, - keyword: filesSearch, - }); - setFiles( - (data.content || []).map((item: DatasetFile) => ({ - ...item, - key: item.id, - datasetName: selectedDataset.name, - })) - ); - setFilesPagination((prev) => ({ - ...prev, - total: data.totalElements, - })); - }, [filesPagination.current, filesPagination.pageSize, filesSearch, selectedDataset]); + const fetchFiles = useCallback( + async ( + options?: Partial<{ page: number; pageSize: number; keyword: string }> + ) => { + if (!selectedDataset) return; + const page = options?.page ?? filesPagination.current; + const pageSize = options?.pageSize ?? filesPagination.pageSize; + const keyword = options?.keyword ?? filesSearch; + + const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, { + page, + size: pageSize, + keyword, + }); + setFiles( + (data.content || []).map((item: DatasetFile) => ({ + ...item, + key: item.id, + datasetName: selectedDataset.name, + })) + ); + setFilesPagination((prev) => ({ + ...prev, + current: page, + pageSize, + total: data.totalElements, + })); + }, + [selectedDataset, filesPagination.current, filesPagination.pageSize, filesSearch] + ); useEffect(() => { - fetchFiles().catch(() => {}); - }, [fetchFiles]); + // 当数据集变化时,重置文件分页并拉取第一页文件,避免额外的循环请求 + if (selectedDataset) { + setFilesPagination({ current: 1, pageSize: 10, total: 0 }); + fetchFiles({ page: 1, pageSize: 10 }).catch(() => {}); + } else { + setFiles([]); + setFilesPagination({ current: 1, pageSize: 10, total: 0 }); + } + // 只在 selectedDataset 变化时触发 + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [selectedDataset]); useEffect(() => { onDatasetSelect?.(selectedDataset); @@ -238,7 +259,18 @@ const DatasetFileTransfer: React.FC = ({ size="small" dataSource={files} columns={fileCols.slice(1, fileCols.length)} - pagination={filesPagination} + pagination={{ + ...filesPagination, + onChange: (page, pageSize) => { + const nextPageSize = pageSize || filesPagination.pageSize; + setFilesPagination((prev) => ({ + ...prev, + current: page, + pageSize: nextPageSize, + })); + fetchFiles({ page, pageSize: nextPageSize }).catch(() => {}); + }, + }} onRow={(record: DatasetFile) => ({ onClick: () => toggleSelectFile(record), })} @@ -247,7 +279,7 @@ const DatasetFileTransfer: React.FC = ({ selectedRowKeys: Object.keys(selectedFilesMap), // 单选 - onSelect: (record: DatasetFile, selected: boolean) => { + onSelect: (record: DatasetFile) => { toggleSelectFile(record); }, @@ -255,7 +287,7 @@ const DatasetFileTransfer: React.FC = ({ onSelectAll: (selected, selectedRows: DatasetFile[]) => { if (selected) { // ✔ 全选 -> 将 files 列表全部加入 selectedFilesMap - const newMap: Record = {}; + const newMap: Record = { ...selectedFilesMap }; selectedRows.forEach((f) => { newMap[f.id] = f; }); @@ -264,7 +296,7 @@ const DatasetFileTransfer: React.FC = ({ // ✘ 取消全选 -> 清空 map const newMap = { ...selectedFilesMap }; Object.keys(newMap).forEach((id) => { - if (files.find((f) => f.id === id)) { + if (files.some((f) => String(f.id) === id)) { // 仅移除当前页对应文件 delete newMap[id]; } @@ -277,15 +309,6 @@ const DatasetFileTransfer: React.FC = ({ name: record.fileName, }), }} - - // rowSelection={{ - // type: "checkbox", - // selectedRowKeys: Object.keys(selectedFilesMap), - // onSelect: toggleSelectFile, - // getCheckboxProps: (record: DatasetFile) => ({ - // name: record.fileName, - // }), - // }} /> diff --git a/runtime/datamate-python/app/common/__init__.py b/runtime/datamate-python/app/common/__init__.py new file mode 100644 index 0000000..03cb3be --- /dev/null +++ b/runtime/datamate-python/app/common/__init__.py @@ -0,0 +1,3 @@ +""" +公共模块 +""" diff --git a/runtime/datamate-python/app/common/document_loaders.py b/runtime/datamate-python/app/common/document_loaders.py new file mode 100644 index 0000000..b6f045b --- /dev/null +++ b/runtime/datamate-python/app/common/document_loaders.py @@ -0,0 +1,93 @@ +from typing import List, Union, Optional +from pathlib import Path + +from langchain_core.documents import Document +from langchain_community.document_loaders import ( + TextLoader, + JSONLoader, + CSVLoader, + UnstructuredMarkdownLoader, + PyPDFLoader, + Docx2txtLoader +) + +from app.core.logging import get_logger + +log = get_logger(__name__) + +class UniversalDocLoader: + """ + 通用泛文本文档加载类 + 支持格式:TXT/JSON/CSV/Markdown/Word(.docx)/PPT(.pptx)/PDF + """ + # 格式-加载器映射(轻量优先) + SUPPORTED_FORMATS = { + # 纯文本类 + ".txt": TextLoader, + ".json": JSONLoader, + ".csv": CSVLoader, + ".md": UnstructuredMarkdownLoader, + # 办公文档类 + ".docx": Docx2txtLoader, + ".doc": Docx2txtLoader, + # PDF 类 + ".pdf": PyPDFLoader + } + + def __init__(self, file_path: Union[str, Path]): + self.file_path = Path(file_path).resolve() + self.file_suffix = self.file_path.suffix.lower() + log.info(f"初始化文档加载器: {self.file_path} (格式: {self.file_suffix})") + self._validate_file() + + def _validate_file(self) -> None: + """验证文件存在性和格式支持性""" + if not self.file_path.exists(): + raise FileNotFoundError(f"文件不存在: {self.file_path}") + if self.file_suffix not in self.SUPPORTED_FORMATS: + raise ValueError( + f"不支持的格式: {self.file_suffix} | 支持格式: {list(self.SUPPORTED_FORMATS.keys())}" + ) + + def load( + self, + file_format: Optional[str] = None, + **loader_kwargs + ) -> List[Document]: + """ + 加载文档并返回 LangChain Document 列表 + :param file_format: 手动指定格式(如 ".pdf"),默认自动识别 + :param loader_kwargs: 传递给具体加载器的参数(如 JSONLoader 的 jq_schema) + :return: List[Document] + """ + # 确定目标格式 + target_format = file_format.lower() if file_format else self.file_suffix + loader_cls = self.SUPPORTED_FORMATS[target_format] + + # 加载器默认参数优化 + loader_kwargs = self._set_default_kwargs(loader_cls, loader_kwargs) + + # 初始化并加载 + loader = loader_cls(str(self.file_path), **loader_kwargs) + return loader.load() + + @staticmethod + def _set_default_kwargs(loader_cls, kwargs: dict) -> dict: + """为不同加载器设置默认参数,简化调用""" + if loader_cls == JSONLoader and "jq_schema" not in kwargs: + kwargs.setdefault("jq_schema", ".") + kwargs.setdefault("text_content", False) + if loader_cls == CSVLoader and "csv_args" not in kwargs: + kwargs["csv_args"] = {"delimiter": ","} + return kwargs + + +# 文档加载器便捷函数 +def load_documents( + file_path: Union[str, Path], + file_format: Optional[str] = None, + **loader_kwargs +) -> List[Document]: + """快速加载文档的便捷函数""" + loader = UniversalDocLoader(file_path) + return loader.load(file_format=file_format, **loader_kwargs) diff --git a/runtime/datamate-python/app/common/text_split.py b/runtime/datamate-python/app/common/text_split.py new file mode 100644 index 0000000..e69de29 diff --git a/runtime/datamate-python/app/module/generation/service/generation_service.py b/runtime/datamate-python/app/module/generation/service/generation_service.py index 1c9996b..b216839 100644 --- a/runtime/datamate-python/app/module/generation/service/generation_service.py +++ b/runtime/datamate-python/app/module/generation/service/generation_service.py @@ -1,20 +1,8 @@ import asyncio -import uuid import json +import uuid from pathlib import Path -from langchain_community.document_loaders import ( - TextLoader, - CSVLoader, - JSONLoader, - UnstructuredMarkdownLoader, - UnstructuredHTMLLoader, - UnstructuredFileLoader, - PyPDFLoader, - UnstructuredWordDocumentLoader, - UnstructuredPowerPointLoader, - UnstructuredExcelLoader, -) from langchain_text_splitters import RecursiveCharacterTextSplitter from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession @@ -30,6 +18,7 @@ from app.db.models.model_config import get_model_by_id from app.db.session import logger from app.module.shared.util.model_chat import _extract_json_substring from app.module.system.service.common_service import get_chat_client, chat +from app.common.document_loaders import load_documents class GenerationService: @@ -250,8 +239,7 @@ class GenerationService: 保留每个 Document 的 metadata,方便后续追加例如文件ID、chunk序号等信息。 """ - loader = self._build_loader(file_path) - docs = loader.load() + docs = load_documents(file_path) splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, @@ -262,67 +250,6 @@ class GenerationService: split_docs = splitter.split_documents(docs) return split_docs - @staticmethod - def _build_loader(file_path: str): - """根据文件扩展名选择合适的 LangChain 文本加载器,尽量覆盖常见泛文本格式。 - - 优先按格式选择专门的 Loader,找不到匹配时退回到 TextLoader。 - """ - path = Path(file_path) - suffix = path.suffix.lower() - path_str = str(path) - - # 1. 纯文本类 - if suffix in {".txt", "", ".log"}: # "" 兼容无扩展名 - return TextLoader(path_str, encoding="utf-8") - - # 2. Markdown - if suffix in {".md", ".markdown"}: - # UnstructuredMarkdownLoader 会保留更多结构信息 - return UnstructuredMarkdownLoader(path_str) - - # 3. HTML / HTM - if suffix in {".html", ".htm"}: - return UnstructuredHTMLLoader(path_str) - - # 4. JSON - if suffix == ".json": - # 使用 JSONLoader 将 JSON 中的内容展开成文档 - # 这里使用默认 jq_schema,后续需要更精细地提取可以在此调整 - return JSONLoader(file_path=path_str, jq_schema=".") - - # 5. CSV / TSV - if suffix in {".csv", ".tsv"}: - # CSVLoader 默认将每一行作为一条 Document - return CSVLoader(file_path=path_str) - - # 6. YAML - if suffix in {".yaml", ".yml"}: - # 暂时按纯文本加载 - return TextLoader(path_str, encoding="utf-8") - - # 7. PDF - if suffix == ".pdf": - return PyPDFLoader(path_str) - - # 8. Word 文档 - if suffix in {".docx", ".doc"}: - # UnstructuredWordDocumentLoader 支持 .docx/.doc 文本抽取 - return UnstructuredWordDocumentLoader(path_str) - - # 9. PowerPoint - if suffix in {".ppt", ".pptx"}: - return UnstructuredPowerPointLoader(path_str) - - # 10. Excel - if suffix in {".xls", ".xlsx"}: - return UnstructuredExcelLoader(path_str) - - # 11. 兜底:使用 UnstructuredFileLoader 或 TextLoader 作为纯文本 - try: - return UnstructuredFileLoader(path_str) - except Exception: - return TextLoader(path_str, encoding="utf-8") @staticmethod def _build_qa_prompt(chunk: str, synthesis_cfg: dict) -> str: diff --git a/scripts/images/backend-python/Dockerfile b/scripts/images/backend-python/Dockerfile index 2a25af0..9cd9f3d 100644 --- a/scripts/images/backend-python/Dockerfile +++ b/scripts/images/backend-python/Dockerfile @@ -32,7 +32,7 @@ RUN --mount=type=cache,target=$POETRY_CACHE_DIR \ poetry install --no-root --only main # Download NLTK data -RUN python -c "import nltk; nltk.download('punkt_tab', download_dir='/usr/local/nltk_data')" +RUN python -c "import nltk; nltk.download(['punkt_tab','averaged_perceptron_tagger_eng'], download_dir='/usr/local/nltk_data')" ENV NLTK_DATA=/usr/local/nltk_data # Copy the rest of the application