You've already forked DataMate
feat: enhance dataset file fetching with improved pagination and document loading support (#156)
This commit is contained in:
@@ -46,7 +46,7 @@ public class DatasetFileController {
|
|||||||
@PathVariable("datasetId") String datasetId,
|
@PathVariable("datasetId") String datasetId,
|
||||||
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
|
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
|
||||||
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
|
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
|
||||||
@RequestParam(value = "prefix", required = false) String prefix) {
|
@RequestParam(value = "prefix", required = false, defaultValue = "") String prefix) {
|
||||||
PagingQuery pagingQuery = new PagingQuery(page, size);
|
PagingQuery pagingQuery = new PagingQuery(page, size);
|
||||||
PagedResponse<DatasetFile> filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(
|
PagedResponse<DatasetFile> filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(
|
||||||
datasetId, prefix, pagingQuery);
|
datasetId, prefix, pagingQuery);
|
||||||
|
|||||||
@@ -78,6 +78,7 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
|||||||
|
|
||||||
const fetchDatasets = async () => {
|
const fetchDatasets = async () => {
|
||||||
const { data } = await queryDatasetsUsingGet({
|
const { data } = await queryDatasetsUsingGet({
|
||||||
|
// Ant Design Table pagination.current is 1-based; ensure backend also receives 1-based value
|
||||||
page: datasetPagination.current,
|
page: datasetPagination.current,
|
||||||
size: datasetPagination.pageSize,
|
size: datasetPagination.pageSize,
|
||||||
keyword: datasetSearch,
|
keyword: datasetSearch,
|
||||||
@@ -98,12 +99,19 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
|||||||
300
|
300
|
||||||
);
|
);
|
||||||
|
|
||||||
const fetchFiles = useCallback(async () => {
|
const fetchFiles = useCallback(
|
||||||
|
async (
|
||||||
|
options?: Partial<{ page: number; pageSize: number; keyword: string }>
|
||||||
|
) => {
|
||||||
if (!selectedDataset) return;
|
if (!selectedDataset) return;
|
||||||
|
const page = options?.page ?? filesPagination.current;
|
||||||
|
const pageSize = options?.pageSize ?? filesPagination.pageSize;
|
||||||
|
const keyword = options?.keyword ?? filesSearch;
|
||||||
|
|
||||||
const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, {
|
const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, {
|
||||||
page: filesPagination.current - 1,
|
page,
|
||||||
size: filesPagination.pageSize,
|
size: pageSize,
|
||||||
keyword: filesSearch,
|
keyword,
|
||||||
});
|
});
|
||||||
setFiles(
|
setFiles(
|
||||||
(data.content || []).map((item: DatasetFile) => ({
|
(data.content || []).map((item: DatasetFile) => ({
|
||||||
@@ -114,13 +122,26 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
|||||||
);
|
);
|
||||||
setFilesPagination((prev) => ({
|
setFilesPagination((prev) => ({
|
||||||
...prev,
|
...prev,
|
||||||
|
current: page,
|
||||||
|
pageSize,
|
||||||
total: data.totalElements,
|
total: data.totalElements,
|
||||||
}));
|
}));
|
||||||
}, [filesPagination.current, filesPagination.pageSize, filesSearch, selectedDataset]);
|
},
|
||||||
|
[selectedDataset, filesPagination.current, filesPagination.pageSize, filesSearch]
|
||||||
|
);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
fetchFiles().catch(() => {});
|
// 当数据集变化时,重置文件分页并拉取第一页文件,避免额外的循环请求
|
||||||
}, [fetchFiles]);
|
if (selectedDataset) {
|
||||||
|
setFilesPagination({ current: 1, pageSize: 10, total: 0 });
|
||||||
|
fetchFiles({ page: 1, pageSize: 10 }).catch(() => {});
|
||||||
|
} else {
|
||||||
|
setFiles([]);
|
||||||
|
setFilesPagination({ current: 1, pageSize: 10, total: 0 });
|
||||||
|
}
|
||||||
|
// 只在 selectedDataset 变化时触发
|
||||||
|
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||||
|
}, [selectedDataset]);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
onDatasetSelect?.(selectedDataset);
|
onDatasetSelect?.(selectedDataset);
|
||||||
@@ -238,7 +259,18 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
|||||||
size="small"
|
size="small"
|
||||||
dataSource={files}
|
dataSource={files}
|
||||||
columns={fileCols.slice(1, fileCols.length)}
|
columns={fileCols.slice(1, fileCols.length)}
|
||||||
pagination={filesPagination}
|
pagination={{
|
||||||
|
...filesPagination,
|
||||||
|
onChange: (page, pageSize) => {
|
||||||
|
const nextPageSize = pageSize || filesPagination.pageSize;
|
||||||
|
setFilesPagination((prev) => ({
|
||||||
|
...prev,
|
||||||
|
current: page,
|
||||||
|
pageSize: nextPageSize,
|
||||||
|
}));
|
||||||
|
fetchFiles({ page, pageSize: nextPageSize }).catch(() => {});
|
||||||
|
},
|
||||||
|
}}
|
||||||
onRow={(record: DatasetFile) => ({
|
onRow={(record: DatasetFile) => ({
|
||||||
onClick: () => toggleSelectFile(record),
|
onClick: () => toggleSelectFile(record),
|
||||||
})}
|
})}
|
||||||
@@ -247,7 +279,7 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
|||||||
selectedRowKeys: Object.keys(selectedFilesMap),
|
selectedRowKeys: Object.keys(selectedFilesMap),
|
||||||
|
|
||||||
// 单选
|
// 单选
|
||||||
onSelect: (record: DatasetFile, selected: boolean) => {
|
onSelect: (record: DatasetFile) => {
|
||||||
toggleSelectFile(record);
|
toggleSelectFile(record);
|
||||||
},
|
},
|
||||||
|
|
||||||
@@ -255,7 +287,7 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
|||||||
onSelectAll: (selected, selectedRows: DatasetFile[]) => {
|
onSelectAll: (selected, selectedRows: DatasetFile[]) => {
|
||||||
if (selected) {
|
if (selected) {
|
||||||
// ✔ 全选 -> 将 files 列表全部加入 selectedFilesMap
|
// ✔ 全选 -> 将 files 列表全部加入 selectedFilesMap
|
||||||
const newMap: Record<string, DatasetFile> = {};
|
const newMap: Record<string, DatasetFile> = { ...selectedFilesMap };
|
||||||
selectedRows.forEach((f) => {
|
selectedRows.forEach((f) => {
|
||||||
newMap[f.id] = f;
|
newMap[f.id] = f;
|
||||||
});
|
});
|
||||||
@@ -264,7 +296,7 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
|||||||
// ✘ 取消全选 -> 清空 map
|
// ✘ 取消全选 -> 清空 map
|
||||||
const newMap = { ...selectedFilesMap };
|
const newMap = { ...selectedFilesMap };
|
||||||
Object.keys(newMap).forEach((id) => {
|
Object.keys(newMap).forEach((id) => {
|
||||||
if (files.find((f) => f.id === id)) {
|
if (files.some((f) => String(f.id) === id)) {
|
||||||
// 仅移除当前页对应文件
|
// 仅移除当前页对应文件
|
||||||
delete newMap[id];
|
delete newMap[id];
|
||||||
}
|
}
|
||||||
@@ -277,15 +309,6 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
|||||||
name: record.fileName,
|
name: record.fileName,
|
||||||
}),
|
}),
|
||||||
}}
|
}}
|
||||||
|
|
||||||
// rowSelection={{
|
|
||||||
// type: "checkbox",
|
|
||||||
// selectedRowKeys: Object.keys(selectedFilesMap),
|
|
||||||
// onSelect: toggleSelectFile,
|
|
||||||
// getCheckboxProps: (record: DatasetFile) => ({
|
|
||||||
// name: record.fileName,
|
|
||||||
// }),
|
|
||||||
// }}
|
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
3
runtime/datamate-python/app/common/__init__.py
Normal file
3
runtime/datamate-python/app/common/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
"""
|
||||||
|
公共模块
|
||||||
|
"""
|
||||||
93
runtime/datamate-python/app/common/document_loaders.py
Normal file
93
runtime/datamate-python/app/common/document_loaders.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
from typing import List, Union, Optional
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_community.document_loaders import (
|
||||||
|
TextLoader,
|
||||||
|
JSONLoader,
|
||||||
|
CSVLoader,
|
||||||
|
UnstructuredMarkdownLoader,
|
||||||
|
PyPDFLoader,
|
||||||
|
Docx2txtLoader
|
||||||
|
)
|
||||||
|
|
||||||
|
from app.core.logging import get_logger
|
||||||
|
|
||||||
|
log = get_logger(__name__)
|
||||||
|
|
||||||
|
class UniversalDocLoader:
|
||||||
|
"""
|
||||||
|
通用泛文本文档加载类
|
||||||
|
支持格式:TXT/JSON/CSV/Markdown/Word(.docx)/PPT(.pptx)/PDF
|
||||||
|
"""
|
||||||
|
# 格式-加载器映射(轻量优先)
|
||||||
|
SUPPORTED_FORMATS = {
|
||||||
|
# 纯文本类
|
||||||
|
".txt": TextLoader,
|
||||||
|
".json": JSONLoader,
|
||||||
|
".csv": CSVLoader,
|
||||||
|
".md": UnstructuredMarkdownLoader,
|
||||||
|
# 办公文档类
|
||||||
|
".docx": Docx2txtLoader,
|
||||||
|
".doc": Docx2txtLoader,
|
||||||
|
# PDF 类
|
||||||
|
".pdf": PyPDFLoader
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, file_path: Union[str, Path]):
|
||||||
|
self.file_path = Path(file_path).resolve()
|
||||||
|
self.file_suffix = self.file_path.suffix.lower()
|
||||||
|
log.info(f"初始化文档加载器: {self.file_path} (格式: {self.file_suffix})")
|
||||||
|
self._validate_file()
|
||||||
|
|
||||||
|
def _validate_file(self) -> None:
|
||||||
|
"""验证文件存在性和格式支持性"""
|
||||||
|
if not self.file_path.exists():
|
||||||
|
raise FileNotFoundError(f"文件不存在: {self.file_path}")
|
||||||
|
if self.file_suffix not in self.SUPPORTED_FORMATS:
|
||||||
|
raise ValueError(
|
||||||
|
f"不支持的格式: {self.file_suffix} | 支持格式: {list(self.SUPPORTED_FORMATS.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def load(
|
||||||
|
self,
|
||||||
|
file_format: Optional[str] = None,
|
||||||
|
**loader_kwargs
|
||||||
|
) -> List[Document]:
|
||||||
|
"""
|
||||||
|
加载文档并返回 LangChain Document 列表
|
||||||
|
:param file_format: 手动指定格式(如 ".pdf"),默认自动识别
|
||||||
|
:param loader_kwargs: 传递给具体加载器的参数(如 JSONLoader 的 jq_schema)
|
||||||
|
:return: List[Document]
|
||||||
|
"""
|
||||||
|
# 确定目标格式
|
||||||
|
target_format = file_format.lower() if file_format else self.file_suffix
|
||||||
|
loader_cls = self.SUPPORTED_FORMATS[target_format]
|
||||||
|
|
||||||
|
# 加载器默认参数优化
|
||||||
|
loader_kwargs = self._set_default_kwargs(loader_cls, loader_kwargs)
|
||||||
|
|
||||||
|
# 初始化并加载
|
||||||
|
loader = loader_cls(str(self.file_path), **loader_kwargs)
|
||||||
|
return loader.load()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _set_default_kwargs(loader_cls, kwargs: dict) -> dict:
|
||||||
|
"""为不同加载器设置默认参数,简化调用"""
|
||||||
|
if loader_cls == JSONLoader and "jq_schema" not in kwargs:
|
||||||
|
kwargs.setdefault("jq_schema", ".")
|
||||||
|
kwargs.setdefault("text_content", False)
|
||||||
|
if loader_cls == CSVLoader and "csv_args" not in kwargs:
|
||||||
|
kwargs["csv_args"] = {"delimiter": ","}
|
||||||
|
return kwargs
|
||||||
|
|
||||||
|
|
||||||
|
# 文档加载器便捷函数
|
||||||
|
def load_documents(
|
||||||
|
file_path: Union[str, Path],
|
||||||
|
file_format: Optional[str] = None,
|
||||||
|
**loader_kwargs
|
||||||
|
) -> List[Document]:
|
||||||
|
"""快速加载文档的便捷函数"""
|
||||||
|
loader = UniversalDocLoader(file_path)
|
||||||
|
return loader.load(file_format=file_format, **loader_kwargs)
|
||||||
0
runtime/datamate-python/app/common/text_split.py
Normal file
0
runtime/datamate-python/app/common/text_split.py
Normal file
@@ -1,20 +1,8 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import uuid
|
|
||||||
import json
|
import json
|
||||||
|
import uuid
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from langchain_community.document_loaders import (
|
|
||||||
TextLoader,
|
|
||||||
CSVLoader,
|
|
||||||
JSONLoader,
|
|
||||||
UnstructuredMarkdownLoader,
|
|
||||||
UnstructuredHTMLLoader,
|
|
||||||
UnstructuredFileLoader,
|
|
||||||
PyPDFLoader,
|
|
||||||
UnstructuredWordDocumentLoader,
|
|
||||||
UnstructuredPowerPointLoader,
|
|
||||||
UnstructuredExcelLoader,
|
|
||||||
)
|
|
||||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||||
from sqlalchemy import select
|
from sqlalchemy import select
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
@@ -30,6 +18,7 @@ from app.db.models.model_config import get_model_by_id
|
|||||||
from app.db.session import logger
|
from app.db.session import logger
|
||||||
from app.module.shared.util.model_chat import _extract_json_substring
|
from app.module.shared.util.model_chat import _extract_json_substring
|
||||||
from app.module.system.service.common_service import get_chat_client, chat
|
from app.module.system.service.common_service import get_chat_client, chat
|
||||||
|
from app.common.document_loaders import load_documents
|
||||||
|
|
||||||
|
|
||||||
class GenerationService:
|
class GenerationService:
|
||||||
@@ -250,8 +239,7 @@ class GenerationService:
|
|||||||
|
|
||||||
保留每个 Document 的 metadata,方便后续追加例如文件ID、chunk序号等信息。
|
保留每个 Document 的 metadata,方便后续追加例如文件ID、chunk序号等信息。
|
||||||
"""
|
"""
|
||||||
loader = self._build_loader(file_path)
|
docs = load_documents(file_path)
|
||||||
docs = loader.load()
|
|
||||||
|
|
||||||
splitter = RecursiveCharacterTextSplitter(
|
splitter = RecursiveCharacterTextSplitter(
|
||||||
chunk_size=chunk_size,
|
chunk_size=chunk_size,
|
||||||
@@ -262,67 +250,6 @@ class GenerationService:
|
|||||||
split_docs = splitter.split_documents(docs)
|
split_docs = splitter.split_documents(docs)
|
||||||
return split_docs
|
return split_docs
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _build_loader(file_path: str):
|
|
||||||
"""根据文件扩展名选择合适的 LangChain 文本加载器,尽量覆盖常见泛文本格式。
|
|
||||||
|
|
||||||
优先按格式选择专门的 Loader,找不到匹配时退回到 TextLoader。
|
|
||||||
"""
|
|
||||||
path = Path(file_path)
|
|
||||||
suffix = path.suffix.lower()
|
|
||||||
path_str = str(path)
|
|
||||||
|
|
||||||
# 1. 纯文本类
|
|
||||||
if suffix in {".txt", "", ".log"}: # "" 兼容无扩展名
|
|
||||||
return TextLoader(path_str, encoding="utf-8")
|
|
||||||
|
|
||||||
# 2. Markdown
|
|
||||||
if suffix in {".md", ".markdown"}:
|
|
||||||
# UnstructuredMarkdownLoader 会保留更多结构信息
|
|
||||||
return UnstructuredMarkdownLoader(path_str)
|
|
||||||
|
|
||||||
# 3. HTML / HTM
|
|
||||||
if suffix in {".html", ".htm"}:
|
|
||||||
return UnstructuredHTMLLoader(path_str)
|
|
||||||
|
|
||||||
# 4. JSON
|
|
||||||
if suffix == ".json":
|
|
||||||
# 使用 JSONLoader 将 JSON 中的内容展开成文档
|
|
||||||
# 这里使用默认 jq_schema,后续需要更精细地提取可以在此调整
|
|
||||||
return JSONLoader(file_path=path_str, jq_schema=".")
|
|
||||||
|
|
||||||
# 5. CSV / TSV
|
|
||||||
if suffix in {".csv", ".tsv"}:
|
|
||||||
# CSVLoader 默认将每一行作为一条 Document
|
|
||||||
return CSVLoader(file_path=path_str)
|
|
||||||
|
|
||||||
# 6. YAML
|
|
||||||
if suffix in {".yaml", ".yml"}:
|
|
||||||
# 暂时按纯文本加载
|
|
||||||
return TextLoader(path_str, encoding="utf-8")
|
|
||||||
|
|
||||||
# 7. PDF
|
|
||||||
if suffix == ".pdf":
|
|
||||||
return PyPDFLoader(path_str)
|
|
||||||
|
|
||||||
# 8. Word 文档
|
|
||||||
if suffix in {".docx", ".doc"}:
|
|
||||||
# UnstructuredWordDocumentLoader 支持 .docx/.doc 文本抽取
|
|
||||||
return UnstructuredWordDocumentLoader(path_str)
|
|
||||||
|
|
||||||
# 9. PowerPoint
|
|
||||||
if suffix in {".ppt", ".pptx"}:
|
|
||||||
return UnstructuredPowerPointLoader(path_str)
|
|
||||||
|
|
||||||
# 10. Excel
|
|
||||||
if suffix in {".xls", ".xlsx"}:
|
|
||||||
return UnstructuredExcelLoader(path_str)
|
|
||||||
|
|
||||||
# 11. 兜底:使用 UnstructuredFileLoader 或 TextLoader 作为纯文本
|
|
||||||
try:
|
|
||||||
return UnstructuredFileLoader(path_str)
|
|
||||||
except Exception:
|
|
||||||
return TextLoader(path_str, encoding="utf-8")
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _build_qa_prompt(chunk: str, synthesis_cfg: dict) -> str:
|
def _build_qa_prompt(chunk: str, synthesis_cfg: dict) -> str:
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ RUN --mount=type=cache,target=$POETRY_CACHE_DIR \
|
|||||||
poetry install --no-root --only main
|
poetry install --no-root --only main
|
||||||
|
|
||||||
# Download NLTK data
|
# Download NLTK data
|
||||||
RUN python -c "import nltk; nltk.download('punkt_tab', download_dir='/usr/local/nltk_data')"
|
RUN python -c "import nltk; nltk.download(['punkt_tab','averaged_perceptron_tagger_eng'], download_dir='/usr/local/nltk_data')"
|
||||||
ENV NLTK_DATA=/usr/local/nltk_data
|
ENV NLTK_DATA=/usr/local/nltk_data
|
||||||
|
|
||||||
# Copy the rest of the application
|
# Copy the rest of the application
|
||||||
|
|||||||
Reference in New Issue
Block a user