Compare commits

..

2 Commits

Author SHA1 Message Date
b5d7c66240 feat(data-management): 扩展源文档排除功能支持Excel文件类型
- 在后端服务中扩展源文档类型检查,新增对XLS和XLSX文件的支持
- 修改DatasetFileApplicationService中的过滤逻辑,统一处理所有源文档类型
- 新增isSourceDocument和isDerivedFile辅助方法进行文件类型判断
- 更新前端DatasetFileTransfer组件中的注释说明
- 在Python运行时依赖中添加openpyxl和xlrd库以支持Excel文件处理
- 修改标注项目接口中源文档类型的集合定义
- 更新文件操作钩子中的派生文件排除逻辑
2026-01-31 11:30:55 +08:00
6c7ea0c25e chore(deps): 更新 Docker 镜像源地址
- 将 etcd 镜像源从 quay.io 替换为 quay.nju.edu.cn
- 将 vLLM-Ascend 镜像源从 quay.io 替换为 quay.nju.edu.cn
- 统一使用南京大学镜像仓库地址以提高下载速度
2026-01-31 11:21:47 +08:00
10 changed files with 213 additions and 122 deletions

View File

@@ -28,6 +28,7 @@ import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest;
import com.datamate.datamanagement.interfaces.dto.CreateDirectoryRequest; import com.datamate.datamanagement.interfaces.dto.CreateDirectoryRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest; import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest; import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import jakarta.servlet.http.HttpServletResponse; import jakarta.servlet.http.HttpServletResponse;
@@ -79,6 +80,7 @@ public class DatasetFileApplicationService {
XLS_FILE_TYPE, XLS_FILE_TYPE,
XLSX_FILE_TYPE XLSX_FILE_TYPE
); );
private static final String DERIVED_METADATA_KEY = "derived_from_file_id";
private final DatasetFileRepository datasetFileRepository; private final DatasetFileRepository datasetFileRepository;
private final DatasetRepository datasetRepository; private final DatasetRepository datasetRepository;
@@ -119,7 +121,7 @@ public class DatasetFileApplicationService {
* @param status 状态过滤 * @param status 状态过滤
* @param name 文件名模糊查询 * @param name 文件名模糊查询
* @param hasAnnotation 是否有标注 * @param hasAnnotation 是否有标注
* @param excludeSourceDocuments 是否排除已被转换为TXT的源文档(PDF/DOC/DOCX) * @param excludeSourceDocuments 是否排除源文档(PDF/DOC/DOCX/XLS/XLSX
* @param pagingQuery 分页参数 * @param pagingQuery 分页参数
* @return 分页文件列表 * @return 分页文件列表
*/ */
@@ -130,19 +132,15 @@ public class DatasetFileApplicationService {
IPage<DatasetFile> files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page); IPage<DatasetFile> files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page);
if (excludeSourceDocuments) { if (excludeSourceDocuments) {
// 查询所有作为衍生TXT文件源的文档文件ID // 过滤掉源文档文件(PDF/DOC/DOCX/XLS/XLSX),用于标注场景只展示派生文件
List<String> sourceFileIds = datasetFileRepository.findSourceFileIdsWithDerivedFiles(datasetId); List<DatasetFile> filteredRecords = files.getRecords().stream()
if (!sourceFileIds.isEmpty()) { .filter(file -> !isSourceDocument(file))
// 过滤掉源文件 .collect(Collectors.toList());
List<DatasetFile> filteredRecords = files.getRecords().stream()
.filter(file -> !sourceFileIds.contains(file.getId()))
.collect(Collectors.toList());
// 重新构建分页结果 // 重新构建分页结果
Page<DatasetFile> filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal()); Page<DatasetFile> filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal());
filteredPage.setRecords(filteredRecords); filteredPage.setRecords(filteredRecords);
return PagedResponse.of(filteredPage); return PagedResponse.of(filteredPage);
}
} }
return PagedResponse.of(files); return PagedResponse.of(files);
@@ -152,7 +150,7 @@ public class DatasetFileApplicationService {
* 获取数据集文件列表 * 获取数据集文件列表
*/ */
@Transactional(readOnly = true) @Transactional(readOnly = true)
public PagedResponse<DatasetFile> getDatasetFilesWithDirectory(String datasetId, String prefix, PagingQuery pagingQuery) { public PagedResponse<DatasetFile> getDatasetFilesWithDirectory(String datasetId, String prefix, boolean excludeDerivedFiles, PagingQuery pagingQuery) {
Dataset dataset = datasetRepository.getById(datasetId); Dataset dataset = datasetRepository.getById(datasetId);
int page = Math.max(pagingQuery.getPage(), 1); int page = Math.max(pagingQuery.getPage(), 1);
int size = pagingQuery.getSize() == null || pagingQuery.getSize() < 0 ? 20 : pagingQuery.getSize(); int size = pagingQuery.getSize() == null || pagingQuery.getSize() < 0 ? 20 : pagingQuery.getSize();
@@ -163,9 +161,17 @@ public class DatasetFileApplicationService {
Path queryPath = Path.of(dataset.getPath() + File.separator + prefix); Path queryPath = Path.of(dataset.getPath() + File.separator + prefix);
Map<String, DatasetFile> datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId) Map<String, DatasetFile> datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId)
.stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity())); .stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity()));
Set<String> derivedFilePaths = excludeDerivedFiles
? datasetFilesMap.values().stream()
.filter(this::isDerivedFile)
.map(DatasetFile::getFilePath)
.filter(Objects::nonNull)
.collect(Collectors.toSet())
: Collections.emptySet();
try (Stream<Path> pathStream = Files.list(queryPath)) { try (Stream<Path> pathStream = Files.list(queryPath)) {
List<Path> allFiles = pathStream List<Path> allFiles = pathStream
.filter(path -> path.toString().startsWith(datasetPath)) .filter(path -> path.toString().startsWith(datasetPath))
.filter(path -> !excludeDerivedFiles || Files.isDirectory(path) || !derivedFilePaths.contains(path.toString()))
.sorted(Comparator .sorted(Comparator
.comparing((Path path) -> !Files.isDirectory(path)) .comparing((Path path) -> !Files.isDirectory(path))
.thenComparing(path -> path.getFileName().toString())) .thenComparing(path -> path.getFileName().toString()))
@@ -249,6 +255,35 @@ public class DatasetFileApplicationService {
return datasetFile; return datasetFile;
} }
private boolean isSourceDocument(DatasetFile datasetFile) {
if (datasetFile == null) {
return false;
}
String fileType = datasetFile.getFileType();
if (fileType == null || fileType.isBlank()) {
return false;
}
return DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT));
}
private boolean isDerivedFile(DatasetFile datasetFile) {
if (datasetFile == null) {
return false;
}
String metadata = datasetFile.getMetadata();
if (metadata == null || metadata.isBlank()) {
return false;
}
try {
ObjectMapper mapper = new ObjectMapper();
Map<String, Object> metadataMap = mapper.readValue(metadata, new TypeReference<Map<String, Object>>() {});
return metadataMap.get(DERIVED_METADATA_KEY) != null;
} catch (Exception e) {
log.debug("Failed to parse dataset file metadata for derived detection: {}", datasetFile.getId(), e);
return false;
}
}
/** /**
* 获取文件详情 * 获取文件详情
*/ */

View File

@@ -52,11 +52,17 @@ public class DatasetFileController {
@RequestParam(value = "prefix", required = false, defaultValue = "") String prefix, @RequestParam(value = "prefix", required = false, defaultValue = "") String prefix,
@RequestParam(value = "status", required = false) String status, @RequestParam(value = "status", required = false) String status,
@RequestParam(value = "hasAnnotation", required = false) Boolean hasAnnotation, @RequestParam(value = "hasAnnotation", required = false) Boolean hasAnnotation,
@RequestParam(value = "excludeSourceDocuments", required = false, defaultValue = "false") Boolean excludeSourceDocuments) { @RequestParam(value = "excludeSourceDocuments", required = false, defaultValue = "false") Boolean excludeSourceDocuments,
@RequestParam(value = "excludeDerivedFiles", required = false, defaultValue = "false") Boolean excludeDerivedFiles) {
PagingQuery pagingQuery = new PagingQuery(page, size); PagingQuery pagingQuery = new PagingQuery(page, size);
PagedResponse<DatasetFile> filesPage; PagedResponse<DatasetFile> filesPage;
if (isWithDirectory) { if (isWithDirectory) {
filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(datasetId, prefix, pagingQuery); filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(
datasetId,
prefix,
Boolean.TRUE.equals(excludeDerivedFiles),
pagingQuery
);
} else { } else {
filesPage = datasetFileApplicationService.getDatasetFiles(datasetId, null, status, null, hasAnnotation, filesPage = datasetFileApplicationService.getDatasetFiles(datasetId, null, status, null, hasAnnotation,
Boolean.TRUE.equals(excludeSourceDocuments), pagingQuery); Boolean.TRUE.equals(excludeSourceDocuments), pagingQuery);

View File

@@ -1,7 +1,7 @@
services: services:
etcd: etcd:
container_name: milvus-etcd container_name: milvus-etcd
image: quay.io/coreos/etcd:v3.5.18 image: quay.nju.edu.cn/coreos/etcd:v3.5.18
environment: environment:
- ETCD_AUTO_COMPACTION_MODE=revision - ETCD_AUTO_COMPACTION_MODE=revision
- ETCD_AUTO_COMPACTION_RETENTION=1000 - ETCD_AUTO_COMPACTION_RETENTION=1000

View File

@@ -23,8 +23,7 @@ interface DatasetFileTransferProps
datasetTypeFilter?: DatasetType; datasetTypeFilter?: DatasetType;
hasAnnotationFilter?: boolean; hasAnnotationFilter?: boolean;
/** /**
* 是否排除已被转换为TXT的源文档文件(PDF/DOC/DOCX * 是否排除源文档文件(PDF/DOC/DOCX/XLS/XLSX),文本标注默认启用
* 默认为 true,当 datasetTypeFilter 为 TEXT 时自动启用
*/ */
excludeSourceDocuments?: boolean; excludeSourceDocuments?: boolean;
} }

View File

@@ -282,7 +282,7 @@ export default function CreateAnnotationTask({
} }
setDatasetPreviewLoading(true); setDatasetPreviewLoading(true);
try { try {
// 对于文本数据集,排除已被转换为TXT的源文档文件(PDF/DOC/DOCX) // 对于文本数据集,排除源文档文件(PDF/DOC/DOCX/XLS/XLSX
const params: { page: number; size: number; excludeSourceDocuments?: boolean } = { page: 0, size: 10 }; const params: { page: number; size: number; excludeSourceDocuments?: boolean } = { page: 0, size: 10 };
if (isTextDataset) { if (isTextDataset) {
params.excludeSourceDocuments = true; params.excludeSourceDocuments = true;

View File

@@ -2,6 +2,7 @@ import type {
Dataset, Dataset,
DatasetFile, DatasetFile,
} from "@/pages/DataManagement/dataset.model"; } from "@/pages/DataManagement/dataset.model";
import { DatasetType } from "@/pages/DataManagement/dataset.model";
import { App } from "antd"; import { App } from "antd";
import { useState } from "react"; import { useState } from "react";
import { import {
@@ -51,12 +52,14 @@ export function useFilesOperation(dataset: Dataset) {
) => { ) => {
// 如果明确传了 prefix(包括空字符串),使用传入的值;否则使用当前 pagination.prefix // 如果明确传了 prefix(包括空字符串),使用传入的值;否则使用当前 pagination.prefix
const targetPrefix = prefix !== undefined ? prefix : (pagination.prefix || ''); const targetPrefix = prefix !== undefined ? prefix : (pagination.prefix || '');
const shouldExcludeDerivedFiles = dataset?.datasetType === DatasetType.TEXT;
const params: DatasetFilesQueryParams = { const params: DatasetFilesQueryParams = {
page: current !== undefined ? current : pagination.current, page: current !== undefined ? current : pagination.current,
size: pageSize !== undefined ? pageSize : pagination.pageSize, size: pageSize !== undefined ? pageSize : pagination.pageSize,
isWithDirectory: true, isWithDirectory: true,
prefix: targetPrefix, prefix: targetPrefix,
...(shouldExcludeDerivedFiles ? { excludeDerivedFiles: true } : {}),
}; };
const { data } = await queryDatasetFilesUsingGet(id!, params); const { data } = await queryDatasetFilesUsingGet(id!, params);
@@ -245,4 +248,5 @@ interface DatasetFilesQueryParams {
size: number; size: number;
isWithDirectory: boolean; isWithDirectory: boolean;
prefix: string; prefix: string;
excludeDerivedFiles?: boolean;
} }

View File

@@ -27,6 +27,7 @@ router = APIRouter(
) )
logger = get_logger(__name__) logger = get_logger(__name__)
TEXT_DATASET_TYPE = "TEXT" TEXT_DATASET_TYPE = "TEXT"
SOURCE_DOCUMENT_FILE_TYPES = {"pdf", "doc", "docx", "xls", "xlsx"}
@router.get("/{mapping_id}/login") @router.get("/{mapping_id}/login")
async def login_label_studio( async def login_label_studio(
@@ -123,18 +124,14 @@ async def create_mapping(
file_records = file_result.scalars().all() file_records = file_result.scalars().all()
snapshot_file_ids: list[str] = [] snapshot_file_ids: list[str] = []
if dataset_type == TEXT_DATASET_TYPE: if dataset_type == TEXT_DATASET_TYPE:
derived_source_ids = set() snapshot_file_ids = []
for file_record in file_records: for file_record in file_records:
metadata = getattr(file_record, "dataset_filemetadata", None) if not file_record.id:
if isinstance(metadata, dict): continue
source_id = metadata.get("derived_from_file_id") file_type = str(getattr(file_record, "file_type", "") or "").lower()
if source_id: if file_type in SOURCE_DOCUMENT_FILE_TYPES:
derived_source_ids.add(str(source_id)) continue
snapshot_file_ids = [ snapshot_file_ids.append(str(file_record.id))
str(file_record.id)
for file_record in file_records
if file_record.id and str(file_record.id) not in derived_source_ids
]
else: else:
snapshot_file_ids = [ snapshot_file_ids = [
str(file_record.id) str(file_record.id)

View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. # This file is automatically @generated by Poetry 2.3.1 and should not be changed by hand.
[[package]] [[package]]
name = "aiofiles" name = "aiofiles"
@@ -704,6 +704,18 @@ files = [
[package.extras] [package.extras]
dev = ["coverage", "pytest (>=7.4.4)"] dev = ["coverage", "pytest (>=7.4.4)"]
[[package]]
name = "et-xmlfile"
version = "2.0.0"
description = "An implementation of lxml.xmlfile for the standard library"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"},
{file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"},
]
[[package]] [[package]]
name = "fastapi" name = "fastapi"
version = "0.124.0" version = "0.124.0"
@@ -1353,7 +1365,7 @@ files = [
[package.dependencies] [package.dependencies]
attrs = ">=22.2.0" attrs = ">=22.2.0"
jsonschema-specifications = ">=2023.03.6" jsonschema-specifications = ">=2023.3.6"
referencing = ">=0.28.4" referencing = ">=0.28.4"
rpds-py = ">=0.7.1" rpds-py = ">=0.7.1"
@@ -2155,6 +2167,21 @@ datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
realtime = ["websockets (>=13,<16)"] realtime = ["websockets (>=13,<16)"]
voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"] voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"]
[[package]]
name = "openpyxl"
version = "3.1.5"
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"},
{file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"},
]
[package.dependencies]
et-xmlfile = "*"
[[package]] [[package]]
name = "orjson" name = "orjson"
version = "3.11.4" version = "3.11.4"
@@ -3329,12 +3356,14 @@ optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
groups = ["main"] groups = ["main"]
files = [ files = [
{file = "sqlalchemy-2.0.45-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c64772786d9eee72d4d3784c28f0a636af5b0a29f3fe26ff11f55efe90c0bd85"},
{file = "sqlalchemy-2.0.45-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7ae64ebf7657395824a19bca98ab10eb9a3ecb026bf09524014f1bb81cb598d4"}, {file = "sqlalchemy-2.0.45-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7ae64ebf7657395824a19bca98ab10eb9a3ecb026bf09524014f1bb81cb598d4"},
{file = "sqlalchemy-2.0.45-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f02325709d1b1a1489f23a39b318e175a171497374149eae74d612634b234c0"}, {file = "sqlalchemy-2.0.45-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f02325709d1b1a1489f23a39b318e175a171497374149eae74d612634b234c0"},
{file = "sqlalchemy-2.0.45-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2c3684fca8a05f0ac1d9a21c1f4a266983a7ea9180efb80ffeb03861ecd01a0"}, {file = "sqlalchemy-2.0.45-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2c3684fca8a05f0ac1d9a21c1f4a266983a7ea9180efb80ffeb03861ecd01a0"},
{file = "sqlalchemy-2.0.45-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:040f6f0545b3b7da6b9317fc3e922c9a98fc7243b2a1b39f78390fc0942f7826"}, {file = "sqlalchemy-2.0.45-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:040f6f0545b3b7da6b9317fc3e922c9a98fc7243b2a1b39f78390fc0942f7826"},
{file = "sqlalchemy-2.0.45-cp310-cp310-win32.whl", hash = "sha256:830d434d609fe7bfa47c425c445a8b37929f140a7a44cdaf77f6d34df3a7296a"}, {file = "sqlalchemy-2.0.45-cp310-cp310-win32.whl", hash = "sha256:830d434d609fe7bfa47c425c445a8b37929f140a7a44cdaf77f6d34df3a7296a"},
{file = "sqlalchemy-2.0.45-cp310-cp310-win_amd64.whl", hash = "sha256:0209d9753671b0da74da2cfbb9ecf9c02f72a759e4b018b3ab35f244c91842c7"}, {file = "sqlalchemy-2.0.45-cp310-cp310-win_amd64.whl", hash = "sha256:0209d9753671b0da74da2cfbb9ecf9c02f72a759e4b018b3ab35f244c91842c7"},
{file = "sqlalchemy-2.0.45-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e90a344c644a4fa871eb01809c32096487928bd2038bf10f3e4515cb688cc56"},
{file = "sqlalchemy-2.0.45-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b8c8b41b97fba5f62349aa285654230296829672fc9939cd7f35aab246d1c08b"}, {file = "sqlalchemy-2.0.45-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b8c8b41b97fba5f62349aa285654230296829672fc9939cd7f35aab246d1c08b"},
{file = "sqlalchemy-2.0.45-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12c694ed6468333a090d2f60950e4250b928f457e4962389553d6ba5fe9951ac"}, {file = "sqlalchemy-2.0.45-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12c694ed6468333a090d2f60950e4250b928f457e4962389553d6ba5fe9951ac"},
{file = "sqlalchemy-2.0.45-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:f7d27a1d977a1cfef38a0e2e1ca86f09c4212666ce34e6ae542f3ed0a33bc606"}, {file = "sqlalchemy-2.0.45-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:f7d27a1d977a1cfef38a0e2e1ca86f09c4212666ce34e6ae542f3ed0a33bc606"},
@@ -3363,12 +3392,14 @@ files = [
{file = "sqlalchemy-2.0.45-cp314-cp314-win_amd64.whl", hash = "sha256:4748601c8ea959e37e03d13dcda4a44837afcd1b21338e637f7c935b8da06177"}, {file = "sqlalchemy-2.0.45-cp314-cp314-win_amd64.whl", hash = "sha256:4748601c8ea959e37e03d13dcda4a44837afcd1b21338e637f7c935b8da06177"},
{file = "sqlalchemy-2.0.45-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cd337d3526ec5298f67d6a30bbbe4ed7e5e68862f0bf6dd21d289f8d37b7d60b"}, {file = "sqlalchemy-2.0.45-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cd337d3526ec5298f67d6a30bbbe4ed7e5e68862f0bf6dd21d289f8d37b7d60b"},
{file = "sqlalchemy-2.0.45-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9a62b446b7d86a3909abbcd1cd3cc550a832f99c2bc37c5b22e1925438b9367b"}, {file = "sqlalchemy-2.0.45-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9a62b446b7d86a3909abbcd1cd3cc550a832f99c2bc37c5b22e1925438b9367b"},
{file = "sqlalchemy-2.0.45-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5964f832431b7cdfaaa22a660b4c7eb1dfcd6ed41375f67fd3e3440fd95cb3cc"},
{file = "sqlalchemy-2.0.45-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee580ab50e748208754ae8980cec79ec205983d8cf8b3f7c39067f3d9f2c8e22"}, {file = "sqlalchemy-2.0.45-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee580ab50e748208754ae8980cec79ec205983d8cf8b3f7c39067f3d9f2c8e22"},
{file = "sqlalchemy-2.0.45-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13e27397a7810163440c6bfed6b3fe46f1bfb2486eb540315a819abd2c004128"}, {file = "sqlalchemy-2.0.45-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13e27397a7810163440c6bfed6b3fe46f1bfb2486eb540315a819abd2c004128"},
{file = "sqlalchemy-2.0.45-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ed3635353e55d28e7f4a95c8eda98a5cdc0a0b40b528433fbd41a9ae88f55b3d"}, {file = "sqlalchemy-2.0.45-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ed3635353e55d28e7f4a95c8eda98a5cdc0a0b40b528433fbd41a9ae88f55b3d"},
{file = "sqlalchemy-2.0.45-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:db6834900338fb13a9123307f0c2cbb1f890a8656fcd5e5448ae3ad5bbe8d312"}, {file = "sqlalchemy-2.0.45-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:db6834900338fb13a9123307f0c2cbb1f890a8656fcd5e5448ae3ad5bbe8d312"},
{file = "sqlalchemy-2.0.45-cp38-cp38-win32.whl", hash = "sha256:1d8b4a7a8c9b537509d56d5cd10ecdcfbb95912d72480c8861524efecc6a3fff"}, {file = "sqlalchemy-2.0.45-cp38-cp38-win32.whl", hash = "sha256:1d8b4a7a8c9b537509d56d5cd10ecdcfbb95912d72480c8861524efecc6a3fff"},
{file = "sqlalchemy-2.0.45-cp38-cp38-win_amd64.whl", hash = "sha256:ebd300afd2b62679203435f596b2601adafe546cb7282d5a0cd3ed99e423720f"}, {file = "sqlalchemy-2.0.45-cp38-cp38-win_amd64.whl", hash = "sha256:ebd300afd2b62679203435f596b2601adafe546cb7282d5a0cd3ed99e423720f"},
{file = "sqlalchemy-2.0.45-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d29b2b99d527dbc66dd87c3c3248a5dd789d974a507f4653c969999fc7c1191b"},
{file = "sqlalchemy-2.0.45-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:59a8b8bd9c6bedf81ad07c8bd5543eedca55fe9b8780b2b628d495ba55f8db1e"}, {file = "sqlalchemy-2.0.45-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:59a8b8bd9c6bedf81ad07c8bd5543eedca55fe9b8780b2b628d495ba55f8db1e"},
{file = "sqlalchemy-2.0.45-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fd93c6f5d65f254ceabe97548c709e073d6da9883343adaa51bf1a913ce93f8e"}, {file = "sqlalchemy-2.0.45-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fd93c6f5d65f254ceabe97548c709e073d6da9883343adaa51bf1a913ce93f8e"},
{file = "sqlalchemy-2.0.45-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6d0beadc2535157070c9c17ecf25ecec31e13c229a8f69196d7590bde8082bf1"}, {file = "sqlalchemy-2.0.45-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6d0beadc2535157070c9c17ecf25ecec31e13c229a8f69196d7590bde8082bf1"},
@@ -4133,6 +4164,23 @@ files = [
[package.extras] [package.extras]
dev = ["pytest", "setuptools"] dev = ["pytest", "setuptools"]
[[package]]
name = "xlrd"
version = "2.0.2"
description = "Library for developers to extract data from Microsoft Excel (tm) .xls spreadsheet files"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
groups = ["main"]
files = [
{file = "xlrd-2.0.2-py2.py3-none-any.whl", hash = "sha256:ea762c3d29f4cca48d82df517b6d89fbce4db3107f9d78713e48cd321d5c9aa9"},
{file = "xlrd-2.0.2.tar.gz", hash = "sha256:08b5e25de58f21ce71dc7db3b3b8106c1fa776f3024c54e45b45b374e89234c9"},
]
[package.extras]
build = ["twine", "wheel"]
docs = ["sphinx"]
test = ["pytest", "pytest-cov"]
[[package]] [[package]]
name = "xxhash" name = "xxhash"
version = "3.6.0" version = "3.6.0"
@@ -4538,9 +4586,9 @@ files = [
] ]
[package.extras] [package.extras]
cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and python_version < \"3.14\"", "cffi (>=2.0.0b) ; platform_python_implementation != \"PyPy\" and python_version >= \"3.14\""] cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and python_version < \"3.14\"", "cffi (>=2.0.0b0) ; platform_python_implementation != \"PyPy\" and python_version >= \"3.14\""]
[metadata] [metadata]
lock-version = "2.1" lock-version = "2.1"
python-versions = ">=3.12,<4.0.0" python-versions = ">=3.12,<4.0.0"
content-hash = "996ab9a6b957607afb6d493b0a5dd1fec8f65f600f41bb5e99ee1e16fcb1f7b8" content-hash = "906ee4a17768bc92cf160032c185fd9a9d530ca56082081c1d85b2311b409df3"

View File

@@ -25,6 +25,8 @@ dependencies = [
"jsonschema (>=4.25.1,<5.0.0)", "jsonschema (>=4.25.1,<5.0.0)",
"greenlet (>=3.3.0,<4.0.0)", "greenlet (>=3.3.0,<4.0.0)",
"docx2txt (>=0.9,<0.10)", "docx2txt (>=0.9,<0.10)",
"openpyxl (>=3.1.5,<4.0.0)",
"xlrd (>=2.0.1,<3.0.0)",
"jq (>=1.10.0,<2.0.0)", "jq (>=1.10.0,<2.0.0)",
"openai (>=2.9.0,<3.0.0)", "openai (>=2.9.0,<3.0.0)",
"langchain-openai (>=1.1.1,<2.0.0)", "langchain-openai (>=1.1.1,<2.0.0)",

View File

@@ -1,6 +1,6 @@
# 基础镜像配置 vLLM 或 LMDeploy ,请根据实际需要选择其中一个,要求 ARM(AArch64) CPU + Ascend NPU。 # 基础镜像配置 vLLM 或 LMDeploy ,请根据实际需要选择其中一个,要求 ARM(AArch64) CPU + Ascend NPU。
# Base image containing the vLLM inference environment, requiring ARM(AArch64) CPU + Ascend NPU. # Base image containing the vLLM inference environment, requiring ARM(AArch64) CPU + Ascend NPU.
FROM quay.io/ascend/vllm-ascend:v0.11.0rc2 FROM quay.nju.edu.cn/ascend/vllm-ascend:v0.11.0rc2
# Base image containing the LMDeploy inference environment, requiring ARM(AArch64) CPU + Ascend NPU. # Base image containing the LMDeploy inference environment, requiring ARM(AArch64) CPU + Ascend NPU.
# FROM crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:mineru-a2 # FROM crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:mineru-a2