From b5d7c662403f1ca89107968077960c236d8ba36e Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sat, 31 Jan 2026 11:30:55 +0800 Subject: [PATCH] =?UTF-8?q?feat(data-management):=20=E6=89=A9=E5=B1=95?= =?UTF-8?q?=E6=BA=90=E6=96=87=E6=A1=A3=E6=8E=92=E9=99=A4=E5=8A=9F=E8=83=BD?= =?UTF-8?q?=E6=94=AF=E6=8C=81Excel=E6=96=87=E4=BB=B6=E7=B1=BB=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 在后端服务中扩展源文档类型检查,新增对XLS和XLSX文件的支持 - 修改DatasetFileApplicationService中的过滤逻辑,统一处理所有源文档类型 - 新增isSourceDocument和isDerivedFile辅助方法进行文件类型判断 - 更新前端DatasetFileTransfer组件中的注释说明 - 在Python运行时依赖中添加openpyxl和xlrd库以支持Excel文件处理 - 修改标注项目接口中源文档类型的集合定义 - 更新文件操作钩子中的派生文件排除逻辑 --- .../DatasetFileApplicationService.java | 187 +++++++++++------- .../rest/DatasetFileController.java | 42 ++-- .../business/DatasetFileTransfer.tsx | 9 +- .../components/CreateAnnotationTaskDialog.tsx | 2 +- .../Detail/useFilesOperation.ts | 12 +- .../module/annotation/interface/project.py | 19 +- runtime/datamate-python/poetry.lock | 56 +++++- runtime/datamate-python/pyproject.toml | 2 + 8 files changed, 210 insertions(+), 119 deletions(-) diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java index 72be4f7..bed9fa1 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java @@ -22,15 +22,16 @@ import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckIn import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode; import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository; import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository; -import com.datamate.datamanagement.interfaces.converter.DatasetConverter; -import com.datamate.datamanagement.interfaces.dto.AddFilesRequest; -import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest; -import com.datamate.datamanagement.interfaces.dto.CreateDirectoryRequest; -import com.datamate.datamanagement.interfaces.dto.UploadFileRequest; -import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import jakarta.servlet.http.HttpServletResponse; +import com.datamate.datamanagement.interfaces.converter.DatasetConverter; +import com.datamate.datamanagement.interfaces.dto.AddFilesRequest; +import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest; +import com.datamate.datamanagement.interfaces.dto.CreateDirectoryRequest; +import com.datamate.datamanagement.interfaces.dto.UploadFileRequest; +import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import jakarta.servlet.http.HttpServletResponse; import lombok.extern.slf4j.Slf4j; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; @@ -45,16 +46,16 @@ import org.springframework.transaction.annotation.Transactional; import java.io.File; import java.io.IOException; -import java.io.InputStream; -import java.net.MalformedURLException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.nio.file.attribute.BasicFileAttributes; -import java.time.LocalDateTime; -import java.time.ZoneId; -import java.time.format.DateTimeFormatter; -import java.util.*; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.attribute.BasicFileAttributes; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.util.*; import java.util.concurrent.CompletableFuture; import java.util.function.Function; import java.util.stream.Collectors; @@ -79,11 +80,12 @@ public class DatasetFileApplicationService { XLS_FILE_TYPE, XLSX_FILE_TYPE ); - - private final DatasetFileRepository datasetFileRepository; - private final DatasetRepository datasetRepository; - private final FileService fileService; - private final PdfTextExtractAsyncService pdfTextExtractAsyncService; + private static final String DERIVED_METADATA_KEY = "derived_from_file_id"; + + private final DatasetFileRepository datasetFileRepository; + private final DatasetRepository datasetRepository; + private final FileService fileService; + private final PdfTextExtractAsyncService pdfTextExtractAsyncService; @Value("${datamate.data-management.base-path:/dataset}") private String datasetBasePath; @@ -119,57 +121,61 @@ public class DatasetFileApplicationService { * @param status 状态过滤 * @param name 文件名模糊查询 * @param hasAnnotation 是否有标注 - * @param excludeSourceDocuments 是否排除已被转换为TXT的源文档(PDF/DOC/DOCX) + * @param excludeSourceDocuments 是否排除源文档(PDF/DOC/DOCX/XLS/XLSX) * @param pagingQuery 分页参数 * @return 分页文件列表 */ @Transactional(readOnly = true) - public PagedResponse getDatasetFiles(String datasetId, String fileType, String status, String name, - Boolean hasAnnotation, boolean excludeSourceDocuments, PagingQuery pagingQuery) { - IPage page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize()); - IPage files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page); - - if (excludeSourceDocuments) { - // 查询所有作为衍生TXT文件源的文档文件ID - List sourceFileIds = datasetFileRepository.findSourceFileIdsWithDerivedFiles(datasetId); - if (!sourceFileIds.isEmpty()) { - // 过滤掉源文件 - List filteredRecords = files.getRecords().stream() - .filter(file -> !sourceFileIds.contains(file.getId())) - .collect(Collectors.toList()); - - // 重新构建分页结果 - Page filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal()); - filteredPage.setRecords(filteredRecords); - return PagedResponse.of(filteredPage); - } - } - - return PagedResponse.of(files); - } + public PagedResponse getDatasetFiles(String datasetId, String fileType, String status, String name, + Boolean hasAnnotation, boolean excludeSourceDocuments, PagingQuery pagingQuery) { + IPage page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize()); + IPage files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page); + + if (excludeSourceDocuments) { + // 过滤掉源文档文件(PDF/DOC/DOCX/XLS/XLSX),用于标注场景只展示派生文件 + List filteredRecords = files.getRecords().stream() + .filter(file -> !isSourceDocument(file)) + .collect(Collectors.toList()); + + // 重新构建分页结果 + Page filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal()); + filteredPage.setRecords(filteredRecords); + return PagedResponse.of(filteredPage); + } + + return PagedResponse.of(files); + } /** * 获取数据集文件列表 */ @Transactional(readOnly = true) - public PagedResponse getDatasetFilesWithDirectory(String datasetId, String prefix, PagingQuery pagingQuery) { - Dataset dataset = datasetRepository.getById(datasetId); - int page = Math.max(pagingQuery.getPage(), 1); - int size = pagingQuery.getSize() == null || pagingQuery.getSize() < 0 ? 20 : pagingQuery.getSize(); - if (dataset == null) { - return PagedResponse.of(new Page<>(page, size)); - } - String datasetPath = dataset.getPath(); - Path queryPath = Path.of(dataset.getPath() + File.separator + prefix); - Map datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId) - .stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity())); - try (Stream pathStream = Files.list(queryPath)) { - List allFiles = pathStream - .filter(path -> path.toString().startsWith(datasetPath)) - .sorted(Comparator - .comparing((Path path) -> !Files.isDirectory(path)) - .thenComparing(path -> path.getFileName().toString())) - .collect(Collectors.toList()); + public PagedResponse getDatasetFilesWithDirectory(String datasetId, String prefix, boolean excludeDerivedFiles, PagingQuery pagingQuery) { + Dataset dataset = datasetRepository.getById(datasetId); + int page = Math.max(pagingQuery.getPage(), 1); + int size = pagingQuery.getSize() == null || pagingQuery.getSize() < 0 ? 20 : pagingQuery.getSize(); + if (dataset == null) { + return PagedResponse.of(new Page<>(page, size)); + } + String datasetPath = dataset.getPath(); + Path queryPath = Path.of(dataset.getPath() + File.separator + prefix); + Map datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId) + .stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity())); + Set derivedFilePaths = excludeDerivedFiles + ? datasetFilesMap.values().stream() + .filter(this::isDerivedFile) + .map(DatasetFile::getFilePath) + .filter(Objects::nonNull) + .collect(Collectors.toSet()) + : Collections.emptySet(); + try (Stream pathStream = Files.list(queryPath)) { + List allFiles = pathStream + .filter(path -> path.toString().startsWith(datasetPath)) + .filter(path -> !excludeDerivedFiles || Files.isDirectory(path) || !derivedFilePaths.contains(path.toString())) + .sorted(Comparator + .comparing((Path path) -> !Files.isDirectory(path)) + .thenComparing(path -> path.getFileName().toString())) + .collect(Collectors.toList()); // 计算分页 int total = allFiles.size(); @@ -187,15 +193,15 @@ public class DatasetFileApplicationService { List datasetFiles = pageData.stream().map(path -> getDatasetFile(path, datasetFilesMap)).toList(); return new PagedResponse<>(page, size, total, totalPages, datasetFiles); - } catch (IOException e) { - log.error("list dataset path error", e); - return PagedResponse.of(new Page<>(page, size)); - } - } + } catch (IOException e) { + log.error("list dataset path error", e); + return PagedResponse.of(new Page<>(page, size)); + } + } - private DatasetFile getDatasetFile(Path path, Map datasetFilesMap) { - DatasetFile datasetFile = new DatasetFile(); - LocalDateTime localDateTime = LocalDateTime.now(); + private DatasetFile getDatasetFile(Path path, Map datasetFilesMap) { + DatasetFile datasetFile = new DatasetFile(); + LocalDateTime localDateTime = LocalDateTime.now(); try { localDateTime = Files.getLastModifiedTime(path).toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime(); } catch (IOException e) { @@ -246,8 +252,37 @@ public class DatasetFileApplicationService { datasetFile = exist; } } - return datasetFile; - } + return datasetFile; + } + + private boolean isSourceDocument(DatasetFile datasetFile) { + if (datasetFile == null) { + return false; + } + String fileType = datasetFile.getFileType(); + if (fileType == null || fileType.isBlank()) { + return false; + } + return DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT)); + } + + private boolean isDerivedFile(DatasetFile datasetFile) { + if (datasetFile == null) { + return false; + } + String metadata = datasetFile.getMetadata(); + if (metadata == null || metadata.isBlank()) { + return false; + } + try { + ObjectMapper mapper = new ObjectMapper(); + Map metadataMap = mapper.readValue(metadata, new TypeReference>() {}); + return metadataMap.get(DERIVED_METADATA_KEY) != null; + } catch (Exception e) { + log.debug("Failed to parse dataset file metadata for derived detection: {}", datasetFile.getId(), e); + return false; + } + } /** * 获取文件详情 diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java index 12a5e63..0d5a337 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java @@ -44,24 +44,30 @@ public class DatasetFileController { } @GetMapping - public Response> getDatasetFiles( - @PathVariable("datasetId") String datasetId, - @RequestParam(value = "isWithDirectory", required = false) boolean isWithDirectory, - @RequestParam(value = "page", required = false, defaultValue = "0") Integer page, - @RequestParam(value = "size", required = false, defaultValue = "20") Integer size, - @RequestParam(value = "prefix", required = false, defaultValue = "") String prefix, - @RequestParam(value = "status", required = false) String status, - @RequestParam(value = "hasAnnotation", required = false) Boolean hasAnnotation, - @RequestParam(value = "excludeSourceDocuments", required = false, defaultValue = "false") Boolean excludeSourceDocuments) { - PagingQuery pagingQuery = new PagingQuery(page, size); - PagedResponse filesPage; - if (isWithDirectory) { - filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(datasetId, prefix, pagingQuery); - } else { - filesPage = datasetFileApplicationService.getDatasetFiles(datasetId, null, status, null, hasAnnotation, - Boolean.TRUE.equals(excludeSourceDocuments), pagingQuery); - } - return Response.ok(filesPage); + public Response> getDatasetFiles( + @PathVariable("datasetId") String datasetId, + @RequestParam(value = "isWithDirectory", required = false) boolean isWithDirectory, + @RequestParam(value = "page", required = false, defaultValue = "0") Integer page, + @RequestParam(value = "size", required = false, defaultValue = "20") Integer size, + @RequestParam(value = "prefix", required = false, defaultValue = "") String prefix, + @RequestParam(value = "status", required = false) String status, + @RequestParam(value = "hasAnnotation", required = false) Boolean hasAnnotation, + @RequestParam(value = "excludeSourceDocuments", required = false, defaultValue = "false") Boolean excludeSourceDocuments, + @RequestParam(value = "excludeDerivedFiles", required = false, defaultValue = "false") Boolean excludeDerivedFiles) { + PagingQuery pagingQuery = new PagingQuery(page, size); + PagedResponse filesPage; + if (isWithDirectory) { + filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory( + datasetId, + prefix, + Boolean.TRUE.equals(excludeDerivedFiles), + pagingQuery + ); + } else { + filesPage = datasetFileApplicationService.getDatasetFiles(datasetId, null, status, null, hasAnnotation, + Boolean.TRUE.equals(excludeSourceDocuments), pagingQuery); + } + return Response.ok(filesPage); } @GetMapping("/{fileId}") diff --git a/frontend/src/components/business/DatasetFileTransfer.tsx b/frontend/src/components/business/DatasetFileTransfer.tsx index d5eed58..ba9249b 100644 --- a/frontend/src/components/business/DatasetFileTransfer.tsx +++ b/frontend/src/components/business/DatasetFileTransfer.tsx @@ -22,11 +22,10 @@ interface DatasetFileTransferProps onDatasetSelect?: (dataset: Dataset | null) => void; datasetTypeFilter?: DatasetType; hasAnnotationFilter?: boolean; - /** - * 是否排除已被转换为TXT的源文档文件(PDF/DOC/DOCX) - * 默认为 true,当 datasetTypeFilter 为 TEXT 时自动启用 - */ - excludeSourceDocuments?: boolean; + /** + * 是否排除源文档文件(PDF/DOC/DOCX/XLS/XLSX),文本标注默认启用 + */ + excludeSourceDocuments?: boolean; } const fileCols = [ diff --git a/frontend/src/pages/DataAnnotation/Create/components/CreateAnnotationTaskDialog.tsx b/frontend/src/pages/DataAnnotation/Create/components/CreateAnnotationTaskDialog.tsx index daf28a1..0a1146d 100644 --- a/frontend/src/pages/DataAnnotation/Create/components/CreateAnnotationTaskDialog.tsx +++ b/frontend/src/pages/DataAnnotation/Create/components/CreateAnnotationTaskDialog.tsx @@ -282,7 +282,7 @@ export default function CreateAnnotationTask({ } setDatasetPreviewLoading(true); try { - // 对于文本数据集,排除已被转换为TXT的源文档文件(PDF/DOC/DOCX) + // 对于文本数据集,排除源文档文件(PDF/DOC/DOCX/XLS/XLSX) const params: { page: number; size: number; excludeSourceDocuments?: boolean } = { page: 0, size: 10 }; if (isTextDataset) { params.excludeSourceDocuments = true; diff --git a/frontend/src/pages/DataManagement/Detail/useFilesOperation.ts b/frontend/src/pages/DataManagement/Detail/useFilesOperation.ts index 66702f8..e4c880a 100644 --- a/frontend/src/pages/DataManagement/Detail/useFilesOperation.ts +++ b/frontend/src/pages/DataManagement/Detail/useFilesOperation.ts @@ -1,7 +1,8 @@ -import type { - Dataset, - DatasetFile, -} from "@/pages/DataManagement/dataset.model"; +import type { + Dataset, + DatasetFile, +} from "@/pages/DataManagement/dataset.model"; +import { DatasetType } from "@/pages/DataManagement/dataset.model"; import { App } from "antd"; import { useState } from "react"; import { @@ -51,12 +52,14 @@ export function useFilesOperation(dataset: Dataset) { ) => { // 如果明确传了 prefix(包括空字符串),使用传入的值;否则使用当前 pagination.prefix const targetPrefix = prefix !== undefined ? prefix : (pagination.prefix || ''); + const shouldExcludeDerivedFiles = dataset?.datasetType === DatasetType.TEXT; const params: DatasetFilesQueryParams = { page: current !== undefined ? current : pagination.current, size: pageSize !== undefined ? pageSize : pagination.pageSize, isWithDirectory: true, prefix: targetPrefix, + ...(shouldExcludeDerivedFiles ? { excludeDerivedFiles: true } : {}), }; const { data } = await queryDatasetFilesUsingGet(id!, params); @@ -245,4 +248,5 @@ interface DatasetFilesQueryParams { size: number; isWithDirectory: boolean; prefix: string; + excludeDerivedFiles?: boolean; } diff --git a/runtime/datamate-python/app/module/annotation/interface/project.py b/runtime/datamate-python/app/module/annotation/interface/project.py index 1ce14ef..12d77c1 100644 --- a/runtime/datamate-python/app/module/annotation/interface/project.py +++ b/runtime/datamate-python/app/module/annotation/interface/project.py @@ -27,6 +27,7 @@ router = APIRouter( ) logger = get_logger(__name__) TEXT_DATASET_TYPE = "TEXT" +SOURCE_DOCUMENT_FILE_TYPES = {"pdf", "doc", "docx", "xls", "xlsx"} @router.get("/{mapping_id}/login") async def login_label_studio( @@ -123,18 +124,14 @@ async def create_mapping( file_records = file_result.scalars().all() snapshot_file_ids: list[str] = [] if dataset_type == TEXT_DATASET_TYPE: - derived_source_ids = set() + snapshot_file_ids = [] for file_record in file_records: - metadata = getattr(file_record, "dataset_filemetadata", None) - if isinstance(metadata, dict): - source_id = metadata.get("derived_from_file_id") - if source_id: - derived_source_ids.add(str(source_id)) - snapshot_file_ids = [ - str(file_record.id) - for file_record in file_records - if file_record.id and str(file_record.id) not in derived_source_ids - ] + if not file_record.id: + continue + file_type = str(getattr(file_record, "file_type", "") or "").lower() + if file_type in SOURCE_DOCUMENT_FILE_TYPES: + continue + snapshot_file_ids.append(str(file_record.id)) else: snapshot_file_ids = [ str(file_record.id) diff --git a/runtime/datamate-python/poetry.lock b/runtime/datamate-python/poetry.lock index 1b9a9a6..b348a52 100644 --- a/runtime/datamate-python/poetry.lock +++ b/runtime/datamate-python/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.3.1 and should not be changed by hand. [[package]] name = "aiofiles" @@ -704,6 +704,18 @@ files = [ [package.extras] dev = ["coverage", "pytest (>=7.4.4)"] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +description = "An implementation of lxml.xmlfile for the standard library" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"}, + {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, +] + [[package]] name = "fastapi" version = "0.124.0" @@ -1353,7 +1365,7 @@ files = [ [package.dependencies] attrs = ">=22.2.0" -jsonschema-specifications = ">=2023.03.6" +jsonschema-specifications = ">=2023.3.6" referencing = ">=0.28.4" rpds-py = ">=0.7.1" @@ -2155,6 +2167,21 @@ datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] realtime = ["websockets (>=13,<16)"] voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"] +[[package]] +name = "openpyxl" +version = "3.1.5" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"}, + {file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "orjson" version = "3.11.4" @@ -3329,12 +3356,14 @@ optional = false python-versions = ">=3.7" groups = ["main"] files = [ + {file = "sqlalchemy-2.0.45-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c64772786d9eee72d4d3784c28f0a636af5b0a29f3fe26ff11f55efe90c0bd85"}, {file = "sqlalchemy-2.0.45-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7ae64ebf7657395824a19bca98ab10eb9a3ecb026bf09524014f1bb81cb598d4"}, {file = "sqlalchemy-2.0.45-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f02325709d1b1a1489f23a39b318e175a171497374149eae74d612634b234c0"}, {file = "sqlalchemy-2.0.45-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2c3684fca8a05f0ac1d9a21c1f4a266983a7ea9180efb80ffeb03861ecd01a0"}, {file = "sqlalchemy-2.0.45-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:040f6f0545b3b7da6b9317fc3e922c9a98fc7243b2a1b39f78390fc0942f7826"}, {file = "sqlalchemy-2.0.45-cp310-cp310-win32.whl", hash = "sha256:830d434d609fe7bfa47c425c445a8b37929f140a7a44cdaf77f6d34df3a7296a"}, {file = "sqlalchemy-2.0.45-cp310-cp310-win_amd64.whl", hash = "sha256:0209d9753671b0da74da2cfbb9ecf9c02f72a759e4b018b3ab35f244c91842c7"}, + {file = "sqlalchemy-2.0.45-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e90a344c644a4fa871eb01809c32096487928bd2038bf10f3e4515cb688cc56"}, {file = "sqlalchemy-2.0.45-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b8c8b41b97fba5f62349aa285654230296829672fc9939cd7f35aab246d1c08b"}, {file = "sqlalchemy-2.0.45-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12c694ed6468333a090d2f60950e4250b928f457e4962389553d6ba5fe9951ac"}, {file = "sqlalchemy-2.0.45-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:f7d27a1d977a1cfef38a0e2e1ca86f09c4212666ce34e6ae542f3ed0a33bc606"}, @@ -3363,12 +3392,14 @@ files = [ {file = "sqlalchemy-2.0.45-cp314-cp314-win_amd64.whl", hash = "sha256:4748601c8ea959e37e03d13dcda4a44837afcd1b21338e637f7c935b8da06177"}, {file = "sqlalchemy-2.0.45-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cd337d3526ec5298f67d6a30bbbe4ed7e5e68862f0bf6dd21d289f8d37b7d60b"}, {file = "sqlalchemy-2.0.45-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9a62b446b7d86a3909abbcd1cd3cc550a832f99c2bc37c5b22e1925438b9367b"}, + {file = "sqlalchemy-2.0.45-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5964f832431b7cdfaaa22a660b4c7eb1dfcd6ed41375f67fd3e3440fd95cb3cc"}, {file = "sqlalchemy-2.0.45-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee580ab50e748208754ae8980cec79ec205983d8cf8b3f7c39067f3d9f2c8e22"}, {file = "sqlalchemy-2.0.45-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13e27397a7810163440c6bfed6b3fe46f1bfb2486eb540315a819abd2c004128"}, {file = "sqlalchemy-2.0.45-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ed3635353e55d28e7f4a95c8eda98a5cdc0a0b40b528433fbd41a9ae88f55b3d"}, {file = "sqlalchemy-2.0.45-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:db6834900338fb13a9123307f0c2cbb1f890a8656fcd5e5448ae3ad5bbe8d312"}, {file = "sqlalchemy-2.0.45-cp38-cp38-win32.whl", hash = "sha256:1d8b4a7a8c9b537509d56d5cd10ecdcfbb95912d72480c8861524efecc6a3fff"}, {file = "sqlalchemy-2.0.45-cp38-cp38-win_amd64.whl", hash = "sha256:ebd300afd2b62679203435f596b2601adafe546cb7282d5a0cd3ed99e423720f"}, + {file = "sqlalchemy-2.0.45-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d29b2b99d527dbc66dd87c3c3248a5dd789d974a507f4653c969999fc7c1191b"}, {file = "sqlalchemy-2.0.45-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:59a8b8bd9c6bedf81ad07c8bd5543eedca55fe9b8780b2b628d495ba55f8db1e"}, {file = "sqlalchemy-2.0.45-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fd93c6f5d65f254ceabe97548c709e073d6da9883343adaa51bf1a913ce93f8e"}, {file = "sqlalchemy-2.0.45-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6d0beadc2535157070c9c17ecf25ecec31e13c229a8f69196d7590bde8082bf1"}, @@ -4133,6 +4164,23 @@ files = [ [package.extras] dev = ["pytest", "setuptools"] +[[package]] +name = "xlrd" +version = "2.0.2" +description = "Library for developers to extract data from Microsoft Excel (tm) .xls spreadsheet files" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +groups = ["main"] +files = [ + {file = "xlrd-2.0.2-py2.py3-none-any.whl", hash = "sha256:ea762c3d29f4cca48d82df517b6d89fbce4db3107f9d78713e48cd321d5c9aa9"}, + {file = "xlrd-2.0.2.tar.gz", hash = "sha256:08b5e25de58f21ce71dc7db3b3b8106c1fa776f3024c54e45b45b374e89234c9"}, +] + +[package.extras] +build = ["twine", "wheel"] +docs = ["sphinx"] +test = ["pytest", "pytest-cov"] + [[package]] name = "xxhash" version = "3.6.0" @@ -4538,9 +4586,9 @@ files = [ ] [package.extras] -cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and python_version < \"3.14\"", "cffi (>=2.0.0b) ; platform_python_implementation != \"PyPy\" and python_version >= \"3.14\""] +cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and python_version < \"3.14\"", "cffi (>=2.0.0b0) ; platform_python_implementation != \"PyPy\" and python_version >= \"3.14\""] [metadata] lock-version = "2.1" python-versions = ">=3.12,<4.0.0" -content-hash = "996ab9a6b957607afb6d493b0a5dd1fec8f65f600f41bb5e99ee1e16fcb1f7b8" +content-hash = "906ee4a17768bc92cf160032c185fd9a9d530ca56082081c1d85b2311b409df3" diff --git a/runtime/datamate-python/pyproject.toml b/runtime/datamate-python/pyproject.toml index c6a514f..c7378b8 100644 --- a/runtime/datamate-python/pyproject.toml +++ b/runtime/datamate-python/pyproject.toml @@ -25,6 +25,8 @@ dependencies = [ "jsonschema (>=4.25.1,<5.0.0)", "greenlet (>=3.3.0,<4.0.0)", "docx2txt (>=0.9,<0.10)", + "openpyxl (>=3.1.5,<4.0.0)", + "xlrd (>=2.0.1,<3.0.0)", "jq (>=1.10.0,<2.0.0)", "openai (>=2.9.0,<3.0.0)", "langchain-openai (>=1.1.1,<2.0.0)",