You've already forked DataMate
feat(data-management): 扩展源文档排除功能支持Excel文件类型
- 在后端服务中扩展源文档类型检查,新增对XLS和XLSX文件的支持 - 修改DatasetFileApplicationService中的过滤逻辑,统一处理所有源文档类型 - 新增isSourceDocument和isDerivedFile辅助方法进行文件类型判断 - 更新前端DatasetFileTransfer组件中的注释说明 - 在Python运行时依赖中添加openpyxl和xlrd库以支持Excel文件处理 - 修改标注项目接口中源文档类型的集合定义 - 更新文件操作钩子中的派生文件排除逻辑
This commit is contained in:
@@ -27,6 +27,7 @@ router = APIRouter(
|
||||
)
|
||||
logger = get_logger(__name__)
|
||||
TEXT_DATASET_TYPE = "TEXT"
|
||||
SOURCE_DOCUMENT_FILE_TYPES = {"pdf", "doc", "docx", "xls", "xlsx"}
|
||||
|
||||
@router.get("/{mapping_id}/login")
|
||||
async def login_label_studio(
|
||||
@@ -123,18 +124,14 @@ async def create_mapping(
|
||||
file_records = file_result.scalars().all()
|
||||
snapshot_file_ids: list[str] = []
|
||||
if dataset_type == TEXT_DATASET_TYPE:
|
||||
derived_source_ids = set()
|
||||
snapshot_file_ids = []
|
||||
for file_record in file_records:
|
||||
metadata = getattr(file_record, "dataset_filemetadata", None)
|
||||
if isinstance(metadata, dict):
|
||||
source_id = metadata.get("derived_from_file_id")
|
||||
if source_id:
|
||||
derived_source_ids.add(str(source_id))
|
||||
snapshot_file_ids = [
|
||||
str(file_record.id)
|
||||
for file_record in file_records
|
||||
if file_record.id and str(file_record.id) not in derived_source_ids
|
||||
]
|
||||
if not file_record.id:
|
||||
continue
|
||||
file_type = str(getattr(file_record, "file_type", "") or "").lower()
|
||||
if file_type in SOURCE_DOCUMENT_FILE_TYPES:
|
||||
continue
|
||||
snapshot_file_ids.append(str(file_record.id))
|
||||
else:
|
||||
snapshot_file_ids = [
|
||||
str(file_record.id)
|
||||
|
||||
56
runtime/datamate-python/poetry.lock
generated
56
runtime/datamate-python/poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 2.3.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiofiles"
|
||||
@@ -704,6 +704,18 @@ files = [
|
||||
[package.extras]
|
||||
dev = ["coverage", "pytest (>=7.4.4)"]
|
||||
|
||||
[[package]]
|
||||
name = "et-xmlfile"
|
||||
version = "2.0.0"
|
||||
description = "An implementation of lxml.xmlfile for the standard library"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"},
|
||||
{file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastapi"
|
||||
version = "0.124.0"
|
||||
@@ -1353,7 +1365,7 @@ files = [
|
||||
|
||||
[package.dependencies]
|
||||
attrs = ">=22.2.0"
|
||||
jsonschema-specifications = ">=2023.03.6"
|
||||
jsonschema-specifications = ">=2023.3.6"
|
||||
referencing = ">=0.28.4"
|
||||
rpds-py = ">=0.7.1"
|
||||
|
||||
@@ -2155,6 +2167,21 @@ datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
|
||||
realtime = ["websockets (>=13,<16)"]
|
||||
voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"]
|
||||
|
||||
[[package]]
|
||||
name = "openpyxl"
|
||||
version = "3.1.5"
|
||||
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"},
|
||||
{file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
et-xmlfile = "*"
|
||||
|
||||
[[package]]
|
||||
name = "orjson"
|
||||
version = "3.11.4"
|
||||
@@ -3329,12 +3356,14 @@ optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "sqlalchemy-2.0.45-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c64772786d9eee72d4d3784c28f0a636af5b0a29f3fe26ff11f55efe90c0bd85"},
|
||||
{file = "sqlalchemy-2.0.45-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7ae64ebf7657395824a19bca98ab10eb9a3ecb026bf09524014f1bb81cb598d4"},
|
||||
{file = "sqlalchemy-2.0.45-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f02325709d1b1a1489f23a39b318e175a171497374149eae74d612634b234c0"},
|
||||
{file = "sqlalchemy-2.0.45-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2c3684fca8a05f0ac1d9a21c1f4a266983a7ea9180efb80ffeb03861ecd01a0"},
|
||||
{file = "sqlalchemy-2.0.45-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:040f6f0545b3b7da6b9317fc3e922c9a98fc7243b2a1b39f78390fc0942f7826"},
|
||||
{file = "sqlalchemy-2.0.45-cp310-cp310-win32.whl", hash = "sha256:830d434d609fe7bfa47c425c445a8b37929f140a7a44cdaf77f6d34df3a7296a"},
|
||||
{file = "sqlalchemy-2.0.45-cp310-cp310-win_amd64.whl", hash = "sha256:0209d9753671b0da74da2cfbb9ecf9c02f72a759e4b018b3ab35f244c91842c7"},
|
||||
{file = "sqlalchemy-2.0.45-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e90a344c644a4fa871eb01809c32096487928bd2038bf10f3e4515cb688cc56"},
|
||||
{file = "sqlalchemy-2.0.45-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b8c8b41b97fba5f62349aa285654230296829672fc9939cd7f35aab246d1c08b"},
|
||||
{file = "sqlalchemy-2.0.45-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12c694ed6468333a090d2f60950e4250b928f457e4962389553d6ba5fe9951ac"},
|
||||
{file = "sqlalchemy-2.0.45-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:f7d27a1d977a1cfef38a0e2e1ca86f09c4212666ce34e6ae542f3ed0a33bc606"},
|
||||
@@ -3363,12 +3392,14 @@ files = [
|
||||
{file = "sqlalchemy-2.0.45-cp314-cp314-win_amd64.whl", hash = "sha256:4748601c8ea959e37e03d13dcda4a44837afcd1b21338e637f7c935b8da06177"},
|
||||
{file = "sqlalchemy-2.0.45-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cd337d3526ec5298f67d6a30bbbe4ed7e5e68862f0bf6dd21d289f8d37b7d60b"},
|
||||
{file = "sqlalchemy-2.0.45-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9a62b446b7d86a3909abbcd1cd3cc550a832f99c2bc37c5b22e1925438b9367b"},
|
||||
{file = "sqlalchemy-2.0.45-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5964f832431b7cdfaaa22a660b4c7eb1dfcd6ed41375f67fd3e3440fd95cb3cc"},
|
||||
{file = "sqlalchemy-2.0.45-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee580ab50e748208754ae8980cec79ec205983d8cf8b3f7c39067f3d9f2c8e22"},
|
||||
{file = "sqlalchemy-2.0.45-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13e27397a7810163440c6bfed6b3fe46f1bfb2486eb540315a819abd2c004128"},
|
||||
{file = "sqlalchemy-2.0.45-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ed3635353e55d28e7f4a95c8eda98a5cdc0a0b40b528433fbd41a9ae88f55b3d"},
|
||||
{file = "sqlalchemy-2.0.45-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:db6834900338fb13a9123307f0c2cbb1f890a8656fcd5e5448ae3ad5bbe8d312"},
|
||||
{file = "sqlalchemy-2.0.45-cp38-cp38-win32.whl", hash = "sha256:1d8b4a7a8c9b537509d56d5cd10ecdcfbb95912d72480c8861524efecc6a3fff"},
|
||||
{file = "sqlalchemy-2.0.45-cp38-cp38-win_amd64.whl", hash = "sha256:ebd300afd2b62679203435f596b2601adafe546cb7282d5a0cd3ed99e423720f"},
|
||||
{file = "sqlalchemy-2.0.45-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d29b2b99d527dbc66dd87c3c3248a5dd789d974a507f4653c969999fc7c1191b"},
|
||||
{file = "sqlalchemy-2.0.45-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:59a8b8bd9c6bedf81ad07c8bd5543eedca55fe9b8780b2b628d495ba55f8db1e"},
|
||||
{file = "sqlalchemy-2.0.45-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fd93c6f5d65f254ceabe97548c709e073d6da9883343adaa51bf1a913ce93f8e"},
|
||||
{file = "sqlalchemy-2.0.45-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6d0beadc2535157070c9c17ecf25ecec31e13c229a8f69196d7590bde8082bf1"},
|
||||
@@ -4133,6 +4164,23 @@ files = [
|
||||
[package.extras]
|
||||
dev = ["pytest", "setuptools"]
|
||||
|
||||
[[package]]
|
||||
name = "xlrd"
|
||||
version = "2.0.2"
|
||||
description = "Library for developers to extract data from Microsoft Excel (tm) .xls spreadsheet files"
|
||||
optional = false
|
||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "xlrd-2.0.2-py2.py3-none-any.whl", hash = "sha256:ea762c3d29f4cca48d82df517b6d89fbce4db3107f9d78713e48cd321d5c9aa9"},
|
||||
{file = "xlrd-2.0.2.tar.gz", hash = "sha256:08b5e25de58f21ce71dc7db3b3b8106c1fa776f3024c54e45b45b374e89234c9"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
build = ["twine", "wheel"]
|
||||
docs = ["sphinx"]
|
||||
test = ["pytest", "pytest-cov"]
|
||||
|
||||
[[package]]
|
||||
name = "xxhash"
|
||||
version = "3.6.0"
|
||||
@@ -4538,9 +4586,9 @@ files = [
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and python_version < \"3.14\"", "cffi (>=2.0.0b) ; platform_python_implementation != \"PyPy\" and python_version >= \"3.14\""]
|
||||
cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and python_version < \"3.14\"", "cffi (>=2.0.0b0) ; platform_python_implementation != \"PyPy\" and python_version >= \"3.14\""]
|
||||
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.12,<4.0.0"
|
||||
content-hash = "996ab9a6b957607afb6d493b0a5dd1fec8f65f600f41bb5e99ee1e16fcb1f7b8"
|
||||
content-hash = "906ee4a17768bc92cf160032c185fd9a9d530ca56082081c1d85b2311b409df3"
|
||||
|
||||
@@ -25,6 +25,8 @@ dependencies = [
|
||||
"jsonschema (>=4.25.1,<5.0.0)",
|
||||
"greenlet (>=3.3.0,<4.0.0)",
|
||||
"docx2txt (>=0.9,<0.10)",
|
||||
"openpyxl (>=3.1.5,<4.0.0)",
|
||||
"xlrd (>=2.0.1,<3.0.0)",
|
||||
"jq (>=1.10.0,<2.0.0)",
|
||||
"openai (>=2.9.0,<3.0.0)",
|
||||
"langchain-openai (>=1.1.1,<2.0.0)",
|
||||
|
||||
Reference in New Issue
Block a user