From 06b05a65a9a1b52c6f75329bb368755ec8ba1b52 Mon Sep 17 00:00:00 2001 From: Startalker <103120663+Startalker@users.noreply.github.com> Date: Thu, 30 Oct 2025 20:21:12 +0800 Subject: [PATCH] feature: add unstructured xlsx/xls/csv/pptx/ppt (#41) * feature: add UnstructuredFormatter * feature: add UnstructuredFormatter in db * feature: add unstructured[docx]==0.18.15 * feature: support doc * feature: add mineru * feature: add external pdf extract operator by using mineru * feature: mineru docker install bugfix * feature: add unstructured xlsx/xls/csv/pptx/ppt --------- Co-authored-by: Startalker <438747480@qq.com> --- runtime/ops/formatter/file_exporter/process.py | 1 + runtime/ops/requirements.txt | 2 +- scripts/images/runtime/Dockerfile | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/runtime/ops/formatter/file_exporter/process.py b/runtime/ops/formatter/file_exporter/process.py index 5b5b48b..c4934c4 100644 --- a/runtime/ops/formatter/file_exporter/process.py +++ b/runtime/ops/formatter/file_exporter/process.py @@ -23,6 +23,7 @@ class FileExporter(Mapper): super(FileExporter, self).__init__(*args, **kwargs) self.last_ops = True self.text_support_ext = kwargs.get("text_support_ext", ['txt', 'html', 'md', 'markdown', + 'xlsx', 'xls', 'csv', 'pptx', 'ppt', 'xml', 'json', 'doc', 'docx', 'pdf']) self.data_support_ext = kwargs.get("data_support_ext", ['jpg', 'jpeg', 'png', 'bmp']) self.medical_support_ext = kwargs.get("medical_support_ext", ['svs', 'tif', 'tiff']) diff --git a/runtime/ops/requirements.txt b/runtime/ops/requirements.txt index 8c842b0..b214fb7 100644 --- a/runtime/ops/requirements.txt +++ b/runtime/ops/requirements.txt @@ -19,4 +19,4 @@ xmltodict==1.0.2 zhconv==1.4.3 sqlalchemy==2.0.40 pymysql==1.1.1 -unstructured[docx]==0.18.15 \ No newline at end of file +unstructured[docx,csv,xlsx,pptx]==0.18.15 \ No newline at end of file diff --git a/scripts/images/runtime/Dockerfile b/scripts/images/runtime/Dockerfile index 92900df..86d08da 100644 --- a/scripts/images/runtime/Dockerfile +++ b/scripts/images/runtime/Dockerfile @@ -8,7 +8,7 @@ COPY scripts/images/runtime/start.sh /opt/runtime/start.sh ENV PYTHONPATH=/opt/runtime/datamate/ RUN apt update \ - && apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 libreoffice\ + && apt install -y libgl1 libglib2.0-0 vim libmagic1t64 libreoffice\ && apt clean \ && rm -rf /var/lib/apt/lists/*