feature: add unstructured xlsx/xls/csv/pptx/ppt (#41)

* feature: add UnstructuredFormatter

* feature: add UnstructuredFormatter in db

* feature: add unstructured[docx]==0.18.15

* feature: support doc

* feature: add mineru

* feature: add external pdf extract operator by using mineru

* feature: mineru docker install bugfix

* feature: add unstructured xlsx/xls/csv/pptx/ppt

---------

Co-authored-by: Startalker <438747480@qq.com>
This commit is contained in:
Startalker
2025-10-30 20:21:12 +08:00
committed by GitHub
parent b9b97c1ac2
commit 06b05a65a9
3 changed files with 3 additions and 2 deletions

View File

@@ -23,6 +23,7 @@ class FileExporter(Mapper):
super(FileExporter, self).__init__(*args, **kwargs) super(FileExporter, self).__init__(*args, **kwargs)
self.last_ops = True self.last_ops = True
self.text_support_ext = kwargs.get("text_support_ext", ['txt', 'html', 'md', 'markdown', self.text_support_ext = kwargs.get("text_support_ext", ['txt', 'html', 'md', 'markdown',
'xlsx', 'xls', 'csv', 'pptx', 'ppt',
'xml', 'json', 'doc', 'docx', 'pdf']) 'xml', 'json', 'doc', 'docx', 'pdf'])
self.data_support_ext = kwargs.get("data_support_ext", ['jpg', 'jpeg', 'png', 'bmp']) self.data_support_ext = kwargs.get("data_support_ext", ['jpg', 'jpeg', 'png', 'bmp'])
self.medical_support_ext = kwargs.get("medical_support_ext", ['svs', 'tif', 'tiff']) self.medical_support_ext = kwargs.get("medical_support_ext", ['svs', 'tif', 'tiff'])

View File

@@ -19,4 +19,4 @@ xmltodict==1.0.2
zhconv==1.4.3 zhconv==1.4.3
sqlalchemy==2.0.40 sqlalchemy==2.0.40
pymysql==1.1.1 pymysql==1.1.1
unstructured[docx]==0.18.15 unstructured[docx,csv,xlsx,pptx]==0.18.15

View File

@@ -8,7 +8,7 @@ COPY scripts/images/runtime/start.sh /opt/runtime/start.sh
ENV PYTHONPATH=/opt/runtime/datamate/ ENV PYTHONPATH=/opt/runtime/datamate/
RUN apt update \ RUN apt update \
&& apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 libreoffice\ && apt install -y libgl1 libglib2.0-0 vim libmagic1t64 libreoffice\
&& apt clean \ && apt clean \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*