feature: add external pdf extract operator by using mineru (#36)

* feature: add UnstructuredFormatter

* feature: add UnstructuredFormatter in db

* feature: add unstructured[docx]==0.18.15

* feature: support doc

* feature: add mineru

* feature: add external pdf extract operator by using mineru

* feature: mineru docker install bugfix

---------

Co-authored-by: Startalker <438747480@qq.com>
This commit is contained in:
Startalker
2025-10-30 15:55:10 +08:00
committed by GitHub
parent 2f7341dc1f
commit 155603b1ca
12 changed files with 370 additions and 3 deletions

View File

@@ -60,6 +60,7 @@ services:
MYSQL_USER: "root"
MYSQL_PASSWORD: "password"
MYSQL_DATABASE: "datamate"
PDF_FORMATTER_BASE_URL: "http://datamate-mineru:9001"
command:
- python
- /opt/runtime/datamate/operator_runtime.py
@@ -72,6 +73,27 @@ services:
- flow_volume:/flow
networks: [ datamate ]
# 4) mineru
datamate-mineru:
container_name: datamate-mineru
image: datamate-mineru
restart: on-failure
environment:
MINERU_MODEL_SOURCE: local
MINERU_DEVICE_MODE: cpu # cpu|cuda|npu|mps
MINERU_BACKEND_MODE: pipeline
privileged: true
command:
- python
- /opt/runtime/datamate/mineru/mineru_api.py
- --port
- "9001"
volumes:
- dataset_volume:/dataset
- mineru_log_volume:/var/log/datamate/mineru
networks: [ datamate ]
profiles: [ mineru ]
volumes:
dataset_volume:
name: datamate-dataset-volume