feature: add external pdf extract operator by using mineru (#36)

* feature: add UnstructuredFormatter

* feature: add UnstructuredFormatter in db

* feature: add unstructured[docx]==0.18.15

* feature: support doc

* feature: add mineru

* feature: add external pdf extract operator by using mineru

* feature: mineru docker install bugfix

---------

Co-authored-by: Startalker <438747480@qq.com>
This commit is contained in:
Startalker
2025-10-30 15:55:10 +08:00
committed by GitHub
parent 2f7341dc1f
commit 155603b1ca
12 changed files with 370 additions and 3 deletions

View File

@@ -60,6 +60,7 @@ services:
MYSQL_USER: "root"
MYSQL_PASSWORD: "password"
MYSQL_DATABASE: "datamate"
PDF_FORMATTER_BASE_URL: "http://datamate-mineru:9001"
command:
- python
- /opt/runtime/datamate/operator_runtime.py
@@ -72,6 +73,27 @@ services:
- flow_volume:/flow
networks: [ datamate ]
# 4) mineru
datamate-mineru:
container_name: datamate-mineru
image: datamate-mineru
restart: on-failure
environment:
MINERU_MODEL_SOURCE: local
MINERU_DEVICE_MODE: cpu # cpu|cuda|npu|mps
MINERU_BACKEND_MODE: pipeline
privileged: true
command:
- python
- /opt/runtime/datamate/mineru/mineru_api.py
- --port
- "9001"
volumes:
- dataset_volume:/dataset
- mineru_log_volume:/var/log/datamate/mineru
networks: [ datamate ]
profiles: [ mineru ]
volumes:
dataset_volume:
name: datamate-dataset-volume

View File

@@ -77,6 +77,8 @@ head:
value: "password"
- name: MYSQL_DATABASE
value: "datamate"
- name: PDF_FORMATTER_BASE_URL
value: "http://datamate-mineru:9001"
# - name: EXAMPLE_ENV
# value: "1"
envFrom: []
@@ -154,6 +156,8 @@ head:
value: "password"
- name: MYSQL_DATABASE
value: "datamate"
- name: PDF_FORMATTER_BASE_URL
value: "http://datamate-mineru:9001"
ports:
- containerPort: 8081
volumeMounts:
@@ -221,6 +225,8 @@ worker:
value: "password"
- name: MYSQL_DATABASE
value: "datamate"
- name: PDF_FORMATTER_BASE_URL
value: "http://datamate-mineru:9001"
# - name: EXAMPLE_ENV
# value: "1"
envFrom: []

View File

@@ -0,0 +1,70 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: datamate-mineru
labels:
app: datamate
tier: mineru
spec:
replicas: 1
selector:
matchLabels:
app: datamate
tier: mineru
template:
metadata:
labels:
app: datamate
tier: mineru
spec:
containers:
- name: mineru
image: datamate-mineru
imagePullPolicy: IfNotPresent
command:
- python
- /opt/runtime/datamate/mineru/mineru_api.py
- --port
- "9001"
env:
- name: MINERU_MODEL_SOURCE
value: local
- name: MINERU_DEVICE_MODE
value: cpu
- name: MINERU_BACKEND_MODE
value: pipeline
ports:
- containerPort: 9001
volumeMounts:
- name: dataset-volume
mountPath: /dataset
- name: log-volume
mountPath: /var/log/datamate/mineru
subPath: mineru
volumes:
- name: dataset-volume
hostPath:
path: /opt/datamate/data/dataset
type: DirectoryOrCreate
- name: log-volume
hostPath:
path: /opt/datamate/data/log
type: DirectoryOrCreate
---
apiVersion: v1
kind: Service
metadata:
name: datamate-mineru
labels:
app: datamate
tier: mineru
spec:
type: ClusterIP
ports:
- port: 9001
targetPort: 9001
protocol: TCP
selector:
app: datamate
tier: mineru