You've already forked DataMate
feature: add external pdf extract operator by using mineru (#36)
* feature: add UnstructuredFormatter * feature: add UnstructuredFormatter in db * feature: add unstructured[docx]==0.18.15 * feature: support doc * feature: add mineru * feature: add external pdf extract operator by using mineru * feature: mineru docker install bugfix --------- Co-authored-by: Startalker <438747480@qq.com>
This commit is contained in:
22
Makefile
22
Makefile
@@ -1,5 +1,6 @@
|
|||||||
MAKEFLAGS += --no-print-directory
|
MAKEFLAGS += --no-print-directory
|
||||||
|
|
||||||
|
WITH_MINERU ?= false # 默认不构建mineru
|
||||||
VERSION ?= latest
|
VERSION ?= latest
|
||||||
NAMESPACE ?= datamate
|
NAMESPACE ?= datamate
|
||||||
|
|
||||||
@@ -8,7 +9,7 @@ build-%:
|
|||||||
$(MAKE) $*-docker-build
|
$(MAKE) $*-docker-build
|
||||||
|
|
||||||
.PHONY: build
|
.PHONY: build
|
||||||
build: backend-docker-build frontend-docker-build runtime-docker-build
|
build: backend-docker-build frontend-docker-build runtime-docker-build $(if $(WITH_MINERU),mineru-docker-build)
|
||||||
|
|
||||||
.PHONY: create-namespace
|
.PHONY: create-namespace
|
||||||
create-namespace:
|
create-namespace:
|
||||||
@@ -85,6 +86,9 @@ deer-flow-docker-build:
|
|||||||
cp deployment/docker/deer-flow/conf.yaml.example ../deer-flow/conf.yaml
|
cp deployment/docker/deer-flow/conf.yaml.example ../deer-flow/conf.yaml
|
||||||
cd ../deer-flow && docker compose build
|
cd ../deer-flow && docker compose build
|
||||||
|
|
||||||
|
.PHONY: mineru-docker-build
|
||||||
|
mineru-docker-build:
|
||||||
|
docker build -t datamate-mineru:$(VERSION) . -f scripts/images/mineru/Dockerfile
|
||||||
.PHONY: backend-docker-install
|
.PHONY: backend-docker-install
|
||||||
backend-docker-install:
|
backend-docker-install:
|
||||||
cd deployment/docker/datamate && docker compose up -d backend
|
cd deployment/docker/datamate && docker compose up -d backend
|
||||||
@@ -109,6 +113,22 @@ runtime-docker-install:
|
|||||||
runtime-docker-uninstall:
|
runtime-docker-uninstall:
|
||||||
cd deployment/docker/datamate && docker compose down runtime
|
cd deployment/docker/datamate && docker compose down runtime
|
||||||
|
|
||||||
|
.PHONY: mineru-docker-install
|
||||||
|
mineru-docker-install:
|
||||||
|
cd deployment/docker/datamate && cp .env.example .env && docker compose up -d datamate-mineru
|
||||||
|
|
||||||
|
.PHONY: mineru-docker-uninstall
|
||||||
|
mineru-docker-uninstall:
|
||||||
|
cd deployment/docker/datamate && docker compose down datamate-mineru
|
||||||
|
|
||||||
|
.PHONY: mineru-k8s-install
|
||||||
|
mineru-k8s-install: create-namespace
|
||||||
|
kubectl apply -f deployment/kubernetes/mineru/deploy.yaml -n $(NAMESPACE)
|
||||||
|
|
||||||
|
.PHONY: mineru-k8s-uninstall
|
||||||
|
mineru-k8s-uninstall:
|
||||||
|
kubectl delete -f deployment/kubernetes/mineru/deploy.yaml -n $(NAMESPACE)
|
||||||
|
|
||||||
.PHONY: datamate-docker-install
|
.PHONY: datamate-docker-install
|
||||||
datamate-docker-install:
|
datamate-docker-install:
|
||||||
cd deployment/docker/datamate && cp .env.example .env && docker compose -f docker-compose.yml up -d
|
cd deployment/docker/datamate && cp .env.example .env && docker compose -f docker-compose.yml up -d
|
||||||
|
|||||||
@@ -60,6 +60,7 @@ services:
|
|||||||
MYSQL_USER: "root"
|
MYSQL_USER: "root"
|
||||||
MYSQL_PASSWORD: "password"
|
MYSQL_PASSWORD: "password"
|
||||||
MYSQL_DATABASE: "datamate"
|
MYSQL_DATABASE: "datamate"
|
||||||
|
PDF_FORMATTER_BASE_URL: "http://datamate-mineru:9001"
|
||||||
command:
|
command:
|
||||||
- python
|
- python
|
||||||
- /opt/runtime/datamate/operator_runtime.py
|
- /opt/runtime/datamate/operator_runtime.py
|
||||||
@@ -72,6 +73,27 @@ services:
|
|||||||
- flow_volume:/flow
|
- flow_volume:/flow
|
||||||
networks: [ datamate ]
|
networks: [ datamate ]
|
||||||
|
|
||||||
|
# 4) mineru
|
||||||
|
datamate-mineru:
|
||||||
|
container_name: datamate-mineru
|
||||||
|
image: datamate-mineru
|
||||||
|
restart: on-failure
|
||||||
|
environment:
|
||||||
|
MINERU_MODEL_SOURCE: local
|
||||||
|
MINERU_DEVICE_MODE: cpu # cpu|cuda|npu|mps
|
||||||
|
MINERU_BACKEND_MODE: pipeline
|
||||||
|
privileged: true
|
||||||
|
command:
|
||||||
|
- python
|
||||||
|
- /opt/runtime/datamate/mineru/mineru_api.py
|
||||||
|
- --port
|
||||||
|
- "9001"
|
||||||
|
volumes:
|
||||||
|
- dataset_volume:/dataset
|
||||||
|
- mineru_log_volume:/var/log/datamate/mineru
|
||||||
|
networks: [ datamate ]
|
||||||
|
profiles: [ mineru ]
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
dataset_volume:
|
dataset_volume:
|
||||||
name: datamate-dataset-volume
|
name: datamate-dataset-volume
|
||||||
|
|||||||
@@ -77,6 +77,8 @@ head:
|
|||||||
value: "password"
|
value: "password"
|
||||||
- name: MYSQL_DATABASE
|
- name: MYSQL_DATABASE
|
||||||
value: "datamate"
|
value: "datamate"
|
||||||
|
- name: PDF_FORMATTER_BASE_URL
|
||||||
|
value: "http://datamate-mineru:9001"
|
||||||
# - name: EXAMPLE_ENV
|
# - name: EXAMPLE_ENV
|
||||||
# value: "1"
|
# value: "1"
|
||||||
envFrom: []
|
envFrom: []
|
||||||
@@ -154,6 +156,8 @@ head:
|
|||||||
value: "password"
|
value: "password"
|
||||||
- name: MYSQL_DATABASE
|
- name: MYSQL_DATABASE
|
||||||
value: "datamate"
|
value: "datamate"
|
||||||
|
- name: PDF_FORMATTER_BASE_URL
|
||||||
|
value: "http://datamate-mineru:9001"
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8081
|
- containerPort: 8081
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
@@ -221,6 +225,8 @@ worker:
|
|||||||
value: "password"
|
value: "password"
|
||||||
- name: MYSQL_DATABASE
|
- name: MYSQL_DATABASE
|
||||||
value: "datamate"
|
value: "datamate"
|
||||||
|
- name: PDF_FORMATTER_BASE_URL
|
||||||
|
value: "http://datamate-mineru:9001"
|
||||||
# - name: EXAMPLE_ENV
|
# - name: EXAMPLE_ENV
|
||||||
# value: "1"
|
# value: "1"
|
||||||
envFrom: []
|
envFrom: []
|
||||||
|
|||||||
70
deployment/kubernetes/mineru/deploy.yaml
Normal file
70
deployment/kubernetes/mineru/deploy.yaml
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: datamate-mineru
|
||||||
|
labels:
|
||||||
|
app: datamate
|
||||||
|
tier: mineru
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: datamate
|
||||||
|
tier: mineru
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: datamate
|
||||||
|
tier: mineru
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: mineru
|
||||||
|
image: datamate-mineru
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
command:
|
||||||
|
- python
|
||||||
|
- /opt/runtime/datamate/mineru/mineru_api.py
|
||||||
|
- --port
|
||||||
|
- "9001"
|
||||||
|
env:
|
||||||
|
- name: MINERU_MODEL_SOURCE
|
||||||
|
value: local
|
||||||
|
- name: MINERU_DEVICE_MODE
|
||||||
|
value: cpu
|
||||||
|
- name: MINERU_BACKEND_MODE
|
||||||
|
value: pipeline
|
||||||
|
ports:
|
||||||
|
- containerPort: 9001
|
||||||
|
volumeMounts:
|
||||||
|
- name: dataset-volume
|
||||||
|
mountPath: /dataset
|
||||||
|
- name: log-volume
|
||||||
|
mountPath: /var/log/datamate/mineru
|
||||||
|
subPath: mineru
|
||||||
|
volumes:
|
||||||
|
- name: dataset-volume
|
||||||
|
hostPath:
|
||||||
|
path: /opt/datamate/data/dataset
|
||||||
|
type: DirectoryOrCreate
|
||||||
|
- name: log-volume
|
||||||
|
hostPath:
|
||||||
|
path: /opt/datamate/data/log
|
||||||
|
type: DirectoryOrCreate
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: datamate-mineru
|
||||||
|
labels:
|
||||||
|
app: datamate
|
||||||
|
tier: mineru
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
ports:
|
||||||
|
- port: 9001
|
||||||
|
targetPort: 9001
|
||||||
|
protocol: TCP
|
||||||
|
selector:
|
||||||
|
app: datamate
|
||||||
|
tier: mineru
|
||||||
112
runtime/mineru/mineru_api.py
Normal file
112
runtime/mineru/mineru_api.py
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
import shutil
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
import os
|
||||||
|
|
||||||
|
import click
|
||||||
|
import uvicorn
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from pathlib import Path
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from loguru import logger
|
||||||
|
from mineru.cli.common import aio_do_parse, read_fn
|
||||||
|
from mineru.cli.fast_api import get_infer_result
|
||||||
|
|
||||||
|
# 日志配置
|
||||||
|
LOG_DIR = "/var/log/datamate/mineru"
|
||||||
|
os.makedirs(LOG_DIR, exist_ok=True)
|
||||||
|
logger.add(
|
||||||
|
f"{LOG_DIR}/mineru.log",
|
||||||
|
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {name}:{function}:{line} - {message}",
|
||||||
|
level="DEBUG",
|
||||||
|
enqueue=True
|
||||||
|
)
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
class PDFParseRequest(BaseModel):
|
||||||
|
source_path: str
|
||||||
|
export_path: str
|
||||||
|
|
||||||
|
@app.post(path="/api/pdf-extract")
|
||||||
|
async def parse_pdf(request: PDFParseRequest):
|
||||||
|
try:
|
||||||
|
start = time.time()
|
||||||
|
# 创建唯一的输出目录
|
||||||
|
unique_id = str(uuid.uuid4())
|
||||||
|
unique_dir = os.path.join(request.export_path, unique_id)
|
||||||
|
os.makedirs(unique_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# 如果是PDF,使用read_fn处理
|
||||||
|
file_path = Path(request.source_path)
|
||||||
|
file_suffix = file_path.suffix.lower()
|
||||||
|
if file_suffix == ".pdf":
|
||||||
|
try:
|
||||||
|
pdf_bytes = read_fn(file_path)
|
||||||
|
pdf_name = file_path.stem
|
||||||
|
pdf_bytes_list = [pdf_bytes]
|
||||||
|
pdf_file_names = [pdf_name]
|
||||||
|
except Exception as e:
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=400,
|
||||||
|
content={"error": f"Failed to load file: {str(e)}"}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=400,
|
||||||
|
content={"error": f"Unsupported file type: {file_suffix}"}
|
||||||
|
)
|
||||||
|
|
||||||
|
# 调用异步处理函数
|
||||||
|
await aio_do_parse(
|
||||||
|
output_dir=unique_dir,
|
||||||
|
pdf_file_names=pdf_file_names,
|
||||||
|
pdf_bytes_list=pdf_bytes_list,
|
||||||
|
p_lang_list=["ch"],
|
||||||
|
f_draw_layout_bbox=False,
|
||||||
|
f_draw_span_bbox=False,
|
||||||
|
f_dump_orig_pdf=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
if os.getenv("MINERU_BACKEND_MODE").startswith("pipeline"):
|
||||||
|
parse_dir = os.path.join(unique_dir, pdf_name, "auto")
|
||||||
|
else:
|
||||||
|
parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
|
||||||
|
|
||||||
|
content = ""
|
||||||
|
if os.path.exists(parse_dir):
|
||||||
|
content = get_infer_result(".md", pdf_name, parse_dir)
|
||||||
|
|
||||||
|
if os.path.exists(unique_dir):
|
||||||
|
try:
|
||||||
|
shutil.rmtree(unique_dir)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to remove unique dir for {unique_id}: {str(e)}")
|
||||||
|
|
||||||
|
logger.info(f"fileName: {file_path.name} costs {time.time() - start:.6f} s")
|
||||||
|
|
||||||
|
return JSONResponse(status_code=200, content={"result": content})
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(e)
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=500,
|
||||||
|
content={"error": f"Failed to process file: {str(e)}"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option('--ip', default='0.0.0.0', help='Service ip for this API, default to use 0.0.0.0.')
|
||||||
|
@click.option('--port', default=9001, type=int, help='Service port for this API, default to use 8082.')
|
||||||
|
def main(ip, port):
|
||||||
|
"""Create API for Submitting Job to MinerU"""
|
||||||
|
logger.info(f"Start MinerU FastAPI Service: http://{ip}:{port}")
|
||||||
|
uvicorn.run(
|
||||||
|
app,
|
||||||
|
host=ip,
|
||||||
|
port=port
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
||||||
@@ -21,6 +21,7 @@ def _import_operators():
|
|||||||
from . import file_exporter
|
from . import file_exporter
|
||||||
from . import slide_formatter
|
from . import slide_formatter
|
||||||
from . import unstructured_formatter
|
from . import unstructured_formatter
|
||||||
|
from . import external_pdf_formatter
|
||||||
|
|
||||||
|
|
||||||
_import_operators()
|
_import_operators()
|
||||||
|
|||||||
6
runtime/ops/formatter/external_pdf_formatter/__init__.py
Normal file
6
runtime/ops/formatter/external_pdf_formatter/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from datamate.core.base_op import OPERATORS
|
||||||
|
|
||||||
|
OPERATORS.register_module(module_name='ExternalPDFFormatter',
|
||||||
|
module_path="ops.formatter.external_pdf_formatter.process")
|
||||||
16
runtime/ops/formatter/external_pdf_formatter/metadata.yml
Normal file
16
runtime/ops/formatter/external_pdf_formatter/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
name: '外部PDF文本抽取'
|
||||||
|
name_en: 'External PDF Text Extraction'
|
||||||
|
description: '基于外部API,抽取PDF中的文本。'
|
||||||
|
description_en: 'Extracts text from PDF files based on external APIs.'
|
||||||
|
language: 'python'
|
||||||
|
vendor: 'huawei'
|
||||||
|
raw_id: 'ExternalPDFFormatter'
|
||||||
|
version: '1.0.0'
|
||||||
|
types:
|
||||||
|
- 'collect'
|
||||||
|
modal: 'text'
|
||||||
|
effect:
|
||||||
|
before: ''
|
||||||
|
after: ''
|
||||||
|
inputs: 'text'
|
||||||
|
outputs: 'text'
|
||||||
38
runtime/ops/formatter/external_pdf_formatter/process.py
Normal file
38
runtime/ops/formatter/external_pdf_formatter/process.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
#!/user/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
Description: 外部PDF文本抽取
|
||||||
|
Create: 2025/10/29 17:24
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from loguru import logger
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
from datamate.core.base_op import Mapper
|
||||||
|
from datamate.common.utils.rest_client import http_request
|
||||||
|
|
||||||
|
|
||||||
|
class ExternalPDFFormatter(Mapper):
|
||||||
|
"""基于外部API,抽取PDF中的文本"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(ExternalPDFFormatter, self).__init__(*args, **kwargs)
|
||||||
|
self.base_url = os.getenv("EXTERNAL_PDF_BASE_URL", "http://datamate-mineru:9001")
|
||||||
|
self.pdf_extract_url = f"{self.base_url}/api/pdf-extract"
|
||||||
|
|
||||||
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
start = time.time()
|
||||||
|
filename = sample[self.filename_key]
|
||||||
|
try:
|
||||||
|
data = {"source_path": sample[self.filepath_key], "export_path": sample[self.export_path_key]}
|
||||||
|
response = http_request(method="POST", url=self.pdf_extract_url, data=data)
|
||||||
|
sample[self.text_key] = json.loads(response.text).get("result")
|
||||||
|
logger.info(
|
||||||
|
f"fileName: {filename}, method: ExternalPDFFormatter costs {(time.time() - start):6f} s")
|
||||||
|
except UnicodeDecodeError as err:
|
||||||
|
logger.exception(f"fileName: {filename}, method: ExternalPDFFormatter causes decode error: {err}")
|
||||||
|
raise
|
||||||
|
return sample
|
||||||
53
runtime/python-executor/datamate/common/utils/rest_client.py
Normal file
53
runtime/python-executor/datamate/common/utils/rest_client.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import requests
|
||||||
|
from typing import Optional, Dict, Any
|
||||||
|
|
||||||
|
from starlette.responses import JSONResponse
|
||||||
|
|
||||||
|
|
||||||
|
def http_request(method: str, url: str, data: Optional[Dict[str, Any]] = None):
|
||||||
|
"""
|
||||||
|
通用HTTP请求方法
|
||||||
|
|
||||||
|
Args:
|
||||||
|
method: 请求方法 ('GET', 'POST', 'PUT', 'DELETE', 'PATCH')
|
||||||
|
url: 请求URL
|
||||||
|
data: 请求数据(JSON格式)
|
||||||
|
|
||||||
|
Returns: 格式化响应体
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
requests.exceptions.RequestException: 请求异常
|
||||||
|
ValueError: 响应状态码错误
|
||||||
|
"""
|
||||||
|
success_codes = [200, 201, 202, 204]
|
||||||
|
|
||||||
|
# 设置默认请求头
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
|
||||||
|
method = method.upper()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 准备请求参数
|
||||||
|
request_kwargs = {"url": url, "headers": headers}
|
||||||
|
|
||||||
|
# 根据请求方法添加数据
|
||||||
|
if data is not None:
|
||||||
|
if method in ["POST", "PUT", "DELETE"]:
|
||||||
|
request_kwargs["json"] = data
|
||||||
|
elif method == "GET":
|
||||||
|
request_kwargs["params"] = data
|
||||||
|
|
||||||
|
# 发送请求
|
||||||
|
response = requests.request(method, **request_kwargs)
|
||||||
|
|
||||||
|
# 检查响应状态码
|
||||||
|
if response.status_code not in success_codes:
|
||||||
|
raise ValueError(
|
||||||
|
f"Request failed! Status code: {response.status_code}, "
|
||||||
|
f"content: {response.text}, "
|
||||||
|
f"URL: {url}"
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
raise requests.exceptions.RequestException(f"Request exception: {str(e)}")
|
||||||
@@ -69,6 +69,7 @@ INSERT IGNORE INTO t_operator
|
|||||||
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
|
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
|
||||||
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
|
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
|
||||||
('UnstructuredFormatter', '非结构化文本抽取', '抽取非结构化文件的文本,目前支持word文档。', '1.0.0', 'text', 'text', null, null, '', false),
|
('UnstructuredFormatter', '非结构化文本抽取', '抽取非结构化文件的文本,目前支持word文档。', '1.0.0', 'text', 'text', null, null, '', false),
|
||||||
|
('ExternalPDFFormatter', '外部PDF文本抽取', '基于外部API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
|
||||||
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false),
|
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false),
|
||||||
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
|
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
|
||||||
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
|
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
|
||||||
@@ -122,7 +123,7 @@ AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRe
|
|||||||
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
|
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
|
||||||
'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
|
'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
|
||||||
'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
|
'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
|
||||||
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'UnstructuredFormatter');
|
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner');
|
||||||
|
|
||||||
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
|
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
|
||||||
SELECT c.id, o.id
|
SELECT c.id, o.id
|
||||||
@@ -138,4 +139,4 @@ SELECT c.id, o.id
|
|||||||
FROM t_operator_category c
|
FROM t_operator_category c
|
||||||
CROSS JOIN t_operator o
|
CROSS JOIN t_operator o
|
||||||
WHERE c.id IN (7, 8, 11)
|
WHERE c.id IN (7, 8, 11)
|
||||||
AND o.id IN ('FileExporter', 'UnstructuredFormatter');
|
AND o.id IN ('FileExporter', 'UnstructuredFormatter', 'ExternalPDFFormatter');
|
||||||
|
|||||||
22
scripts/images/mineru/Dockerfile
Normal file
22
scripts/images/mineru/Dockerfile
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
COPY runtime/mineru /opt/runtime/datamate/mineru
|
||||||
|
|
||||||
|
RUN sed -i 's/deb.debian.org/mirrors.huaweicloud.com/g' /etc/apt/sources.list.d/debian.sources \
|
||||||
|
&& apt-get update \
|
||||||
|
&& apt-get install -y curl vim libgl1 libglx0 libopengl0 libglib2.0-0 procps \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN pip config --user set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple && \
|
||||||
|
pip config --user set global.trusted-host mirrors.huaweicloud.com && \
|
||||||
|
pip install --upgrade setuptools && \
|
||||||
|
pip install -U 'mineru[core]==2.5.4' --break-system-packages && \
|
||||||
|
pip cache purge
|
||||||
|
|
||||||
|
ENV CURL_CA_BUNDLE=""
|
||||||
|
ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
|
||||||
|
|
||||||
|
RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
|
||||||
|
|
||||||
|
ENV MINERU_MODEL_SOURCE=local
|
||||||
Reference in New Issue
Block a user