From 924d977d6f2fa79f8cf3a94b5dd6f55a41c5c60a Mon Sep 17 00:00:00 2001 From: hhhhsc701 <56435672+hhhhsc701@users.noreply.github.com> Date: Wed, 17 Dec 2025 16:31:06 +0800 Subject: [PATCH] =?UTF-8?q?=E6=94=AF=E6=8C=81mineru=20npu=E5=A4=84?= =?UTF-8?q?=E7=90=86=20(#174)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feature: unstructured支持简单pdf处理 * feature: update values.yaml to enhance ray-cluster configuration with security context, environment variables, and resource limits * feature: update deploy.yaml and process.py for mineru server configuration and PDF processing enhancements * feature: update deploy.yaml and process.py for mineru server configuration and PDF processing enhancements * feature: improve PDF processing logic and update dependencies in process.py and pyproject.toml * feature: improve PDF processing logic and update dependencies in process.py and pyproject.toml * feature: update Dockerfile for improved package source mirrors and add mineru-npu to build targets --- Makefile | 2 +- deployment/docker/datamate/docker-compose.yml | 21 ++++++--- deployment/helm/datamate/values.yaml | 5 +-- deployment/kubernetes/mineru/deploy.yaml | 26 ++++++----- .../ops/formatter/mineru_formatter/process.py | 45 ++++++++++++++----- runtime/ops/pyproject.toml | 6 ++- scripts/images/backend/Dockerfile | 8 +++- scripts/images/mineru-npu/Dockerfile | 32 +++++++++++++ 8 files changed, 110 insertions(+), 35 deletions(-) create mode 100644 scripts/images/mineru-npu/Dockerfile diff --git a/Makefile b/Makefile index ff44fa7..1837593 100644 --- a/Makefile +++ b/Makefile @@ -155,7 +155,7 @@ endef # ========== Build Targets ========== # Valid build targets -VALID_BUILD_TARGETS := backend database frontend runtime backend-python deer-flow mineru +VALID_BUILD_TARGETS := backend database frontend runtime backend-python deer-flow mineru mineru-npu # Generic docker build target with service name as parameter # Automatically prefixes image names with "datamate-" unless it's deer-flow diff --git a/deployment/docker/datamate/docker-compose.yml b/deployment/docker/datamate/docker-compose.yml index 9c6ca1c..eec59a1 100644 --- a/deployment/docker/datamate/docker-compose.yml +++ b/deployment/docker/datamate/docker-compose.yml @@ -99,19 +99,28 @@ services: restart: on-failure environment: MINERU_MODEL_SOURCE: local - MINERU_DEVICE_MODE: cpu # cpu|cuda|npu|mps - MINERU_BACKEND_MODE: pipeline + MINERU_DEVICE_MODE: npu # cpu|cuda|npu|mps + VLLM_WORKER_MULTIPROC_METHOD: spawn privileged: true command: - - python - - /opt/runtime/datamate/mineru/mineru_api.py - - --port - - "9001" + - mineru-openai-server + - --engine vllm + - --host 0.0.0.0 + - --port "8000" volumes: - dataset_volume:/dataset - mineru_log_volume:/var/log/datamate/mineru + - /var/log/npu/:/usr/slog + - /usr/local/dcmi:/usr/local/dcmi + - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi + - /usr/local/Ascend/driver:/usr/local/Ascend/driver networks: [ datamate ] profiles: [ mineru ] + devices: + - /dev/davinci0 + - /dev/davinci_manager + - /dev/devmm_svm + - /dev/hisi_hdc # 5) redis datamate-redis: diff --git a/deployment/helm/datamate/values.yaml b/deployment/helm/datamate/values.yaml index 7f913aa..3fa3840 100644 --- a/deployment/helm/datamate/values.yaml +++ b/deployment/helm/datamate/values.yaml @@ -175,7 +175,6 @@ ray-cluster: enabled: true head: rayStartParams: - object-store-memory: '78643200' num-cpus: '0' containerEnv: - name: RAY_DEDUP_LOGS @@ -194,8 +193,8 @@ ray-cluster: value: "datamate" resources: limits: - cpu: "2" - memory: "8G" + cpu: "4" + memory: "16G" requests: cpu: "1" memory: "2G" diff --git a/deployment/kubernetes/mineru/deploy.yaml b/deployment/kubernetes/mineru/deploy.yaml index 8c61f43..c4b1491 100644 --- a/deployment/kubernetes/mineru/deploy.yaml +++ b/deployment/kubernetes/mineru/deploy.yaml @@ -22,26 +22,32 @@ spec: image: datamate-mineru imagePullPolicy: IfNotPresent command: - - python - - /opt/runtime/datamate/mineru/mineru_api.py + - mineru-openai-server + args: + - --engine + - vllm + - --host + - 0.0.0.0 - --port - - "9001" + - "8000" env: - name: MINERU_MODEL_SOURCE value: local - name: MINERU_DEVICE_MODE - value: cpu - - name: MINERU_BACKEND_MODE - value: pipeline + value: npu + - name: VLLM_WORKER_MULTIPROC_METHOD + value: spawn ports: - - containerPort: 9001 + - containerPort: 8000 resources: limits: - cpu: 16 + cpu: 8 memory: 32Gi + huawei.com/Ascend910: 1 requests: cpu: 100m memory: 100Mi + huawei.com/Ascend910: 1 volumeMounts: - name: dataset-volume mountPath: /dataset @@ -67,8 +73,8 @@ metadata: spec: type: ClusterIP ports: - - port: 9001 - targetPort: 9001 + - port: 8000 + targetPort: 8000 protocol: TCP selector: app: datamate diff --git a/runtime/ops/formatter/mineru_formatter/process.py b/runtime/ops/formatter/mineru_formatter/process.py index 181b870..df30c2a 100644 --- a/runtime/ops/formatter/mineru_formatter/process.py +++ b/runtime/ops/formatter/mineru_formatter/process.py @@ -5,13 +5,17 @@ Description: MinerU PDF文本抽取 Create: 2025/10/29 17:24 """ -import json +import os +import shutil import time -from loguru import logger from typing import Dict, Any -from datamate.core.base_op import Mapper from datamate.common.utils.rest_client import http_request +from datamate.core.base_op import Mapper +from loguru import logger +from mineru.cli.common import do_parse, read_fn +from mineru.cli.fast_api import get_infer_result +from pypdf import PdfReader class MineruFormatter(Mapper): @@ -19,21 +23,40 @@ class MineruFormatter(Mapper): def __init__(self, *args, **kwargs): super(MineruFormatter, self).__init__(*args, **kwargs) - self.base_url = "http://datamate-mineru:9001" - self.pdf_extract_url = f"{self.base_url}/api/pdf-extract" + self.server_url = "http://datamate-mineru:8000" + self.backend = "vlm-http-client" + self.output_dir = "/dataset/outputs" def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]: start = time.time() filename = sample[self.filename_key] - if not filename.lower().endswith(".pdf"): + filename_without_ext = os.path.splitext(filename)[0] + if not filename.lower().endswith((".png", ".jpeg", ".jpg", ".webp", ".gif", ".pdf")): return sample try: - data = {"source_path": sample[self.filepath_key], "export_path": sample[self.export_path_key]} - response = http_request(method="POST", url=self.pdf_extract_url, data=data) - sample[self.text_key] = json.loads(response.text).get("result") + filepath = sample[self.filepath_key] + parse_dir = os.path.join(self.output_dir, filename_without_ext, "vlm") + pdf_bytes = read_fn(filepath) + total_page = len(PdfReader(filepath).pages) + content = "" + for page in range(0, total_page, 10): + do_parse( + output_dir=self.output_dir, + pdf_file_names=[filename_without_ext], + pdf_bytes_list=[pdf_bytes], + p_lang_list=["ch"], + backend=self.backend, + server_url=self.server_url, + start_page_id=page, + end_page_id=min(page + 9, total_page - 1), + ) + if os.path.exists(parse_dir): + content += get_infer_result(".md", filename_without_ext, parse_dir) + shutil.rmtree(parse_dir) + sample[self.text_key] = content logger.info( f"fileName: {filename}, method: MineruFormatter costs {(time.time() - start):6f} s") - except UnicodeDecodeError as err: - logger.exception(f"fileName: {filename}, method: MineruFormatter causes decode error: {err}") + except Exception as e: + logger.exception(f"fileName: {filename}, method: MineruFormatter causes error: {e}") raise return sample diff --git a/runtime/ops/pyproject.toml b/runtime/ops/pyproject.toml index 07fc092..ce1afc4 100644 --- a/runtime/ops/pyproject.toml +++ b/runtime/ops/pyproject.toml @@ -11,7 +11,9 @@ dependencies = [ "emoji>=2.15.0", "jieba>=0.42.1", "loguru>=0.7.3", - "numpy==1.23.3", + "mineru>=2.6.5", + "numpy==1.24.3", + "python-multipart>=0.0.20", "opencv-contrib-python-headless==4.7.0.72", "opencv-python-headless==4.7.0.72", "openslide-python>=1.4.3", @@ -29,4 +31,4 @@ dependencies = [ "sqlalchemy>=2.0.44", "xmltodict>=1.0.2", "zhconv>=1.4.3", -] \ No newline at end of file +] diff --git a/scripts/images/backend/Dockerfile b/scripts/images/backend/Dockerfile index 968efab..5df63ad 100644 --- a/scripts/images/backend/Dockerfile +++ b/scripts/images/backend/Dockerfile @@ -1,6 +1,8 @@ FROM maven:3-eclipse-temurin-8 AS datax-builder -RUN apt-get update && \ +RUN sed -i "s@http://.*archive.ubuntu.com@http://mirrors.huaweicloud.com@g" /etc/apt/sources.list && \ + sed -i "s@http://.*security.ubuntu.com@http://mirrors.huaweicloud.com@g" /etc/apt/sources.list && \ + apt-get update && \ apt-get install -y git && \ git clone https://github.com/alibaba/DataX.git @@ -21,7 +23,9 @@ RUN cd /opt/backend && \ FROM eclipse-temurin:21-jdk -RUN apt-get update && \ +RUN sed -i "s@http://.*archive.ubuntu.com@http://mirrors.huaweicloud.com@g" /etc/apt/sources.list && \ + sed -i "s@http://.*security.ubuntu.com@http://mirrors.huaweicloud.com@g" /etc/apt/sources.list && \ + apt-get update && \ apt-get install -y vim wget curl nfs-common rsync python3 python3-pip python-is-python3 dos2unix && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/scripts/images/mineru-npu/Dockerfile b/scripts/images/mineru-npu/Dockerfile new file mode 100644 index 0000000..f80721e --- /dev/null +++ b/scripts/images/mineru-npu/Dockerfile @@ -0,0 +1,32 @@ +# 基础镜像配置 vLLM 或 LMDeploy ,请根据实际需要选择其中一个,要求 ARM(AArch64) CPU + Ascend NPU。 +# Base image containing the vLLM inference environment, requiring ARM(AArch64) CPU + Ascend NPU. +FROM quay.io/ascend/vllm-ascend:v0.11.0rc2 +# Base image containing the LMDeploy inference environment, requiring ARM(AArch64) CPU + Ascend NPU. +# FROM crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:mineru-a2 + + +# Install libgl for opencv support & Noto fonts for Chinese characters +RUN apt-get update && \ + apt-get install -y \ + fonts-noto-core \ + fonts-noto-cjk \ + fontconfig \ + libgl1 \ + libglib2.0-0 && \ + fc-cache -fv && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Install mineru latest +RUN python3 -m pip install -U pip -i https://mirrors.aliyun.com/pypi/simple && \ + python3 -m pip install 'mineru[core]>=2.6.5' \ + numpy==1.26.4 \ + opencv-python==4.11.0.86 \ + -i https://mirrors.aliyun.com/pypi/simple && \ + python3 -m pip cache purge + +# Download models and update the configuration file +RUN TORCH_DEVICE_BACKEND_AUTOLOAD=0 /bin/bash -c "mineru-models-download -s modelscope -m all" + +# Set the entry point to activate the virtual environment and run the command line tool +ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"] \ No newline at end of file