支持mineru npu处理 (#174)

* feature: unstructured支持简单pdf处理 * feature: update values.yaml to enhance ray-cluster configuration with security context, environment variables, and resource limits * feature: update deploy.yaml and process.py for mineru server configuration and PDF processing enhancements * feature: update deploy.yaml and process.py for mineru server configuration and PDF processing enhancements * feature: improve PDF processing logic and update dependencies in process.py and pyproject.toml * feature: improve PDF processing logic and update dependencies in process.py and pyproject.toml * feature: update Dockerfile for improved package source mirrors and add mineru-npu to build targets
2025-12-17 16:31:06 +08:00
parent 3b4f8488e8
commit 924d977d6f
8 changed files with 110 additions and 35 deletions
@@ -155,7 +155,7 @@ endef
 # ========== Build Targets ==========
 # Valid build targets
-VALID_BUILD_TARGETS := backend database frontend runtime backend-python deer-flow mineru
+VALID_BUILD_TARGETS := backend database frontend runtime backend-python deer-flow mineru mineru-npu
 # Generic docker build target with service name as parameter
 # Automatically prefixes image names with "datamate-" unless it's deer-flow
@@ -99,19 +99,28 @@ services:
    restart: on-failure
    environment:
      MINERU_MODEL_SOURCE: local
-      MINERU_DEVICE_MODE: cpu  # cpu|cuda|npu|mps
+      MINERU_DEVICE_MODE: npu  # cpu|cuda|npu|mps
-      MINERU_BACKEND_MODE: pipeline
+      VLLM_WORKER_MULTIPROC_METHOD: spawn
    privileged: true
    command:
-      - python
+      - mineru-openai-server
-      - /opt/runtime/datamate/mineru/mineru_api.py
+      - --engine vllm
-      - --port
+      - --host 0.0.0.0
-      - "9001"
+      - --port "8000"
    volumes:
      - dataset_volume:/dataset
      - mineru_log_volume:/var/log/datamate/mineru
      - /var/log/npu/:/usr/slog
      - /usr/local/dcmi:/usr/local/dcmi
      - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
      - /usr/local/Ascend/driver:/usr/local/Ascend/driver
    networks: [ datamate ]
    profiles: [ mineru ]
    devices:
      - /dev/davinci0
      - /dev/davinci_manager
      - /dev/devmm_svm
      - /dev/hisi_hdc
  # 5) redis
  datamate-redis:
@@ -175,7 +175,6 @@ ray-cluster:
  enabled: true
  head:
    rayStartParams:
      object-store-memory: '78643200'
      num-cpus: '0'
    containerEnv:
      - name: RAY_DEDUP_LOGS
@@ -194,8 +193,8 @@ ray-cluster:
        value: "datamate"
    resources:
      limits:
-        cpu: "2"
+        cpu: "4"
-        memory: "8G"
+        memory: "16G"
      requests:
        cpu: "1"
        memory: "2G"
@@ -22,26 +22,32 @@ spec:
          image: datamate-mineru
          imagePullPolicy: IfNotPresent
          command:
-            - python
+            - mineru-openai-server
-            - /opt/runtime/datamate/mineru/mineru_api.py
+          args:
            - --engine
            - vllm
            - --host
            - 0.0.0.0
            - --port
-            - "9001"
+            - "8000"
          env:
            - name: MINERU_MODEL_SOURCE
              value: local
            - name: MINERU_DEVICE_MODE
-              value: cpu
+              value: npu
-            - name: MINERU_BACKEND_MODE
+            - name: VLLM_WORKER_MULTIPROC_METHOD
-              value: pipeline
+              value: spawn
          ports:
-            - containerPort: 9001
+            - containerPort: 8000
          resources:
            limits:
-              cpu: 16
+              cpu: 8
              memory: 32Gi
              huawei.com/Ascend910: 1
            requests:
              cpu: 100m
              memory: 100Mi
              huawei.com/Ascend910: 1
          volumeMounts:
            - name: dataset-volume
              mountPath: /dataset
@@ -67,8 +73,8 @@ metadata:
 spec:
  type: ClusterIP
  ports:
-    - port: 9001
+    - port: 8000
-      targetPort: 9001
+      targetPort: 8000
      protocol: TCP
  selector:
    app: datamate
@@ -5,13 +5,17 @@
 Description: MinerU PDF文本抽取
 Create: 2025/10/29 17:24
 """
-import json
+import os
 import shutil
 import time
 from loguru import logger
 from typing import Dict, Any
 from datamate.core.base_op import Mapper
 from datamate.common.utils.rest_client import http_request
 from datamate.core.base_op import Mapper
 from loguru import logger
 from mineru.cli.common import do_parse, read_fn
 from mineru.cli.fast_api import get_infer_result
 from pypdf import PdfReader
 class MineruFormatter(Mapper):
@@ -19,21 +23,40 @@ class MineruFormatter(Mapper):
    def __init__(self, *args, **kwargs):
        super(MineruFormatter, self).__init__(*args, **kwargs)
-        self.base_url = "http://datamate-mineru:9001"
+        self.server_url = "http://datamate-mineru:8000"
-        self.pdf_extract_url = f"{self.base_url}/api/pdf-extract"
+        self.backend = "vlm-http-client"
        self.output_dir = "/dataset/outputs"
    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
        start = time.time()
        filename = sample[self.filename_key]
-        if not filename.lower().endswith(".pdf"):
+        filename_without_ext = os.path.splitext(filename)[0]
        if not filename.lower().endswith((".png", ".jpeg", ".jpg", ".webp", ".gif", ".pdf")):
            return sample
        try:
-            data = {"source_path": sample[self.filepath_key], "export_path": sample[self.export_path_key]}
+            filepath = sample[self.filepath_key]
-            response = http_request(method="POST", url=self.pdf_extract_url, data=data)
+            parse_dir = os.path.join(self.output_dir, filename_without_ext, "vlm")
-            sample[self.text_key] = json.loads(response.text).get("result")
+            pdf_bytes = read_fn(filepath)
            total_page = len(PdfReader(filepath).pages)
            content = ""
            for page in range(0, total_page, 10):
                do_parse(
                    output_dir=self.output_dir,
                    pdf_file_names=[filename_without_ext],
                    pdf_bytes_list=[pdf_bytes],
                    p_lang_list=["ch"],
                    backend=self.backend,
                    server_url=self.server_url,
                    start_page_id=page,
                    end_page_id=min(page + 9, total_page - 1),
                )
                if os.path.exists(parse_dir):
                    content += get_infer_result(".md", filename_without_ext, parse_dir)
                    shutil.rmtree(parse_dir)
            sample[self.text_key] = content
            logger.info(
                f"fileName: {filename}, method: MineruFormatter costs {(time.time() - start):6f} s")
-        except UnicodeDecodeError as err:
+        except Exception as e:
-            logger.exception(f"fileName: {filename}, method: MineruFormatter causes decode error: {err}")
+            logger.exception(f"fileName: {filename}, method: MineruFormatter causes error: {e}")
            raise
        return sample
@@ -11,7 +11,9 @@ dependencies = [
    "emoji>=2.15.0",
    "jieba>=0.42.1",
    "loguru>=0.7.3",
-    "numpy==1.23.3",
+    "mineru>=2.6.5",
    "numpy==1.24.3",
    "python-multipart>=0.0.20",
    "opencv-contrib-python-headless==4.7.0.72",
    "opencv-python-headless==4.7.0.72",
    "openslide-python>=1.4.3",
@@ -1,6 +1,8 @@
 FROM maven:3-eclipse-temurin-8 AS datax-builder
-RUN apt-get update && \
+RUN sed -i "s@http://.*archive.ubuntu.com@http://mirrors.huaweicloud.com@g" /etc/apt/sources.list && \
    sed -i "s@http://.*security.ubuntu.com@http://mirrors.huaweicloud.com@g" /etc/apt/sources.list && \
    apt-get update && \
    apt-get install -y git && \
    git clone https://github.com/alibaba/DataX.git
@@ -21,7 +23,9 @@ RUN cd /opt/backend && \
 FROM eclipse-temurin:21-jdk
-RUN apt-get update && \
+RUN sed -i "s@http://.*archive.ubuntu.com@http://mirrors.huaweicloud.com@g" /etc/apt/sources.list && \
    sed -i "s@http://.*security.ubuntu.com@http://mirrors.huaweicloud.com@g" /etc/apt/sources.list && \
    apt-get update && \
    apt-get install -y vim wget curl nfs-common rsync python3 python3-pip python-is-python3 dos2unix && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
@@ -0,0 +1,32 @@
 # 基础镜像配置 vLLM 或 LMDeploy ，请根据实际需要选择其中一个，要求 ARM(AArch64) CPU + Ascend NPU。
 # Base image containing the vLLM inference environment, requiring ARM(AArch64) CPU + Ascend NPU.
 FROM quay.io/ascend/vllm-ascend:v0.11.0rc2
 # Base image containing the LMDeploy inference environment, requiring ARM(AArch64) CPU + Ascend NPU.
 # FROM crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:mineru-a2
 # Install libgl for opencv support & Noto fonts for Chinese characters
 RUN apt-get update && \
    apt-get install -y \
        fonts-noto-core \
        fonts-noto-cjk \
        fontconfig \
        libgl1 \
        libglib2.0-0 && \
    fc-cache -fv && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Install mineru latest
 RUN python3 -m pip install -U pip -i https://mirrors.aliyun.com/pypi/simple && \
    python3 -m pip install 'mineru[core]>=2.6.5' \
                            numpy==1.26.4 \
                            opencv-python==4.11.0.86 \
                            -i https://mirrors.aliyun.com/pypi/simple && \
    python3 -m pip cache purge
 # Download models and update the configuration file
 RUN TORCH_DEVICE_BACKEND_AUTOLOAD=0 /bin/bash -c "mineru-models-download -s modelscope -m all"
 # Set the entry point to activate the virtual environment and run the command line tool
 ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]