feature: add external pdf extract operator by using mineru (#36)

* feature: add UnstructuredFormatter

* feature: add UnstructuredFormatter in db

* feature: add unstructured[docx]==0.18.15

* feature: support doc

* feature: add mineru

* feature: add external pdf extract operator by using mineru

* feature: mineru docker install bugfix

---------

Co-authored-by: Startalker <438747480@qq.com>
This commit is contained in:
Startalker
2025-10-30 15:55:10 +08:00
committed by GitHub
parent 2f7341dc1f
commit 155603b1ca
12 changed files with 370 additions and 3 deletions

View File

@@ -0,0 +1,112 @@
import shutil
import time
import uuid
import os
import click
import uvicorn
from pydantic import BaseModel
from pathlib import Path
from fastapi import FastAPI
from fastapi.responses import JSONResponse
from loguru import logger
from mineru.cli.common import aio_do_parse, read_fn
from mineru.cli.fast_api import get_infer_result
# 日志配置
LOG_DIR = "/var/log/datamate/mineru"
os.makedirs(LOG_DIR, exist_ok=True)
logger.add(
f"{LOG_DIR}/mineru.log",
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {name}:{function}:{line} - {message}",
level="DEBUG",
enqueue=True
)
app = FastAPI()
class PDFParseRequest(BaseModel):
source_path: str
export_path: str
@app.post(path="/api/pdf-extract")
async def parse_pdf(request: PDFParseRequest):
try:
start = time.time()
# 创建唯一的输出目录
unique_id = str(uuid.uuid4())
unique_dir = os.path.join(request.export_path, unique_id)
os.makedirs(unique_dir, exist_ok=True)
# 如果是PDF,使用read_fn处理
file_path = Path(request.source_path)
file_suffix = file_path.suffix.lower()
if file_suffix == ".pdf":
try:
pdf_bytes = read_fn(file_path)
pdf_name = file_path.stem
pdf_bytes_list = [pdf_bytes]
pdf_file_names = [pdf_name]
except Exception as e:
return JSONResponse(
status_code=400,
content={"error": f"Failed to load file: {str(e)}"}
)
else:
return JSONResponse(
status_code=400,
content={"error": f"Unsupported file type: {file_suffix}"}
)
# 调用异步处理函数
await aio_do_parse(
output_dir=unique_dir,
pdf_file_names=pdf_file_names,
pdf_bytes_list=pdf_bytes_list,
p_lang_list=["ch"],
f_draw_layout_bbox=False,
f_draw_span_bbox=False,
f_dump_orig_pdf=False,
)
if os.getenv("MINERU_BACKEND_MODE").startswith("pipeline"):
parse_dir = os.path.join(unique_dir, pdf_name, "auto")
else:
parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
content = ""
if os.path.exists(parse_dir):
content = get_infer_result(".md", pdf_name, parse_dir)
if os.path.exists(unique_dir):
try:
shutil.rmtree(unique_dir)
except Exception as e:
logger.error(f"Failed to remove unique dir for {unique_id}: {str(e)}")
logger.info(f"fileName: {file_path.name} costs {time.time() - start:.6f} s")
return JSONResponse(status_code=200, content={"result": content})
except Exception as e:
logger.exception(e)
return JSONResponse(
status_code=500,
content={"error": f"Failed to process file: {str(e)}"}
)
@click.command()
@click.option('--ip', default='0.0.0.0', help='Service ip for this API, default to use 0.0.0.0.')
@click.option('--port', default=9001, type=int, help='Service port for this API, default to use 8082.')
def main(ip, port):
"""Create API for Submitting Job to MinerU"""
logger.info(f"Start MinerU FastAPI Service: http://{ip}:{port}")
uvicorn.run(
app,
host=ip,
port=port
)
if __name__ == "__main__":
main()

View File

@@ -21,6 +21,7 @@ def _import_operators():
from . import file_exporter
from . import slide_formatter
from . import unstructured_formatter
from . import external_pdf_formatter
_import_operators()

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ExternalPDFFormatter',
module_path="ops.formatter.external_pdf_formatter.process")

View File

@@ -0,0 +1,16 @@
name: '外部PDF文本抽取'
name_en: 'External PDF Text Extraction'
description: '基于外部API,抽取PDF中的文本。'
description_en: 'Extracts text from PDF files based on external APIs.'
language: 'python'
vendor: 'huawei'
raw_id: 'ExternalPDFFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,38 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 外部PDF文本抽取
Create: 2025/10/29 17:24
"""
import json
import os
import time
from loguru import logger
from typing import Dict, Any
from datamate.core.base_op import Mapper
from datamate.common.utils.rest_client import http_request
class ExternalPDFFormatter(Mapper):
"""基于外部API,抽取PDF中的文本"""
def __init__(self, *args, **kwargs):
super(ExternalPDFFormatter, self).__init__(*args, **kwargs)
self.base_url = os.getenv("EXTERNAL_PDF_BASE_URL", "http://datamate-mineru:9001")
self.pdf_extract_url = f"{self.base_url}/api/pdf-extract"
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
filename = sample[self.filename_key]
try:
data = {"source_path": sample[self.filepath_key], "export_path": sample[self.export_path_key]}
response = http_request(method="POST", url=self.pdf_extract_url, data=data)
sample[self.text_key] = json.loads(response.text).get("result")
logger.info(
f"fileName: {filename}, method: ExternalPDFFormatter costs {(time.time() - start):6f} s")
except UnicodeDecodeError as err:
logger.exception(f"fileName: {filename}, method: ExternalPDFFormatter causes decode error: {err}")
raise
return sample

View File

@@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
import requests
from typing import Optional, Dict, Any
from starlette.responses import JSONResponse
def http_request(method: str, url: str, data: Optional[Dict[str, Any]] = None):
"""
通用HTTP请求方法
Args:
method: 请求方法 ('GET', 'POST', 'PUT', 'DELETE', 'PATCH')
url: 请求URL
data: 请求数据(JSON格式)
Returns: 格式化响应体
Raises:
requests.exceptions.RequestException: 请求异常
ValueError: 响应状态码错误
"""
success_codes = [200, 201, 202, 204]
# 设置默认请求头
headers = {"Content-Type": "application/json"}
method = method.upper()
try:
# 准备请求参数
request_kwargs = {"url": url, "headers": headers}
# 根据请求方法添加数据
if data is not None:
if method in ["POST", "PUT", "DELETE"]:
request_kwargs["json"] = data
elif method == "GET":
request_kwargs["params"] = data
# 发送请求
response = requests.request(method, **request_kwargs)
# 检查响应状态码
if response.status_code not in success_codes:
raise ValueError(
f"Request failed! Status code: {response.status_code}, "
f"content: {response.text}, "
f"URL: {url}"
)
return response
except requests.exceptions.RequestException as e:
raise requests.exceptions.RequestException(f"Request exception: {str(e)}")