You've already forked DataMate
feature: add external pdf extract operator by using mineru (#36)
* feature: add UnstructuredFormatter * feature: add UnstructuredFormatter in db * feature: add unstructured[docx]==0.18.15 * feature: support doc * feature: add mineru * feature: add external pdf extract operator by using mineru * feature: mineru docker install bugfix --------- Co-authored-by: Startalker <438747480@qq.com>
This commit is contained in:
112
runtime/mineru/mineru_api.py
Normal file
112
runtime/mineru/mineru_api.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import shutil
|
||||
import time
|
||||
import uuid
|
||||
import os
|
||||
|
||||
import click
|
||||
import uvicorn
|
||||
from pydantic import BaseModel
|
||||
from pathlib import Path
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import JSONResponse
|
||||
from loguru import logger
|
||||
from mineru.cli.common import aio_do_parse, read_fn
|
||||
from mineru.cli.fast_api import get_infer_result
|
||||
|
||||
# 日志配置
|
||||
LOG_DIR = "/var/log/datamate/mineru"
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
logger.add(
|
||||
f"{LOG_DIR}/mineru.log",
|
||||
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {name}:{function}:{line} - {message}",
|
||||
level="DEBUG",
|
||||
enqueue=True
|
||||
)
|
||||
|
||||
app = FastAPI()
|
||||
class PDFParseRequest(BaseModel):
|
||||
source_path: str
|
||||
export_path: str
|
||||
|
||||
@app.post(path="/api/pdf-extract")
|
||||
async def parse_pdf(request: PDFParseRequest):
|
||||
try:
|
||||
start = time.time()
|
||||
# 创建唯一的输出目录
|
||||
unique_id = str(uuid.uuid4())
|
||||
unique_dir = os.path.join(request.export_path, unique_id)
|
||||
os.makedirs(unique_dir, exist_ok=True)
|
||||
|
||||
# 如果是PDF,使用read_fn处理
|
||||
file_path = Path(request.source_path)
|
||||
file_suffix = file_path.suffix.lower()
|
||||
if file_suffix == ".pdf":
|
||||
try:
|
||||
pdf_bytes = read_fn(file_path)
|
||||
pdf_name = file_path.stem
|
||||
pdf_bytes_list = [pdf_bytes]
|
||||
pdf_file_names = [pdf_name]
|
||||
except Exception as e:
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content={"error": f"Failed to load file: {str(e)}"}
|
||||
)
|
||||
else:
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content={"error": f"Unsupported file type: {file_suffix}"}
|
||||
)
|
||||
|
||||
# 调用异步处理函数
|
||||
await aio_do_parse(
|
||||
output_dir=unique_dir,
|
||||
pdf_file_names=pdf_file_names,
|
||||
pdf_bytes_list=pdf_bytes_list,
|
||||
p_lang_list=["ch"],
|
||||
f_draw_layout_bbox=False,
|
||||
f_draw_span_bbox=False,
|
||||
f_dump_orig_pdf=False,
|
||||
)
|
||||
|
||||
if os.getenv("MINERU_BACKEND_MODE").startswith("pipeline"):
|
||||
parse_dir = os.path.join(unique_dir, pdf_name, "auto")
|
||||
else:
|
||||
parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
|
||||
|
||||
content = ""
|
||||
if os.path.exists(parse_dir):
|
||||
content = get_infer_result(".md", pdf_name, parse_dir)
|
||||
|
||||
if os.path.exists(unique_dir):
|
||||
try:
|
||||
shutil.rmtree(unique_dir)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to remove unique dir for {unique_id}: {str(e)}")
|
||||
|
||||
logger.info(f"fileName: {file_path.name} costs {time.time() - start:.6f} s")
|
||||
|
||||
return JSONResponse(status_code=200, content={"result": content})
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content={"error": f"Failed to process file: {str(e)}"}
|
||||
)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--ip', default='0.0.0.0', help='Service ip for this API, default to use 0.0.0.0.')
|
||||
@click.option('--port', default=9001, type=int, help='Service port for this API, default to use 8082.')
|
||||
def main(ip, port):
|
||||
"""Create API for Submitting Job to MinerU"""
|
||||
logger.info(f"Start MinerU FastAPI Service: http://{ip}:{port}")
|
||||
uvicorn.run(
|
||||
app,
|
||||
host=ip,
|
||||
port=port
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -21,6 +21,7 @@ def _import_operators():
|
||||
from . import file_exporter
|
||||
from . import slide_formatter
|
||||
from . import unstructured_formatter
|
||||
from . import external_pdf_formatter
|
||||
|
||||
|
||||
_import_operators()
|
||||
|
||||
6
runtime/ops/formatter/external_pdf_formatter/__init__.py
Normal file
6
runtime/ops/formatter/external_pdf_formatter/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ExternalPDFFormatter',
|
||||
module_path="ops.formatter.external_pdf_formatter.process")
|
||||
16
runtime/ops/formatter/external_pdf_formatter/metadata.yml
Normal file
16
runtime/ops/formatter/external_pdf_formatter/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '外部PDF文本抽取'
|
||||
name_en: 'External PDF Text Extraction'
|
||||
description: '基于外部API,抽取PDF中的文本。'
|
||||
description_en: 'Extracts text from PDF files based on external APIs.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ExternalPDFFormatter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
38
runtime/ops/formatter/external_pdf_formatter/process.py
Normal file
38
runtime/ops/formatter/external_pdf_formatter/process.py
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 外部PDF文本抽取
|
||||
Create: 2025/10/29 17:24
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from loguru import logger
|
||||
from typing import Dict, Any
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
from datamate.common.utils.rest_client import http_request
|
||||
|
||||
|
||||
class ExternalPDFFormatter(Mapper):
|
||||
"""基于外部API,抽取PDF中的文本"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ExternalPDFFormatter, self).__init__(*args, **kwargs)
|
||||
self.base_url = os.getenv("EXTERNAL_PDF_BASE_URL", "http://datamate-mineru:9001")
|
||||
self.pdf_extract_url = f"{self.base_url}/api/pdf-extract"
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
filename = sample[self.filename_key]
|
||||
try:
|
||||
data = {"source_path": sample[self.filepath_key], "export_path": sample[self.export_path_key]}
|
||||
response = http_request(method="POST", url=self.pdf_extract_url, data=data)
|
||||
sample[self.text_key] = json.loads(response.text).get("result")
|
||||
logger.info(
|
||||
f"fileName: {filename}, method: ExternalPDFFormatter costs {(time.time() - start):6f} s")
|
||||
except UnicodeDecodeError as err:
|
||||
logger.exception(f"fileName: {filename}, method: ExternalPDFFormatter causes decode error: {err}")
|
||||
raise
|
||||
return sample
|
||||
53
runtime/python-executor/datamate/common/utils/rest_client.py
Normal file
53
runtime/python-executor/datamate/common/utils/rest_client.py
Normal file
@@ -0,0 +1,53 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import requests
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
from starlette.responses import JSONResponse
|
||||
|
||||
|
||||
def http_request(method: str, url: str, data: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
通用HTTP请求方法
|
||||
|
||||
Args:
|
||||
method: 请求方法 ('GET', 'POST', 'PUT', 'DELETE', 'PATCH')
|
||||
url: 请求URL
|
||||
data: 请求数据(JSON格式)
|
||||
|
||||
Returns: 格式化响应体
|
||||
|
||||
Raises:
|
||||
requests.exceptions.RequestException: 请求异常
|
||||
ValueError: 响应状态码错误
|
||||
"""
|
||||
success_codes = [200, 201, 202, 204]
|
||||
|
||||
# 设置默认请求头
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
method = method.upper()
|
||||
|
||||
try:
|
||||
# 准备请求参数
|
||||
request_kwargs = {"url": url, "headers": headers}
|
||||
|
||||
# 根据请求方法添加数据
|
||||
if data is not None:
|
||||
if method in ["POST", "PUT", "DELETE"]:
|
||||
request_kwargs["json"] = data
|
||||
elif method == "GET":
|
||||
request_kwargs["params"] = data
|
||||
|
||||
# 发送请求
|
||||
response = requests.request(method, **request_kwargs)
|
||||
|
||||
# 检查响应状态码
|
||||
if response.status_code not in success_codes:
|
||||
raise ValueError(
|
||||
f"Request failed! Status code: {response.status_code}, "
|
||||
f"content: {response.text}, "
|
||||
f"URL: {url}"
|
||||
)
|
||||
return response
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise requests.exceptions.RequestException(f"Request exception: {str(e)}")
|
||||
Reference in New Issue
Block a user