You've already forked DataMate
* feature: add UnstructuredFormatter * feature: add UnstructuredFormatter in db * feature: add unstructured[docx]==0.18.15 * feature: support doc * feature: add mineru * feature: add external pdf extract operator by using mineru * feature: mineru docker install bugfix --------- Co-authored-by: Startalker <438747480@qq.com>
113 lines
3.4 KiB
Python
113 lines
3.4 KiB
Python
import shutil
|
|
import time
|
|
import uuid
|
|
import os
|
|
|
|
import click
|
|
import uvicorn
|
|
from pydantic import BaseModel
|
|
from pathlib import Path
|
|
from fastapi import FastAPI
|
|
from fastapi.responses import JSONResponse
|
|
from loguru import logger
|
|
from mineru.cli.common import aio_do_parse, read_fn
|
|
from mineru.cli.fast_api import get_infer_result
|
|
|
|
# 日志配置
|
|
LOG_DIR = "/var/log/datamate/mineru"
|
|
os.makedirs(LOG_DIR, exist_ok=True)
|
|
logger.add(
|
|
f"{LOG_DIR}/mineru.log",
|
|
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {name}:{function}:{line} - {message}",
|
|
level="DEBUG",
|
|
enqueue=True
|
|
)
|
|
|
|
app = FastAPI()
|
|
class PDFParseRequest(BaseModel):
|
|
source_path: str
|
|
export_path: str
|
|
|
|
@app.post(path="/api/pdf-extract")
|
|
async def parse_pdf(request: PDFParseRequest):
|
|
try:
|
|
start = time.time()
|
|
# 创建唯一的输出目录
|
|
unique_id = str(uuid.uuid4())
|
|
unique_dir = os.path.join(request.export_path, unique_id)
|
|
os.makedirs(unique_dir, exist_ok=True)
|
|
|
|
# 如果是PDF,使用read_fn处理
|
|
file_path = Path(request.source_path)
|
|
file_suffix = file_path.suffix.lower()
|
|
if file_suffix == ".pdf":
|
|
try:
|
|
pdf_bytes = read_fn(file_path)
|
|
pdf_name = file_path.stem
|
|
pdf_bytes_list = [pdf_bytes]
|
|
pdf_file_names = [pdf_name]
|
|
except Exception as e:
|
|
return JSONResponse(
|
|
status_code=400,
|
|
content={"error": f"Failed to load file: {str(e)}"}
|
|
)
|
|
else:
|
|
return JSONResponse(
|
|
status_code=400,
|
|
content={"error": f"Unsupported file type: {file_suffix}"}
|
|
)
|
|
|
|
# 调用异步处理函数
|
|
await aio_do_parse(
|
|
output_dir=unique_dir,
|
|
pdf_file_names=pdf_file_names,
|
|
pdf_bytes_list=pdf_bytes_list,
|
|
p_lang_list=["ch"],
|
|
f_draw_layout_bbox=False,
|
|
f_draw_span_bbox=False,
|
|
f_dump_orig_pdf=False,
|
|
)
|
|
|
|
if os.getenv("MINERU_BACKEND_MODE").startswith("pipeline"):
|
|
parse_dir = os.path.join(unique_dir, pdf_name, "auto")
|
|
else:
|
|
parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
|
|
|
|
content = ""
|
|
if os.path.exists(parse_dir):
|
|
content = get_infer_result(".md", pdf_name, parse_dir)
|
|
|
|
if os.path.exists(unique_dir):
|
|
try:
|
|
shutil.rmtree(unique_dir)
|
|
except Exception as e:
|
|
logger.error(f"Failed to remove unique dir for {unique_id}: {str(e)}")
|
|
|
|
logger.info(f"fileName: {file_path.name} costs {time.time() - start:.6f} s")
|
|
|
|
return JSONResponse(status_code=200, content={"result": content})
|
|
except Exception as e:
|
|
logger.exception(e)
|
|
return JSONResponse(
|
|
status_code=500,
|
|
content={"error": f"Failed to process file: {str(e)}"}
|
|
)
|
|
|
|
|
|
@click.command()
|
|
@click.option('--ip', default='0.0.0.0', help='Service ip for this API, default to use 0.0.0.0.')
|
|
@click.option('--port', default=9001, type=int, help='Service port for this API, default to use 8082.')
|
|
def main(ip, port):
|
|
"""Create API for Submitting Job to MinerU"""
|
|
logger.info(f"Start MinerU FastAPI Service: http://{ip}:{port}")
|
|
uvicorn.run(
|
|
app,
|
|
host=ip,
|
|
port=port
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|