[Feature] Refactor project to use 'datamate' naming convention for services and configurations (#14)

* Enhance CleaningTaskService to track cleaning process progress and update ExecutorType to DATAMATE

* Refactor project to use 'datamate' naming convention for services and configurations
This commit is contained in:
hhhhsc701
2025-10-22 17:53:16 +08:00
committed by GitHub
parent 175d9ded93
commit 31ef8bc265
39 changed files with 312 additions and 737 deletions

View File

@@ -1,33 +0,0 @@
FROM maven:3-openjdk-8-slim AS builder
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
apt-get update && \
apt-get install -y git && \
git clone https://github.com/alibaba/DataX.git
COPY runtime/datax/ DataX/
RUN cd DataX && \
sed -i "s/com.mysql.jdbc.Driver/com.mysql.cj.jdbc.Driver/g" \
plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/util/DataBaseType.java && \
mvn -U clean package assembly:assembly -Dmaven.test.skip=true
FROM openjdk:8-jdk-slim
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
apt-get update && \
apt-get install -y python3 python3-pip python-is-python3 vim wget curl nfs-common rsync && \
apt-get clean && \
rm -rf /var/lib/apy/lists/*
RUN pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple && \
pip config --user set global.trusted-host mirrors.aliyun.com && \
pip install fastapi uvicorn[standard] && \
pip cache purge
COPY --from=builder /DataX/target/datax/datax /opt/datax
COPY scripts/images/datax/app.py /opt/datax/bin/app.py

View File

@@ -1,52 +0,0 @@
import subprocess
import tempfile
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI(title="datax")
class CreateJobParam(BaseModel):
content: str
@app.post("/process", tags=["run datax.py"])
async def process(job: CreateJobParam):
output = {
"status": "failed",
}
try:
# 创建临时文件存储Python脚本
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=True) as f:
f.write(job.content)
f.seek(0)
cmd_args = ["python3", "/opt/datax/bin/datax.py", f.name]
result = subprocess.run(
cmd_args,
capture_output=True,
text=True,
check=True
)
output["status"] = result.returncode
if result.returncode != 0:
output["stdout"] = result.stdout
output["stderr"] = result.stderr
except subprocess.TimeoutExpired as e:
output["status"] = 408
output["stderr"] = f"The script execution timed out: {e.stderr}"
except subprocess.CalledProcessError as e:
output["status"] = 500
output["stderr"] = f"Script execution failed: {e.stdout}"
except Exception as e:
output["status"] = 500
output["stderr"] = f"Server error: {str(e)}"
return output
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -10,7 +10,7 @@ RUN if [ -f package-lock.json ]; then npm ci; else npm install; fi && \
FROM nginx:1.29 AS runner
COPY --from=builder /app/dist /opt/frontend
COPY scripts/images/frontend/edm.conf /etc/nginx/conf.d/default.conf
COPY scripts/images/frontend/backend.conf /etc/nginx/conf.d/default.conf
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime

View File

@@ -2,8 +2,13 @@ server {
listen 80;
server_name 0.0.0.0;
access_log /var/log/datamate/frontend/access.log main;
error_log /var/log/datamate/frontend/error.log notice;
client_max_body_size 1024M;
location /api/ {
proxy_pass http://backend:8080/api/;
proxy_pass http://datamate-backend:8080/api/;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;

View File

@@ -1,22 +0,0 @@
FROM python:3.10-slim
RUN apt-get update && \
apt-get install -y curl vim libgl1 libgl1-mesa-glx libglib2.0-0 procps && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple && \
pip config --user set global.trusted-host mirrors.aliyun.com && \
pip install --upgrade setuptools && \
pip install -U 'mineru[core]==2.1.0' --break-system-packages && \
pip install torch==2.7.1+cpu -f https://download.pytorch.org/whl/torch/ && \
pip install torchvision==0.22.1+cpu -f https://download.pytorch.org/whl/torchvision && \
pip install requests==2.27.1 torch_npu==2.7.1rc1 numpy==1.26.0 decorator==5.2.1 einops==0.8.1 attrs==25.3.0 && \
pip cache purge
ENV CURL_CA_BUNDLE=""
ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
ENV MINERU_MODEL_SOURCE=local

View File

@@ -17,8 +17,6 @@ ENV HF_HUB_DISABLE_XET=1
RUN pip install -e . -i https://mirrors.huaweicloud.com/repository/pypi/simple \
&& pip install -r /opt/runtime/datamate/ops/requirements.txt -i https://mirrors.huaweicloud.com/repository/pypi/simple \
&& pip cache purge \
&& python -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" \
&& python -c "from unstructured_inference.models.base import get_model; get_model()"
&& pip cache purge
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime

View File

@@ -1,9 +0,0 @@
FROM downloads.unstructured.io/unstructured-io/unstructured
RUN pip config --user set global.index https://mirrors.huaweicloud.com/repository/pypi && \
pip config --user set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple && \
pip config --user set global.trusted-host mirrors.huaweicloud.com && \
pip install fastapi uvicorn && \
pip cache purge
COPY scripts/images/unstructured/app.py /app/app.py

View File

@@ -1,61 +0,0 @@
import asyncio
import os
from typing import Optional
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from unstructured.partition.auto import partition
app = FastAPI(title="unstructured")
class FileProcessingRequest(BaseModel):
"""文件处理请求模型"""
file_path: Optional[str] = None
# 可添加其他可选字段
@app.post("/process", tags=["文件处理"])
async def process_file(request_data: FileProcessingRequest):
"""处理文件并返回提取的文本内容"""
try:
file_path = request_data.file_path
if not file_path:
raise HTTPException(status_code=400, detail="缺少必要参数: filePath")
if not os.path.exists(file_path):
raise HTTPException(status_code=404, detail=f"文件不存在: {file_path}")
# 异步执行可能耗时的文件处理操作
text_content = await process_file_async(file_path)
# 返回处理结果
return {
"filePath": file_path,
"text": text_content,
"status": "success"
}
except HTTPException as e:
raise e
except Exception as e:
raise HTTPException(status_code=500, detail=f"process failed: {str(e)}")
async def process_file_async(file_path: str) -> str:
"""异步处理文件内容"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, partition_file_sync, file_path)
def partition_file_sync(file_path: str) -> str:
"""同步处理文件内容(由异步函数调用)"""
elements = partition(filename=file_path)
return "\n\n".join([str(el) for el in elements])
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)