You've already forked DataMate
init datamate
This commit is contained in:
45
scripts/images/backend/Dockerfile
Normal file
45
scripts/images/backend/Dockerfile
Normal file
@@ -0,0 +1,45 @@
|
||||
FROM maven:3-openjdk-8-slim AS datax-builder
|
||||
|
||||
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
|
||||
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
|
||||
apt-get update && \
|
||||
apt-get install -y git && \
|
||||
git clone https://github.com/alibaba/DataX.git
|
||||
|
||||
COPY runtime/datax/ DataX/
|
||||
|
||||
RUN cd DataX && \
|
||||
sed -i "s/com.mysql.jdbc.Driver/com.mysql.cj.jdbc.Driver/g" \
|
||||
plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/util/DataBaseType.java && \
|
||||
mvn -U clean package assembly:assembly -Dmaven.test.skip=true
|
||||
|
||||
FROM maven:3-amazoncorretto-21-debian AS builder
|
||||
|
||||
COPY backend/ /opt/backend
|
||||
COPY scripts/images/backend/settings.xml /opt/backend
|
||||
|
||||
RUN cd /opt/backend && \
|
||||
mvn -U clean package -s settings.xml -Dmaven.test.skip=true
|
||||
|
||||
|
||||
FROM openjdk:21-jdk-slim
|
||||
|
||||
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources && \
|
||||
apt-get update && \
|
||||
apt-get install -y vim wget curl nfs-common rsync python3 python3-pip python-is-python3 && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apy/lists/*
|
||||
|
||||
COPY --from=builder /opt/backend/services/main-application/target/data-mate.jar /opt/backend/data-mate.jar
|
||||
COPY --from=datax-builder /DataX/target/datax/datax /opt/datax
|
||||
|
||||
COPY editions/community/config/application.yml /opt/backend/application.yml
|
||||
COPY editions/community/config/log4j2.xml /opt/backend/log4j2.xml
|
||||
COPY scripts/images/backend/start.sh /opt/backend/start.sh
|
||||
|
||||
RUN chmod +x /opt/backend/start.sh \
|
||||
&& ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
|
||||
|
||||
ENTRYPOINT ["/opt/backend/start.sh"]
|
||||
|
||||
CMD ["java", "-Duser.timezone=Asia/Shanghai", "-jar", "/opt/backend/data-mate.jar"]
|
||||
68
scripts/images/backend/settings.xml
Normal file
68
scripts/images/backend/settings.xml
Normal file
@@ -0,0 +1,68 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 https://maven.apache.org/xsd/settings-1.0.0.xsd">
|
||||
<!-- 本地仓库路径(可选,默认在 ~/.m2/repository) -->
|
||||
<localRepository>${user.home}/.m2/repository</localRepository>
|
||||
|
||||
<!-- 阿里云镜像配置 -->
|
||||
<mirrors>
|
||||
<mirror>
|
||||
<id>aliyun-maven</id>
|
||||
<name>Aliyun Maven Repository</name>
|
||||
<url>https://maven.aliyun.com/repository/public</url>
|
||||
<mirrorOf>central,jcenter,google,spring,spring-plugin,gradle-plugin</mirrorOf>
|
||||
</mirror>
|
||||
</mirrors>
|
||||
|
||||
<!-- 使用 Java 21 编译配置(可选,但推荐) -->
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>java21</id>
|
||||
<activation>
|
||||
<activeByDefault>true</activeByDefault>
|
||||
<jdk>21</jdk>
|
||||
</activation>
|
||||
<properties>
|
||||
<maven.compiler.source>21</maven.compiler.source>
|
||||
<maven.compiler.target>21</maven.compiler.target>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
</profile>
|
||||
<!-- 激活阿里云仓库(可选,增强依赖解析) -->
|
||||
<profile>
|
||||
<id>aliyun-repos</id>
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>aliyun-public</id>
|
||||
<name>Aliyun Public Repository</name>
|
||||
<url>https://maven.aliyun.com/repository/public</url>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
<snapshots>
|
||||
<enabled>false</enabled> <!-- 默认关闭快照版本 -->
|
||||
</snapshots>
|
||||
</repository>
|
||||
</repositories>
|
||||
<pluginRepositories>
|
||||
<pluginRepository>
|
||||
<id>aliyun-plugin</id>
|
||||
<name>Aliyun Plugin Repository</name>
|
||||
<url>https://maven.aliyun.com/repository/public</url>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
<snapshots>
|
||||
<enabled>false</enabled>
|
||||
</snapshots>
|
||||
</pluginRepository>
|
||||
</pluginRepositories>
|
||||
</profile>
|
||||
</profiles>
|
||||
|
||||
<activeProfiles>
|
||||
<activeProfile>aliyun-repos</activeProfile> <!-- 激活阿里云仓库 -->
|
||||
<activeProfile>java21</activeProfile> <!-- 激活 Java 21 配置 -->
|
||||
</activeProfiles>
|
||||
</settings>
|
||||
8
scripts/images/backend/start.sh
Normal file
8
scripts/images/backend/start.sh
Normal file
@@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
rpcbind
|
||||
|
||||
echo "Starting main application..."
|
||||
exec "$@"
|
||||
33
scripts/images/datax/Dockerfile
Normal file
33
scripts/images/datax/Dockerfile
Normal file
@@ -0,0 +1,33 @@
|
||||
FROM maven:3-openjdk-8-slim AS builder
|
||||
|
||||
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
|
||||
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
|
||||
apt-get update && \
|
||||
apt-get install -y git && \
|
||||
git clone https://github.com/alibaba/DataX.git
|
||||
|
||||
COPY runtime/datax/ DataX/
|
||||
|
||||
RUN cd DataX && \
|
||||
sed -i "s/com.mysql.jdbc.Driver/com.mysql.cj.jdbc.Driver/g" \
|
||||
plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/util/DataBaseType.java && \
|
||||
mvn -U clean package assembly:assembly -Dmaven.test.skip=true
|
||||
|
||||
|
||||
FROM openjdk:8-jdk-slim
|
||||
|
||||
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
|
||||
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
|
||||
apt-get update && \
|
||||
apt-get install -y python3 python3-pip python-is-python3 vim wget curl nfs-common rsync && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apy/lists/*
|
||||
|
||||
RUN pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple && \
|
||||
pip config --user set global.trusted-host mirrors.aliyun.com && \
|
||||
pip install fastapi uvicorn[standard] && \
|
||||
pip cache purge
|
||||
|
||||
COPY --from=builder /DataX/target/datax/datax /opt/datax
|
||||
|
||||
COPY scripts/images/datax/app.py /opt/datax/bin/app.py
|
||||
52
scripts/images/datax/app.py
Normal file
52
scripts/images/datax/app.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
from fastapi import FastAPI
|
||||
from pydantic import BaseModel
|
||||
|
||||
app = FastAPI(title="datax")
|
||||
|
||||
|
||||
class CreateJobParam(BaseModel):
|
||||
content: str
|
||||
|
||||
|
||||
@app.post("/process", tags=["run datax.py"])
|
||||
async def process(job: CreateJobParam):
|
||||
output = {
|
||||
"status": "failed",
|
||||
}
|
||||
try:
|
||||
# 创建临时文件存储Python脚本
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=True) as f:
|
||||
f.write(job.content)
|
||||
f.seek(0)
|
||||
|
||||
cmd_args = ["python3", "/opt/datax/bin/datax.py", f.name]
|
||||
result = subprocess.run(
|
||||
cmd_args,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
|
||||
output["status"] = result.returncode
|
||||
if result.returncode != 0:
|
||||
output["stdout"] = result.stdout
|
||||
output["stderr"] = result.stderr
|
||||
except subprocess.TimeoutExpired as e:
|
||||
output["status"] = 408
|
||||
output["stderr"] = f"The script execution timed out: {e.stderr}"
|
||||
except subprocess.CalledProcessError as e:
|
||||
output["status"] = 500
|
||||
output["stderr"] = f"Script execution failed: {e.stdout}"
|
||||
except Exception as e:
|
||||
output["status"] = 500
|
||||
output["stderr"] = f"Server error: {str(e)}"
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
17
scripts/images/frontend/Dockerfile
Normal file
17
scripts/images/frontend/Dockerfile
Normal file
@@ -0,0 +1,17 @@
|
||||
FROM node:18-alpine AS builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY frontend ./
|
||||
|
||||
RUN if [ -f package-lock.json ]; then npm ci; else npm install; fi && \
|
||||
npm run build
|
||||
|
||||
FROM nginx:1.29 AS runner
|
||||
|
||||
COPY --from=builder /app/dist /opt/frontend
|
||||
COPY scripts/images/frontend/edm.conf /etc/nginx/conf.d/default.conf
|
||||
|
||||
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
|
||||
|
||||
CMD ["nginx", "-g", "daemon off;"]
|
||||
16
scripts/images/frontend/edm.conf
Normal file
16
scripts/images/frontend/edm.conf
Normal file
@@ -0,0 +1,16 @@
|
||||
server {
|
||||
listen 80;
|
||||
server_name 0.0.0.0;
|
||||
|
||||
location /api/ {
|
||||
proxy_pass http://backend:8080/api/;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
}
|
||||
|
||||
location / {
|
||||
root /opt/frontend;
|
||||
try_files $uri $uri/ /index.html;
|
||||
}
|
||||
}
|
||||
22
scripts/images/mineru/Dockerfile
Normal file
22
scripts/images/mineru/Dockerfile
Normal file
@@ -0,0 +1,22 @@
|
||||
FROM python:3.10-slim
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl vim libgl1 libgl1-mesa-glx libglib2.0-0 procps && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple && \
|
||||
pip config --user set global.trusted-host mirrors.aliyun.com && \
|
||||
pip install --upgrade setuptools && \
|
||||
pip install -U 'mineru[core]==2.1.0' --break-system-packages && \
|
||||
pip install torch==2.7.1+cpu -f https://download.pytorch.org/whl/torch/ && \
|
||||
pip install torchvision==0.22.1+cpu -f https://download.pytorch.org/whl/torchvision && \
|
||||
pip install requests==2.27.1 torch_npu==2.7.1rc1 numpy==1.26.0 decorator==5.2.1 einops==0.8.1 attrs==25.3.0 && \
|
||||
pip cache purge
|
||||
|
||||
ENV CURL_CA_BUNDLE=""
|
||||
ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
|
||||
|
||||
RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
|
||||
|
||||
ENV MINERU_MODEL_SOURCE=local
|
||||
24
scripts/images/runtime/Dockerfile
Normal file
24
scripts/images/runtime/Dockerfile
Normal file
@@ -0,0 +1,24 @@
|
||||
FROM python:3.11
|
||||
|
||||
COPY runtime/python-executor /opt/runtime
|
||||
COPY runtime/ops /opt/runtime/datamate/ops
|
||||
|
||||
ENV PYTHONPATH=/opt/runtime/datamate/
|
||||
|
||||
RUN sed -i 's/deb.debian.org/mirrors.huaweicloud.com/g' /etc/apt/sources.list.d/debian.sources \
|
||||
&& apt update \
|
||||
&& apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 \
|
||||
&& apt clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /opt/runtime
|
||||
|
||||
ENV HF_HUB_DISABLE_XET=1
|
||||
|
||||
RUN pip install -e . -i https://mirrors.huaweicloud.com/repository/pypi/simple \
|
||||
&& pip install -r /opt/runtime/datamate/ops/requirements.txt -i https://mirrors.huaweicloud.com/repository/pypi/simple \
|
||||
&& pip cache purge \
|
||||
&& python -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" \
|
||||
&& python -c "from unstructured_inference.models.base import get_model; get_model()"
|
||||
|
||||
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
|
||||
9
scripts/images/unstructured/Dockerfile
Normal file
9
scripts/images/unstructured/Dockerfile
Normal file
@@ -0,0 +1,9 @@
|
||||
FROM downloads.unstructured.io/unstructured-io/unstructured
|
||||
|
||||
RUN pip config --user set global.index https://mirrors.huaweicloud.com/repository/pypi && \
|
||||
pip config --user set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple && \
|
||||
pip config --user set global.trusted-host mirrors.huaweicloud.com && \
|
||||
pip install fastapi uvicorn && \
|
||||
pip cache purge
|
||||
|
||||
COPY scripts/images/unstructured/app.py /app/app.py
|
||||
61
scripts/images/unstructured/app.py
Normal file
61
scripts/images/unstructured/app.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import asyncio
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
app = FastAPI(title="unstructured")
|
||||
|
||||
|
||||
class FileProcessingRequest(BaseModel):
|
||||
"""文件处理请求模型"""
|
||||
file_path: Optional[str] = None
|
||||
# 可添加其他可选字段
|
||||
|
||||
|
||||
@app.post("/process", tags=["文件处理"])
|
||||
async def process_file(request_data: FileProcessingRequest):
|
||||
"""处理文件并返回提取的文本内容"""
|
||||
try:
|
||||
file_path = request_data.file_path
|
||||
|
||||
if not file_path:
|
||||
raise HTTPException(status_code=400, detail="缺少必要参数: filePath")
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise HTTPException(status_code=404, detail=f"文件不存在: {file_path}")
|
||||
|
||||
# 异步执行可能耗时的文件处理操作
|
||||
text_content = await process_file_async(file_path)
|
||||
|
||||
# 返回处理结果
|
||||
return {
|
||||
"filePath": file_path,
|
||||
"text": text_content,
|
||||
"status": "success"
|
||||
}
|
||||
|
||||
except HTTPException as e:
|
||||
raise e
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"process failed: {str(e)}")
|
||||
|
||||
|
||||
async def process_file_async(file_path: str) -> str:
|
||||
"""异步处理文件内容"""
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(None, partition_file_sync, file_path)
|
||||
|
||||
|
||||
def partition_file_sync(file_path: str) -> str:
|
||||
"""同步处理文件内容(由异步函数调用)"""
|
||||
elements = partition(filename=file_path)
|
||||
return "\n\n".join([str(el) for el in elements])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
Reference in New Issue
Block a user