You've already forked DataMate
[Feature] Refactor project to use 'datamate' naming convention for services and configurations (#14)
* Enhance CleaningTaskService to track cleaning process progress and update ExecutorType to DATAMATE * Refactor project to use 'datamate' naming convention for services and configurations
This commit is contained in:
@@ -22,6 +22,7 @@ CREATE TABLE IF NOT EXISTS t_clean_task
|
||||
dest_dataset_name varchar(64),
|
||||
before_size bigint,
|
||||
after_size bigint,
|
||||
file_count int,
|
||||
created_at timestamp default current_timestamp,
|
||||
started_at timestamp,
|
||||
finished_at timestamp,
|
||||
|
||||
@@ -111,113 +111,30 @@ VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0',
|
||||
|
||||
|
||||
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
|
||||
VALUES (3, 'TextFormatter'),
|
||||
(7, 'FileExporter'),
|
||||
(8, 'TextFormatter'),
|
||||
(8, 'FileExporter'),
|
||||
(3, 'FileWithShortOrLongLengthFilter'),
|
||||
(3, 'FileWithHighRepeatPhraseRateFilter'),
|
||||
(3, 'FileWithHighRepeatWordRateFilter'),
|
||||
(3, 'FileWithHighSpecialCharRateFilter'),
|
||||
(3, 'FileWithManySensitiveWordsFilter'),
|
||||
(3, 'DuplicateFilesFilter'),
|
||||
(3, 'DuplicateSentencesFilter'),
|
||||
(3, 'AnonymizedCreditCardNumber'),
|
||||
(3, 'AnonymizedIdNumber'),
|
||||
(3, 'AnonymizedIpAddress'),
|
||||
(3, 'AnonymizedPhoneNumber'),
|
||||
(3, 'AnonymizedUrlCleaner'),
|
||||
(3, 'HtmlTagCleaner'),
|
||||
(3, 'XMLTagCleaner'),
|
||||
(3, 'ContentCleaner'),
|
||||
(3, 'EmailNumberCleaner'),
|
||||
(3, 'EmojiCleaner'),
|
||||
(3, 'ExtraSpaceCleaner'),
|
||||
(3, 'FullWidthCharacterCleaner'),
|
||||
(3, 'GrableCharactersCleaner'),
|
||||
(3, 'InvisibleCharactersCleaner'),
|
||||
(3, 'LegendCleaner'),
|
||||
(3, 'PoliticalWordCleaner'),
|
||||
(3, 'SexualAndViolentWordCleaner'),
|
||||
(3, 'TraditionalChineseCleaner'),
|
||||
(3, 'UnicodeSpaceCleaner'),
|
||||
(4, 'ImgFormatter'),
|
||||
(4, 'ImgBlurredImagesCleaner'),
|
||||
(4, 'ImgBrightness'),
|
||||
(4, 'ImgContrast'),
|
||||
(4, 'ImgDenoise'),
|
||||
(4, 'ImgDuplicatedImagesCleaner'),
|
||||
(4, 'ImgPerspectiveTransformation'),
|
||||
(4, 'ImgResize'),
|
||||
(4, 'ImgSaturation'),
|
||||
(4, 'ImgShadowRemove'),
|
||||
(4, 'ImgSharpness'),
|
||||
(4, 'ImgSimilarImagesCleaner'),
|
||||
(4, 'ImgTypeUnify'),
|
||||
(8, 'FileWithShortOrLongLengthFilter'),
|
||||
(8, 'FileWithHighRepeatPhraseRateFilter'),
|
||||
(8, 'FileWithHighRepeatWordRateFilter'),
|
||||
(8, 'FileWithHighSpecialCharRateFilter'),
|
||||
(8, 'FileWithManySensitiveWordsFilter'),
|
||||
(8, 'DuplicateFilesFilter'),
|
||||
(8, 'DuplicateSentencesFilter'),
|
||||
(8, 'AnonymizedCreditCardNumber'),
|
||||
(8, 'AnonymizedIdNumber'),
|
||||
(8, 'AnonymizedIpAddress'),
|
||||
(8, 'AnonymizedPhoneNumber'),
|
||||
(8, 'AnonymizedUrlCleaner'),
|
||||
(8, 'HtmlTagCleaner'),
|
||||
(8, 'XMLTagCleaner'),
|
||||
(8, 'ContentCleaner'),
|
||||
(8, 'EmailNumberCleaner'),
|
||||
(8, 'EmojiCleaner'),
|
||||
(8, 'ExtraSpaceCleaner'),
|
||||
(8, 'FullWidthCharacterCleaner'),
|
||||
(8, 'GrableCharactersCleaner'),
|
||||
(8, 'InvisibleCharactersCleaner'),
|
||||
(8, 'LegendCleaner'),
|
||||
(8, 'PoliticalWordCleaner'),
|
||||
(8, 'SexualAndViolentWordCleaner'),
|
||||
(8, 'TraditionalChineseCleaner'),
|
||||
(8, 'UnicodeSpaceCleaner'),
|
||||
(11, 'TextFormatter'),
|
||||
(11, 'FileExporter'),
|
||||
(11, 'FileWithShortOrLongLengthFilter'),
|
||||
(11, 'FileWithHighRepeatPhraseRateFilter'),
|
||||
(11, 'FileWithHighRepeatWordRateFilter'),
|
||||
(11, 'FileWithHighSpecialCharRateFilter'),
|
||||
(11, 'FileWithManySensitiveWordsFilter'),
|
||||
(11, 'DuplicateFilesFilter'),
|
||||
(11, 'DuplicateSentencesFilter'),
|
||||
(11, 'AnonymizedCreditCardNumber'),
|
||||
(11, 'AnonymizedIdNumber'),
|
||||
(11, 'AnonymizedIpAddress'),
|
||||
(11, 'AnonymizedPhoneNumber'),
|
||||
(11, 'AnonymizedUrlCleaner'),
|
||||
(11, 'HtmlTagCleaner'),
|
||||
(11, 'XMLTagCleaner'),
|
||||
(11, 'ContentCleaner'),
|
||||
(11, 'EmailNumberCleaner'),
|
||||
(11, 'EmojiCleaner'),
|
||||
(11, 'ExtraSpaceCleaner'),
|
||||
(11, 'FullWidthCharacterCleaner'),
|
||||
(11, 'GrableCharactersCleaner'),
|
||||
(11, 'InvisibleCharactersCleaner'),
|
||||
(11, 'LegendCleaner'),
|
||||
(11, 'PoliticalWordCleaner'),
|
||||
(11, 'SexualAndViolentWordCleaner'),
|
||||
(11, 'TraditionalChineseCleaner'),
|
||||
(11, 'UnicodeSpaceCleaner'),
|
||||
(11, 'ImgFormatter'),
|
||||
(11, 'ImgBlurredImagesCleaner'),
|
||||
(11, 'ImgBrightness'),
|
||||
(11, 'ImgContrast'),
|
||||
(11, 'ImgDenoise'),
|
||||
(11, 'ImgDuplicatedImagesCleaner'),
|
||||
(11, 'ImgPerspectiveTransformation'),
|
||||
(11, 'ImgResize'),
|
||||
(11, 'ImgSaturation'),
|
||||
(11, 'ImgShadowRemove'),
|
||||
(11, 'ImgSharpness'),
|
||||
(11, 'ImgSimilarImagesCleaner'),
|
||||
(11, 'ImgTypeUnify');
|
||||
SELECT c.id, o.id
|
||||
FROM t_operator_category c
|
||||
CROSS JOIN t_operator o
|
||||
WHERE c.id IN (3, 8, 11)
|
||||
AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFilter',
|
||||
'FileWithHighRepeatWordRateFilter', 'FileWithHighSpecialCharRateFilter', 'FileWithManySensitiveWordsFilter',
|
||||
'DuplicateFilesFilter', 'DuplicateSentencesFilter', 'AnonymizedCreditCardNumber', 'AnonymizedIdNumber',
|
||||
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
|
||||
'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
|
||||
'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
|
||||
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner');
|
||||
|
||||
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
|
||||
SELECT c.id, o.id
|
||||
FROM t_operator_category c
|
||||
CROSS JOIN t_operator o
|
||||
WHERE c.id IN (4, 8, 11)
|
||||
AND o.id IN ('ImgFormatter', 'ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
|
||||
'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation',
|
||||
'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify');
|
||||
|
||||
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
|
||||
SELECT c.id, o.id
|
||||
FROM t_operator_category c
|
||||
CROSS JOIN t_operator o
|
||||
WHERE c.id IN (7, 8, 11)
|
||||
AND o.id IN ('FileExporter');
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
FROM maven:3-openjdk-8-slim AS builder
|
||||
|
||||
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
|
||||
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
|
||||
apt-get update && \
|
||||
apt-get install -y git && \
|
||||
git clone https://github.com/alibaba/DataX.git
|
||||
|
||||
COPY runtime/datax/ DataX/
|
||||
|
||||
RUN cd DataX && \
|
||||
sed -i "s/com.mysql.jdbc.Driver/com.mysql.cj.jdbc.Driver/g" \
|
||||
plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/util/DataBaseType.java && \
|
||||
mvn -U clean package assembly:assembly -Dmaven.test.skip=true
|
||||
|
||||
|
||||
FROM openjdk:8-jdk-slim
|
||||
|
||||
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
|
||||
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
|
||||
apt-get update && \
|
||||
apt-get install -y python3 python3-pip python-is-python3 vim wget curl nfs-common rsync && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apy/lists/*
|
||||
|
||||
RUN pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple && \
|
||||
pip config --user set global.trusted-host mirrors.aliyun.com && \
|
||||
pip install fastapi uvicorn[standard] && \
|
||||
pip cache purge
|
||||
|
||||
COPY --from=builder /DataX/target/datax/datax /opt/datax
|
||||
|
||||
COPY scripts/images/datax/app.py /opt/datax/bin/app.py
|
||||
@@ -1,52 +0,0 @@
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
from fastapi import FastAPI
|
||||
from pydantic import BaseModel
|
||||
|
||||
app = FastAPI(title="datax")
|
||||
|
||||
|
||||
class CreateJobParam(BaseModel):
|
||||
content: str
|
||||
|
||||
|
||||
@app.post("/process", tags=["run datax.py"])
|
||||
async def process(job: CreateJobParam):
|
||||
output = {
|
||||
"status": "failed",
|
||||
}
|
||||
try:
|
||||
# 创建临时文件存储Python脚本
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=True) as f:
|
||||
f.write(job.content)
|
||||
f.seek(0)
|
||||
|
||||
cmd_args = ["python3", "/opt/datax/bin/datax.py", f.name]
|
||||
result = subprocess.run(
|
||||
cmd_args,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
|
||||
output["status"] = result.returncode
|
||||
if result.returncode != 0:
|
||||
output["stdout"] = result.stdout
|
||||
output["stderr"] = result.stderr
|
||||
except subprocess.TimeoutExpired as e:
|
||||
output["status"] = 408
|
||||
output["stderr"] = f"The script execution timed out: {e.stderr}"
|
||||
except subprocess.CalledProcessError as e:
|
||||
output["status"] = 500
|
||||
output["stderr"] = f"Script execution failed: {e.stdout}"
|
||||
except Exception as e:
|
||||
output["status"] = 500
|
||||
output["stderr"] = f"Server error: {str(e)}"
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
@@ -10,7 +10,7 @@ RUN if [ -f package-lock.json ]; then npm ci; else npm install; fi && \
|
||||
FROM nginx:1.29 AS runner
|
||||
|
||||
COPY --from=builder /app/dist /opt/frontend
|
||||
COPY scripts/images/frontend/edm.conf /etc/nginx/conf.d/default.conf
|
||||
COPY scripts/images/frontend/backend.conf /etc/nginx/conf.d/default.conf
|
||||
|
||||
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
|
||||
|
||||
|
||||
@@ -2,8 +2,13 @@ server {
|
||||
listen 80;
|
||||
server_name 0.0.0.0;
|
||||
|
||||
access_log /var/log/datamate/frontend/access.log main;
|
||||
error_log /var/log/datamate/frontend/error.log notice;
|
||||
|
||||
client_max_body_size 1024M;
|
||||
|
||||
location /api/ {
|
||||
proxy_pass http://backend:8080/api/;
|
||||
proxy_pass http://datamate-backend:8080/api/;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
@@ -1,22 +0,0 @@
|
||||
FROM python:3.10-slim
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl vim libgl1 libgl1-mesa-glx libglib2.0-0 procps && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple && \
|
||||
pip config --user set global.trusted-host mirrors.aliyun.com && \
|
||||
pip install --upgrade setuptools && \
|
||||
pip install -U 'mineru[core]==2.1.0' --break-system-packages && \
|
||||
pip install torch==2.7.1+cpu -f https://download.pytorch.org/whl/torch/ && \
|
||||
pip install torchvision==0.22.1+cpu -f https://download.pytorch.org/whl/torchvision && \
|
||||
pip install requests==2.27.1 torch_npu==2.7.1rc1 numpy==1.26.0 decorator==5.2.1 einops==0.8.1 attrs==25.3.0 && \
|
||||
pip cache purge
|
||||
|
||||
ENV CURL_CA_BUNDLE=""
|
||||
ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
|
||||
|
||||
RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
|
||||
|
||||
ENV MINERU_MODEL_SOURCE=local
|
||||
@@ -17,8 +17,6 @@ ENV HF_HUB_DISABLE_XET=1
|
||||
|
||||
RUN pip install -e . -i https://mirrors.huaweicloud.com/repository/pypi/simple \
|
||||
&& pip install -r /opt/runtime/datamate/ops/requirements.txt -i https://mirrors.huaweicloud.com/repository/pypi/simple \
|
||||
&& pip cache purge \
|
||||
&& python -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" \
|
||||
&& python -c "from unstructured_inference.models.base import get_model; get_model()"
|
||||
&& pip cache purge
|
||||
|
||||
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
|
||||
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
FROM downloads.unstructured.io/unstructured-io/unstructured
|
||||
|
||||
RUN pip config --user set global.index https://mirrors.huaweicloud.com/repository/pypi && \
|
||||
pip config --user set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple && \
|
||||
pip config --user set global.trusted-host mirrors.huaweicloud.com && \
|
||||
pip install fastapi uvicorn && \
|
||||
pip cache purge
|
||||
|
||||
COPY scripts/images/unstructured/app.py /app/app.py
|
||||
@@ -1,61 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
app = FastAPI(title="unstructured")
|
||||
|
||||
|
||||
class FileProcessingRequest(BaseModel):
|
||||
"""文件处理请求模型"""
|
||||
file_path: Optional[str] = None
|
||||
# 可添加其他可选字段
|
||||
|
||||
|
||||
@app.post("/process", tags=["文件处理"])
|
||||
async def process_file(request_data: FileProcessingRequest):
|
||||
"""处理文件并返回提取的文本内容"""
|
||||
try:
|
||||
file_path = request_data.file_path
|
||||
|
||||
if not file_path:
|
||||
raise HTTPException(status_code=400, detail="缺少必要参数: filePath")
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise HTTPException(status_code=404, detail=f"文件不存在: {file_path}")
|
||||
|
||||
# 异步执行可能耗时的文件处理操作
|
||||
text_content = await process_file_async(file_path)
|
||||
|
||||
# 返回处理结果
|
||||
return {
|
||||
"filePath": file_path,
|
||||
"text": text_content,
|
||||
"status": "success"
|
||||
}
|
||||
|
||||
except HTTPException as e:
|
||||
raise e
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"process failed: {str(e)}")
|
||||
|
||||
|
||||
async def process_file_async(file_path: str) -> str:
|
||||
"""异步处理文件内容"""
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(None, partition_file_sync, file_path)
|
||||
|
||||
|
||||
def partition_file_sync(file_path: str) -> str:
|
||||
"""同步处理文件内容(由异步函数调用)"""
|
||||
elements = partition(filename=file_path)
|
||||
return "\n\n".join([str(el) for el in elements])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
@@ -1,103 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# ==========================================================
|
||||
# 步骤 1: 定义帮助函数
|
||||
# ==========================================================
|
||||
|
||||
# 脚本名称
|
||||
SCRIPT_NAME=$(basename "$0")
|
||||
|
||||
help_message() {
|
||||
cat << EOF
|
||||
Usage: $SCRIPT_NAME [-d TARGET_DIR] [-h|--help]
|
||||
|
||||
描述:
|
||||
将预定义的 Docker 镜像列表保存为 .tar 文件。
|
||||
|
||||
选项:
|
||||
-d TARGET_DIR 指定保存镜像的目标目录。
|
||||
(绝对路径或相对路径)
|
||||
如果未指定,将使用默认路径: $TARGET_DIR_DEFAULT
|
||||
-h, --help 显示此帮助信息并退出。
|
||||
|
||||
示例:
|
||||
# 使用默认目录 (./dist)
|
||||
$SCRIPT_NAME
|
||||
|
||||
# 指定保存到 /tmp/my-archive 目录
|
||||
$SCRIPT_NAME -d /tmp/my-archive
|
||||
EOF
|
||||
}
|
||||
|
||||
# ==========================================================
|
||||
# 步骤 2: 定义默认值和处理参数
|
||||
# ==========================================================
|
||||
|
||||
# 默认目标目录
|
||||
TARGET_DIR_DEFAULT="./dist"
|
||||
TARGET_DIR="$TARGET_DIR_DEFAULT"
|
||||
|
||||
# 使用 getopts 处理命令行选项。
|
||||
# d: 表示 -d 选项后需要一个参数(目标目录)。
|
||||
while getopts "d:h" opt; do
|
||||
case ${opt} in
|
||||
d )
|
||||
# 如果 -d 选项被指定,使用传入的参数作为目标目录
|
||||
TARGET_DIR="$OPTARG"
|
||||
;;
|
||||
h )
|
||||
# 如果是 -h 选项,显示帮助并退出
|
||||
help_message
|
||||
exit 0
|
||||
;;
|
||||
\? )
|
||||
# 处理无效的选项
|
||||
echo "错误:无效选项 -$OPTARG" >&2
|
||||
help_message
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# 移动到下一个非选项参数 (通常此脚本没有其他参数,但这是最佳实践)
|
||||
shift $((OPTIND -1))
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# 步骤 3: 脚本核心逻辑
|
||||
# ==========================================================
|
||||
|
||||
# 检查/创建目标文件夹
|
||||
if ! mkdir -p "$TARGET_DIR"; then
|
||||
echo "❌ 致命错误:无法创建目标目录: $TARGET_DIR" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "目标目录已确认/创建: $TARGET_DIR"
|
||||
echo "----------------------------------------"
|
||||
|
||||
# Image list
|
||||
images=("frontend:latest" "backend:latest" "runtime:latest" "mysql:8")
|
||||
|
||||
for image in "${images[@]}"; do
|
||||
|
||||
# 清理镜像名称,用 '_' 替换 ':',以创建安全的文件名。
|
||||
safe_name="${image//[:]/_}"
|
||||
|
||||
# 构造完整的输出文件路径。
|
||||
output_path="$TARGET_DIR/$safe_name.tar"
|
||||
|
||||
echo "正在保存镜像 $image"
|
||||
echo " -> 到文件 $output_path"
|
||||
|
||||
# 执行 docker save 命令
|
||||
docker save -o "$output_path" "$image"
|
||||
|
||||
# 检查保存是否成功 ($? 存储上一个命令的退出状态)
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "✅ 保存成功。"
|
||||
else
|
||||
echo "❌ 保存失败!"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
done
|
||||
Reference in New Issue
Block a user