[Feature] Refactor project to use 'datamate' naming convention for services and configurations (#14)

* Enhance CleaningTaskService to track cleaning process progress and update ExecutorType to DATAMATE

* Refactor project to use 'datamate' naming convention for services and configurations
This commit is contained in:
hhhhsc701
2025-10-22 17:53:16 +08:00
committed by GitHub
parent 175d9ded93
commit 31ef8bc265
39 changed files with 312 additions and 737 deletions

View File

@@ -22,6 +22,7 @@ CREATE TABLE IF NOT EXISTS t_clean_task
dest_dataset_name varchar(64),
before_size bigint,
after_size bigint,
file_count int,
created_at timestamp default current_timestamp,
started_at timestamp,
finished_at timestamp,

View File

@@ -111,113 +111,30 @@ VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0',
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
VALUES (3, 'TextFormatter'),
(7, 'FileExporter'),
(8, 'TextFormatter'),
(8, 'FileExporter'),
(3, 'FileWithShortOrLongLengthFilter'),
(3, 'FileWithHighRepeatPhraseRateFilter'),
(3, 'FileWithHighRepeatWordRateFilter'),
(3, 'FileWithHighSpecialCharRateFilter'),
(3, 'FileWithManySensitiveWordsFilter'),
(3, 'DuplicateFilesFilter'),
(3, 'DuplicateSentencesFilter'),
(3, 'AnonymizedCreditCardNumber'),
(3, 'AnonymizedIdNumber'),
(3, 'AnonymizedIpAddress'),
(3, 'AnonymizedPhoneNumber'),
(3, 'AnonymizedUrlCleaner'),
(3, 'HtmlTagCleaner'),
(3, 'XMLTagCleaner'),
(3, 'ContentCleaner'),
(3, 'EmailNumberCleaner'),
(3, 'EmojiCleaner'),
(3, 'ExtraSpaceCleaner'),
(3, 'FullWidthCharacterCleaner'),
(3, 'GrableCharactersCleaner'),
(3, 'InvisibleCharactersCleaner'),
(3, 'LegendCleaner'),
(3, 'PoliticalWordCleaner'),
(3, 'SexualAndViolentWordCleaner'),
(3, 'TraditionalChineseCleaner'),
(3, 'UnicodeSpaceCleaner'),
(4, 'ImgFormatter'),
(4, 'ImgBlurredImagesCleaner'),
(4, 'ImgBrightness'),
(4, 'ImgContrast'),
(4, 'ImgDenoise'),
(4, 'ImgDuplicatedImagesCleaner'),
(4, 'ImgPerspectiveTransformation'),
(4, 'ImgResize'),
(4, 'ImgSaturation'),
(4, 'ImgShadowRemove'),
(4, 'ImgSharpness'),
(4, 'ImgSimilarImagesCleaner'),
(4, 'ImgTypeUnify'),
(8, 'FileWithShortOrLongLengthFilter'),
(8, 'FileWithHighRepeatPhraseRateFilter'),
(8, 'FileWithHighRepeatWordRateFilter'),
(8, 'FileWithHighSpecialCharRateFilter'),
(8, 'FileWithManySensitiveWordsFilter'),
(8, 'DuplicateFilesFilter'),
(8, 'DuplicateSentencesFilter'),
(8, 'AnonymizedCreditCardNumber'),
(8, 'AnonymizedIdNumber'),
(8, 'AnonymizedIpAddress'),
(8, 'AnonymizedPhoneNumber'),
(8, 'AnonymizedUrlCleaner'),
(8, 'HtmlTagCleaner'),
(8, 'XMLTagCleaner'),
(8, 'ContentCleaner'),
(8, 'EmailNumberCleaner'),
(8, 'EmojiCleaner'),
(8, 'ExtraSpaceCleaner'),
(8, 'FullWidthCharacterCleaner'),
(8, 'GrableCharactersCleaner'),
(8, 'InvisibleCharactersCleaner'),
(8, 'LegendCleaner'),
(8, 'PoliticalWordCleaner'),
(8, 'SexualAndViolentWordCleaner'),
(8, 'TraditionalChineseCleaner'),
(8, 'UnicodeSpaceCleaner'),
(11, 'TextFormatter'),
(11, 'FileExporter'),
(11, 'FileWithShortOrLongLengthFilter'),
(11, 'FileWithHighRepeatPhraseRateFilter'),
(11, 'FileWithHighRepeatWordRateFilter'),
(11, 'FileWithHighSpecialCharRateFilter'),
(11, 'FileWithManySensitiveWordsFilter'),
(11, 'DuplicateFilesFilter'),
(11, 'DuplicateSentencesFilter'),
(11, 'AnonymizedCreditCardNumber'),
(11, 'AnonymizedIdNumber'),
(11, 'AnonymizedIpAddress'),
(11, 'AnonymizedPhoneNumber'),
(11, 'AnonymizedUrlCleaner'),
(11, 'HtmlTagCleaner'),
(11, 'XMLTagCleaner'),
(11, 'ContentCleaner'),
(11, 'EmailNumberCleaner'),
(11, 'EmojiCleaner'),
(11, 'ExtraSpaceCleaner'),
(11, 'FullWidthCharacterCleaner'),
(11, 'GrableCharactersCleaner'),
(11, 'InvisibleCharactersCleaner'),
(11, 'LegendCleaner'),
(11, 'PoliticalWordCleaner'),
(11, 'SexualAndViolentWordCleaner'),
(11, 'TraditionalChineseCleaner'),
(11, 'UnicodeSpaceCleaner'),
(11, 'ImgFormatter'),
(11, 'ImgBlurredImagesCleaner'),
(11, 'ImgBrightness'),
(11, 'ImgContrast'),
(11, 'ImgDenoise'),
(11, 'ImgDuplicatedImagesCleaner'),
(11, 'ImgPerspectiveTransformation'),
(11, 'ImgResize'),
(11, 'ImgSaturation'),
(11, 'ImgShadowRemove'),
(11, 'ImgSharpness'),
(11, 'ImgSimilarImagesCleaner'),
(11, 'ImgTypeUnify');
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN (3, 8, 11)
AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFilter',
'FileWithHighRepeatWordRateFilter', 'FileWithHighSpecialCharRateFilter', 'FileWithManySensitiveWordsFilter',
'DuplicateFilesFilter', 'DuplicateSentencesFilter', 'AnonymizedCreditCardNumber', 'AnonymizedIdNumber',
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN (4, 8, 11)
AND o.id IN ('ImgFormatter', 'ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation',
'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN (7, 8, 11)
AND o.id IN ('FileExporter');

View File

@@ -1,33 +0,0 @@
FROM maven:3-openjdk-8-slim AS builder
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
apt-get update && \
apt-get install -y git && \
git clone https://github.com/alibaba/DataX.git
COPY runtime/datax/ DataX/
RUN cd DataX && \
sed -i "s/com.mysql.jdbc.Driver/com.mysql.cj.jdbc.Driver/g" \
plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/util/DataBaseType.java && \
mvn -U clean package assembly:assembly -Dmaven.test.skip=true
FROM openjdk:8-jdk-slim
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
apt-get update && \
apt-get install -y python3 python3-pip python-is-python3 vim wget curl nfs-common rsync && \
apt-get clean && \
rm -rf /var/lib/apy/lists/*
RUN pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple && \
pip config --user set global.trusted-host mirrors.aliyun.com && \
pip install fastapi uvicorn[standard] && \
pip cache purge
COPY --from=builder /DataX/target/datax/datax /opt/datax
COPY scripts/images/datax/app.py /opt/datax/bin/app.py

View File

@@ -1,52 +0,0 @@
import subprocess
import tempfile
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI(title="datax")
class CreateJobParam(BaseModel):
content: str
@app.post("/process", tags=["run datax.py"])
async def process(job: CreateJobParam):
output = {
"status": "failed",
}
try:
# 创建临时文件存储Python脚本
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=True) as f:
f.write(job.content)
f.seek(0)
cmd_args = ["python3", "/opt/datax/bin/datax.py", f.name]
result = subprocess.run(
cmd_args,
capture_output=True,
text=True,
check=True
)
output["status"] = result.returncode
if result.returncode != 0:
output["stdout"] = result.stdout
output["stderr"] = result.stderr
except subprocess.TimeoutExpired as e:
output["status"] = 408
output["stderr"] = f"The script execution timed out: {e.stderr}"
except subprocess.CalledProcessError as e:
output["status"] = 500
output["stderr"] = f"Script execution failed: {e.stdout}"
except Exception as e:
output["status"] = 500
output["stderr"] = f"Server error: {str(e)}"
return output
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -10,7 +10,7 @@ RUN if [ -f package-lock.json ]; then npm ci; else npm install; fi && \
FROM nginx:1.29 AS runner
COPY --from=builder /app/dist /opt/frontend
COPY scripts/images/frontend/edm.conf /etc/nginx/conf.d/default.conf
COPY scripts/images/frontend/backend.conf /etc/nginx/conf.d/default.conf
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime

View File

@@ -2,8 +2,13 @@ server {
listen 80;
server_name 0.0.0.0;
access_log /var/log/datamate/frontend/access.log main;
error_log /var/log/datamate/frontend/error.log notice;
client_max_body_size 1024M;
location /api/ {
proxy_pass http://backend:8080/api/;
proxy_pass http://datamate-backend:8080/api/;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;

View File

@@ -1,22 +0,0 @@
FROM python:3.10-slim
RUN apt-get update && \
apt-get install -y curl vim libgl1 libgl1-mesa-glx libglib2.0-0 procps && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple && \
pip config --user set global.trusted-host mirrors.aliyun.com && \
pip install --upgrade setuptools && \
pip install -U 'mineru[core]==2.1.0' --break-system-packages && \
pip install torch==2.7.1+cpu -f https://download.pytorch.org/whl/torch/ && \
pip install torchvision==0.22.1+cpu -f https://download.pytorch.org/whl/torchvision && \
pip install requests==2.27.1 torch_npu==2.7.1rc1 numpy==1.26.0 decorator==5.2.1 einops==0.8.1 attrs==25.3.0 && \
pip cache purge
ENV CURL_CA_BUNDLE=""
ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
ENV MINERU_MODEL_SOURCE=local

View File

@@ -17,8 +17,6 @@ ENV HF_HUB_DISABLE_XET=1
RUN pip install -e . -i https://mirrors.huaweicloud.com/repository/pypi/simple \
&& pip install -r /opt/runtime/datamate/ops/requirements.txt -i https://mirrors.huaweicloud.com/repository/pypi/simple \
&& pip cache purge \
&& python -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" \
&& python -c "from unstructured_inference.models.base import get_model; get_model()"
&& pip cache purge
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime

View File

@@ -1,9 +0,0 @@
FROM downloads.unstructured.io/unstructured-io/unstructured
RUN pip config --user set global.index https://mirrors.huaweicloud.com/repository/pypi && \
pip config --user set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple && \
pip config --user set global.trusted-host mirrors.huaweicloud.com && \
pip install fastapi uvicorn && \
pip cache purge
COPY scripts/images/unstructured/app.py /app/app.py

View File

@@ -1,61 +0,0 @@
import asyncio
import os
from typing import Optional
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from unstructured.partition.auto import partition
app = FastAPI(title="unstructured")
class FileProcessingRequest(BaseModel):
"""文件处理请求模型"""
file_path: Optional[str] = None
# 可添加其他可选字段
@app.post("/process", tags=["文件处理"])
async def process_file(request_data: FileProcessingRequest):
"""处理文件并返回提取的文本内容"""
try:
file_path = request_data.file_path
if not file_path:
raise HTTPException(status_code=400, detail="缺少必要参数: filePath")
if not os.path.exists(file_path):
raise HTTPException(status_code=404, detail=f"文件不存在: {file_path}")
# 异步执行可能耗时的文件处理操作
text_content = await process_file_async(file_path)
# 返回处理结果
return {
"filePath": file_path,
"text": text_content,
"status": "success"
}
except HTTPException as e:
raise e
except Exception as e:
raise HTTPException(status_code=500, detail=f"process failed: {str(e)}")
async def process_file_async(file_path: str) -> str:
"""异步处理文件内容"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, partition_file_sync, file_path)
def partition_file_sync(file_path: str) -> str:
"""同步处理文件内容(由异步函数调用)"""
elements = partition(filename=file_path)
return "\n\n".join([str(el) for el in elements])
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -1,103 +0,0 @@
#!/bin/bash
# ==========================================================
# 步骤 1: 定义帮助函数
# ==========================================================
# 脚本名称
SCRIPT_NAME=$(basename "$0")
help_message() {
cat << EOF
Usage: $SCRIPT_NAME [-d TARGET_DIR] [-h|--help]
描述:
将预定义的 Docker 镜像列表保存为 .tar 文件。
选项:
-d TARGET_DIR 指定保存镜像的目标目录。
(绝对路径或相对路径)
如果未指定,将使用默认路径: $TARGET_DIR_DEFAULT
-h, --help 显示此帮助信息并退出。
示例:
# 使用默认目录 (./dist)
$SCRIPT_NAME
# 指定保存到 /tmp/my-archive 目录
$SCRIPT_NAME -d /tmp/my-archive
EOF
}
# ==========================================================
# 步骤 2: 定义默认值和处理参数
# ==========================================================
# 默认目标目录
TARGET_DIR_DEFAULT="./dist"
TARGET_DIR="$TARGET_DIR_DEFAULT"
# 使用 getopts 处理命令行选项。
# d: 表示 -d 选项后需要一个参数(目标目录)。
while getopts "d:h" opt; do
case ${opt} in
d )
# 如果 -d 选项被指定,使用传入的参数作为目标目录
TARGET_DIR="$OPTARG"
;;
h )
# 如果是 -h 选项,显示帮助并退出
help_message
exit 0
;;
\? )
# 处理无效的选项
echo "错误:无效选项 -$OPTARG" >&2
help_message
exit 1
;;
esac
done
# 移动到下一个非选项参数 (通常此脚本没有其他参数,但这是最佳实践)
shift $((OPTIND -1))
# ==========================================================
# 步骤 3: 脚本核心逻辑
# ==========================================================
# 检查/创建目标文件夹
if ! mkdir -p "$TARGET_DIR"; then
echo "❌ 致命错误:无法创建目标目录: $TARGET_DIR" >&2
exit 1
fi
echo "目标目录已确认/创建: $TARGET_DIR"
echo "----------------------------------------"
# Image list
images=("frontend:latest" "backend:latest" "runtime:latest" "mysql:8")
for image in "${images[@]}"; do
# 清理镜像名称,用 '_' 替换 ':',以创建安全的文件名。
safe_name="${image//[:]/_}"
# 构造完整的输出文件路径。
output_path="$TARGET_DIR/$safe_name.tar"
echo "正在保存镜像 $image"
echo " -> 到文件 $output_path"
# 执行 docker save 命令
docker save -o "$output_path" "$image"
# 检查保存是否成功 ($? 存储上一个命令的退出状态)
if [ $? -eq 0 ]; then
echo "✅ 保存成功。"
else
echo "❌ 保存失败!"
fi
echo ""
done