[Feature] Refactor project to use 'datamate' naming convention for services and configurations (#14)

* Enhance CleaningTaskService to track cleaning process progress and update ExecutorType to DATAMATE * Refactor project to use 'datamate' naming convention for services and configurations
2025-10-22 17:53:16 +08:00
parent 175d9ded93
commit 31ef8bc265
39 changed files with 312 additions and 737 deletions
--- a/scripts/db/data-cleaning-init.sql
+++ b/scripts/db/data-cleaning-init.sql
@@ -22,6 +22,7 @@ CREATE TABLE IF NOT EXISTS t_clean_task
    dest_dataset_name varchar(64),
    before_size       bigint,
    after_size        bigint,
+    file_count        int,
    created_at        timestamp default current_timestamp,
    started_at        timestamp,
    finished_at       timestamp,
--- a/scripts/db/data-operator-init.sql
+++ b/scripts/db/data-operator-init.sql
@@ -111,113 +111,30 @@ VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0',


 INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
-VALUES (3, 'TextFormatter'),
-       (7, 'FileExporter'),
-       (8, 'TextFormatter'),
-       (8, 'FileExporter'),
-       (3, 'FileWithShortOrLongLengthFilter'),
-       (3, 'FileWithHighRepeatPhraseRateFilter'),
-       (3, 'FileWithHighRepeatWordRateFilter'),
-       (3, 'FileWithHighSpecialCharRateFilter'),
-       (3, 'FileWithManySensitiveWordsFilter'),
-       (3, 'DuplicateFilesFilter'),
-       (3, 'DuplicateSentencesFilter'),
-       (3, 'AnonymizedCreditCardNumber'),
-       (3, 'AnonymizedIdNumber'),
-       (3, 'AnonymizedIpAddress'),
-       (3, 'AnonymizedPhoneNumber'),
-       (3, 'AnonymizedUrlCleaner'),
-       (3, 'HtmlTagCleaner'),
-       (3, 'XMLTagCleaner'),
-       (3, 'ContentCleaner'),
-       (3, 'EmailNumberCleaner'),
-       (3, 'EmojiCleaner'),
-       (3, 'ExtraSpaceCleaner'),
-       (3, 'FullWidthCharacterCleaner'),
-       (3, 'GrableCharactersCleaner'),
-       (3, 'InvisibleCharactersCleaner'),
-       (3, 'LegendCleaner'),
-       (3, 'PoliticalWordCleaner'),
-       (3, 'SexualAndViolentWordCleaner'),
-       (3, 'TraditionalChineseCleaner'),
-       (3, 'UnicodeSpaceCleaner'),
-       (4, 'ImgFormatter'),
-       (4, 'ImgBlurredImagesCleaner'),
-       (4, 'ImgBrightness'),
-       (4, 'ImgContrast'),
-       (4, 'ImgDenoise'),
-       (4, 'ImgDuplicatedImagesCleaner'),
-       (4, 'ImgPerspectiveTransformation'),
-       (4, 'ImgResize'),
-       (4, 'ImgSaturation'),
-       (4, 'ImgShadowRemove'),
-       (4, 'ImgSharpness'),
-       (4, 'ImgSimilarImagesCleaner'),
-       (4, 'ImgTypeUnify'),
-       (8, 'FileWithShortOrLongLengthFilter'),
-       (8, 'FileWithHighRepeatPhraseRateFilter'),
-       (8, 'FileWithHighRepeatWordRateFilter'),
-       (8, 'FileWithHighSpecialCharRateFilter'),
-       (8, 'FileWithManySensitiveWordsFilter'),
-       (8, 'DuplicateFilesFilter'),
-       (8, 'DuplicateSentencesFilter'),
-       (8, 'AnonymizedCreditCardNumber'),
-       (8, 'AnonymizedIdNumber'),
-       (8, 'AnonymizedIpAddress'),
-       (8, 'AnonymizedPhoneNumber'),
-       (8, 'AnonymizedUrlCleaner'),
-       (8, 'HtmlTagCleaner'),
-       (8, 'XMLTagCleaner'),
-       (8, 'ContentCleaner'),
-       (8, 'EmailNumberCleaner'),
-       (8, 'EmojiCleaner'),
-       (8, 'ExtraSpaceCleaner'),
-       (8, 'FullWidthCharacterCleaner'),
-       (8, 'GrableCharactersCleaner'),
-       (8, 'InvisibleCharactersCleaner'),
-       (8, 'LegendCleaner'),
-       (8, 'PoliticalWordCleaner'),
-       (8, 'SexualAndViolentWordCleaner'),
-       (8, 'TraditionalChineseCleaner'),
-       (8, 'UnicodeSpaceCleaner'),
-       (11, 'TextFormatter'),
-       (11, 'FileExporter'),
-       (11, 'FileWithShortOrLongLengthFilter'),
-       (11, 'FileWithHighRepeatPhraseRateFilter'),
-       (11, 'FileWithHighRepeatWordRateFilter'),
-       (11, 'FileWithHighSpecialCharRateFilter'),
-       (11, 'FileWithManySensitiveWordsFilter'),
-       (11, 'DuplicateFilesFilter'),
-       (11, 'DuplicateSentencesFilter'),
-       (11, 'AnonymizedCreditCardNumber'),
-       (11, 'AnonymizedIdNumber'),
-       (11, 'AnonymizedIpAddress'),
-       (11, 'AnonymizedPhoneNumber'),
-       (11, 'AnonymizedUrlCleaner'),
-       (11, 'HtmlTagCleaner'),
-       (11, 'XMLTagCleaner'),
-       (11, 'ContentCleaner'),
-       (11, 'EmailNumberCleaner'),
-       (11, 'EmojiCleaner'),
-       (11, 'ExtraSpaceCleaner'),
-       (11, 'FullWidthCharacterCleaner'),
-       (11, 'GrableCharactersCleaner'),
-       (11, 'InvisibleCharactersCleaner'),
-       (11, 'LegendCleaner'),
-       (11, 'PoliticalWordCleaner'),
-       (11, 'SexualAndViolentWordCleaner'),
-       (11, 'TraditionalChineseCleaner'),
-       (11, 'UnicodeSpaceCleaner'),
-       (11, 'ImgFormatter'),
-       (11, 'ImgBlurredImagesCleaner'),
-       (11, 'ImgBrightness'),
-       (11, 'ImgContrast'),
-       (11, 'ImgDenoise'),
-       (11, 'ImgDuplicatedImagesCleaner'),
-       (11, 'ImgPerspectiveTransformation'),
-       (11, 'ImgResize'),
-       (11, 'ImgSaturation'),
-       (11, 'ImgShadowRemove'),
-       (11, 'ImgSharpness'),
-       (11, 'ImgSimilarImagesCleaner'),
-       (11, 'ImgTypeUnify');
+SELECT c.id, o.id
+FROM t_operator_category c
+CROSS JOIN t_operator o
+WHERE c.id IN (3, 8, 11)
+AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFilter',
+            'FileWithHighRepeatWordRateFilter', 'FileWithHighSpecialCharRateFilter', 'FileWithManySensitiveWordsFilter',
+            'DuplicateFilesFilter', 'DuplicateSentencesFilter', 'AnonymizedCreditCardNumber', 'AnonymizedIdNumber',
+            'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
+            'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
+            'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
+            'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner');
+
+INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
+SELECT c.id, o.id
+FROM t_operator_category c
+       CROSS JOIN t_operator o
+WHERE c.id IN (4, 8, 11)
+  AND o.id IN ('ImgFormatter', 'ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
+               'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation',
+               'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify');
+
+INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
+SELECT c.id, o.id
+FROM t_operator_category c
+       CROSS JOIN t_operator o
+WHERE c.id IN (7, 8, 11)
+  AND o.id IN ('FileExporter');
--- a/scripts/images/datax/Dockerfile
+++ b/scripts/images/datax/Dockerfile
@@ -1,33 +0,0 @@
-FROM maven:3-openjdk-8-slim AS builder
-
-RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
-    sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
-    apt-get update && \
-    apt-get install -y git && \
-    git clone https://github.com/alibaba/DataX.git
-
-COPY runtime/datax/ DataX/
-
-RUN cd DataX && \
-    sed -i "s/com.mysql.jdbc.Driver/com.mysql.cj.jdbc.Driver/g" \
-    plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/util/DataBaseType.java && \
-    mvn -U clean package assembly:assembly -Dmaven.test.skip=true
-
-
-FROM openjdk:8-jdk-slim
-
-RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
-    sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
-    apt-get update && \
-    apt-get install -y python3 python3-pip python-is-python3 vim wget curl nfs-common rsync && \
-    apt-get clean && \
-    rm -rf /var/lib/apy/lists/*
-
-RUN pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple && \
-    pip config --user set global.trusted-host mirrors.aliyun.com && \
-    pip install fastapi uvicorn[standard] && \
-    pip cache purge
-
-COPY --from=builder /DataX/target/datax/datax /opt/datax
-
-COPY scripts/images/datax/app.py /opt/datax/bin/app.py
--- a/scripts/images/datax/app.py
+++ b/scripts/images/datax/app.py
@@ -1,52 +0,0 @@
-import subprocess
-import tempfile
-
-from fastapi import FastAPI
-from pydantic import BaseModel
-
-app = FastAPI(title="datax")
-
-
-class CreateJobParam(BaseModel):
-    content: str
-
-
-@app.post("/process", tags=["run datax.py"])
-async def process(job: CreateJobParam):
-    output = {
-        "status": "failed",
-    }
-    try:
-        # 创建临时文件存储Python脚本
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=True) as f:
-            f.write(job.content)
-            f.seek(0)
-
-            cmd_args = ["python3", "/opt/datax/bin/datax.py", f.name]
-            result = subprocess.run(
-                cmd_args,
-                capture_output=True,
-                text=True,
-                check=True
-            )
-
-            output["status"] = result.returncode
-            if result.returncode != 0:
-                output["stdout"] = result.stdout
-                output["stderr"] = result.stderr
-    except subprocess.TimeoutExpired as e:
-        output["status"] = 408
-        output["stderr"] = f"The script execution timed out: {e.stderr}"
-    except subprocess.CalledProcessError as e:
-        output["status"] = 500
-        output["stderr"] = f"Script execution failed: {e.stdout}"
-    except Exception as e:
-        output["status"] = 500
-        output["stderr"] = f"Server error: {str(e)}"
-    return output
-
-
-if __name__ == "__main__":
-    import uvicorn
-
-    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/scripts/images/frontend/Dockerfile
+++ b/scripts/images/frontend/Dockerfile
@@ -10,7 +10,7 @@ RUN if [ -f package-lock.json ]; then npm ci; else npm install; fi && \
 FROM nginx:1.29 AS runner

 COPY --from=builder /app/dist /opt/frontend
-COPY scripts/images/frontend/edm.conf /etc/nginx/conf.d/default.conf
+COPY scripts/images/frontend/backend.conf /etc/nginx/conf.d/default.conf

 RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime

--- a/scripts/images/frontend/backend.conf
+++ b/scripts/images/frontend/backend.conf
@@ -2,8 +2,13 @@ server {
    listen       80;
    server_name  0.0.0.0;

+    access_log /var/log/datamate/frontend/access.log main;
+    error_log /var/log/datamate/frontend/error.log notice;
+
+    client_max_body_size 1024M;
+
    location /api/ {
-        proxy_pass http://backend:8080/api/;
+        proxy_pass http://datamate-backend:8080/api/;
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
--- a/scripts/images/mineru/Dockerfile
+++ b/scripts/images/mineru/Dockerfile
@@ -1,22 +0,0 @@
-FROM python:3.10-slim
-
-RUN apt-get update && \
-    apt-get install -y curl vim libgl1 libgl1-mesa-glx libglib2.0-0 procps && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple && \
-    pip config --user set global.trusted-host mirrors.aliyun.com && \
-    pip install --upgrade setuptools && \
-    pip install -U 'mineru[core]==2.1.0' --break-system-packages && \
-    pip install torch==2.7.1+cpu -f https://download.pytorch.org/whl/torch/ && \
-    pip install torchvision==0.22.1+cpu -f https://download.pytorch.org/whl/torchvision && \
-    pip install requests==2.27.1 torch_npu==2.7.1rc1 numpy==1.26.0 decorator==5.2.1 einops==0.8.1 attrs==25.3.0 && \
-    pip cache purge
-
-ENV CURL_CA_BUNDLE=""
-ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
-
-RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
-
-ENV MINERU_MODEL_SOURCE=local
--- a/scripts/images/runtime/Dockerfile
+++ b/scripts/images/runtime/Dockerfile
@@ -17,8 +17,6 @@ ENV HF_HUB_DISABLE_XET=1

 RUN pip install -e . -i https://mirrors.huaweicloud.com/repository/pypi/simple \
    && pip install -r /opt/runtime/datamate/ops/requirements.txt -i https://mirrors.huaweicloud.com/repository/pypi/simple \
-    && pip cache purge \
-    && python -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" \
-    && python -c "from unstructured_inference.models.base import get_model; get_model()"
+    && pip cache purge

-RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
+RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
--- a/scripts/images/unstructured/Dockerfile
+++ b/scripts/images/unstructured/Dockerfile
@@ -1,9 +0,0 @@
-FROM downloads.unstructured.io/unstructured-io/unstructured
-
-RUN pip config --user set global.index https://mirrors.huaweicloud.com/repository/pypi && \
-    pip config --user set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple && \
-    pip config --user set global.trusted-host mirrors.huaweicloud.com && \
-    pip install fastapi uvicorn && \
-    pip cache purge
-
-COPY scripts/images/unstructured/app.py /app/app.py
--- a/scripts/images/unstructured/app.py
+++ b/scripts/images/unstructured/app.py
@@ -1,61 +0,0 @@
-import asyncio
-import os
-from typing import Optional
-
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-from unstructured.partition.auto import partition
-
-app = FastAPI(title="unstructured")
-
-
-class FileProcessingRequest(BaseModel):
-    """文件处理请求模型"""
-    file_path: Optional[str] = None
-    # 可添加其他可选字段
-
-
-@app.post("/process", tags=["文件处理"])
-async def process_file(request_data: FileProcessingRequest):
-    """处理文件并返回提取的文本内容"""
-    try:
-        file_path = request_data.file_path
-
-        if not file_path:
-            raise HTTPException(status_code=400, detail="缺少必要参数: filePath")
-
-        if not os.path.exists(file_path):
-            raise HTTPException(status_code=404, detail=f"文件不存在: {file_path}")
-
-        # 异步执行可能耗时的文件处理操作
-        text_content = await process_file_async(file_path)
-
-        # 返回处理结果
-        return {
-            "filePath": file_path,
-            "text": text_content,
-            "status": "success"
-        }
-
-    except HTTPException as e:
-        raise e
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"process failed: {str(e)}")
-
-
-async def process_file_async(file_path: str) -> str:
-    """异步处理文件内容"""
-    loop = asyncio.get_event_loop()
-    return await loop.run_in_executor(None, partition_file_sync, file_path)
-
-
-def partition_file_sync(file_path: str) -> str:
-    """同步处理文件内容（由异步函数调用）"""
-    elements = partition(filename=file_path)
-    return "\n\n".join([str(el) for el in elements])
-
-
-if __name__ == "__main__":
-    import uvicorn
-
-    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/scripts/save_images.sh
+++ b/scripts/save_images.sh
@@ -1,103 +0,0 @@
-#!/bin/bash
-
-# ==========================================================
-# 步骤 1: 定义帮助函数
-# ==========================================================
-
-# 脚本名称
-SCRIPT_NAME=$(basename "$0")
-
-help_message() {
-    cat << EOF
-Usage: $SCRIPT_NAME [-d TARGET_DIR] [-h|--help]
-
-描述: 
-  将预定义的 Docker 镜像列表保存为 .tar 文件。
-
-选项:
-  -d TARGET_DIR  指定保存镜像的目标目录。
-                 (绝对路径或相对路径)
-                 如果未指定，将使用默认路径: $TARGET_DIR_DEFAULT
-  -h, --help     显示此帮助信息并退出。
-
-示例:
-  # 使用默认目录 (./dist)
-  $SCRIPT_NAME
-
-  # 指定保存到 /tmp/my-archive 目录
-  $SCRIPT_NAME -d /tmp/my-archive
-EOF
-}
-
-# ==========================================================
-# 步骤 2: 定义默认值和处理参数
-# ==========================================================
-
-# 默认目标目录
-TARGET_DIR_DEFAULT="./dist"
-TARGET_DIR="$TARGET_DIR_DEFAULT"
-
-# 使用 getopts 处理命令行选项。
-# d: 表示 -d 选项后需要一个参数（目标目录）。
-while getopts "d:h" opt; do
-    case ${opt} in
-        d )
-            # 如果 -d 选项被指定，使用传入的参数作为目标目录
-            TARGET_DIR="$OPTARG"
-            ;;
-        h )
-            # 如果是 -h 选项，显示帮助并退出
-            help_message
-            exit 0
-            ;;
-        \? )
-            # 处理无效的选项
-            echo "错误：无效选项 -$OPTARG" >&2
-            help_message
-            exit 1
-            ;;
-    esac
-done
-
-# 移动到下一个非选项参数 (通常此脚本没有其他参数，但这是最佳实践)
-shift $((OPTIND -1))
-
-
-# ==========================================================
-# 步骤 3: 脚本核心逻辑
-# ==========================================================
-
-# 检查/创建目标文件夹
-if ! mkdir -p "$TARGET_DIR"; then
-    echo "❌ 致命错误：无法创建目标目录: $TARGET_DIR" >&2
-    exit 1
-fi
-echo "目标目录已确认/创建: $TARGET_DIR"
-echo "----------------------------------------"
-
-# Image list
-images=("frontend:latest" "backend:latest" "runtime:latest" "mysql:8")
-
-for image in "${images[@]}"; do
-
-    # 清理镜像名称，用 '_' 替换 ':'，以创建安全的文件名。
-    safe_name="${image//[:]/_}"
-    
-    # 构造完整的输出文件路径。
-    output_path="$TARGET_DIR/$safe_name.tar"
-
-    echo "正在保存镜像 $image"
-    echo "  -> 到文件 $output_path"
-    
-    # 执行 docker save 命令
-    docker save -o "$output_path" "$image"
-
-    # 检查保存是否成功 ($? 存储上一个命令的退出状态)
-    if [ $? -eq 0 ]; then
-        echo "✅ 保存成功。"
-    else
-        echo "❌ 保存失败！"
-    fi
-    echo ""
-
-done