diff --git a/Makefile.offline.mk b/Makefile.offline.mk new file mode 100644 index 0000000..ecf885b --- /dev/null +++ b/Makefile.offline.mk @@ -0,0 +1,257 @@ +# ============================================================================ +# Makefile 离线构建扩展 +# 将此文件内容追加到主 Makefile 末尾,或单独包含使用 +# ============================================================================ + +# 离线构建配置 +CACHE_DIR ?= ./build-cache +OFFLINE_VERSION ?= latest + +# 创建 buildx 构建器(如果不存在) +.PHONY: ensure-buildx +ensure-buildx: + @if ! docker buildx inspect offline-builder > /dev/null 2>&1; then \ + echo "创建 buildx 构建器..."; \ + docker buildx create --name offline-builder --driver docker-container --use 2>/dev/null || docker buildx use offline-builder; \ + else \ + docker buildx use offline-builder 2>/dev/null || true; \ + fi + +# ========== 离线缓存导出(有网环境) ========== + +.PHONY: offline-export +offline-export: ensure-buildx + @echo "======================================" + @echo "导出离线构建缓存..." + @echo "======================================" + @mkdir -p $(CACHE_DIR)/buildkit $(CACHE_DIR)/images $(CACHE_DIR)/resources + @$(MAKE) _offline-export-base-images + @$(MAKE) _offline-export-cache + @$(MAKE) _offline-export-resources + @$(MAKE) _offline-package + +.PHONY: _offline-export-base-images +_offline-export-base-images: + @echo "" + @echo "1. 导出基础镜像..." + @bash -c 'images=( \ + "maven:3-eclipse-temurin-21" \ + "maven:3-eclipse-temurin-8" \ + "eclipse-temurin:21-jdk" \ + "mysql:8" \ + "node:20-alpine" \ + "nginx:1.29" \ + "ghcr.nju.edu.cn/astral-sh/uv:python3.11-bookworm" \ + "ghcr.nju.edu.cn/astral-sh/uv:python3.12-bookworm" \ + "ghcr.nju.edu.cn/astral-sh/uv:latest" \ + "python:3.12-slim" \ + "python:3.11-slim" \ + "gcr.io/distroless/nodejs20-debian12" \ + ); for img in "$${images[@]}"; do echo " Pulling $$img..."; docker pull "$$img" 2>/dev/null || true; done' + @echo " Saving base images..." + @docker save -o $(CACHE_DIR)/images/base-images.tar \ + maven:3-eclipse-temurin-21 \ + maven:3-eclipse-temurin-8 \ + eclipse-temurin:21-jdk \ + mysql:8 \ + node:20-alpine \ + nginx:1.29 \ + ghcr.nju.edu.cn/astral-sh/uv:python3.11-bookworm \ + ghcr.nju.edu.cn/astral-sh/uv:python3.12-bookworm \ + ghcr.nju.edu.cn/astral-sh/uv:latest \ + python:3.12-slim \ + python:3.11-slim \ + gcr.io/distroless/nodejs20-debian12 2>/dev/null || echo " Warning: Some images may not exist" + +.PHONY: _offline-export-cache +_offline-export-cache: + @echo "" + @echo "2. 导出 BuildKit 缓存..." + @echo " backend..." + @docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/backend-cache,mode=max -f scripts/images/backend/Dockerfile -t datamate-backend:cache . 2>/dev/null || echo " Warning: backend cache export failed" + @echo " backend-python..." + @docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/backend-python-cache,mode=max -f scripts/images/backend-python/Dockerfile -t datamate-backend-python:cache . 2>/dev/null || echo " Warning: backend-python cache export failed" + @echo " database..." + @docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/database-cache,mode=max -f scripts/images/database/Dockerfile -t datamate-database:cache . 2>/dev/null || echo " Warning: database cache export failed" + @echo " frontend..." + @docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/frontend-cache,mode=max -f scripts/images/frontend/Dockerfile -t datamate-frontend:cache . 2>/dev/null || echo " Warning: frontend cache export failed" + @echo " gateway..." + @docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/gateway-cache,mode=max -f scripts/images/gateway/Dockerfile -t datamate-gateway:cache . 2>/dev/null || echo " Warning: gateway cache export failed" + @echo " runtime..." + @docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/runtime-cache,mode=max -f scripts/images/runtime/Dockerfile -t datamate-runtime:cache . 2>/dev/null || echo " Warning: runtime cache export failed" + @echo " deer-flow-backend..." + @docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/deer-flow-backend-cache,mode=max -f scripts/images/deer-flow-backend/Dockerfile -t deer-flow-backend:cache . 2>/dev/null || echo " Warning: deer-flow-backend cache export failed" + @echo " deer-flow-frontend..." + @docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/deer-flow-frontend-cache,mode=max -f scripts/images/deer-flow-frontend/Dockerfile -t deer-flow-frontend:cache . 2>/dev/null || echo " Warning: deer-flow-frontend cache export failed" + @echo " mineru..." + @docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/mineru-cache,mode=max -f scripts/images/mineru/Dockerfile -t datamate-mineru:cache . 2>/dev/null || echo " Warning: mineru cache export failed" + +.PHONY: _offline-export-resources +_offline-export-resources: + @echo "" + @echo "3. 预下载外部资源..." + @mkdir -p $(CACHE_DIR)/resources/models + @echo " PaddleOCR model..." + @wget -q -O $(CACHE_DIR)/resources/models/ch_ppocr_mobile_v2.0_cls_infer.tar \ + https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar 2>/dev/null || echo " Warning: PaddleOCR model download failed" + @echo " spaCy model..." + @wget -q -O $(CACHE_DIR)/resources/models/zh_core_web_sm-3.8.0-py3-none-any.whl \ + https://ghproxy.net/https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.8.0/zh_core_web_sm-3.8.0-py3-none-any.whl 2>/dev/null || echo " Warning: spaCy model download failed" + @echo " DataX source..." + @if [ ! -d "$(CACHE_DIR)/resources/DataX" ]; then \ + git clone --depth 1 https://gitee.com/alibaba/DataX.git $(CACHE_DIR)/resources/DataX 2>/dev/null || echo " Warning: DataX clone failed"; \ + fi + @echo " deer-flow source..." + @if [ ! -d "$(CACHE_DIR)/resources/deer-flow" ]; then \ + git clone --depth 1 https://ghproxy.net/https://github.com/ModelEngine-Group/deer-flow.git $(CACHE_DIR)/resources/deer-flow 2>/dev/null || echo " Warning: deer-flow clone failed"; \ + fi + +.PHONY: _offline-package +_offline-package: + @echo "" + @echo "4. 打包缓存..." + @cd $(CACHE_DIR) && tar -czf "build-cache-$$(date +%Y%m%d).tar.gz" buildkit images resources 2>/dev/null && cd - > /dev/null + @echo "" + @echo "======================================" + @echo "✓ 缓存导出完成!" + @echo "======================================" + @echo "传输文件: $(CACHE_DIR)/build-cache-$$(date +%Y%m%d).tar.gz" + +# ========== 离线构建(无网环境) ========== + +.PHONY: offline-setup +offline-setup: + @echo "======================================" + @echo "设置离线构建环境..." + @echo "======================================" + @if [ ! -d "$(CACHE_DIR)" ]; then \ + echo "查找并解压缓存包..."; \ + cache_file=$$(ls -t build-cache-*.tar.gz 2>/dev/null | head -1); \ + if [ -z "$$cache_file" ]; then \ + echo "错误: 未找到缓存压缩包 (build-cache-*.tar.gz)"; \ + exit 1; \ + fi; \ + echo "解压 $$cache_file..."; \ + tar -xzf "$$cache_file"; \ + else \ + echo "缓存目录已存在: $(CACHE_DIR)"; \ + fi + @echo "" + @echo "加载基础镜像..." + @if [ -f "$(CACHE_DIR)/images/base-images.tar" ]; then \ + docker load -i $(CACHE_DIR)/images/base-images.tar; \ + else \ + echo "警告: 基础镜像文件不存在,假设已手动加载"; \ + fi + @$(MAKE) ensure-buildx + @echo "" + @echo "✓ 离线环境准备完成" + +.PHONY: offline-build +offline-build: offline-setup + @echo "" + @echo "======================================" + @echo "开始离线构建..." + @echo "======================================" + @$(MAKE) _offline-build-services + +.PHONY: _offline-build-services +_offline-build-services: ensure-buildx + @echo "" + @echo "构建 datamate-database..." + @docker buildx build \ + --cache-from type=local,src=$(CACHE_DIR)/buildkit/database-cache \ + --network=none \ + -f scripts/images/database/Dockerfile \ + -t datamate-database:$(OFFLINE_VERSION) \ + --load . 2>/dev/null || echo " Warning: database build may need network, retrying without --network=none..." + @docker buildx build \ + --cache-from type=local,src=$(CACHE_DIR)/buildkit/database-cache \ + -f scripts/images/database/Dockerfile \ + -t datamate-database:$(OFFLINE_VERSION) \ + --load . || echo " Failed" + + @echo "" + @echo "构建 datamate-gateway..." + @docker buildx build \ + --cache-from type=local,src=$(CACHE_DIR)/buildkit/gateway-cache \ + -f scripts/images/gateway/Dockerfile \ + -t datamate-gateway:$(OFFLINE_VERSION) \ + --load . || echo " Failed" + + @echo "" + @echo "构建 datamate-backend..." + @docker buildx build \ + --cache-from type=local,src=$(CACHE_DIR)/buildkit/backend-cache \ + -f scripts/images/backend/Dockerfile \ + -t datamate-backend:$(OFFLINE_VERSION) \ + --load . || echo " Failed" + + @echo "" + @echo "构建 datamate-frontend..." + @docker buildx build \ + --cache-from type=local,src=$(CACHE_DIR)/buildkit/frontend-cache \ + -f scripts/images/frontend/Dockerfile \ + -t datamate-frontend:$(OFFLINE_VERSION) \ + --load . || echo " Failed" + + @echo "" + @echo "构建 datamate-runtime..." + @docker buildx build \ + --cache-from type=local,src=$(CACHE_DIR)/buildkit/runtime-cache \ + --build-arg RESOURCES_DIR=$(CACHE_DIR)/resources \ + -f scripts/images/runtime/Dockerfile \ + -t datamate-runtime:$(OFFLINE_VERSION) \ + --load . || echo " Failed" + + @echo "" + @echo "构建 datamate-backend-python..." + @docker buildx build \ + --cache-from type=local,src=$(CACHE_DIR)/buildkit/backend-python-cache \ + --build-arg RESOURCES_DIR=$(CACHE_DIR)/resources \ + -f scripts/images/backend-python/Dockerfile \ + -t datamate-backend-python:$(OFFLINE_VERSION) \ + --load . || echo " Failed" + + @echo "" + @echo "======================================" + @echo "✓ 离线构建完成" + @echo "======================================" + +# 单个服务离线构建 +.PHONY: %-offline-build +%-offline-build: offline-setup ensure-buildx + @echo "离线构建 $*..." + @if [ ! -d "$(CACHE_DIR)/buildkit/$*-cache" ]; then \ + echo "错误: $* 的缓存不存在"; \ + exit 1; \ + fi + @$(eval IMAGE_NAME := $(if $(filter deer-flow%,$*),$*,datamate-$*)) + @docker buildx build \ + --cache-from type=local,src=$(CACHE_DIR)/buildkit/$*-cache \ + $(if $(filter runtime backend-python deer-flow%,$*),--build-arg RESOURCES_DIR=$(CACHE_DIR)/resources,) \ + -f scripts/images/$*/Dockerfile \ + -t $(IMAGE_NAME):$(OFFLINE_VERSION) \ + --load . + +# ========== 帮助 ========== + +.PHONY: help-offline +help-offline: + @echo "离线构建命令:" + @echo " make offline-export [CACHE_DIR=./build-cache] - 在有网环境导出构建缓存" + @echo " make offline-setup [CACHE_DIR=./build-cache] - 解压并准备离线缓存" + @echo " make offline-build [CACHE_DIR=./build-cache] - 在无网环境构建所有服务" + @echo " make -offline-build - 离线构建单个服务" + @echo " (如: make backend-offline-build)" + @echo "" + @echo "完整工作流程:" + @echo " # 1. 有网环境导出缓存" + @echo " make offline-export" + @echo "" + @echo " # 2. 传输缓存到无网环境" + @echo " scp build-cache-*.tar.gz user@offline-server:/path/to/project/" + @echo "" + @echo " # 3. 无网环境构建" + @echo " tar -xzf build-cache-*.tar.gz" + @echo " make offline-build" diff --git a/scripts/offline/Dockerfile.backend-python.offline b/scripts/offline/Dockerfile.backend-python.offline new file mode 100644 index 0000000..c47bdb5 --- /dev/null +++ b/scripts/offline/Dockerfile.backend-python.offline @@ -0,0 +1,93 @@ +# backend-python Dockerfile 离线版本 +# 修改点: 使用本地 DataX 源码替代 git clone + +FROM maven:3-eclipse-temurin-8 AS datax-builder + +# 配置 Maven 阿里云镜像 +RUN mkdir -p /root/.m2 && \ + echo '\n\ +\n\ + \n\ + \n\ + aliyunmaven\n\ + *\n\ + 阿里云公共仓库\n\ + https://maven.aliyun.com/repository/public\n\ + \n\ + \n\ +' > /root/.m2/settings.xml + +# 离线模式: 从构建参数获取本地 DataX 路径 +ARG DATAX_LOCAL_PATH=./build-cache/resources/DataX + +# 复制本地 DataX 源码(离线环境预先下载) +COPY ${DATAX_LOCAL_PATH} /DataX + +COPY runtime/datax/ DataX/ + +RUN cd DataX && \ + sed -i "s/com.mysql.jdbc.Driver/com.mysql.cj.jdbc.Driver/g" \ + plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/util/DataBaseType.java && \ + mvn -U clean package assembly:assembly -Dmaven.test.skip=true + +FROM python:3.12-slim + +# 配置 apt 阿里云镜像源 +RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \ + sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \ + elif [ -f /etc/apt/sources.list ]; then \ + sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \ + fi && \ + apt-get update && \ + apt-get install -y --no-install-recommends vim openjdk-21-jre nfs-common glusterfs-client rsync && \ + rm -rf /var/lib/apt/lists/* + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + POETRY_VERSION=2.2.1 \ + POETRY_NO_INTERACTION=1 \ + POETRY_VIRTUALENVS_CREATE=false \ + POETRY_CACHE_DIR=/tmp/poetry_cache + +ENV JAVA_HOME=/usr/lib/jvm/java-21-openjdk + +ENV PATH="/root/.local/bin:$JAVA_HOME/bin:$PATH" + +WORKDIR /app + +# 配置 pip 阿里云镜像并安装 Poetry +RUN --mount=type=cache,target=/root/.cache/pip \ + pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/ && \ + pip config set global.trusted-host mirrors.aliyun.com && \ + pip install --upgrade --root-user-action=ignore pip \ + && pip install --root-user-action=ignore pipx \ + && pipx install "poetry==$POETRY_VERSION" + +COPY --from=datax-builder /DataX/target/datax/datax /opt/datax +RUN cp /opt/datax/plugin/reader/mysqlreader/libs/mysql* /opt/datax/plugin/reader/starrocksreader/libs/ + +# Copy only dependency files first +COPY runtime/datamate-python/pyproject.toml runtime/datamate-python/poetry.lock* /app/ + +# Install dependencies +RUN --mount=type=cache,target=$POETRY_CACHE_DIR \ + poetry install --no-root --only main + +# 离线模式: 使用本地 NLTK 数据 +ARG NLTK_DATA_LOCAL_PATH=./build-cache/resources/nltk_data +COPY ${NLTK_DATA_LOCAL_PATH} /usr/local/nltk_data + +ENV NLTK_DATA=/usr/local/nltk_data + +# Copy the rest of the application +COPY runtime/datamate-python /app + +COPY runtime/datamate-python/deploy/docker-entrypoint.sh /docker-entrypoint.sh +RUN chmod +x /docker-entrypoint.sh || true + +# Expose the application port +EXPOSE 18000 + +ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/scripts/offline/Dockerfile.deer-flow-backend.offline b/scripts/offline/Dockerfile.deer-flow-backend.offline new file mode 100644 index 0000000..9076ae4 --- /dev/null +++ b/scripts/offline/Dockerfile.deer-flow-backend.offline @@ -0,0 +1,44 @@ +# deer-flow-backend Dockerfile 离线版本 +# 修改点: 使用本地 deer-flow 源码替代 git clone + +FROM ghcr.nju.edu.cn/astral-sh/uv:python3.12-bookworm + +# Install uv. +COPY --from=ghcr.nju.edu.cn/astral-sh/uv:latest /uv /bin/uv + +# 配置 apt 阿里云镜像源并安装系统依赖 +RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \ + sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \ + elif [ -f /etc/apt/sources.list ]; then \ + sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \ + fi && \ + apt-get update && apt-get install -y \ + libpq-dev git \ + && rm -rf /var/lib/apt/lists/* + +# 配置 uv 使用阿里云 PyPI 镜像 +ENV UV_INDEX_URL="https://mirrors.aliyun.com/pypi/simple/" + +WORKDIR /app + +# 离线模式: 本地 deer-flow 路径 +ARG RESOURCES_DIR=./build-cache/resources +ARG DEERFLOW_DIR=${RESOURCES_DIR}/deer-flow + +# 复制本地 deer-flow 源码(离线环境预先下载) +COPY ${DEERFLOW_DIR} /app +COPY runtime/deer-flow/.env /app/.env +COPY runtime/deer-flow/conf.yaml /app/conf.yaml + +# Pre-cache the application dependencies. +RUN --mount=type=cache,target=/root/.cache/uv \ + uv sync --locked --no-install-project + +# Install the application dependencies. +RUN --mount=type=cache,target=/root/.cache/uv \ + uv sync --locked + +EXPOSE 8000 + +# Run the application. +CMD ["uv", "run", "--no-sync", "python", "server.py", "--host", "0.0.0.0", "--port", "8000"] diff --git a/scripts/offline/Dockerfile.deer-flow-frontend.offline b/scripts/offline/Dockerfile.deer-flow-frontend.offline new file mode 100644 index 0000000..e16aad6 --- /dev/null +++ b/scripts/offline/Dockerfile.deer-flow-frontend.offline @@ -0,0 +1,75 @@ +# deer-flow-frontend Dockerfile 离线版本 +# 修改点: 使用本地 deer-flow 源码替代 git clone + +##### DEPENDENCIES + +FROM node:20-alpine AS deps +RUN apk add --no-cache libc6-compat openssl +WORKDIR /app + +# 离线模式: 本地 deer-flow 路径 +ARG RESOURCES_DIR=./build-cache/resources +ARG DEERFLOW_DIR=${RESOURCES_DIR}/deer-flow + +# 复制本地 deer-flow 源码 +COPY ${DEERFLOW_DIR}/web /app + +# 配置 npm 淘宝镜像并安装依赖 +RUN npm config set registry https://registry.npmmirror.com && \ + if [ -f yarn.lock ]; then yarn config set registry https://registry.npmmirror.com && yarn --frozen-lockfile; \ + elif [ -f package-lock.json ]; then npm ci; \ + elif [ -f pnpm-lock.yaml ]; then npm install -g pnpm && pnpm config set registry https://registry.npmmirror.com && pnpm i; \ + else echo "Lockfile not found." && exit 1; \ + fi + +##### BUILDER + +FROM node:20-alpine AS builder + +RUN apk add --no-cache git + +WORKDIR /app +ARG NEXT_PUBLIC_API_URL="/deer-flow-backend" + +# 离线模式: 复制本地源码 +ARG RESOURCES_DIR=./build-cache/resources +ARG DEERFLOW_DIR=${RESOURCES_DIR}/deer-flow + +COPY ${DEERFLOW_DIR} /deer-flow + +RUN cd /deer-flow \ + && mv /deer-flow/web/* /app \ + && rm -rf /deer-flow + +COPY --from=deps /app/node_modules ./node_modules + +ENV NEXT_TELEMETRY_DISABLED=1 + +# 配置 npm 淘宝镜像 +RUN npm config set registry https://registry.npmmirror.com && \ + if [ -f yarn.lock ]; then yarn config set registry https://registry.npmmirror.com && SKIP_ENV_VALIDATION=1 yarn build; \ + elif [ -f package-lock.json ]; then SKIP_ENV_VALIDATION=1 npm run build; \ + elif [ -f pnpm-lock.yaml ]; then npm install -g pnpm && pnpm config set registry https://registry.npmmirror.com && SKIP_ENV_VALIDATION=1 pnpm run build; \ + else echo "Lockfile not found." && exit 1; \ + fi + +##### RUNNER + +FROM gcr.io/distroless/nodejs20-debian12 AS runner +WORKDIR /app + +ENV NODE_ENV=production + +ENV NEXT_TELEMETRY_DISABLED=1 + +COPY --from=builder /app/next.config.js ./ +COPY --from=builder /app/public ./public +COPY --from=builder /app/package.json ./package.json + +COPY --from=builder /app/.next/standalone ./ +COPY --from=builder /app/.next/static ./.next/static + +EXPOSE 3000 +ENV PORT=3000 + +CMD ["server.js"] diff --git a/scripts/offline/Dockerfile.runtime.offline b/scripts/offline/Dockerfile.runtime.offline new file mode 100644 index 0000000..4cd29cc --- /dev/null +++ b/scripts/offline/Dockerfile.runtime.offline @@ -0,0 +1,54 @@ +# runtime Dockerfile 离线版本 +# 修改点: 使用本地模型文件替代 wget 下载 + +FROM ghcr.nju.edu.cn/astral-sh/uv:python3.11-bookworm + +# 配置 apt 阿里云镜像源 +RUN --mount=type=cache,target=/var/cache/apt \ + --mount=type=cache,target=/var/lib/apt \ + if [ -f /etc/apt/sources.list.d/debian.sources ]; then \ + sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \ + elif [ -f /etc/apt/sources.list ]; then \ + sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \ + fi \ + && apt update \ + && apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix swig poppler-utils tesseract-ocr + +# 离线模式: 本地模型文件路径 +ARG RESOURCES_DIR=./build-cache/resources +ARG MODELS_DIR=${RESOURCES_DIR}/models + +# 复制本地 PaddleOCR 模型(离线环境预先下载) +RUN mkdir -p /home/models +COPY ${MODELS_DIR}/ch_ppocr_mobile_v2.0_cls_infer.tar /home/models/ +RUN tar -xf /home/models/ch_ppocr_mobile_v2.0_cls_infer.tar -C /home/models + +COPY runtime/python-executor /opt/runtime +COPY runtime/ops /opt/runtime/datamate/ops +COPY runtime/ops/user /opt/runtime/user +COPY scripts/images/runtime/start.sh /opt/runtime/start.sh + +ENV PYTHONPATH=/opt/runtime/datamate/ +ENV UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" +ENV UV_INDEX_STRATEGY=unsafe-best-match +# 配置 uv 使用阿里云 PyPI 镜像 +ENV UV_INDEX_URL="https://mirrors.aliyun.com/pypi/simple/" + +WORKDIR /opt/runtime + +# 复制本地 spaCy 模型(离线环境预先下载) +COPY ${MODELS_DIR}/zh_core_web_sm-3.8.0-py3-none-any.whl /tmp/ + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install -e .[all] --system \ + && uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \ + && uv pip install /tmp/zh_core_web_sm-3.8.0-py3-none-any.whl --system \ + && echo "/usr/local/lib/ops/site-packages" > /usr/local/lib/python3.11/site-packages/ops.pth + +RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \ + && chmod +x /opt/runtime/start.sh \ + && dos2unix /opt/runtime/start.sh + +EXPOSE 8081 + +ENTRYPOINT ["/opt/runtime/start.sh"] diff --git a/scripts/offline/Makefile.offline b/scripts/offline/Makefile.offline new file mode 100644 index 0000000..f83c42f --- /dev/null +++ b/scripts/offline/Makefile.offline @@ -0,0 +1,76 @@ +# Makefile 离线构建扩展 +# 将此内容追加到主 Makefile 或单独使用 +# 使用方法: make -f Makefile.offline + +# 离线构建配置 +CACHE_DIR ?= ./build-cache +VERSION ?= latest + +# ========== 离线构建目标 ========== + +.PHONY: offline-export +offline-export: + @echo "导出离线构建缓存..." + @bash scripts/offline/export-cache.sh $(CACHE_DIR) + +.PHONY: offline-build +offline-build: + @echo "使用缓存进行离线构建..." + @bash scripts/offline/build-offline.sh $(CACHE_DIR) $(VERSION) + +.PHONY: offline-setup +offline-setup: + @echo "解压并设置离线缓存..." + @if [ ! -d "$(CACHE_DIR)" ]; then \ + echo "查找缓存压缩包..."; \ + cache_file=$$(ls -t build-cache-*.tar.gz 2>/dev/null | head -1); \ + if [ -z "$$cache_file" ]; then \ + echo "错误: 未找到缓存压缩包 (build-cache-*.tar.gz)"; \ + exit 1; \ + fi; \ + echo "解压 $$cache_file..."; \ + tar -xzf "$$cache_file"; \ + fi + @echo "✓ 离线缓存准备完成" + +# 单个服务的离线构建 +.PHONY: %-offline-build +%-offline-build: + @echo "离线构建 $*..." + @$(eval CACHE_FILE := $(CACHE_DIR)/buildkit/$*-cache) + @$(eval IMAGE_NAME := $(if $(filter deer-flow%,$*),$*,datamate-$*)) + @if [ ! -d "$(CACHE_FILE)" ]; then \ + echo "错误: $* 的缓存不存在于 $(CACHE_FILE)"; \ + exit 1; \ + fi + @docker buildx build \ + --cache-from type=local,src=$(CACHE_FILE) \ + --network=none \ + -f scripts/images/$*/Dockerfile \ + -t $(IMAGE_NAME):$(VERSION) \ + --load \ + . || echo "警告: $* 离线构建失败" + +# 兼容原 Makefile 的构建目标(离线模式) +.PHONY: build-offline +build-offline: offline-setup + @$(MAKE) offline-build + +.PHONY: help-offline +help-offline: + @echo "离线构建命令:" + @echo " make offline-export - 在有网环境导出构建缓存" + @echo " make offline-setup - 解压并准备离线缓存" + @echo " make offline-build - 在无网环境使用缓存构建" + @echo " make -offline-build - 离线构建单个服务" + @echo "" + @echo "示例:" + @echo " # 有网环境导出缓存" + @echo " make offline-export" + @echo "" + @echo " # 传输 build-cache-*.tar.gz 到无网环境" + @echo " scp build-cache-20250202.tar.gz user@offline-server:/path/" + @echo "" + @echo " # 无网环境构建" + @echo " make offline-setup" + @echo " make offline-build" diff --git a/scripts/offline/README.md b/scripts/offline/README.md new file mode 100644 index 0000000..1d7e4da --- /dev/null +++ b/scripts/offline/README.md @@ -0,0 +1,245 @@ +# BuildKit 离线构建方案 + +本方案使用 Docker BuildKit 的缓存机制,实现在弱网/无网环境下的镜像构建。 + +## 方案概述 + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ 有网环境 (Build Machine) │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │ +│ │ 基础镜像 │ │ BuildKit │ │ 外部资源 │ │ +│ │ docker pull │ + │ 缓存导出 │ + │ (模型/源码) │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────┘ │ +│ │ │ │ │ +│ └──────────────────┼──────────────────┘ │ +│ ▼ │ +│ ┌──────────────────┐ │ +│ │ build-cache.tar.gz│ │ +│ └────────┬─────────┘ │ +└─────────────────────────────┼───────────────────────────────────┘ + │ 传输到无网环境 + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ 无网环境 (Offline Machine) │ +│ ┌──────────────────┐ │ +│ │ build-cache.tar.gz│ │ +│ └────────┬─────────┘ │ +│ ▼ │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │ +│ │ docker load │ │ BuildKit │ │ 本地资源挂载 │ │ +│ │ 基础镜像 │ + │ 缓存导入 │ + │ (模型/源码) │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────┘ │ +│ │ │ │ │ +│ └──────────────────┼──────────────────┘ │ +│ ▼ │ +│ 构建成功! │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## 快速开始 + +### 方法一:使用 Makefile 扩展(推荐) + +#### 1. 合并 Makefile + +将 `Makefile.offline.mk` 追加到主 Makefile: + +```bash +# Linux/Mac +cat Makefile.offline.mk >> Makefile + +# Windows (PowerShell) +Get-Content Makefile.offline.mk | Add-Content Makefile +``` + +#### 2. 有网环境导出缓存 + +```bash +# 导出所有缓存(包括基础镜像、BuildKit 缓存、外部资源) +make offline-export + +# 或者指定输出目录 +make offline-export CACHE_DIR=/path/to/cache +``` + +执行完成后,会生成压缩包:`build-cache-YYYYMMDD.tar.gz` + +#### 3. 传输到无网环境 + +```bash +# 使用 scp 或其他方式传输 +scp build-cache-20250202.tar.gz user@offline-server:/opt/datamate/ + +# 或者使用 U 盘等物理介质 +``` + +#### 4. 无网环境构建 + +```bash +# 解压缓存 +tar -xzf build-cache-20250202.tar.gz + +# 设置环境并构建 +make offline-setup +make offline-build + +# 或者指定版本号 +make offline-build OFFLINE_VERSION=v1.0.0 +``` + +### 方法二:使用独立脚本 + +#### 导出缓存 + +```bash +cd scripts/offline +./export-cache.sh /path/to/output +``` + +#### 离线构建 + +```bash +cd scripts/offline +./build-offline.sh /path/to/cache [version] +``` + +## 详细说明 + +### 缓存内容 + +缓存目录结构: + +``` +build-cache/ +├── buildkit/ # BuildKit 缓存 +│ ├── backend-cache/ +│ ├── backend-python-cache/ +│ ├── database-cache/ +│ ├── frontend-cache/ +│ ├── gateway-cache/ +│ ├── runtime-cache/ +│ ├── deer-flow-backend-cache/ +│ ├── deer-flow-frontend-cache/ +│ └── mineru-cache/ +├── images/ +│ └── base-images.tar # 基础镜像集合 +└── resources/ # 外部资源 + ├── models/ + │ ├── ch_ppocr_mobile_v2.0_cls_infer.tar # PaddleOCR 模型 + │ └── zh_core_web_sm-3.8.0-py3-none-any.whl # spaCy 模型 + ├── DataX/ # DataX 源码 + └── deer-flow/ # deer-flow 源码 +``` + +### 单个服务构建 + +```bash +# 仅构建 backend +make backend-offline-build + +# 仅构建 runtime +make runtime-offline-build + +# 仅构建 deer-flow-backend +make deer-flow-backend-offline-build +``` + +### 增量更新 + +如果只有部分服务代码变更,可以只导出该服务的缓存: + +```bash +# 重新导出 backend 缓存 +docker buildx build \ + --cache-to type=local,dest=./build-cache/buildkit/backend-cache,mode=max \ + -f scripts/images/backend/Dockerfile \ + -t datamate-backend:cache . + +# 传输并重新构建 +tar -czf build-cache-partial.tar.gz build-cache/buildkit/backend-cache +# ... 传输到无网环境 ... +make backend-offline-build +``` + +## 故障排查 + +### 问题 1: 缓存导入失败 + +``` +ERROR: failed to solve: failed to read cache metadata +``` + +**解决**: 缓存目录可能损坏,重新在有网环境导出。 + +### 问题 2: 基础镜像不存在 + +``` +ERROR: pull access denied +``` + +**解决**: 先执行 `make offline-setup` 加载基础镜像。 + +### 问题 3: 网络连接错误(无网环境) + +``` +ERROR: failed to do request: dial tcp: lookup ... +``` + +**解决**: 检查 Dockerfile 中是否还有网络依赖,可能需要修改 Dockerfile 使用本地资源。 + +### 问题 4: 内存不足 + +BuildKit 缓存可能占用大量内存,可以设置资源限制: + +```bash +# 创建带资源限制的 buildx 构建器 +docker buildx create --name offline-builder \ + --driver docker-container \ + --driver-opt memory=8g \ + --use +``` + +## 限制说明 + +1. **镜像版本**: 基础镜像版本必须与缓存导出时一致 +2. **Dockerfile 变更**: 如果 Dockerfile 发生较大变更,可能需要重新导出缓存 +3. **资源文件**: mineru 镜像中的模型下载(`mineru-models-download`)仍需要网络,如果需要在完全无网环境使用,需要预先将模型文件挂载到镜像中 + +## 高级用法 + +### 自定义缓存位置 + +```bash +make offline-export CACHE_DIR=/mnt/nas/build-cache +make offline-build CACHE_DIR=/mnt/nas/build-cache +``` + +### 导出特定平台缓存 + +```bash +# 导出 ARM64 平台的缓存 +docker buildx build \ + --platform linux/arm64 \ + --cache-to type=local,dest=./build-cache/buildkit/backend-cache,mode=max \ + -f scripts/images/backend/Dockerfile . +``` + +### 使用远程缓存(有网环境) + +```bash +# 导出到 S3/MinIO +docker buildx build \ + --cache-to type=s3,region=us-east-1,bucket=mybucket,name=backend-cache \ + -f scripts/images/backend/Dockerfile . + +# 从 S3 导入 +docker buildx build \ + --cache-from type=s3,region=us-east-1,bucket=mybucket,name=backend-cache \ + -f scripts/images/backend/Dockerfile . +``` + +## 参考 + +- [Docker BuildKit Documentation](https://docs.docker.com/build/buildkit/) +- [Cache Storage Backends](https://docs.docker.com/build/cache/backends/) diff --git a/scripts/offline/build-offline.sh b/scripts/offline/build-offline.sh new file mode 100644 index 0000000..2192271 --- /dev/null +++ b/scripts/offline/build-offline.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# BuildKit 离线构建脚本 - 在无网环境执行 +# Usage: ./build-offline.sh [cache-dir] [version] + +set -e + +CACHE_DIR="${1:-./build-cache}" +VERSION="${2:-latest}" +BUILDKIT_CACHE_DIR="$CACHE_DIR/buildkit" +IMAGES_DIR="$CACHE_DIR/images" +RESOURCES_DIR="$CACHE_DIR/resources" + +# 检查缓存目录 +if [ ! -d "$CACHE_DIR" ]; then + echo "错误: 缓存目录 $CACHE_DIR 不存在" + echo "请先解压缓存包: tar -xzf build-cache-*.tar.gz" + exit 1 +fi + +# 确保 buildx 构建器存在 +if ! docker buildx inspect offline-builder > /dev/null 2>&1; then + echo "创建 buildx 构建器..." + docker buildx create --name offline-builder --driver docker-container --use +else + docker buildx use offline-builder +fi + +echo "======================================" +echo "1. 加载基础镜像" +echo "======================================" + +if [ -f "$IMAGES_DIR/base-images.tar" ]; then + echo "从 $IMAGES_DIR/base-images.tar 加载基础镜像..." + docker load -i "$IMAGES_DIR/base-images.tar" + echo "✓ 基础镜像加载完成" +else + echo "警告: 基础镜像文件不存在,假设镜像已存在" +fi + +echo "" +echo "======================================" +echo "2. 离线构建服务" +echo "======================================" + +# 定义服务配置(与 export-cache.sh 保持一致) +SERVICES=( + "backend:datamate-backend:scripts/images/backend/Dockerfile" + "backend-python:datamate-backend-python:scripts/images/backend-python/Dockerfile" + "database:datamate-database:scripts/images/database/Dockerfile" + "frontend:datamate-frontend:scripts/images/frontend/Dockerfile" + "gateway:datamate-gateway:scripts/images/gateway/Dockerfile" + "runtime:datamate-runtime:scripts/images/runtime/Dockerfile" + "deer-flow-backend:deer-flow-backend:scripts/images/deer-flow-backend/Dockerfile" + "deer-flow-frontend:deer-flow-frontend:scripts/images/deer-flow-frontend/Dockerfile" + "mineru:datamate-mineru:scripts/images/mineru/Dockerfile" +) + +# 检查是否有资源目录需要挂载 +MOUNT_ARGS="" +if [ -d "$RESOURCES_DIR" ]; then + echo "检测到资源目录,将用于本地资源挂载" + MOUNT_ARGS="--build-arg RESOURCES_DIR=$RESOURCES_DIR" +fi + +for service_config in "${SERVICES[@]}"; do + IFS=':' read -r service_name image_name dockerfile <<< "$service_config" + cache_file="$BUILDKIT_CACHE_DIR/$service_name-cache" + + echo "" + echo "--------------------------------------" + echo "构建 [$service_name] -> $image_name:$VERSION" + echo "--------------------------------------" + + if [ ! -d "$cache_file" ]; then + echo "警告: $service_name 的缓存不存在,跳过..." + continue + fi + + # 使用缓存进行离线构建 + # --network=none 确保不访问网络 + docker buildx build \ + --cache-from "type=local,src=$cache_file" \ + --network=none \ + -f "$dockerfile" \ + -t "$image_name:$VERSION" \ + --load \ + . || { + echo "错误: $service_name 构建失败" + echo "尝试不使用 --network=none 重新构建..." + docker buildx build \ + --cache-from "type=local,src=$cache_file" \ + -f "$dockerfile" \ + -t "$image_name:$VERSION" \ + --load \ + . + } + + echo "✓ $service_name 构建完成" +done + +echo "" +echo "======================================" +echo "✓ 离线构建完成!" +echo "======================================" +echo "" +echo "构建的镜像列表:" +docker images | grep -E "(datamate-|deer-flow-)" || true diff --git a/scripts/offline/export-cache.sh b/scripts/offline/export-cache.sh new file mode 100644 index 0000000..49f6bf1 --- /dev/null +++ b/scripts/offline/export-cache.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# BuildKit 缓存导出脚本 - 在有网环境执行 +# Usage: ./export-cache.sh [output-dir] + +set -e + +OUTPUT_DIR="${1:-./build-cache}" +BUILDKIT_CACHE_DIR="$OUTPUT_DIR/buildkit" +IMAGES_DIR="$OUTPUT_DIR/images" +RESOURCES_DIR="$OUTPUT_DIR/resources" + +# 确保 buildx 构建器存在 +if ! docker buildx inspect offline-builder > /dev/null 2>&1; then + echo "创建 buildx 构建器..." + docker buildx create --name offline-builder --driver docker-container --use +else + docker buildx use offline-builder +fi + +mkdir -p "$BUILDKIT_CACHE_DIR" "$IMAGES_DIR" "$RESOURCES_DIR" + +echo "======================================" +echo "1. 导出基础镜像" +echo "======================================" + +BASE_IMAGES=( + "maven:3-eclipse-temurin-21" + "maven:3-eclipse-temurin-8" + "eclipse-temurin:21-jdk" + "mysql:8" + "node:20-alpine" + "nginx:1.29" + "ghcr.nju.edu.cn/astral-sh/uv:python3.11-bookworm" + "ghcr.nju.edu.cn/astral-sh/uv:python3.12-bookworm" + "ghcr.nju.edu.cn/astral-sh/uv:latest" + "python:3.12-slim" + "python:3.11-slim" + "gcr.io/distroless/nodejs20-debian12" +) + +for img in "${BASE_IMAGES[@]}"; do + echo "拉取: $img" + docker pull "$img" || echo "警告: $img 拉取失败,可能已存在" +done + +echo "" +echo "保存基础镜像到 $IMAGES_DIR/base-images.tar..." +docker save -o "$IMAGES_DIR/base-images.tar" "${BASE_IMAGES[@]}" +echo "✓ 基础镜像保存完成" + +echo "" +echo "======================================" +echo "2. 导出 BuildKit 构建缓存" +echo "======================================" + +# 定义服务配置 +SERVICES=( + "backend:datamate-backend:scripts/images/backend/Dockerfile" + "backend-python:datamate-backend-python:scripts/images/backend-python/Dockerfile" + "database:datamate-database:scripts/images/database/Dockerfile" + "frontend:datamate-frontend:scripts/images/frontend/Dockerfile" + "gateway:datamate-gateway:scripts/images/gateway/Dockerfile" + "runtime:datamate-runtime:scripts/images/runtime/Dockerfile" + "deer-flow-backend:deer-flow-backend:scripts/images/deer-flow-backend/Dockerfile" + "deer-flow-frontend:deer-flow-frontend:scripts/images/deer-flow-frontend/Dockerfile" + "mineru:datamate-mineru:scripts/images/mineru/Dockerfile" +) + +for service_config in "${SERVICES[@]}"; do + IFS=':' read -r service_name image_name dockerfile <<< "$service_config" + cache_file="$BUILDKIT_CACHE_DIR/$service_name-cache" + + echo "" + echo "导出 [$service_name] 缓存到 $cache_file..." + + # 先正常构建以填充缓存 + docker buildx build \ + --cache-to "type=local,dest=$cache_file,mode=max" \ + -f "$dockerfile" \ + -t "$image_name:cache" \ + . || echo "警告: $service_name 缓存导出失败" + + echo "✓ $service_name 缓存导出完成" +done + +echo "" +echo "======================================" +echo "3. 预下载外部资源" +echo "======================================" + +# PaddleOCR 模型 +mkdir -p "$RESOURCES_DIR/models" +if [ ! -f "$RESOURCES_DIR/models/ch_ppocr_mobile_v2.0_cls_infer.tar" ]; then + echo "下载 PaddleOCR 模型..." + wget -O "$RESOURCES_DIR/models/ch_ppocr_mobile_v2.0_cls_infer.tar" \ + "https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar" || true +fi + +# spaCy 模型 +if [ ! -f "$RESOURCES_DIR/models/zh_core_web_sm-3.8.0-py3-none-any.whl" ]; then + echo "下载 spaCy 模型..." + wget -O "$RESOURCES_DIR/models/zh_core_web_sm-3.8.0-py3-none-any.whl" \ + "https://ghproxy.net/https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.8.0/zh_core_web_sm-3.8.0-py3-none-any.whl" || true +fi + +# DataX 源码 +if [ ! -d "$RESOURCES_DIR/DataX" ]; then + echo "克隆 DataX 源码..." + git clone --depth 1 "https://gitee.com/alibaba/DataX.git" "$RESOURCES_DIR/DataX" || true +fi + +# deer-flow 源码(用于 deer-flow 构建) +if [ ! -d "$RESOURCES_DIR/deer-flow" ]; then + echo "克隆 deer-flow 源码..." + git clone --depth 1 "https://ghproxy.net/https://github.com/ModelEngine-Group/deer-flow.git" "$RESOURCES_DIR/deer-flow" || true +fi + +echo "" +echo "======================================" +echo "4. 打包缓存" +echo "======================================" + +cd "$OUTPUT_DIR" +tar -czf "build-cache-$(date +%Y%m%d).tar.gz" buildkit images resources +cd - > /dev/null + +echo "" +echo "======================================" +echo "✓ 缓存导出完成!" +echo "======================================" +echo "缓存位置: $OUTPUT_DIR" +echo "传输文件: $OUTPUT_DIR/build-cache-$(date +%Y%m%d).tar.gz" +echo "" +echo "请将此压缩包传输到无网环境后解压使用"