You've already forked DataMate
- 使用缓存挂载 DataX 源码,避免重复克隆提高构建效率 - 添加 NLTK 数据缓存挂载并增加失败检查机制 - 实现 PaddleOCR 模型下载缓存,支持离线重用 - 集成 spaCy 模型缓存机制,提升安装稳定性 - 优化构建流程适配弱网环境下的依赖下载
105 lines
4.2 KiB
Docker
105 lines
4.2 KiB
Docker
FROM maven:3-eclipse-temurin-8 AS datax-builder
|
|
|
|
# 配置 Maven 阿里云镜像
|
|
RUN mkdir -p /root/.m2 && \
|
|
echo '<?xml version="1.0" encoding="UTF-8"?>\n\
|
|
<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"\n\
|
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n\
|
|
xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">\n\
|
|
<mirrors>\n\
|
|
<mirror>\n\
|
|
<id>aliyunmaven</id>\n\
|
|
<mirrorOf>*</mirrorOf>\n\
|
|
<name>阿里云公共仓库</name>\n\
|
|
<url>https://maven.aliyun.com/repository/public</url>\n\
|
|
</mirror>\n\
|
|
</mirrors>\n\
|
|
</settings>' > /root/.m2/settings.xml
|
|
|
|
# 使用缓存挂载 DataX 源码,支持弱网环境
|
|
RUN --mount=type=cache,target=/tmp/datax-git \
|
|
apt-get update && \
|
|
apt-get install -y git && \
|
|
if [ -d /tmp/datax-git/.git ]; then \
|
|
echo "Using cached DataX repository"; \
|
|
cd /tmp/datax-git && git pull || true; \
|
|
else \
|
|
echo "Cloning DataX repository..."; \
|
|
git clone https://gitee.com/alibaba/DataX.git /tmp/datax-git || exit 1; \
|
|
fi \
|
|
&& cp -r /tmp/datax-git /DataX
|
|
|
|
COPY runtime/datax/ DataX/
|
|
|
|
RUN cd DataX && \
|
|
sed -i "s/com.mysql.jdbc.Driver/com.mysql.cj.jdbc.Driver/g" \
|
|
plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/util/DataBaseType.java && \
|
|
mvn -U clean package assembly:assembly -Dmaven.test.skip=true
|
|
|
|
FROM python:3.12-slim
|
|
|
|
# Single-stage image with build cache optimization using BuildKit cache mounts.
|
|
# Note: to use the cache mount syntax you must build with BuildKit enabled:
|
|
# DOCKER_BUILDKIT=1 docker build . -f scripts/images/datamate-python/Dockerfile -t datamate-backend-python
|
|
|
|
# 配置 apt 阿里云镜像源
|
|
RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
|
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
|
|
elif [ -f /etc/apt/sources.list ]; then \
|
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \
|
|
fi && \
|
|
apt-get update && \
|
|
apt-get install -y --no-install-recommends vim openjdk-21-jre nfs-common glusterfs-client rsync && \
|
|
rm -rf /var/lib/apt/lists/*
|
|
|
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
PYTHONUNBUFFERED=1 \
|
|
POETRY_VERSION=2.2.1 \
|
|
POETRY_NO_INTERACTION=1 \
|
|
POETRY_VIRTUALENVS_CREATE=false \
|
|
POETRY_CACHE_DIR=/tmp/poetry_cache
|
|
|
|
ENV JAVA_HOME=/usr/lib/jvm/java-21-openjdk
|
|
|
|
ENV PATH="/root/.local/bin:$JAVA_HOME/bin:$PATH"
|
|
|
|
WORKDIR /app
|
|
|
|
# 配置 pip 阿里云镜像并安装 Poetry
|
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/ && \
|
|
pip config set global.trusted-host mirrors.aliyun.com && \
|
|
pip install --upgrade --root-user-action=ignore pip \
|
|
&& pip install --root-user-action=ignore pipx \
|
|
&& pipx install "poetry==$POETRY_VERSION"
|
|
|
|
COPY --from=datax-builder /DataX/target/datax/datax /opt/datax
|
|
RUN cp /opt/datax/plugin/reader/mysqlreader/libs/mysql* /opt/datax/plugin/reader/starrocksreader/libs/
|
|
|
|
# Copy only dependency files first (leverages layer caching when dependencies don't change)
|
|
COPY runtime/datamate-python/pyproject.toml runtime/datamate-python/poetry.lock* /app/
|
|
|
|
# Install dependencies using Poetry with cache mount
|
|
# --no-root: don't install the project itself yet (only dependencies)
|
|
# --only main: only install main dependencies, not dev dependencies
|
|
RUN --mount=type=cache,target=$POETRY_CACHE_DIR \
|
|
poetry install --no-root --only main
|
|
|
|
# 使用缓存挂载 NLTK 数据,支持弱网环境
|
|
RUN --mount=type=cache,target=/usr/local/nltk_data \
|
|
python -c "import nltk; nltk.download(['punkt_tab','averaged_perceptron_tagger_eng'], download_dir='/usr/local/nltk_data')" || \
|
|
(echo "Warning: Failed to download NLTK data, checking if cache exists..." && \
|
|
[ -f /usr/local/nltk_data/tokenizers/punkt_tab/english/collocations.tab ] || exit 1)
|
|
ENV NLTK_DATA=/usr/local/nltk_data
|
|
|
|
# Copy the rest of the application
|
|
COPY runtime/datamate-python /app
|
|
|
|
COPY runtime/datamate-python/deploy/docker-entrypoint.sh /docker-entrypoint.sh
|
|
RUN chmod +x /docker-entrypoint.sh || true
|
|
|
|
# Expose the application port
|
|
EXPOSE 18000
|
|
|
|
ENTRYPOINT ["/docker-entrypoint.sh"]
|