feat(docker): 优化 Dockerfile 支持弱网环境缓存

- 使用缓存挂载 DataX 源码,避免重复克隆提高构建效率
- 添加 NLTK 数据缓存挂载并增加失败检查机制
- 实现 PaddleOCR 模型下载缓存,支持离线重用
- 集成 spaCy 模型缓存机制,提升安装稳定性
- 优化构建流程适配弱网环境下的依赖下载
This commit is contained in:
2026-01-31 14:31:47 +08:00
parent 2bc48fd465
commit 8fdc7d99b8
2 changed files with 35 additions and 9 deletions

View File

@@ -16,9 +16,18 @@ RUN mkdir -p /root/.m2 && \
</mirrors>\n\
</settings>' > /root/.m2/settings.xml
RUN apt-get update && \
# 使用缓存挂载 DataX 源码,支持弱网环境
RUN --mount=type=cache,target=/tmp/datax-git \
apt-get update && \
apt-get install -y git && \
git clone https://gitee.com/alibaba/DataX.git
if [ -d /tmp/datax-git/.git ]; then \
echo "Using cached DataX repository"; \
cd /tmp/datax-git && git pull || true; \
else \
echo "Cloning DataX repository..."; \
git clone https://gitee.com/alibaba/DataX.git /tmp/datax-git || exit 1; \
fi \
&& cp -r /tmp/datax-git /DataX
COPY runtime/datax/ DataX/
@@ -76,8 +85,11 @@ COPY runtime/datamate-python/pyproject.toml runtime/datamate-python/poetry.lock*
RUN --mount=type=cache,target=$POETRY_CACHE_DIR \
poetry install --no-root --only main
# Download NLTK data
RUN python -c "import nltk; nltk.download(['punkt_tab','averaged_perceptron_tagger_eng'], download_dir='/usr/local/nltk_data')"
# 使用缓存挂载 NLTK 数据,支持弱网环境
RUN --mount=type=cache,target=/usr/local/nltk_data \
python -c "import nltk; nltk.download(['punkt_tab','averaged_perceptron_tagger_eng'], download_dir='/usr/local/nltk_data')" || \
(echo "Warning: Failed to download NLTK data, checking if cache exists..." && \
[ -f /usr/local/nltk_data/tokenizers/punkt_tab/english/collocations.tab ] || exit 1)
ENV NLTK_DATA=/usr/local/nltk_data
# Copy the rest of the application