You've already forked DataMate
feat(docker): 优化 Dockerfile 支持弱网环境缓存
- 使用缓存挂载 DataX 源码,避免重复克隆提高构建效率 - 添加 NLTK 数据缓存挂载并增加失败检查机制 - 实现 PaddleOCR 模型下载缓存,支持离线重用 - 集成 spaCy 模型缓存机制,提升安装稳定性 - 优化构建流程适配弱网环境下的依赖下载
This commit is contained in:
@@ -16,9 +16,18 @@ RUN mkdir -p /root/.m2 && \
|
||||
</mirrors>\n\
|
||||
</settings>' > /root/.m2/settings.xml
|
||||
|
||||
RUN apt-get update && \
|
||||
# 使用缓存挂载 DataX 源码,支持弱网环境
|
||||
RUN --mount=type=cache,target=/tmp/datax-git \
|
||||
apt-get update && \
|
||||
apt-get install -y git && \
|
||||
git clone https://gitee.com/alibaba/DataX.git
|
||||
if [ -d /tmp/datax-git/.git ]; then \
|
||||
echo "Using cached DataX repository"; \
|
||||
cd /tmp/datax-git && git pull || true; \
|
||||
else \
|
||||
echo "Cloning DataX repository..."; \
|
||||
git clone https://gitee.com/alibaba/DataX.git /tmp/datax-git || exit 1; \
|
||||
fi \
|
||||
&& cp -r /tmp/datax-git /DataX
|
||||
|
||||
COPY runtime/datax/ DataX/
|
||||
|
||||
@@ -76,8 +85,11 @@ COPY runtime/datamate-python/pyproject.toml runtime/datamate-python/poetry.lock*
|
||||
RUN --mount=type=cache,target=$POETRY_CACHE_DIR \
|
||||
poetry install --no-root --only main
|
||||
|
||||
# Download NLTK data
|
||||
RUN python -c "import nltk; nltk.download(['punkt_tab','averaged_perceptron_tagger_eng'], download_dir='/usr/local/nltk_data')"
|
||||
# 使用缓存挂载 NLTK 数据,支持弱网环境
|
||||
RUN --mount=type=cache,target=/usr/local/nltk_data \
|
||||
python -c "import nltk; nltk.download(['punkt_tab','averaged_perceptron_tagger_eng'], download_dir='/usr/local/nltk_data')" || \
|
||||
(echo "Warning: Failed to download NLTK data, checking if cache exists..." && \
|
||||
[ -f /usr/local/nltk_data/tokenizers/punkt_tab/english/collocations.tab ] || exit 1)
|
||||
ENV NLTK_DATA=/usr/local/nltk_data
|
||||
|
||||
# Copy the rest of the application
|
||||
|
||||
@@ -11,10 +11,16 @@ RUN --mount=type=cache,target=/var/cache/apt \
|
||||
&& apt update \
|
||||
&& apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix swig poppler-utils tesseract-ocr
|
||||
|
||||
RUN mkdir -p /home/models \
|
||||
&& wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar \
|
||||
&& tar -xf ch_ppocr_mobile_v2.0_cls_infer.tar -C /home/models \
|
||||
&& rm -f ch_*.tar
|
||||
# 使用缓存挂载下载 PaddleOCR 模型,支持弱网环境
|
||||
RUN --mount=type=cache,target=/tmp/models \
|
||||
mkdir -p /home/models \
|
||||
&& if [ ! -f /tmp/models/ch_ppocr_mobile_v2.0_cls_infer.tar ]; then \
|
||||
echo "Downloading PaddleOCR model..."; \
|
||||
wget -O /tmp/models/ch_ppocr_mobile_v2.0_cls_infer.tar https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar || exit 1; \
|
||||
else \
|
||||
echo "Using cached PaddleOCR model"; \
|
||||
fi \
|
||||
&& tar -xf /tmp/models/ch_ppocr_mobile_v2.0_cls_infer.tar -C /home/models
|
||||
|
||||
COPY runtime/python-executor /opt/runtime
|
||||
COPY runtime/ops /opt/runtime/datamate/ops
|
||||
@@ -29,10 +35,18 @@ ENV UV_INDEX_URL="https://mirrors.aliyun.com/pypi/simple/"
|
||||
|
||||
WORKDIR /opt/runtime
|
||||
|
||||
# 使用缓存挂载 spaCy 模型,支持弱网环境
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=cache,target=/tmp/spacy-models \
|
||||
uv pip install -e .[all] --system \
|
||||
&& uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \
|
||||
&& uv pip install https://ghproxy.net/https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.8.0/zh_core_web_sm-3.8.0-py3-none-any.whl --system \
|
||||
&& if [ ! -f /tmp/spacy-models/zh_core_web_sm-3.8.0-py3-none-any.whl ]; then \
|
||||
echo "Downloading spaCy model..."; \
|
||||
wget -O /tmp/spacy-models/zh_core_web_sm-3.8.0-py3-none-any.whl https://ghproxy.net/https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.8.0/zh_core_web_sm-3.8.0-py3-none-any.whl || exit 1; \
|
||||
else \
|
||||
echo "Using cached spaCy model"; \
|
||||
fi \
|
||||
&& uv pip install /tmp/spacy-models/zh_core_web_sm-3.8.0-py3-none-any.whl --system \
|
||||
&& echo "/usr/local/lib/ops/site-packages" > /usr/local/lib/python3.11/site-packages/ops.pth
|
||||
|
||||
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
|
||||
|
||||
Reference in New Issue
Block a user