diff --git a/scripts/images/backend-python/Dockerfile b/scripts/images/backend-python/Dockerfile index aa9ffc3..64ad02e 100644 --- a/scripts/images/backend-python/Dockerfile +++ b/scripts/images/backend-python/Dockerfile @@ -16,9 +16,18 @@ RUN mkdir -p /root/.m2 && \ \n\ ' > /root/.m2/settings.xml -RUN apt-get update && \ +# 使用缓存挂载 DataX 源码,支持弱网环境 +RUN --mount=type=cache,target=/tmp/datax-git \ + apt-get update && \ apt-get install -y git && \ - git clone https://gitee.com/alibaba/DataX.git + if [ -d /tmp/datax-git/.git ]; then \ + echo "Using cached DataX repository"; \ + cd /tmp/datax-git && git pull || true; \ + else \ + echo "Cloning DataX repository..."; \ + git clone https://gitee.com/alibaba/DataX.git /tmp/datax-git || exit 1; \ + fi \ + && cp -r /tmp/datax-git /DataX COPY runtime/datax/ DataX/ @@ -76,8 +85,11 @@ COPY runtime/datamate-python/pyproject.toml runtime/datamate-python/poetry.lock* RUN --mount=type=cache,target=$POETRY_CACHE_DIR \ poetry install --no-root --only main -# Download NLTK data -RUN python -c "import nltk; nltk.download(['punkt_tab','averaged_perceptron_tagger_eng'], download_dir='/usr/local/nltk_data')" +# 使用缓存挂载 NLTK 数据,支持弱网环境 +RUN --mount=type=cache,target=/usr/local/nltk_data \ + python -c "import nltk; nltk.download(['punkt_tab','averaged_perceptron_tagger_eng'], download_dir='/usr/local/nltk_data')" || \ + (echo "Warning: Failed to download NLTK data, checking if cache exists..." && \ + [ -f /usr/local/nltk_data/tokenizers/punkt_tab/english/collocations.tab ] || exit 1) ENV NLTK_DATA=/usr/local/nltk_data # Copy the rest of the application diff --git a/scripts/images/runtime/Dockerfile b/scripts/images/runtime/Dockerfile index 875a6fc..8d24747 100644 --- a/scripts/images/runtime/Dockerfile +++ b/scripts/images/runtime/Dockerfile @@ -11,10 +11,16 @@ RUN --mount=type=cache,target=/var/cache/apt \ && apt update \ && apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix swig poppler-utils tesseract-ocr -RUN mkdir -p /home/models \ - && wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar \ - && tar -xf ch_ppocr_mobile_v2.0_cls_infer.tar -C /home/models \ - && rm -f ch_*.tar +# 使用缓存挂载下载 PaddleOCR 模型,支持弱网环境 +RUN --mount=type=cache,target=/tmp/models \ + mkdir -p /home/models \ + && if [ ! -f /tmp/models/ch_ppocr_mobile_v2.0_cls_infer.tar ]; then \ + echo "Downloading PaddleOCR model..."; \ + wget -O /tmp/models/ch_ppocr_mobile_v2.0_cls_infer.tar https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar || exit 1; \ + else \ + echo "Using cached PaddleOCR model"; \ + fi \ + && tar -xf /tmp/models/ch_ppocr_mobile_v2.0_cls_infer.tar -C /home/models COPY runtime/python-executor /opt/runtime COPY runtime/ops /opt/runtime/datamate/ops @@ -29,10 +35,18 @@ ENV UV_INDEX_URL="https://mirrors.aliyun.com/pypi/simple/" WORKDIR /opt/runtime +# 使用缓存挂载 spaCy 模型,支持弱网环境 RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/tmp/spacy-models \ uv pip install -e .[all] --system \ && uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \ - && uv pip install https://ghproxy.net/https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.8.0/zh_core_web_sm-3.8.0-py3-none-any.whl --system \ + && if [ ! -f /tmp/spacy-models/zh_core_web_sm-3.8.0-py3-none-any.whl ]; then \ + echo "Downloading spaCy model..."; \ + wget -O /tmp/spacy-models/zh_core_web_sm-3.8.0-py3-none-any.whl https://ghproxy.net/https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.8.0/zh_core_web_sm-3.8.0-py3-none-any.whl || exit 1; \ + else \ + echo "Using cached spaCy model"; \ + fi \ + && uv pip install /tmp/spacy-models/zh_core_web_sm-3.8.0-py3-none-any.whl --system \ && echo "/usr/local/lib/ops/site-packages" > /usr/local/lib/python3.11/site-packages/ops.pth RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \