You've already forked DataMate
feat: add operator-packages-volume to docker-compose and update Docke… (#179)
* feat: add operator-packages-volume to docker-compose and update Dockerfile for site-packages path * feat: add retry
This commit is contained in:
@@ -90,6 +90,7 @@ services:
|
|||||||
- dataset_volume:/dataset
|
- dataset_volume:/dataset
|
||||||
- flow_volume:/flow
|
- flow_volume:/flow
|
||||||
- operator-runtime-volume:/opt/runtime/datamate/ops/user
|
- operator-runtime-volume:/opt/runtime/datamate/ops/user
|
||||||
|
- operator-packages-volume:/usr/local/lib/ops/site-packages
|
||||||
networks: [ datamate ]
|
networks: [ datamate ]
|
||||||
|
|
||||||
# 4) mineru
|
# 4) mineru
|
||||||
@@ -150,6 +151,8 @@ volumes:
|
|||||||
name: datamate-operator-upload-volume
|
name: datamate-operator-upload-volume
|
||||||
operator-runtime-volume:
|
operator-runtime-volume:
|
||||||
name: datamate-operator-runtime-volume
|
name: datamate-operator-runtime-volume
|
||||||
|
operator-packages-volume:
|
||||||
|
name: datamate-operator-packages-volume
|
||||||
mineru_log_volume:
|
mineru_log_volume:
|
||||||
name: datamate-mineru_log_volume
|
name: datamate-mineru_log_volume
|
||||||
|
|
||||||
|
|||||||
@@ -170,6 +170,9 @@ runtime:
|
|||||||
- mountPath: /opt/runtime/datamate/ops/user
|
- mountPath: /opt/runtime/datamate/ops/user
|
||||||
name: operator-volume
|
name: operator-volume
|
||||||
subPath: extract
|
subPath: extract
|
||||||
|
- mountPath: /usr/local/lib/ops/site-packages
|
||||||
|
name: operator-volume
|
||||||
|
subPath: site-packages
|
||||||
|
|
||||||
ray-cluster:
|
ray-cluster:
|
||||||
enabled: true
|
enabled: true
|
||||||
@@ -214,6 +217,9 @@ ray-cluster:
|
|||||||
- mountPath: /opt/runtime/datamate/ops/user
|
- mountPath: /opt/runtime/datamate/ops/user
|
||||||
name: operator-volume
|
name: operator-volume
|
||||||
subPath: extract
|
subPath: extract
|
||||||
|
- mountPath: /usr/local/lib/ops/site-packages
|
||||||
|
name: operator-volume
|
||||||
|
subPath: site-packages
|
||||||
sidecarContainers:
|
sidecarContainers:
|
||||||
- name: runtime
|
- name: runtime
|
||||||
image: datamate-runtime
|
image: datamate-runtime
|
||||||
@@ -262,3 +268,6 @@ ray-cluster:
|
|||||||
- mountPath: /opt/runtime/datamate/ops/user
|
- mountPath: /opt/runtime/datamate/ops/user
|
||||||
name: operator-volume
|
name: operator-volume
|
||||||
subPath: extract
|
subPath: extract
|
||||||
|
- mountPath: /usr/local/lib/ops/site-packages
|
||||||
|
name: operator-volume
|
||||||
|
subPath: site-packages
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ class MineruFormatter(Mapper):
|
|||||||
self.server_url = "http://datamate-mineru:8000"
|
self.server_url = "http://datamate-mineru:8000"
|
||||||
self.backend = "vlm-http-client"
|
self.backend = "vlm-http-client"
|
||||||
self.output_dir = "/dataset/outputs"
|
self.output_dir = "/dataset/outputs"
|
||||||
|
self.max_retries = 3
|
||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
@@ -51,16 +52,29 @@ class MineruFormatter(Mapper):
|
|||||||
content = ""
|
content = ""
|
||||||
for page in range(0, total_page, 10):
|
for page in range(0, total_page, 10):
|
||||||
logger.info(f"fileName: {filename}, total_page: {total_page}, page: {page}.")
|
logger.info(f"fileName: {filename}, total_page: {total_page}, page: {page}.")
|
||||||
await aio_do_parse(
|
for attempt in range(self.max_retries):
|
||||||
output_dir=self.output_dir,
|
try:
|
||||||
pdf_file_names=[filename_without_ext],
|
await aio_do_parse(
|
||||||
pdf_bytes_list=[pdf_bytes],
|
output_dir=self.output_dir,
|
||||||
p_lang_list=["ch"],
|
pdf_file_names=[filename_without_ext],
|
||||||
backend=self.backend,
|
pdf_bytes_list=[pdf_bytes],
|
||||||
server_url=self.server_url,
|
p_lang_list=["ch"],
|
||||||
start_page_id=page,
|
backend=self.backend,
|
||||||
end_page_id=min(page + 9, total_page - 1),
|
server_url=self.server_url,
|
||||||
)
|
start_page_id=page,
|
||||||
|
end_page_id=min(page + 9, total_page - 1),
|
||||||
|
)
|
||||||
|
break # 成功则跳出重试循环
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Extract {filename} [{page}-{page + 9}] failed (attempt {attempt + 1}/{self.max_retries}). "
|
||||||
|
f"Error: {e}. Retrying in 5s..."
|
||||||
|
)
|
||||||
|
if attempt < self.max_retries - 1:
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
else:
|
||||||
|
logger.error(f"aio_do_parse failed after {self.max_retries} attempts.")
|
||||||
|
raise # 耗尽次数后抛出异常,交给上层 execute 处理
|
||||||
if os.path.exists(parse_dir):
|
if os.path.exists(parse_dir):
|
||||||
content += get_infer_result(".md", filename_without_ext, parse_dir)
|
content += get_infer_result(".md", filename_without_ext, parse_dir)
|
||||||
shutil.rmtree(parse_dir)
|
shutil.rmtree(parse_dir)
|
||||||
|
|||||||
@@ -23,7 +23,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" uv pip install -e . --system --index-strategy unsafe-best-match \
|
UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" uv pip install -e . --system --index-strategy unsafe-best-match \
|
||||||
&& UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \
|
&& UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \
|
||||||
&& uv pip uninstall torch torchvision --system \
|
&& uv pip uninstall torch torchvision --system \
|
||||||
&& python -m spacy download zh_core_web_sm
|
&& python -m spacy download zh_core_web_sm \
|
||||||
|
&& echo "/usr/local/lib/ops/site-packages" > /usr/local/lib/python3.11/site-packages/ops.pth
|
||||||
|
|
||||||
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
|
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
|
||||||
&& chmod +x /opt/runtime/start.sh \
|
&& chmod +x /opt/runtime/start.sh \
|
||||||
|
|||||||
Reference in New Issue
Block a user