You've already forked DataMate
feat: add operator-packages-volume to docker-compose and update Docke… (#179)
* feat: add operator-packages-volume to docker-compose and update Dockerfile for site-packages path * feat: add retry
This commit is contained in:
@@ -26,6 +26,7 @@ class MineruFormatter(Mapper):
|
||||
self.server_url = "http://datamate-mineru:8000"
|
||||
self.backend = "vlm-http-client"
|
||||
self.output_dir = "/dataset/outputs"
|
||||
self.max_retries = 3
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
@@ -51,16 +52,29 @@ class MineruFormatter(Mapper):
|
||||
content = ""
|
||||
for page in range(0, total_page, 10):
|
||||
logger.info(f"fileName: {filename}, total_page: {total_page}, page: {page}.")
|
||||
await aio_do_parse(
|
||||
output_dir=self.output_dir,
|
||||
pdf_file_names=[filename_without_ext],
|
||||
pdf_bytes_list=[pdf_bytes],
|
||||
p_lang_list=["ch"],
|
||||
backend=self.backend,
|
||||
server_url=self.server_url,
|
||||
start_page_id=page,
|
||||
end_page_id=min(page + 9, total_page - 1),
|
||||
)
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
await aio_do_parse(
|
||||
output_dir=self.output_dir,
|
||||
pdf_file_names=[filename_without_ext],
|
||||
pdf_bytes_list=[pdf_bytes],
|
||||
p_lang_list=["ch"],
|
||||
backend=self.backend,
|
||||
server_url=self.server_url,
|
||||
start_page_id=page,
|
||||
end_page_id=min(page + 9, total_page - 1),
|
||||
)
|
||||
break # 成功则跳出重试循环
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Extract {filename} [{page}-{page + 9}] failed (attempt {attempt + 1}/{self.max_retries}). "
|
||||
f"Error: {e}. Retrying in 5s..."
|
||||
)
|
||||
if attempt < self.max_retries - 1:
|
||||
await asyncio.sleep(5)
|
||||
else:
|
||||
logger.error(f"aio_do_parse failed after {self.max_retries} attempts.")
|
||||
raise # 耗尽次数后抛出异常,交给上层 execute 处理
|
||||
if os.path.exists(parse_dir):
|
||||
content += get_infer_result(".md", filename_without_ext, parse_dir)
|
||||
shutil.rmtree(parse_dir)
|
||||
|
||||
Reference in New Issue
Block a user