feat: 支持运行data-juicer算子 (#215)

* feature: 增加data-juicer算子

* feat: 支持运行data-juicer算子

* feat: 支持data-juicer任务下发

* feat: 支持data-juicer结果数据集归档

* feat: 支持data-juicer结果数据集归档
This commit is contained in:
hhhhsc701
2025-12-31 09:20:41 +08:00
committed by GitHub
parent 63f4e3e447
commit 6a1eb85e8e
26 changed files with 709 additions and 120 deletions

View File

@@ -3,21 +3,18 @@
import base64
import json
import time
from typing import Dict
import ray
import yaml
from jsonargparse import dict_to_namespace, ArgumentParser
from jsonargparse import ArgumentParser
from loguru import logger
from datamate.common.utils import check_valid_path
from datamate.core.dataset import RayDataset
from datamate.sql_manager.persistence_atction import TaskInfoPersistence
from datamate.wrappers.executor import RayExecutor
import datamate.ops
class RayExecutor:
class DataMateExecutor(RayExecutor):
"""
基于Ray的执行器.
@@ -25,38 +22,8 @@ class RayExecutor:
2. 当前仅加载json文件类型的数据集。
"""
def __init__(self, cfg=None, meta=None):
if isinstance(cfg, Dict):
self.cfg = dict_to_namespace(cfg)
else:
logger.error(f"Please set param: cfg as type Dict, but given cfg as type {type(cfg).__name__}")
raise TypeError(f"To params cfg, Dict type is required, but type {type(cfg).__name__} is given!")
self.cfg.process = cfg['process']
self.meta = meta
# init ray
logger.info('Initing Ray ...')
ray.init()
def load_meta(self, line):
meta = json.loads(line)
if meta.get("fileId"):
meta["sourceFileId"] = meta.get("fileId")
if meta.get("fileName"):
meta["sourceFileName"] = meta.get("fileName")
if meta.get("fileType"):
meta["sourceFileType"] = meta.get("fileType")
if meta.get("fileSize"):
meta["sourceFileSize"] = meta.get("fileSize")
if not meta.get("totalPageNum"):
meta["totalPageNum"] = 0
if not meta.get("extraFilePath"):
meta["extraFilePath"] = None
if not meta.get("extraFileType"):
meta["extraFileType"] = None
meta["dataset_id"] = self.cfg.dataset_id
return meta
def __init__(self, cfg = None, meta = None):
super().__init__(cfg, meta)
def run(self):
# 1. 加载数据集
@@ -77,36 +44,13 @@ class RayExecutor:
tend = time.time()
logger.info(f'All Ops are done in {tend - tstart:.3f}s.')
dataset.data.materialize()
def load_dataset(self):
retry = 0
dataset = None
jsonl_file_path = self.cfg.dataset_path
while True:
if check_valid_path(jsonl_file_path):
with open(jsonl_file_path, "r", encoding='utf-8') as meta:
lines = meta.readlines()
dataset = ray.data.from_items([self.load_meta(line) for line in lines])
break
if retry < 5:
retry += 1
time.sleep(retry)
continue
else:
logger.error(f"can not load dataset from dataset_path")
raise RuntimeError(f"Load dataset Failed!, dataset_path: {self.cfg.dataset_path}.")
return dataset
def update_db(self, status):
task_info = TaskInfoPersistence()
task_info.update_result(self.cfg.dataset_id, self.cfg.instance_id, status)
for _ in dataset.data.iter_batches():
pass
if __name__ == '__main__':
parser = ArgumentParser(description="Create API for Submitting Job to Data-juicer")
parser = ArgumentParser(description="Create API for Submitting Job to ray")
parser.add_argument("--config_path", type=str, required=False, default="../configs/demo.yaml")
parser.add_argument("--flow_config", type=str, required=False, default=None)
@@ -119,10 +63,10 @@ if __name__ == '__main__':
if flow_config:
m_cfg = yaml.safe_load(base64.b64decode(flow_config))
else:
with open(config_path, "r", encoding='utf-8') as cfg:
m_cfg = yaml.safe_load(cfg)
with open(config_path, "r", encoding='utf-8') as f:
m_cfg = yaml.safe_load(f)
executor = RayExecutor(m_cfg)
executor = DataMateExecutor(m_cfg)
try:
executor.run()
except Exception as e: