You've already forked DataMate
feat: 支持npu自动扩缩容 (#197)
* feat: npu动态调度 * feat: 数据集分页优化 * feat: 支持npu自动扩缩容 * feat: 支持npu自动扩缩容 * feat: 支持npu自动扩缩容 * feat: clean code
This commit is contained in:
@@ -22,17 +22,6 @@ from core.base_op import Filter as RELATIVE_Filter, Mapper as RELATIVE_Mapper, S
|
||||
rd.DataContext.get_current().enable_progress_bars = False
|
||||
|
||||
|
||||
def is_valid_path(item, dataset_dir):
|
||||
full_path = os.path.abspath(os.path.join(dataset_dir, item))
|
||||
return os.path.exists(full_path)
|
||||
|
||||
|
||||
def new_get_num_npus(init_kwargs):
|
||||
if init_kwargs.get("accelerator", "cpu") != "npu":
|
||||
return 0.0
|
||||
return 0.1
|
||||
|
||||
|
||||
class Formatters(Enum):
|
||||
"""
|
||||
抽取算子和落盘算子枚举类
|
||||
@@ -163,22 +152,19 @@ class RayDataset(BasicDataset):
|
||||
return res
|
||||
|
||||
def _run_single_op(self, operators_cls, init_kwargs, **kwargs):
|
||||
|
||||
num_npus = new_get_num_npus(init_kwargs)
|
||||
max_actor_nums = os.getenv("MAX_ACTOR_NUMS", "20")
|
||||
|
||||
# 分辨是否是onnx算子,如果是需要限制Actor并发数量
|
||||
if self._use_onnx_model(init_kwargs['op_name']):
|
||||
max_actor_nums = 4
|
||||
|
||||
resources = {}
|
||||
|
||||
if num_npus > 0:
|
||||
resources["node_npu"] = 0.1
|
||||
if init_kwargs.get("npu", 0) > 0:
|
||||
resources["npu"] = init_kwargs.get("npu")
|
||||
|
||||
if init_kwargs.get("arch", "arm").startswith("x86"):
|
||||
resources["arch"] = "x86"
|
||||
|
||||
cpu = init_kwargs.get("cpu", 0.05)
|
||||
memory = init_kwargs.get("memory", None)
|
||||
|
||||
kwargs.update({"ext_params": {}, "failed_reason": {}, "target_type": None})
|
||||
try:
|
||||
if issubclass(operators_cls, (Mapper, RELATIVE_Mapper)):
|
||||
@@ -186,7 +172,8 @@ class RayDataset(BasicDataset):
|
||||
fn_constructor_kwargs=init_kwargs,
|
||||
fn_kwargs=kwargs,
|
||||
resources=resources,
|
||||
num_cpus=0.05,
|
||||
num_cpus=cpu,
|
||||
memory=memory,
|
||||
compute=rd.ActorPoolStrategy(min_size=1,
|
||||
max_size=int(max_actor_nums)))
|
||||
|
||||
@@ -195,7 +182,8 @@ class RayDataset(BasicDataset):
|
||||
fn_constructor_kwargs=init_kwargs,
|
||||
fn_kwargs=kwargs,
|
||||
resources=resources,
|
||||
num_cpus=0.05,
|
||||
num_cpus=cpu,
|
||||
memory=memory,
|
||||
compute=rd.ActorPoolStrategy(min_size=1,
|
||||
max_size=int(max_actor_nums)))
|
||||
|
||||
@@ -204,7 +192,8 @@ class RayDataset(BasicDataset):
|
||||
fn_constructor_kwargs=init_kwargs,
|
||||
fn_kwargs=kwargs,
|
||||
resources=resources,
|
||||
num_cpus=0.05,
|
||||
num_cpus=cpu,
|
||||
memory=memory,
|
||||
compute=rd.ActorPoolStrategy(min_size=1,
|
||||
max_size=int(max_actor_nums)))
|
||||
else:
|
||||
@@ -214,13 +203,3 @@ class RayDataset(BasicDataset):
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
raise Exception("Error! Ops Details:") from e
|
||||
|
||||
def _use_onnx_model(self, ops_name):
|
||||
if ops_name in self.onnx_ops_name:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _use_npu_model(self, ops_name):
|
||||
if ops_name in self.npu_ops_name:
|
||||
return True
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user