feat: 支持npu自动扩缩容 (#197)

* feat: npu动态调度

* feat: 数据集分页优化

* feat: 支持npu自动扩缩容

* feat: 支持npu自动扩缩容

* feat: 支持npu自动扩缩容

* feat: clean code
This commit is contained in:
hhhhsc701
2025-12-24 18:03:30 +08:00
committed by GitHub
parent de7f853c83
commit 1c507ac98a
6 changed files with 239 additions and 91 deletions

View File

@@ -22,17 +22,6 @@ from core.base_op import Filter as RELATIVE_Filter, Mapper as RELATIVE_Mapper, S
rd.DataContext.get_current().enable_progress_bars = False
def is_valid_path(item, dataset_dir):
full_path = os.path.abspath(os.path.join(dataset_dir, item))
return os.path.exists(full_path)
def new_get_num_npus(init_kwargs):
if init_kwargs.get("accelerator", "cpu") != "npu":
return 0.0
return 0.1
class Formatters(Enum):
"""
抽取算子和落盘算子枚举类
@@ -163,22 +152,19 @@ class RayDataset(BasicDataset):
return res
def _run_single_op(self, operators_cls, init_kwargs, **kwargs):
num_npus = new_get_num_npus(init_kwargs)
max_actor_nums = os.getenv("MAX_ACTOR_NUMS", "20")
# 分辨是否是onnx算子,如果是需要限制Actor并发数量
if self._use_onnx_model(init_kwargs['op_name']):
max_actor_nums = 4
resources = {}
if num_npus > 0:
resources["node_npu"] = 0.1
if init_kwargs.get("npu", 0) > 0:
resources["npu"] = init_kwargs.get("npu")
if init_kwargs.get("arch", "arm").startswith("x86"):
resources["arch"] = "x86"
cpu = init_kwargs.get("cpu", 0.05)
memory = init_kwargs.get("memory", None)
kwargs.update({"ext_params": {}, "failed_reason": {}, "target_type": None})
try:
if issubclass(operators_cls, (Mapper, RELATIVE_Mapper)):
@@ -186,7 +172,8 @@ class RayDataset(BasicDataset):
fn_constructor_kwargs=init_kwargs,
fn_kwargs=kwargs,
resources=resources,
num_cpus=0.05,
num_cpus=cpu,
memory=memory,
compute=rd.ActorPoolStrategy(min_size=1,
max_size=int(max_actor_nums)))
@@ -195,7 +182,8 @@ class RayDataset(BasicDataset):
fn_constructor_kwargs=init_kwargs,
fn_kwargs=kwargs,
resources=resources,
num_cpus=0.05,
num_cpus=cpu,
memory=memory,
compute=rd.ActorPoolStrategy(min_size=1,
max_size=int(max_actor_nums)))
@@ -204,7 +192,8 @@ class RayDataset(BasicDataset):
fn_constructor_kwargs=init_kwargs,
fn_kwargs=kwargs,
resources=resources,
num_cpus=0.05,
num_cpus=cpu,
memory=memory,
compute=rd.ActorPoolStrategy(min_size=1,
max_size=int(max_actor_nums)))
else:
@@ -214,13 +203,3 @@ class RayDataset(BasicDataset):
except Exception as e:
logger.error(e)
raise Exception("Error! Ops Details:") from e
def _use_onnx_model(self, ops_name):
if ops_name in self.onnx_ops_name:
return True
return False
def _use_npu_model(self, ops_name):
if ops_name in self.npu_ops_name:
return True
return False