feat: 支持npu自动扩缩容 (#197)

* feat: npu动态调度

* feat: 数据集分页优化

* feat: 支持npu自动扩缩容

* feat: 支持npu自动扩缩容

* feat: 支持npu自动扩缩容

* feat: clean code
This commit is contained in:
hhhhsc701
2025-12-24 18:03:30 +08:00
committed by GitHub
parent de7f853c83
commit 1c507ac98a
6 changed files with 239 additions and 91 deletions

View File

@@ -126,7 +126,7 @@ worker:
groupName: workergroup
replicas: 1
minReplicas: 1
maxReplicas: 3
maxReplicas: 1
labels: {}
serviceAccountName: ""
restartPolicy: ""

View File

@@ -189,8 +189,20 @@ runtime:
ray-cluster:
enabled: true
head:
enableInTreeAutoscaling: true
autoscalerOptions:
upscalingMode: Default
idleTimeoutSeconds: 60
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "500m"
memory: "512Mi"
requests:
cpu: "500m"
memory: "512Mi"
rayStartParams:
num-cpus: '0'
num-cpus: "0"
containerEnv:
- name: RAY_DEDUP_LOGS
value: "0"
@@ -206,6 +218,8 @@ ray-cluster:
value: *dbPass
- name: MYSQL_DATABASE
value: "datamate"
- name: RAY_enable_autoscaler_v2
value: "1"
resources:
limits:
cpu: "4"
@@ -283,3 +297,58 @@ ray-cluster:
- mountPath: /usr/local/lib/ops/site-packages
name: operator-volume
subPath: site-packages
additionalWorkerGroups:
npuGroup:
disabled: false
replicas: 0
minReplicas: 0
maxReplicas: 8
rayStartParams:
resources: '"{\"npu\": 1}"'
containerEnv:
- name: RAY_DEDUP_LOGS
value: "0"
- name: RAY_TQDM_PATCH_PRINT
value: "0"
- name: MYSQL_HOST
value: "datamate-database"
- name: MYSQL_PORT
value: "3306"
- name: MYSQL_USER
value: "root"
- name: MYSQL_PASSWORD
value: *dbPass
- name: MYSQL_DATABASE
value: "datamate"
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
resources:
limits:
cpu: "8"
memory: "64G"
huawei.com/Ascend910: 1
requests:
cpu: "1"
memory: "2G"
huawei.com/Ascend910: 1
volumes:
- *datasetVolume
- *flowVolume
- *logVolume
- *operatorVolume
volumeMounts:
- mountPath: /tmp/ray
name: log-volume
subPathExpr: ray/$(POD_NAME)
- mountPath: /dataset
name: dataset-volume
- mountPath: /flow
name: flow-volume
- mountPath: /opt/runtime/datamate/ops/user
name: operator-volume
subPath: extract
- mountPath: /usr/local/lib/ops/site-packages
name: operator-volume
subPath: site-packages