feat: 支持npu自动扩缩容 (#197)

* feat: npu动态调度 * feat: 数据集分页优化 * feat: 支持npu自动扩缩容 * feat: 支持npu自动扩缩容 * feat: 支持npu自动扩缩容 * feat: clean code
2025-12-24 18:03:30 +08:00
parent de7f853c83
commit 1c507ac98a
6 changed files with 239 additions and 91 deletions
--- a/2
+++ b/2
@@ -181,7 +181,7 @@ build-%: %-docker-build
 	@:

 .PHONY: build
-build: database-docker-build backend-docker-build frontend-docker-build runtime-docker-build backend-python-docker-build
+build: database-docker-build gateway-docker-build backend-docker-build frontend-docker-build runtime-docker-build backend-python-docker-build

 # ========== Utility Targets ==========

--- a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTaskService.java
+++ b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTaskService.java
@@ -20,12 +20,19 @@ import com.datamate.datamanagement.common.enums.DatasetType;
 import com.datamate.datamanagement.domain.model.dataset.Dataset;
 import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
 import com.datamate.datamanagement.interfaces.dto.CreateDatasetRequest;
+import com.datamate.operator.domain.repository.OperatorRepository;
+import com.datamate.operator.infrastructure.exception.OperatorErrorCode;
+import com.datamate.operator.interfaces.dto.OperatorDto;
+import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.PropertyNamingStrategies;
 import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.collections4.CollectionUtils;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.springframework.stereotype.Service;
 import org.springframework.transaction.annotation.Transactional;
 import org.yaml.snakeyaml.DumperOptions;
@@ -39,6 +46,8 @@ import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.util.*;
 import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Function;
+import java.util.function.Predicate;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
@@ -52,6 +61,8 @@ public class CleaningTaskService {

    private final OperatorInstanceRepository operatorInstanceRepo;

+    private final OperatorRepository operatorRepo;
+
    private final CleaningResultRepository cleaningResultRepo;

    private final CleaningTaskScheduler taskScheduler;
@@ -66,11 +77,16 @@ public class CleaningTaskService {

    private final String FLOW_PATH = "/flow";

-    private final Pattern LEVEL_PATTERN = Pattern.compile(
-            "\\b(TRACE|DEBUG|INFO|WARN|WARNING|ERROR|FATAL)\\b",
-            Pattern.CASE_INSENSITIVE
+    private static final Pattern STANDARD_LEVEL_PATTERN = Pattern.compile(
+            "\\b(DEBUG|Debug|INFO|Info|WARN|Warn|WARNING|Warning|ERROR|Error|FATAL|Fatal)\\b"
    );

+    private static final Pattern EXCEPTION_SUFFIX_PATTERN = Pattern.compile(
+            "\\b\\w+(Warning|Error|Exception)\\b"
+    );
+
+    private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
    public List<CleaningTaskDto> getTasks(String status, String keywords, Integer page, Integer size) {
        List<CleaningTaskDto> tasks = cleaningTaskRepo.findTasks(status, keywords, page, size);
        tasks.forEach(this::setProcess);
@@ -133,6 +149,7 @@ public class CleaningTaskService {
    }

    public List<CleaningTaskLog> getTaskLog(String taskId) {
+        cleanTaskValidator.checkTaskId(taskId);
        String logPath = FLOW_PATH + "/" + taskId + "/output.log";
        try (Stream<String> lines = Files.lines(Paths.get(logPath))) {
            List<CleaningTaskLog> logs = new ArrayList<>();
@@ -156,18 +173,31 @@ public class CleaningTaskService {
            return defaultLevel;
        }

-        Matcher matcher = LEVEL_PATTERN.matcher(logLine);
-        if (matcher.find()) {
-            return matcher.group(1).toUpperCase();
+        Matcher stdMatcher = STANDARD_LEVEL_PATTERN.matcher(logLine);
+        if (stdMatcher.find()) {
+            return stdMatcher.group(1).toUpperCase();
+        }
+
+        Matcher exMatcher = EXCEPTION_SUFFIX_PATTERN.matcher(logLine);
+        if (exMatcher.find()) {
+            String match = exMatcher.group(1).toUpperCase();
+            if ("WARNING".equals(match)) return "WARN";
+            if ("ERROR".equals(match) || "EXCEPTION".equals(match)) return "ERROR";
        }
        return defaultLevel;
    }

    @Transactional
    public void deleteTask(String taskId) {
+        cleanTaskValidator.checkTaskId(taskId);
        cleaningTaskRepo.deleteTaskById(taskId);
        operatorInstanceRepo.deleteByInstanceId(taskId);
        cleaningResultRepo.deleteByInstanceId(taskId);
+        try {
+            FileUtils.deleteDirectory(new File(FLOW_PATH + "/" + taskId));
+        } catch (IOException e) {
+            log.warn("Can't delete flow path with task id: {}.", taskId, e);
+        }
    }

    public void executeTask(String taskId) {
@@ -180,6 +210,11 @@ public class CleaningTaskService {
    }

    private void prepareTask(CleaningTaskDto task, List<OperatorInstanceDto> instances) {
+        List<OperatorDto> allOperators = operatorRepo.findAllOperators();
+        Map<String, OperatorDto> defaultSettings = allOperators.stream()
+                .filter(operatorDto -> StringUtils.isNotBlank(operatorDto.getSettings()))
+                .collect(Collectors.toMap(OperatorDto::getId, Function.identity()));
+
        TaskProcess process = new TaskProcess();
        process.setInstanceId(task.getId());
        process.setDatasetId(task.getDestDatasetId());
@@ -187,7 +222,14 @@ public class CleaningTaskService {
        process.setExportPath(DATASET_PATH + "/" + task.getDestDatasetId());
        process.setExecutorType(ExecutorType.DATAMATE.getValue());
        process.setProcess(instances.stream()
-                .map(instance -> Map.of(instance.getId(), instance.getOverrides()))
+                .map(instance -> {
+                    OperatorDto operatorDto = defaultSettings.get(instance.getId());
+                    Map<String, Object> stringObjectMap = getDefaultValue(operatorDto);
+                    stringObjectMap.putAll(instance.getOverrides());
+                    Map<String, Object> runtime = getRuntime(operatorDto);
+                    stringObjectMap.putAll(runtime);
+                    return Map.of(instance.getId(), stringObjectMap);
+                })
                .toList());

        ObjectMapper jsonMapper = new ObjectMapper(new YAMLFactory());
@@ -210,67 +252,113 @@ public class CleaningTaskService {
        }
    }

-    private void scanDataset(String taskId, String srcDatasetId) {
-        int pageNumber = 0;
-        int pageSize = 500;
-        PagingQuery pageRequest = new PagingQuery(pageNumber, pageSize);
-        PagedResponse<DatasetFile> datasetFiles;
-        do {
-            datasetFiles = datasetFileService.getDatasetFiles(srcDatasetId, null, null,null, pageRequest);
-            if (datasetFiles.getContent().isEmpty()) {
-                break;
+    private Map<String, Object> getDefaultValue(OperatorDto operatorDto) {
+        if (StringUtils.isBlank(operatorDto.getSettings())) {
+            return new HashMap<>();
+        }
+
+        Map<String, Object> defaultSettings = new HashMap<>();
+        try {
+            Map<String, Map<String, Object>> settings = OBJECT_MAPPER.readValue(operatorDto.getSettings(), Map.class);
+            for  (Map.Entry<String, Map<String, Object>> entry : settings.entrySet()) {
+                String key = entry.getKey();
+                Map<String, Object> setting = entry.getValue();
+                String type = setting.get("type").toString();
+                switch (type) {
+                    case "slider":
+                    case "switch":
+                    case "select":
+                    case "input":
+                    case "radio":
+                    case "checkbox":
+                        if (setting.containsKey("defaultVal")) {
+                            defaultSettings.put(key, setting.get("defaultVal"));
+                        }
+                        break;
+                    case "range":
+                        List<Object> rangeDefault = getRangeDefault(setting);
+                        if (CollectionUtils.isNotEmpty(rangeDefault)) {
+                            defaultSettings.put(key, rangeDefault);
+                        }
+                        break;
+                    default:
+                }
            }
-            List<Map<String, Object>> files = datasetFiles.getContent().stream()
-                    .map(content -> Map.of("fileName", (Object) content.getFileName(),
-                            "fileSize", content.getFileSize(),
-                            "filePath", content.getFilePath(),
-                            "fileType", content.getFileType(),
-                            "fileId", content.getId()))
-                    .toList();
-            writeListMapToJsonlFile(files, FLOW_PATH + "/" + taskId + "/dataset.jsonl");
-            pageNumber += 1;
-        } while (pageNumber < datasetFiles.getTotalPages());
+            return defaultSettings;
+        } catch (JsonProcessingException e) {
+            throw BusinessException.of(OperatorErrorCode.SETTINGS_PARSE_FAILED, e.getMessage());
+        }
+    }
+
+    private List<Object> getRangeDefault(Map<String, Object> setting) {
+        List<Object> defaultValue = new ArrayList<>();
+        Object properties = setting.get("properties");
+        if (properties instanceof List<?> list) {
+            for (Object o : list) {
+                Map<String, Object> map = OBJECT_MAPPER.convertValue(o, Map.class);
+                if (map.containsKey("defaultVal")) {
+                    defaultValue.add(map.get("defaultVal"));
+                }
+            }
+        }
+        return defaultValue;
+    }
+
+    private Map<String, Object> getRuntime(OperatorDto operatorDto) {
+        if (StringUtils.isBlank(operatorDto.getRuntime())) {
+            return new HashMap<>();
+        }
+        try {
+            return OBJECT_MAPPER.readValue(operatorDto.getRuntime(), Map.class);
+        } catch (JsonProcessingException e) {
+            throw BusinessException.of(OperatorErrorCode.SETTINGS_PARSE_FAILED, e.getMessage());
+        }
+    }
+
+    private void scanDataset(String taskId, String srcDatasetId) {
+        doScan(taskId, srcDatasetId, file -> true);
    }

    private void scanDataset(String taskId, String srcDatasetId, Set<String> succeedFiles) {
+        doScan(taskId, srcDatasetId, file -> !succeedFiles.contains(file.getId()));
+    }
+
+    private void doScan(String taskId, String srcDatasetId, Predicate<DatasetFile> filterCondition) {
+        cleanTaskValidator.checkTaskId(taskId);
+        String targetFilePath = FLOW_PATH + "/" + taskId + "/dataset.jsonl";
+        File targetFile = new File(targetFilePath);
+        if (targetFile.getParentFile() != null && !targetFile.getParentFile().exists()) {
+            targetFile.getParentFile().mkdirs();
+        }
+
        int pageNumber = 0;
        int pageSize = 500;
-        PagingQuery pageRequest = new PagingQuery(pageNumber, pageSize);
-        PagedResponse<DatasetFile> datasetFiles;
-        do {
-            datasetFiles = datasetFileService.getDatasetFiles(srcDatasetId, null, null,null, pageRequest);
-            if (datasetFiles.getContent().isEmpty()) {
-                break;
-            }
-            List<Map<String, Object>> files = datasetFiles.getContent().stream()
-                    .filter(content -> !succeedFiles.contains(content.getId()))
-                    .map(content -> Map.of("fileName", (Object) content.getFileName(),
+        try (BufferedWriter writer = new BufferedWriter(new FileWriter(targetFile))) {
+            PagedResponse<DatasetFile> datasetFiles;
+            do {
+                PagingQuery pageRequest = new PagingQuery(pageNumber, pageSize);
+                datasetFiles = datasetFileService.getDatasetFiles(srcDatasetId, null, null, null, pageRequest);
+                if (datasetFiles.getContent().isEmpty()) {
+                    break;
+                }
+                for (DatasetFile content : datasetFiles.getContent()) {
+                    if (!filterCondition.test(content)) {
+                        continue;
+                    }
+                    Map<String, Object> fileMap = Map.of(
+                            "fileName", content.getFileName(),
                            "fileSize", content.getFileSize(),
                            "filePath", content.getFilePath(),
                            "fileType", content.getFileType(),
-                            "fileId", content.getId()))
-                    .toList();
-            writeListMapToJsonlFile(files, FLOW_PATH + "/" + taskId + "/dataset.jsonl");
-            pageNumber += 1;
-        } while (pageNumber < datasetFiles.getTotalPages());
-    }
-
-    private void writeListMapToJsonlFile(List<Map<String, Object>> mapList, String fileName) {
-        ObjectMapper objectMapper = new ObjectMapper();
-
-        try (BufferedWriter writer = new BufferedWriter(new FileWriter(fileName))) {
-            if (!mapList.isEmpty()) { // 检查列表是否为空，避免异常
-                String jsonString = objectMapper.writeValueAsString(mapList.getFirst());
-                writer.write(jsonString);
-
-                for (int i = 1; i < mapList.size(); i++) {
+                            "fileId", content.getId()
+                    );
+                    writer.write(OBJECT_MAPPER.writeValueAsString(fileMap));
                    writer.newLine();
-                    jsonString = objectMapper.writeValueAsString(mapList.get(i));
-                    writer.write(jsonString);
                }
-            }
+                pageNumber++;
+            } while (pageNumber < datasetFiles.getTotalPages());
        } catch (IOException e) {
-            log.error("Failed to prepare dataset.jsonl.", e);
+            log.error("Failed to write dataset.jsonl for taskId: {}", taskId, e);
            throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
        }
    }
--- a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/validator/CleanTaskValidator.java
+++ b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/validator/CleanTaskValidator.java
@@ -4,12 +4,14 @@ import com.datamate.cleaning.common.exception.CleanErrorCode;
 import com.datamate.cleaning.domain.repository.CleaningTaskRepository;
 import com.datamate.cleaning.interfaces.dto.OperatorInstanceDto;
 import com.datamate.common.infrastructure.exception.BusinessException;
+import com.datamate.common.infrastructure.exception.SystemErrorCode;
 import lombok.RequiredArgsConstructor;
 import org.apache.commons.lang3.StringUtils;
 import org.springframework.stereotype.Component;

 import java.util.List;
 import java.util.Locale;
+import java.util.regex.Pattern;


@Component
@@ -17,6 +19,10 @@ import java.util.Locale;
 public class CleanTaskValidator {
    private final CleaningTaskRepository cleaningTaskRepo;

+    private final Pattern UUID_PATTERN = Pattern.compile(
+            "^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"
+    );
+
    public void checkNameDuplication(String name) {
        if (cleaningTaskRepo.isNameExist(name)) {
            throw BusinessException.of(CleanErrorCode.DUPLICATE_TASK_NAME);
@@ -39,4 +45,10 @@ public class CleanTaskValidator {
                            front.getName(), back.getName()));
        }
    }
+
+    public void checkTaskId(String id) {
+        if (id == null || !UUID_PATTERN.matcher(id).matches()) {
+            throw BusinessException.of(SystemErrorCode.INVALID_PARAMETER);
+        }
+    }
 }
--- a/deployment/helm/datamate/charts/ray-cluster/values.yaml
+++ b/deployment/helm/datamate/charts/ray-cluster/values.yaml
@@ -126,7 +126,7 @@ worker:
  groupName: workergroup
  replicas: 1
  minReplicas: 1
-  maxReplicas: 3
+  maxReplicas: 1
  labels: {}
  serviceAccountName: ""
  restartPolicy: ""
--- a/deployment/helm/datamate/values.yaml
+++ b/deployment/helm/datamate/values.yaml
@@ -189,8 +189,20 @@ runtime:
 ray-cluster:
  enabled: true
  head:
+    enableInTreeAutoscaling: true
+    autoscalerOptions:
+      upscalingMode: Default
+      idleTimeoutSeconds: 60
+      imagePullPolicy: IfNotPresent
+      resources:
+        limits:
+          cpu: "500m"
+          memory: "512Mi"
+        requests:
+          cpu: "500m"
+          memory: "512Mi"
    rayStartParams:
-      num-cpus: '0'
+      num-cpus: "0"
    containerEnv:
      - name: RAY_DEDUP_LOGS
        value: "0"
@@ -206,6 +218,8 @@ ray-cluster:
        value: *dbPass
      - name: MYSQL_DATABASE
        value: "datamate"
+      - name: RAY_enable_autoscaler_v2
+        value: "1"
    resources:
      limits:
        cpu: "4"
@@ -283,3 +297,58 @@ ray-cluster:
      - mountPath: /usr/local/lib/ops/site-packages
        name: operator-volume
        subPath: site-packages
+  additionalWorkerGroups:
+    npuGroup:
+      disabled: false
+      replicas: 0
+      minReplicas: 0
+      maxReplicas: 8
+      rayStartParams:
+        resources: '"{\"npu\": 1}"'
+      containerEnv:
+        - name: RAY_DEDUP_LOGS
+          value: "0"
+        - name: RAY_TQDM_PATCH_PRINT
+          value: "0"
+        - name: MYSQL_HOST
+          value: "datamate-database"
+        - name: MYSQL_PORT
+          value: "3306"
+        - name: MYSQL_USER
+          value: "root"
+        - name: MYSQL_PASSWORD
+          value: *dbPass
+        - name: MYSQL_DATABASE
+          value: "datamate"
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+      resources:
+        limits:
+          cpu: "8"
+          memory: "64G"
+          huawei.com/Ascend910: 1
+        requests:
+          cpu: "1"
+          memory: "2G"
+          huawei.com/Ascend910: 1
+      volumes:
+        - *datasetVolume
+        - *flowVolume
+        - *logVolume
+        - *operatorVolume
+      volumeMounts:
+        - mountPath: /tmp/ray
+          name: log-volume
+          subPathExpr: ray/$(POD_NAME)
+        - mountPath: /dataset
+          name: dataset-volume
+        - mountPath: /flow
+          name: flow-volume
+        - mountPath: /opt/runtime/datamate/ops/user
+          name: operator-volume
+          subPath: extract
+        - mountPath: /usr/local/lib/ops/site-packages
+          name: operator-volume
+          subPath: site-packages
--- a/runtime/python-executor/datamate/core/dataset.py
+++ b/runtime/python-executor/datamate/core/dataset.py
@@ -22,17 +22,6 @@ from core.base_op import Filter as RELATIVE_Filter, Mapper as RELATIVE_Mapper, S
 rd.DataContext.get_current().enable_progress_bars = False


-def is_valid_path(item, dataset_dir):
-    full_path = os.path.abspath(os.path.join(dataset_dir, item))
-    return os.path.exists(full_path)
-
-
-def new_get_num_npus(init_kwargs):
-    if init_kwargs.get("accelerator", "cpu") != "npu":
-        return 0.0
-    return 0.1
-
-
 class Formatters(Enum):
    """
    抽取算子和落盘算子枚举类
@@ -163,22 +152,19 @@ class RayDataset(BasicDataset):
        return res

    def _run_single_op(self, operators_cls, init_kwargs, **kwargs):
-
-        num_npus = new_get_num_npus(init_kwargs)
        max_actor_nums = os.getenv("MAX_ACTOR_NUMS", "20")

-        # 分辨是否是onnx算子，如果是需要限制Actor并发数量
-        if self._use_onnx_model(init_kwargs['op_name']):
-            max_actor_nums = 4
-
        resources = {}

-        if num_npus > 0:
-            resources["node_npu"] = 0.1
+        if init_kwargs.get("npu", 0) > 0:
+            resources["npu"] = init_kwargs.get("npu")

        if init_kwargs.get("arch", "arm").startswith("x86"):
            resources["arch"] = "x86"

+        cpu = init_kwargs.get("cpu", 0.05)
+        memory = init_kwargs.get("memory", None)
+
        kwargs.update({"ext_params": {}, "failed_reason": {}, "target_type": None})
        try:
            if issubclass(operators_cls, (Mapper, RELATIVE_Mapper)):
@@ -186,7 +172,8 @@ class RayDataset(BasicDataset):
                                          fn_constructor_kwargs=init_kwargs,
                                          fn_kwargs=kwargs,
                                          resources=resources,
-                                          num_cpus=0.05,
+                                          num_cpus=cpu,
+                                          memory=memory,
                                          compute=rd.ActorPoolStrategy(min_size=1,
                                                                       max_size=int(max_actor_nums)))

@@ -195,7 +182,8 @@ class RayDataset(BasicDataset):
                                               fn_constructor_kwargs=init_kwargs,
                                               fn_kwargs=kwargs,
                                               resources=resources,
-                                               num_cpus=0.05,
+                                               num_cpus=cpu,
+                                               memory=memory,
                                               compute=rd.ActorPoolStrategy(min_size=1,
                                                                            max_size=int(max_actor_nums)))

@@ -204,7 +192,8 @@ class RayDataset(BasicDataset):
                                             fn_constructor_kwargs=init_kwargs,
                                             fn_kwargs=kwargs,
                                             resources=resources,
-                                             num_cpus=0.05,
+                                             num_cpus=cpu,
+                                             memory=memory,
                                             compute=rd.ActorPoolStrategy(min_size=1,
                                                                          max_size=int(max_actor_nums)))
            else:
@@ -214,13 +203,3 @@ class RayDataset(BasicDataset):
        except Exception as e:
            logger.error(e)
            raise Exception("Error! Ops Details:") from e
-
-    def _use_onnx_model(self, ops_name):
-        if ops_name in self.onnx_ops_name:
-            return True
-        return False
-
-    def _use_npu_model(self, ops_name):
-        if ops_name in self.npu_ops_name:
-            return True
-        return False