You've already forked DataMate
feat: 支持运行data-juicer算子 (#215)
* feature: 增加data-juicer算子 * feat: 支持运行data-juicer算子 * feat: 支持data-juicer任务下发 * feat: 支持data-juicer结果数据集归档 * feat: 支持data-juicer结果数据集归档
This commit is contained in:
@@ -107,6 +107,8 @@ public class CleaningTaskService {
|
||||
cleanTaskValidator.checkNameDuplication(request.getName());
|
||||
cleanTaskValidator.checkInputAndOutput(request.getInstance());
|
||||
|
||||
ExecutorType executorType = cleanTaskValidator.checkAndGetExecutorType(request.getInstance());
|
||||
|
||||
CreateDatasetRequest createDatasetRequest = new CreateDatasetRequest();
|
||||
createDatasetRequest.setName(request.getDestDatasetName());
|
||||
createDatasetRequest.setDatasetType(DatasetType.valueOf(request.getDestDatasetType()));
|
||||
@@ -131,7 +133,7 @@ public class CleaningTaskService {
|
||||
|
||||
operatorInstanceRepo.insertInstance(taskId, request.getInstance());
|
||||
|
||||
prepareTask(task, request.getInstance());
|
||||
prepareTask(task, request.getInstance(), executorType);
|
||||
scanDataset(taskId, request.getSrcDatasetId());
|
||||
taskScheduler.executeTask(taskId);
|
||||
return task;
|
||||
@@ -209,20 +211,20 @@ public class CleaningTaskService {
|
||||
taskScheduler.executeTask(taskId);
|
||||
}
|
||||
|
||||
private void prepareTask(CleaningTaskDto task, List<OperatorInstanceDto> instances) {
|
||||
private void prepareTask(CleaningTaskDto task, List<OperatorInstanceDto> instances, ExecutorType executorType) {
|
||||
List<OperatorDto> allOperators = operatorRepo.findAllOperators();
|
||||
Map<String, OperatorDto> defaultSettings = allOperators.stream()
|
||||
Map<String, OperatorDto> operatorDtoMap = allOperators.stream()
|
||||
.collect(Collectors.toMap(OperatorDto::getId, Function.identity()));
|
||||
|
||||
TaskProcess process = new TaskProcess();
|
||||
process.setInstanceId(task.getId());
|
||||
process.setDatasetId(task.getDestDatasetId());
|
||||
process.setExecutorType(executorType.getValue());
|
||||
process.setDatasetPath(FLOW_PATH + "/" + task.getId() + "/dataset.jsonl");
|
||||
process.setExportPath(DATASET_PATH + "/" + task.getDestDatasetId());
|
||||
process.setExecutorType(ExecutorType.DATAMATE.getValue());
|
||||
process.setProcess(instances.stream()
|
||||
.map(instance -> {
|
||||
OperatorDto operatorDto = defaultSettings.get(instance.getId());
|
||||
OperatorDto operatorDto = operatorDtoMap.get(instance.getId());
|
||||
Map<String, Object> stringObjectMap = getDefaultValue(operatorDto);
|
||||
stringObjectMap.putAll(instance.getOverrides());
|
||||
Map<String, Object> runtime = getRuntime(operatorDto);
|
||||
@@ -240,7 +242,7 @@ public class CleaningTaskService {
|
||||
options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK);
|
||||
Yaml yaml = new Yaml(options);
|
||||
|
||||
File file = new File(FLOW_PATH + "/" + process.getInstanceId() + "/process.yaml");
|
||||
File file = new File(FLOW_PATH + "/" + task.getId() + "/process.yaml");
|
||||
file.getParentFile().mkdirs();
|
||||
|
||||
try (FileWriter writer = new FileWriter(file)) {
|
||||
|
||||
@@ -77,6 +77,7 @@ public class CleaningTemplateService {
|
||||
@Transactional
|
||||
public CleaningTemplateDto createTemplate(CreateCleaningTemplateRequest request) {
|
||||
cleanTaskValidator.checkInputAndOutput(request.getInstance());
|
||||
cleanTaskValidator.checkAndGetExecutorType(request.getInstance());
|
||||
CleaningTemplateDto template = new CleaningTemplateDto();
|
||||
String templateId = UUID.randomUUID().toString();
|
||||
template.setId(templateId);
|
||||
|
||||
@@ -12,7 +12,11 @@ public enum CleanErrorCode implements ErrorCode {
|
||||
*/
|
||||
DUPLICATE_TASK_NAME("clean.0001", "清洗任务名称重复"),
|
||||
|
||||
IN_AND_OUT_NOT_MATCH("clean.0002", "算子输入输出不匹配");
|
||||
OPERATOR_LIST_EMPTY("clean.0002", "任务列表为空"),
|
||||
|
||||
IN_AND_OUT_NOT_MATCH("clean.0003", "算子输入输出不匹配"),
|
||||
|
||||
EXECUTOR_NOT_MATCH("clean.0004", "算子执行器不匹配");
|
||||
|
||||
private final String code;
|
||||
private final String message;
|
||||
|
||||
@@ -1,11 +1,15 @@
|
||||
package com.datamate.cleaning.infrastructure.validator;
|
||||
|
||||
import com.datamate.cleaning.common.enums.ExecutorType;
|
||||
import com.datamate.cleaning.common.exception.CleanErrorCode;
|
||||
import com.datamate.cleaning.domain.repository.CleaningTaskRepository;
|
||||
import com.datamate.cleaning.interfaces.dto.OperatorInstanceDto;
|
||||
import com.datamate.common.infrastructure.exception.BusinessException;
|
||||
import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
||||
import com.datamate.common.setting.application.SysParamApplicationService;
|
||||
import com.datamate.operator.domain.contants.OperatorConstant;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@@ -19,6 +23,8 @@ import java.util.regex.Pattern;
|
||||
public class CleanTaskValidator {
|
||||
private final CleaningTaskRepository cleaningTaskRepo;
|
||||
|
||||
private final SysParamApplicationService sysParamApplicationService;
|
||||
|
||||
private final Pattern UUID_PATTERN = Pattern.compile(
|
||||
"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"
|
||||
);
|
||||
@@ -51,4 +57,28 @@ public class CleanTaskValidator {
|
||||
throw BusinessException.of(SystemErrorCode.INVALID_PARAMETER);
|
||||
}
|
||||
}
|
||||
|
||||
public ExecutorType checkAndGetExecutorType(List<OperatorInstanceDto> operators) {
|
||||
if (operators == null || operators.isEmpty()) {
|
||||
throw BusinessException.of(CleanErrorCode.OPERATOR_LIST_EMPTY);
|
||||
}
|
||||
for (int i = 1; i < operators.size(); i++) {
|
||||
OperatorInstanceDto front = operators.get(i - 1);
|
||||
OperatorInstanceDto back = operators.get(i);
|
||||
boolean frontHas = CollectionUtils.isNotEmpty(front.getCategories())
|
||||
&& front.getCategories().contains(OperatorConstant.CATEGORY_DATA_JUICER_ID);
|
||||
boolean backHas = CollectionUtils.isNotEmpty(back.getCategories())
|
||||
&& back.getCategories().contains(OperatorConstant.CATEGORY_DATA_JUICER_ID);
|
||||
if (frontHas == backHas) {
|
||||
continue;
|
||||
}
|
||||
throw BusinessException.of(CleanErrorCode.EXECUTOR_NOT_MATCH,
|
||||
String.format(Locale.ROOT, "ops(name: [%s, %s]) executor does not match",
|
||||
front.getName(), back.getName()));
|
||||
}
|
||||
if (operators.getFirst().getCategories().contains(OperatorConstant.CATEGORY_DATA_JUICER_ID)) {
|
||||
return ExecutorType.fromValue(sysParamApplicationService.getParamByKey("DATA_JUICER_EXECUTOR"));
|
||||
}
|
||||
return ExecutorType.DATAMATE;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ public class OperatorInstanceDto {
|
||||
|
||||
private String outputs;
|
||||
|
||||
private List<Integer> categories;
|
||||
private List<String> categories;
|
||||
|
||||
private Map<String, Object> overrides = new HashMap<>();
|
||||
}
|
||||
|
||||
@@ -32,6 +32,8 @@ public class OperatorConstant {
|
||||
|
||||
public static String CATEGORY_PREDEFINED_ID = "96a3b07a-3439-4557-a835-525faad60ca3";
|
||||
|
||||
public static String CATEGORY_DATA_JUICER_ID = "79b385b4-fde8-4617-bcba-02a176938996";
|
||||
|
||||
public static Map<String, String> CATEGORY_MAP = new HashMap<>();
|
||||
|
||||
static {
|
||||
|
||||
Reference in New Issue
Block a user