feat: 支持运行data-juicer算子 (#215)

* feature: 增加data-juicer算子

* feat: 支持运行data-juicer算子

* feat: 支持data-juicer任务下发

* feat: 支持data-juicer结果数据集归档

* feat: 支持data-juicer结果数据集归档
This commit is contained in:
hhhhsc701
2025-12-31 09:20:41 +08:00
committed by GitHub
parent 63f4e3e447
commit 6a1eb85e8e
26 changed files with 709 additions and 120 deletions

View File

@@ -107,6 +107,8 @@ public class CleaningTaskService {
cleanTaskValidator.checkNameDuplication(request.getName());
cleanTaskValidator.checkInputAndOutput(request.getInstance());
ExecutorType executorType = cleanTaskValidator.checkAndGetExecutorType(request.getInstance());
CreateDatasetRequest createDatasetRequest = new CreateDatasetRequest();
createDatasetRequest.setName(request.getDestDatasetName());
createDatasetRequest.setDatasetType(DatasetType.valueOf(request.getDestDatasetType()));
@@ -131,7 +133,7 @@ public class CleaningTaskService {
operatorInstanceRepo.insertInstance(taskId, request.getInstance());
prepareTask(task, request.getInstance());
prepareTask(task, request.getInstance(), executorType);
scanDataset(taskId, request.getSrcDatasetId());
taskScheduler.executeTask(taskId);
return task;
@@ -209,20 +211,20 @@ public class CleaningTaskService {
taskScheduler.executeTask(taskId);
}
private void prepareTask(CleaningTaskDto task, List<OperatorInstanceDto> instances) {
private void prepareTask(CleaningTaskDto task, List<OperatorInstanceDto> instances, ExecutorType executorType) {
List<OperatorDto> allOperators = operatorRepo.findAllOperators();
Map<String, OperatorDto> defaultSettings = allOperators.stream()
Map<String, OperatorDto> operatorDtoMap = allOperators.stream()
.collect(Collectors.toMap(OperatorDto::getId, Function.identity()));
TaskProcess process = new TaskProcess();
process.setInstanceId(task.getId());
process.setDatasetId(task.getDestDatasetId());
process.setExecutorType(executorType.getValue());
process.setDatasetPath(FLOW_PATH + "/" + task.getId() + "/dataset.jsonl");
process.setExportPath(DATASET_PATH + "/" + task.getDestDatasetId());
process.setExecutorType(ExecutorType.DATAMATE.getValue());
process.setProcess(instances.stream()
.map(instance -> {
OperatorDto operatorDto = defaultSettings.get(instance.getId());
OperatorDto operatorDto = operatorDtoMap.get(instance.getId());
Map<String, Object> stringObjectMap = getDefaultValue(operatorDto);
stringObjectMap.putAll(instance.getOverrides());
Map<String, Object> runtime = getRuntime(operatorDto);
@@ -240,7 +242,7 @@ public class CleaningTaskService {
options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK);
Yaml yaml = new Yaml(options);
File file = new File(FLOW_PATH + "/" + process.getInstanceId() + "/process.yaml");
File file = new File(FLOW_PATH + "/" + task.getId() + "/process.yaml");
file.getParentFile().mkdirs();
try (FileWriter writer = new FileWriter(file)) {

View File

@@ -77,6 +77,7 @@ public class CleaningTemplateService {
@Transactional
public CleaningTemplateDto createTemplate(CreateCleaningTemplateRequest request) {
cleanTaskValidator.checkInputAndOutput(request.getInstance());
cleanTaskValidator.checkAndGetExecutorType(request.getInstance());
CleaningTemplateDto template = new CleaningTemplateDto();
String templateId = UUID.randomUUID().toString();
template.setId(templateId);

View File

@@ -12,7 +12,11 @@ public enum CleanErrorCode implements ErrorCode {
*/
DUPLICATE_TASK_NAME("clean.0001", "清洗任务名称重复"),
IN_AND_OUT_NOT_MATCH("clean.0002", "算子输入输出不匹配");
OPERATOR_LIST_EMPTY("clean.0002", "任务列表为空"),
IN_AND_OUT_NOT_MATCH("clean.0003", "算子输入输出不匹配"),
EXECUTOR_NOT_MATCH("clean.0004", "算子执行器不匹配");
private final String code;
private final String message;

View File

@@ -1,11 +1,15 @@
package com.datamate.cleaning.infrastructure.validator;
import com.datamate.cleaning.common.enums.ExecutorType;
import com.datamate.cleaning.common.exception.CleanErrorCode;
import com.datamate.cleaning.domain.repository.CleaningTaskRepository;
import com.datamate.cleaning.interfaces.dto.OperatorInstanceDto;
import com.datamate.common.infrastructure.exception.BusinessException;
import com.datamate.common.infrastructure.exception.SystemErrorCode;
import com.datamate.common.setting.application.SysParamApplicationService;
import com.datamate.operator.domain.contants.OperatorConstant;
import lombok.RequiredArgsConstructor;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component;
@@ -19,6 +23,8 @@ import java.util.regex.Pattern;
public class CleanTaskValidator {
private final CleaningTaskRepository cleaningTaskRepo;
private final SysParamApplicationService sysParamApplicationService;
private final Pattern UUID_PATTERN = Pattern.compile(
"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"
);
@@ -51,4 +57,28 @@ public class CleanTaskValidator {
throw BusinessException.of(SystemErrorCode.INVALID_PARAMETER);
}
}
public ExecutorType checkAndGetExecutorType(List<OperatorInstanceDto> operators) {
if (operators == null || operators.isEmpty()) {
throw BusinessException.of(CleanErrorCode.OPERATOR_LIST_EMPTY);
}
for (int i = 1; i < operators.size(); i++) {
OperatorInstanceDto front = operators.get(i - 1);
OperatorInstanceDto back = operators.get(i);
boolean frontHas = CollectionUtils.isNotEmpty(front.getCategories())
&& front.getCategories().contains(OperatorConstant.CATEGORY_DATA_JUICER_ID);
boolean backHas = CollectionUtils.isNotEmpty(back.getCategories())
&& back.getCategories().contains(OperatorConstant.CATEGORY_DATA_JUICER_ID);
if (frontHas == backHas) {
continue;
}
throw BusinessException.of(CleanErrorCode.EXECUTOR_NOT_MATCH,
String.format(Locale.ROOT, "ops(name: [%s, %s]) executor does not match",
front.getName(), back.getName()));
}
if (operators.getFirst().getCategories().contains(OperatorConstant.CATEGORY_DATA_JUICER_ID)) {
return ExecutorType.fromValue(sysParamApplicationService.getParamByKey("DATA_JUICER_EXECUTOR"));
}
return ExecutorType.DATAMATE;
}
}

View File

@@ -24,7 +24,7 @@ public class OperatorInstanceDto {
private String outputs;
private List<Integer> categories;
private List<String> categories;
private Map<String, Object> overrides = new HashMap<>();
}