From 6a1eb85e8e8336854964c3c2bda7c5d571295812 Mon Sep 17 00:00:00 2001 From: hhhhsc701 <56435672+hhhhsc701@users.noreply.github.com> Date: Wed, 31 Dec 2025 09:20:41 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81=E8=BF=90=E8=A1=8Cdat?= =?UTF-8?q?a-juicer=E7=AE=97=E5=AD=90=20(#215)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feature: 增加data-juicer算子 * feat: 支持运行data-juicer算子 * feat: 支持data-juicer任务下发 * feat: 支持data-juicer结果数据集归档 * feat: 支持data-juicer结果数据集归档 --- Makefile | 12 +- .../application/CleaningTaskService.java | 14 +- .../application/CleaningTemplateService.java | 1 + .../common/exception/CleanErrorCode.java | 6 +- .../validator/CleanTaskValidator.java | 30 ++ .../interfaces/dto/OperatorInstanceDto.java | 2 +- .../domain/contants/OperatorConstant.java | 2 + deployment/docker/datamate/docker-compose.yml | 15 + .../charts/ray-cluster/templates/_helpers.tpl | 12 - .../templates/raycluster-cluster.yaml | 10 +- deployment/helm/datamate/values.yaml | 6 +- deployment/kubernetes/data-juicer/deploy.yaml | 74 +++++ .../pages/DataCleansing/Create/CreateTask.tsx | 1 + .../DataCleansing/Create/CreateTemplate.tsx | 1 + .../Create/components/OperatorLibrary.tsx | 2 +- runtime/ops/pyproject.toml | 16 +- .../python-executor/datamate/core/base_op.py | 23 +- .../datamate/wrappers/__init__.py | 3 +- .../datamate/wrappers/data_juicer_executor.py | 150 ++++++++++ .../datamate/wrappers/data_juicer_wrapper.py | 7 +- .../datamate/wrappers/datamate_executor.py | 78 +---- .../datamate/wrappers/executor.py | 80 ++++++ runtime/python-executor/pyproject.toml | 4 +- scripts/db/data-operator-init.sql | 272 +++++++++++++++++- scripts/db/setting-management-init.sql | 2 +- scripts/images/runtime/Dockerfile | 6 +- 26 files changed, 709 insertions(+), 120 deletions(-) create mode 100644 deployment/kubernetes/data-juicer/deploy.yaml create mode 100644 runtime/python-executor/datamate/wrappers/data_juicer_executor.py create mode 100644 runtime/python-executor/datamate/wrappers/executor.py diff --git a/Makefile b/Makefile index 2076d2e..4778f07 100644 --- a/Makefile +++ b/Makefile @@ -238,7 +238,7 @@ endif # ========== Docker Install/Uninstall Targets ========== # Valid service targets for docker install/uninstall -VALID_SERVICE_TARGETS := datamate backend frontend runtime mineru "deer-flow" milvus "label-studio" +VALID_SERVICE_TARGETS := datamate backend frontend runtime mineru "deer-flow" milvus "label-studio" "data-juicer" dj # Generic docker service install target .PHONY: %-docker-install @@ -263,6 +263,8 @@ VALID_SERVICE_TARGETS := datamate backend frontend runtime mineru "deer-flow" mi REGISTRY=$(REGISTRY) docker compose -f deployment/docker/deer-flow/docker-compose.yml up -d; \ elif [ "$*" = "milvus" ]; then \ docker compose -f deployment/docker/milvus/docker-compose.yml up -d; \ + elif [ "$*" = "data-juicer" ] || [ "$*" = "dj" ]; then \ + REGISTRY=$(REGISTRY) && docker compose -f deployment/docker/datamate/docker-compose.yml up -d datamate-data-juicer; \ else \ $(call docker-compose-service,$*,up -d,deployment/docker/datamate); \ fi @@ -300,6 +302,8 @@ VALID_SERVICE_TARGETS := datamate backend frontend runtime mineru "deer-flow" mi else \ docker compose -f deployment/docker/milvus/docker-compose.yml down; \ fi; \ + elif [ "$*" = "data-juicer" ] || [ "$*" = "dj" ]; then \ + $(call docker-compose-service,datamate-data-juicer,down,deployment/docker/datamate); \ else \ $(call docker-compose-service,$*,down,deployment/docker/datamate); \ fi @@ -307,7 +311,7 @@ VALID_SERVICE_TARGETS := datamate backend frontend runtime mineru "deer-flow" mi # ========== Kubernetes Install/Uninstall Targets ========== # Valid k8s targets -VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio +VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio data-juicer dj # Generic k8s install target .PHONY: %-k8s-install @@ -334,6 +338,8 @@ VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio helm upgrade milvus deployment/helm/milvus -n $(NAMESPACE) --install; \ elif [ "$*" = "label-studio" ]; then \ helm upgrade label-studio deployment/helm/label-studio -n $(NAMESPACE) --install; \ + elif [ "$*" = "data-juicer" ] || [ "$*" = "dj" ]; then \ + kubectl apply -f deployment/kubernetes/data-juicer/deploy.yaml -n $(NAMESPACE); \ fi # Generic k8s uninstall target @@ -357,6 +363,8 @@ VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio helm uninstall milvus -n $(NAMESPACE) --ignore-not-found; \ elif [ "$*" = "label-studio" ]; then \ helm uninstall label-studio -n $(NAMESPACE) --ignore-not-found; \ + elif [ "$*" = "data-juicer" ] || [ "$*" = "dj" ]; then \ + kubectl delete -f deployment/kubernetes/data-juicer/deploy.yaml -n $(NAMESPACE); \ fi # ========== Upgrade Targets ========== diff --git a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTaskService.java b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTaskService.java index 3d9e5bf..ca94b36 100644 --- a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTaskService.java +++ b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTaskService.java @@ -107,6 +107,8 @@ public class CleaningTaskService { cleanTaskValidator.checkNameDuplication(request.getName()); cleanTaskValidator.checkInputAndOutput(request.getInstance()); + ExecutorType executorType = cleanTaskValidator.checkAndGetExecutorType(request.getInstance()); + CreateDatasetRequest createDatasetRequest = new CreateDatasetRequest(); createDatasetRequest.setName(request.getDestDatasetName()); createDatasetRequest.setDatasetType(DatasetType.valueOf(request.getDestDatasetType())); @@ -131,7 +133,7 @@ public class CleaningTaskService { operatorInstanceRepo.insertInstance(taskId, request.getInstance()); - prepareTask(task, request.getInstance()); + prepareTask(task, request.getInstance(), executorType); scanDataset(taskId, request.getSrcDatasetId()); taskScheduler.executeTask(taskId); return task; @@ -209,20 +211,20 @@ public class CleaningTaskService { taskScheduler.executeTask(taskId); } - private void prepareTask(CleaningTaskDto task, List instances) { + private void prepareTask(CleaningTaskDto task, List instances, ExecutorType executorType) { List allOperators = operatorRepo.findAllOperators(); - Map defaultSettings = allOperators.stream() + Map operatorDtoMap = allOperators.stream() .collect(Collectors.toMap(OperatorDto::getId, Function.identity())); TaskProcess process = new TaskProcess(); process.setInstanceId(task.getId()); process.setDatasetId(task.getDestDatasetId()); + process.setExecutorType(executorType.getValue()); process.setDatasetPath(FLOW_PATH + "/" + task.getId() + "/dataset.jsonl"); process.setExportPath(DATASET_PATH + "/" + task.getDestDatasetId()); - process.setExecutorType(ExecutorType.DATAMATE.getValue()); process.setProcess(instances.stream() .map(instance -> { - OperatorDto operatorDto = defaultSettings.get(instance.getId()); + OperatorDto operatorDto = operatorDtoMap.get(instance.getId()); Map stringObjectMap = getDefaultValue(operatorDto); stringObjectMap.putAll(instance.getOverrides()); Map runtime = getRuntime(operatorDto); @@ -240,7 +242,7 @@ public class CleaningTaskService { options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK); Yaml yaml = new Yaml(options); - File file = new File(FLOW_PATH + "/" + process.getInstanceId() + "/process.yaml"); + File file = new File(FLOW_PATH + "/" + task.getId() + "/process.yaml"); file.getParentFile().mkdirs(); try (FileWriter writer = new FileWriter(file)) { diff --git a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTemplateService.java b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTemplateService.java index ed8c9d9..e7364e4 100644 --- a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTemplateService.java +++ b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTemplateService.java @@ -77,6 +77,7 @@ public class CleaningTemplateService { @Transactional public CleaningTemplateDto createTemplate(CreateCleaningTemplateRequest request) { cleanTaskValidator.checkInputAndOutput(request.getInstance()); + cleanTaskValidator.checkAndGetExecutorType(request.getInstance()); CleaningTemplateDto template = new CleaningTemplateDto(); String templateId = UUID.randomUUID().toString(); template.setId(templateId); diff --git a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/common/exception/CleanErrorCode.java b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/common/exception/CleanErrorCode.java index ff7e2d4..68a4521 100644 --- a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/common/exception/CleanErrorCode.java +++ b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/common/exception/CleanErrorCode.java @@ -12,7 +12,11 @@ public enum CleanErrorCode implements ErrorCode { */ DUPLICATE_TASK_NAME("clean.0001", "清洗任务名称重复"), - IN_AND_OUT_NOT_MATCH("clean.0002", "算子输入输出不匹配"); + OPERATOR_LIST_EMPTY("clean.0002", "任务列表为空"), + + IN_AND_OUT_NOT_MATCH("clean.0003", "算子输入输出不匹配"), + + EXECUTOR_NOT_MATCH("clean.0004", "算子执行器不匹配"); private final String code; private final String message; diff --git a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/validator/CleanTaskValidator.java b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/validator/CleanTaskValidator.java index 3033895..82e48c7 100644 --- a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/validator/CleanTaskValidator.java +++ b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/validator/CleanTaskValidator.java @@ -1,11 +1,15 @@ package com.datamate.cleaning.infrastructure.validator; +import com.datamate.cleaning.common.enums.ExecutorType; import com.datamate.cleaning.common.exception.CleanErrorCode; import com.datamate.cleaning.domain.repository.CleaningTaskRepository; import com.datamate.cleaning.interfaces.dto.OperatorInstanceDto; import com.datamate.common.infrastructure.exception.BusinessException; import com.datamate.common.infrastructure.exception.SystemErrorCode; +import com.datamate.common.setting.application.SysParamApplicationService; +import com.datamate.operator.domain.contants.OperatorConstant; import lombok.RequiredArgsConstructor; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.springframework.stereotype.Component; @@ -19,6 +23,8 @@ import java.util.regex.Pattern; public class CleanTaskValidator { private final CleaningTaskRepository cleaningTaskRepo; + private final SysParamApplicationService sysParamApplicationService; + private final Pattern UUID_PATTERN = Pattern.compile( "^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$" ); @@ -51,4 +57,28 @@ public class CleanTaskValidator { throw BusinessException.of(SystemErrorCode.INVALID_PARAMETER); } } + + public ExecutorType checkAndGetExecutorType(List operators) { + if (operators == null || operators.isEmpty()) { + throw BusinessException.of(CleanErrorCode.OPERATOR_LIST_EMPTY); + } + for (int i = 1; i < operators.size(); i++) { + OperatorInstanceDto front = operators.get(i - 1); + OperatorInstanceDto back = operators.get(i); + boolean frontHas = CollectionUtils.isNotEmpty(front.getCategories()) + && front.getCategories().contains(OperatorConstant.CATEGORY_DATA_JUICER_ID); + boolean backHas = CollectionUtils.isNotEmpty(back.getCategories()) + && back.getCategories().contains(OperatorConstant.CATEGORY_DATA_JUICER_ID); + if (frontHas == backHas) { + continue; + } + throw BusinessException.of(CleanErrorCode.EXECUTOR_NOT_MATCH, + String.format(Locale.ROOT, "ops(name: [%s, %s]) executor does not match", + front.getName(), back.getName())); + } + if (operators.getFirst().getCategories().contains(OperatorConstant.CATEGORY_DATA_JUICER_ID)) { + return ExecutorType.fromValue(sysParamApplicationService.getParamByKey("DATA_JUICER_EXECUTOR")); + } + return ExecutorType.DATAMATE; + } } diff --git a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/dto/OperatorInstanceDto.java b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/dto/OperatorInstanceDto.java index 5ea8216..3cb10dd 100644 --- a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/dto/OperatorInstanceDto.java +++ b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/dto/OperatorInstanceDto.java @@ -24,7 +24,7 @@ public class OperatorInstanceDto { private String outputs; - private List categories; + private List categories; private Map overrides = new HashMap<>(); } diff --git a/backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/contants/OperatorConstant.java b/backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/contants/OperatorConstant.java index 7c88041..bf2151b 100644 --- a/backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/contants/OperatorConstant.java +++ b/backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/contants/OperatorConstant.java @@ -32,6 +32,8 @@ public class OperatorConstant { public static String CATEGORY_PREDEFINED_ID = "96a3b07a-3439-4557-a835-525faad60ca3"; + public static String CATEGORY_DATA_JUICER_ID = "79b385b4-fde8-4617-bcba-02a176938996"; + public static Map CATEGORY_MAP = new HashMap<>(); static { diff --git a/deployment/docker/datamate/docker-compose.yml b/deployment/docker/datamate/docker-compose.yml index cc32413..8ee90ab 100644 --- a/deployment/docker/datamate/docker-compose.yml +++ b/deployment/docker/datamate/docker-compose.yml @@ -139,6 +139,21 @@ services: - "6379:6379" networks: [ datamate ] + datamate-data-juicer: + container_name: datamate-data-juicer + image: datajuicer/data-juicer:v1.4.4 + restart: on-failure + command: + - uvicorn + - service:app + - --host + - "0.0.0.0" + volumes: + - dataset_volume:/dataset + - flow_volume:/flow + networks: [ datamate ] + profiles: [ data-juicer ] + volumes: dataset_volume: name: datamate-dataset-volume diff --git a/deployment/helm/datamate/charts/ray-cluster/templates/_helpers.tpl b/deployment/helm/datamate/charts/ray-cluster/templates/_helpers.tpl index 019ba06..79fe3bb 100644 --- a/deployment/helm/datamate/charts/ray-cluster/templates/_helpers.tpl +++ b/deployment/helm/datamate/charts/ray-cluster/templates/_helpers.tpl @@ -66,15 +66,3 @@ Name of image {{- $name }}:{{ $tag }} {{- end }} {{- end }} - -{{/* -Name of sidecar image -*/}} -{{- define "ray-cluster-sidecar.image" -}} -{{- $name := default (printf "%s:%s" .Values.image.repository .Values.image.tag) .Values.head.sidecarContainers.image }} -{{- if .Values.global.image.repository }} -{{- .Values.global.image.repository | trimSuffix "/" }}/{{ $name }} -{{- else }} -{{- $name }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/deployment/helm/datamate/charts/ray-cluster/templates/raycluster-cluster.yaml b/deployment/helm/datamate/charts/ray-cluster/templates/raycluster-cluster.yaml index 5f26211..cf7af6c 100644 --- a/deployment/helm/datamate/charts/ray-cluster/templates/raycluster-cluster.yaml +++ b/deployment/helm/datamate/charts/ray-cluster/templates/raycluster-cluster.yaml @@ -96,7 +96,7 @@ spec: securityContext: {{- toYaml . | nindent 12 }} {{- end }} - {{- $defult := printf "%s:%s" .Values.image.repository .Values.image.tag }} + {{- $defult := include "ray-cluster.image" . -}} {{- with .Values.head.sidecarContainers }} {{- range $index, $container := . }} {{- $image := default $defult $container.image -}} @@ -313,10 +313,14 @@ spec: - name: ray-worker {{- if $values.image }} image: {{ $values.image.repository }}:{{ $values.image.tag }} + {{- if $values.image.pullPolicy }} imagePullPolicy: {{ $values.image.pullPolicy }} {{- else }} - image: {{ $.Values.image.repository }}:{{ $.Values.image.tag }} - imagePullPolicy: {{ $.Values.image.pullPolicy }} + imagePullPolicy: {{ default $.Values.image.pullPolicy $.Values.global.image.pullPolicy }} + {{- end }} + {{- else }} + image: {{ include "ray-cluster.image" $ }} + imagePullPolicy: {{ default $.Values.image.pullPolicy $.Values.global.image.pullPolicy }} {{- end }} {{- with $values.command }} command: diff --git a/deployment/helm/datamate/values.yaml b/deployment/helm/datamate/values.yaml index 5290d6b..3e7e0ed 100644 --- a/deployment/helm/datamate/values.yaml +++ b/deployment/helm/datamate/values.yaml @@ -248,7 +248,6 @@ ray-cluster: subPath: site-packages sidecarContainers: - name: runtime - image: datamate-runtime imagePullPolicy: IfNotPresent args: *runtimeArgs env: *runtimeEnv @@ -338,6 +337,9 @@ ray-cluster: - *flowVolume - *logVolume - *operatorVolume + - name: ascend + hostPath: + path: /usr/local/Ascend volumeMounts: - mountPath: /tmp/ray name: log-volume @@ -352,3 +354,5 @@ ray-cluster: - mountPath: /usr/local/lib/ops/site-packages name: operator-volume subPath: site-packages + - mountPath: /usr/local/Ascend + name: ascend diff --git a/deployment/kubernetes/data-juicer/deploy.yaml b/deployment/kubernetes/data-juicer/deploy.yaml new file mode 100644 index 0000000..f5b53c6 --- /dev/null +++ b/deployment/kubernetes/data-juicer/deploy.yaml @@ -0,0 +1,74 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: datamate-data-juicer + labels: + app: datamate + tier: data-juicer +spec: + replicas: 1 + selector: + matchLabels: + app: datamate + tier: data-juicer + template: + metadata: + labels: + app: datamate + tier: data-juicer + spec: + containers: + - name: data-juicer + image: datajuicer/data-juicer:v1.4.4 + imagePullPolicy: IfNotPresent + command: + - uvicorn + args: + - service:app + - --host + - "0.0.0.0" + ports: + - containerPort: 8000 + resources: + limits: + cpu: 8 + memory: 32Gi + requests: + cpu: 100m + memory: 100Mi + volumeMounts: + - name: dataset-volume + mountPath: /dataset + - name: log-volume + mountPath: /var/log/datamate/data-juicer + subPath: data-juicer + - name: flow-volume + mountPath: /flow + volumes: + - name: dataset-volume + persistentVolumeClaim: + claimName: datamate-dataset-pvc + - name: flow-volume + persistentVolumeClaim: + claimName: datamate-flow-pvc + - name: log-volume + persistentVolumeClaim: + claimName: datamate-log-pvc + +--- +apiVersion: v1 +kind: Service +metadata: + name: datamate-data-juicer + labels: + app: datamate + tier: data-juicer +spec: + type: ClusterIP + ports: + - port: 8000 + targetPort: 8000 + protocol: TCP + selector: + app: datamate + tier: data-juicer diff --git a/frontend/src/pages/DataCleansing/Create/CreateTask.tsx b/frontend/src/pages/DataCleansing/Create/CreateTask.tsx index b0fbe57..8d9d91c 100644 --- a/frontend/src/pages/DataCleansing/Create/CreateTask.tsx +++ b/frontend/src/pages/DataCleansing/Create/CreateTask.tsx @@ -38,6 +38,7 @@ export default function CleansingTaskCreate() { ...item.defaultParams, ...item.overrides, }, + categories: item.categories, inputs: item.inputs, outputs: item.outputs, })), diff --git a/frontend/src/pages/DataCleansing/Create/CreateTemplate.tsx b/frontend/src/pages/DataCleansing/Create/CreateTemplate.tsx index ca8694b..b664a89 100644 --- a/frontend/src/pages/DataCleansing/Create/CreateTemplate.tsx +++ b/frontend/src/pages/DataCleansing/Create/CreateTemplate.tsx @@ -44,6 +44,7 @@ export default function CleansingTemplateCreate() { ...item.defaultParams, ...item.overrides, }, + categories: item.categories, inputs: item.inputs, outputs: item.outputs, })), diff --git a/frontend/src/pages/DataCleansing/Create/components/OperatorLibrary.tsx b/frontend/src/pages/DataCleansing/Create/components/OperatorLibrary.tsx index 0792e9a..3d1f403 100644 --- a/frontend/src/pages/DataCleansing/Create/components/OperatorLibrary.tsx +++ b/frontend/src/pages/DataCleansing/Create/components/OperatorLibrary.tsx @@ -195,7 +195,7 @@ const OperatorLibrary: React.FC = ({
- 算子库({filteredOperators.length}) + 算子库
diff --git a/runtime/ops/pyproject.toml b/runtime/ops/pyproject.toml index ce1afc4..cd681b6 100644 --- a/runtime/ops/pyproject.toml +++ b/runtime/ops/pyproject.toml @@ -3,7 +3,7 @@ name = "ops" version = "0.0.1" description = "Add your description here" readme = "README.md" -requires-python = ">=3.11" +requires-python = ">=3.10" dependencies = [ "beautifulsoup4>=4.14.3", "datasketch>=1.8.0", @@ -12,14 +12,14 @@ dependencies = [ "jieba>=0.42.1", "loguru>=0.7.3", "mineru>=2.6.5", - "numpy==1.24.3", + "numpy>=2.2.6", "python-multipart>=0.0.20", - "opencv-contrib-python-headless==4.7.0.72", - "opencv-python-headless==4.7.0.72", + "opencv-contrib-python-headless>=4.12.0.88", + "opencv-python-headless>=4.12.0.88", "openslide-python>=1.4.3", - "paddleocr==2.8.1", - "paddlepaddle==2.6.2", - "pandas==1.5.3", + "paddleocr==3.3.0", + "paddlepaddle==3.2.2", + "pandas>=2.2.3", "presidio-analyzer==2.2.25", "presidio-anonymizer==2.2.25", "pycryptodome>=3.23.0", @@ -27,7 +27,7 @@ dependencies = [ "python-docx>=1.2.0", "pytz>=2025.2", "six>=1.17.0", - "spacy==3.7.0", + "spacy>=3.7.0", "sqlalchemy>=2.0.44", "xmltodict>=1.0.2", "zhconv>=1.4.3", diff --git a/runtime/python-executor/datamate/core/base_op.py b/runtime/python-executor/datamate/core/base_op.py index fd5f661..f87457d 100644 --- a/runtime/python-executor/datamate/core/base_op.py +++ b/runtime/python-executor/datamate/core/base_op.py @@ -151,21 +151,31 @@ class BaseOp: if filetype in ["ppt", "pptx", "docx", "doc", "xlsx", "csv", "md", "pdf"]: elements = partition(filename=filepath) sample[self.text_key] = "\n\n".join([str(el) for el in elements]) + sample[self.data_key] = b"" elif filetype in ["txt", "md", "markdown", "xml", "html", "json", "jsonl"]: with open(filepath, 'rb') as f: content = f.read() sample[self.text_key] = content.decode("utf-8-sig").replace("\r\n", "\n") + sample[self.data_key] = b"" elif filetype in ['jpg', 'jpeg', 'png', 'bmp']: image_np = cv2.imdecode(np.fromfile(filepath, dtype=np.uint8), -1) if image_np.size: data = cv2.imencode(f".{filetype}", image_np)[1] image_bytes = data.tobytes() sample[self.data_key] = image_bytes + sample[self.text_key] = "" + return sample def read_file_first(self, sample): if self.is_first_op: self.read_file(sample) + @staticmethod + def save_file_and_db(sample): + if FileExporter().execute(sample): + TaskInfoPersistence().persistence_task_info(sample) + return sample + class Mapper(BaseOp): def __init__(self, *args, **kwargs): @@ -195,8 +205,7 @@ class Mapper(BaseOp): sample["execute_status"] = execute_status # 加载文件成功执行信息到数据库 if self.is_last_op: - if FileExporter().execute(sample): - TaskInfoPersistence().persistence_task_info(sample) + self.save_file_and_db(sample) return sample def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]: @@ -237,8 +246,7 @@ class Slicer(BaseOp): # 加载文件成功执行信息到数据库 if self.is_last_op: - if FileExporter().execute(sample): - TaskInfoPersistence().persistence_task_info(sample) + self.save_file_and_db(sample) return [sample] @@ -333,8 +341,7 @@ class Filter(BaseOp): # 加载文件成功执行信息到数据库 if self.is_last_op: - if FileExporter().execute(sample): - TaskInfoPersistence().persistence_task_info(sample) + self.save_file_and_db(sample) return True def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]: @@ -490,7 +497,7 @@ class FileExporter(BaseOp): save_path = self.get_save_path(sample, target_type) # 不存在则保存为txt文件,正常文本清洗 else: - sample = self._get_from_text_or_data(sample) + sample = self._get_from_text(sample) save_path = self.get_save_path(sample, 'txt') return sample, save_path @@ -552,7 +559,7 @@ class FileExporter(BaseOp): return sample def _get_from_text_or_data(self, sample: Dict[str, Any]) -> Dict[str, Any]: - if sample[self.data_key] is not None and sample[self.data_key] != b'': + if sample[self.data_key] is not None and sample[self.data_key] != b'' and sample[self.data_key] != "": return self._get_from_data(sample) else: return self._get_from_text(sample) diff --git a/runtime/python-executor/datamate/wrappers/__init__.py b/runtime/python-executor/datamate/wrappers/__init__.py index 599f7d3..6c813a2 100644 --- a/runtime/python-executor/datamate/wrappers/__init__.py +++ b/runtime/python-executor/datamate/wrappers/__init__.py @@ -1,6 +1,7 @@ from . import data_juicer_wrapper, datamate_wrapper WRAPPERS = { - "data_juicer": data_juicer_wrapper, + "ray": data_juicer_wrapper, + "default": data_juicer_wrapper, "datamate": datamate_wrapper } diff --git a/runtime/python-executor/datamate/wrappers/data_juicer_executor.py b/runtime/python-executor/datamate/wrappers/data_juicer_executor.py new file mode 100644 index 0000000..d1a5712 --- /dev/null +++ b/runtime/python-executor/datamate/wrappers/data_juicer_executor.py @@ -0,0 +1,150 @@ +import base64 +import time +from json import dumps as jdumps +from json import loads as jloads +from typing import Dict, Optional +from urllib.parse import urljoin + +import ray +import requests +import yaml +from jsonargparse import ArgumentParser +from loguru import logger + +from datamate.core.base_op import FileExporter, SUCCESS_STATUS +from datamate.core.constant import Fields +from datamate.wrappers.executor import RayExecutor + +DJ_OUTPUT = "outputs" + + +class DataJuicerClient: + def __init__(self, base_url): + self.base_url = base_url + + def call_data_juicer_api(self, path: str, params: Optional[Dict] = None, json: Optional[Dict] = None): + url = urljoin(self.base_url, path) + + if json is not None: + response = requests.post(url, params=params, json=json) + else: + response = requests.get(url, params=params) + + return jloads(response.text) + + + def init_config(self, dataset_path: str, export_path, process): + """ + Initialize Data-Juicer config. + + Args: + :param dataset_path: The input dataset path. + :param process: The ops + :param export_path: The export path. + """ + dj_config = { + "dataset_path": dataset_path, + "export_path": export_path, + "process": process, + "executor_type": "default", + } + url_path = "/data_juicer/config/get_init_configs" + try: + res = self.call_data_juicer_api(url_path, params={"cfg": jdumps(dj_config)}) + except Exception as e: + error_msg = f"An unexpected error occurred in calling {url_path}:\n{e}" + raise RuntimeError(error_msg) + return res["result"] + + def execute_config(self, dj_config: Dict): + """ + Execute data-juicer data process. + + Args: + dj_config: configs of data-juicer + """ + + url_path = "/data_juicer/core/DefaultExecutor/run" + try: + res = self.call_data_juicer_api(url_path, params={"skip_return": True}, json={"cfg": jdumps(dj_config)}) + if res.get("status") != "success": + raise RuntimeError(f"An error occurred in calling {url_path}:\n{res}") + return dj_config["export_path"] + except Exception as e: + error_msg = f"An unexpected error occurred in calling {url_path}:\n{e}" + raise RuntimeError(error_msg) + + +class DataJuicerExecutor(RayExecutor): + def __init__(self, cfg = None, meta = None): + super().__init__(cfg, meta) + self.client = DataJuicerClient(base_url="http://datamate-data-juicer:8000") + self.dataset_path = f"/flow/{self.cfg.instance_id}/dataset_on_dj.jsonl" + self.export_path = f"/flow/{self.cfg.instance_id}/processed_dataset.jsonl" + + def add_column(self, batch): + batch_size = len(batch["filePath"]) + batch["execute_status"] = [SUCCESS_STATUS] * batch_size + batch[Fields.instance_id] = [self.cfg.instance_id] * batch_size + batch[Fields.export_path] = [self.cfg.export_path] * batch_size + return batch + + def run(self): + # 1. 加载数据集 + logger.info('Loading dataset with Ray...') + + if self.meta: + file_content = base64.b64decode(self.meta) + lines = file_content.splitlines() + dataset = ray.data.from_items([jloads(line) for line in lines]) + else: + dataset = self.load_dataset() + + logger.info('Read data...') + dataset = dataset.map(FileExporter().read_file, num_cpus=0.05) + + with open(self.dataset_path, "w", encoding="utf-8") as f: + for batch_df in dataset.iter_batches(batch_format="pandas", batch_size=2048): + batch_df.to_json(f, orient="records", lines=True, force_ascii=False) + + logger.info('Processing data...') + tstart = time.time() + try: + dj_config = self.client.init_config(self.dataset_path, self.export_path, self.cfg.process) + result_path = self.client.execute_config(dj_config) + + processed_dataset = self.load_dataset(result_path) + processed_dataset = processed_dataset.map_batches(self.add_column, num_cpus=0.05) + processed_dataset = processed_dataset.map(FileExporter().save_file_and_db, num_cpus=0.05) + for _ in processed_dataset.iter_batches(): + pass + except Exception as e: + logger.error(f"An unexpected error occurred.", e) + raise e + tend = time.time() + logger.info(f'All Ops are done in {tend - tstart:.3f}s.') + + +if __name__ == '__main__': + parser = ArgumentParser(description="Create API for Submitting Job to Data-juicer") + parser.add_argument("--config_path", type=str, required=False, default="../configs/demo.yaml") + parser.add_argument("--flow_config", type=str, required=False, default=None) + + args = parser.parse_args() + + config_path = args.config_path + flow_config = args.flow_config + + if flow_config: + m_cfg = yaml.safe_load(base64.b64decode(flow_config)) + else: + with open(config_path, "r", encoding='utf-8') as f: + m_cfg = yaml.safe_load(f) + + executor = DataJuicerExecutor(m_cfg) + try: + executor.run() + except Exception as e: + executor.update_db("FAILED") + raise e + executor.update_db("COMPLETED") \ No newline at end of file diff --git a/runtime/python-executor/datamate/wrappers/data_juicer_wrapper.py b/runtime/python-executor/datamate/wrappers/data_juicer_wrapper.py index e1d0544..c40fdf5 100644 --- a/runtime/python-executor/datamate/wrappers/data_juicer_wrapper.py +++ b/runtime/python-executor/datamate/wrappers/data_juicer_wrapper.py @@ -1,6 +1,11 @@ # -*- coding: utf-8 -*- +import os + from datamate.scheduler import cmd_scheduler async def submit(task_id, config_path): - await cmd_scheduler.submit(task_id, f"dj-process --config {config_path}") \ No newline at end of file + current_dir = os.path.dirname(__file__) + + await cmd_scheduler.submit(task_id, f"python {os.path.join(current_dir, 'data_juicer_executor.py')} " + f"--config_path={config_path}") \ No newline at end of file diff --git a/runtime/python-executor/datamate/wrappers/datamate_executor.py b/runtime/python-executor/datamate/wrappers/datamate_executor.py index dedf423..ff818af 100644 --- a/runtime/python-executor/datamate/wrappers/datamate_executor.py +++ b/runtime/python-executor/datamate/wrappers/datamate_executor.py @@ -3,21 +3,18 @@ import base64 import json import time -from typing import Dict import ray import yaml -from jsonargparse import dict_to_namespace, ArgumentParser +from jsonargparse import ArgumentParser from loguru import logger -from datamate.common.utils import check_valid_path from datamate.core.dataset import RayDataset -from datamate.sql_manager.persistence_atction import TaskInfoPersistence +from datamate.wrappers.executor import RayExecutor import datamate.ops - -class RayExecutor: +class DataMateExecutor(RayExecutor): """ 基于Ray的执行器. @@ -25,38 +22,8 @@ class RayExecutor: 2. 当前仅加载json文件类型的数据集。 """ - def __init__(self, cfg=None, meta=None): - if isinstance(cfg, Dict): - self.cfg = dict_to_namespace(cfg) - else: - logger.error(f"Please set param: cfg as type Dict, but given cfg as type {type(cfg).__name__}") - raise TypeError(f"To params cfg, Dict type is required, but type {type(cfg).__name__} is given!") - - self.cfg.process = cfg['process'] - self.meta = meta - - # init ray - logger.info('Initing Ray ...') - ray.init() - - def load_meta(self, line): - meta = json.loads(line) - if meta.get("fileId"): - meta["sourceFileId"] = meta.get("fileId") - if meta.get("fileName"): - meta["sourceFileName"] = meta.get("fileName") - if meta.get("fileType"): - meta["sourceFileType"] = meta.get("fileType") - if meta.get("fileSize"): - meta["sourceFileSize"] = meta.get("fileSize") - if not meta.get("totalPageNum"): - meta["totalPageNum"] = 0 - if not meta.get("extraFilePath"): - meta["extraFilePath"] = None - if not meta.get("extraFileType"): - meta["extraFileType"] = None - meta["dataset_id"] = self.cfg.dataset_id - return meta + def __init__(self, cfg = None, meta = None): + super().__init__(cfg, meta) def run(self): # 1. 加载数据集 @@ -77,36 +44,13 @@ class RayExecutor: tend = time.time() logger.info(f'All Ops are done in {tend - tstart:.3f}s.') - dataset.data.materialize() - - def load_dataset(self): - retry = 0 - dataset = None - jsonl_file_path = self.cfg.dataset_path - while True: - if check_valid_path(jsonl_file_path): - with open(jsonl_file_path, "r", encoding='utf-8') as meta: - lines = meta.readlines() - dataset = ray.data.from_items([self.load_meta(line) for line in lines]) - break - if retry < 5: - retry += 1 - time.sleep(retry) - continue - else: - logger.error(f"can not load dataset from dataset_path") - raise RuntimeError(f"Load dataset Failed!, dataset_path: {self.cfg.dataset_path}.") - - return dataset - - def update_db(self, status): - task_info = TaskInfoPersistence() - task_info.update_result(self.cfg.dataset_id, self.cfg.instance_id, status) + for _ in dataset.data.iter_batches(): + pass if __name__ == '__main__': - parser = ArgumentParser(description="Create API for Submitting Job to Data-juicer") + parser = ArgumentParser(description="Create API for Submitting Job to ray") parser.add_argument("--config_path", type=str, required=False, default="../configs/demo.yaml") parser.add_argument("--flow_config", type=str, required=False, default=None) @@ -119,10 +63,10 @@ if __name__ == '__main__': if flow_config: m_cfg = yaml.safe_load(base64.b64decode(flow_config)) else: - with open(config_path, "r", encoding='utf-8') as cfg: - m_cfg = yaml.safe_load(cfg) + with open(config_path, "r", encoding='utf-8') as f: + m_cfg = yaml.safe_load(f) - executor = RayExecutor(m_cfg) + executor = DataMateExecutor(m_cfg) try: executor.run() except Exception as e: diff --git a/runtime/python-executor/datamate/wrappers/executor.py b/runtime/python-executor/datamate/wrappers/executor.py new file mode 100644 index 0000000..9d1a59d --- /dev/null +++ b/runtime/python-executor/datamate/wrappers/executor.py @@ -0,0 +1,80 @@ +import json +import time +from typing import Dict + +import ray +from jsonargparse import dict_to_namespace +from loguru import logger + +from datamate.common.utils import check_valid_path +from datamate.sql_manager.persistence_atction import TaskInfoPersistence + + +class RayExecutor: + """ + 基于Ray的执行器. + + 1. 当前仅支持Mapper,Filter类型的算子。 + 2. 当前仅加载json文件类型的数据集。 + """ + + def __init__(self, cfg=None, meta=None): + if isinstance(cfg, Dict): + self.cfg = dict_to_namespace(cfg) + else: + logger.error(f"Please set param: cfg as type Dict, but given cfg as type {type(cfg).__name__}") + raise TypeError(f"To params cfg, Dict type is required, but type {type(cfg).__name__} is given!") + + self.cfg.process = cfg['process'] + self.meta = meta + + # init ray + logger.info('Initing Ray ...') + ray.init() + + def load_meta(self, line): + meta = json.loads(line) + if meta.get("fileId"): + meta["sourceFileId"] = meta.get("fileId") + if meta.get("fileName"): + meta["sourceFileName"] = meta.get("fileName") + if meta.get("fileType"): + meta["sourceFileType"] = meta.get("fileType") + if meta.get("fileSize"): + meta["sourceFileSize"] = meta.get("fileSize") + if not meta.get("totalPageNum"): + meta["totalPageNum"] = 0 + if not meta.get("extraFilePath"): + meta["extraFilePath"] = None + if not meta.get("extraFileType"): + meta["extraFileType"] = None + meta["dataset_id"] = self.cfg.dataset_id + return meta + + def run(self): + pass + + def load_dataset(self, jsonl_file_path = None): + retry = 0 + dataset = None + if jsonl_file_path is None: + jsonl_file_path = self.cfg.dataset_path + while True: + if check_valid_path(jsonl_file_path): + with open(jsonl_file_path, "r", encoding='utf-8') as meta: + lines = meta.readlines() + dataset = ray.data.from_items([self.load_meta(line) for line in lines]) + break + if retry < 5: + retry += 1 + time.sleep(retry) + continue + else: + logger.error(f"can not load dataset from dataset_path") + raise RuntimeError(f"Load dataset Failed!, dataset_path: {self.cfg.dataset_path}.") + + return dataset + + def update_db(self, status): + task_info = TaskInfoPersistence() + task_info.update_result(self.cfg.dataset_id, self.cfg.instance_id, status) \ No newline at end of file diff --git a/runtime/python-executor/pyproject.toml b/runtime/python-executor/pyproject.toml index d84229d..670b74e 100644 --- a/runtime/python-executor/pyproject.toml +++ b/runtime/python-executor/pyproject.toml @@ -6,7 +6,7 @@ authors = [ { name = "Huawei datamate team" } ] license = { text = "Apache-2.0" } -requires-python = ">=3.10" +requires-python = ">=3.10, <=3.12" urls = { repository = "https://github.com/ModelEngine-Group/datamate" } classifiers = [ "License :: OSI Approved :: Apache Software License", @@ -20,7 +20,7 @@ dependencies = [ "jsonargparse>=4.44.0", "loguru>=0.7.3", "opencv-python-headless>=4.12.0.88", - "ray[data,default]==2.52.1", + "ray[data,default]>=2.52.1", "unstructured[csv,docx,pptx,xlsx,pdf,md]==0.18.15", "uvicorn[standard]>=0.38.0", ] diff --git a/scripts/db/data-operator-init.sql b/scripts/db/data-operator-init.sql index 33b4926..195ed89 100644 --- a/scripts/db/data-operator-init.sql +++ b/scripts/db/data-operator-init.sql @@ -63,7 +63,11 @@ VALUES ('64465bec-b46b-11f0-8291-00155d0e4808', '模态', 'modal', 'predefined' ('b5bfc548-8ef6-417c-b8a6-a4197c078249', 'Java', 'java', 'predefined', '873000a2-65b3-474b-8ccc-4813c08c76fb'), ('16e2d99e-eafb-44fc-acd0-f35a2bad28f8', '来源', 'origin', 'predefined', '0'), ('96a3b07a-3439-4557-a835-525faad60ca3', '系统预置', 'predefined', 'predefined', '16e2d99e-eafb-44fc-acd0-f35a2bad28f8'), - ('ec2cdd17-8b93-4a81-88c4-ac9e98d10757', '用户上传', 'customized', 'predefined', '16e2d99e-eafb-44fc-acd0-f35a2bad28f8'); + ('ec2cdd17-8b93-4a81-88c4-ac9e98d10757', '用户上传', 'customized', 'predefined', '16e2d99e-eafb-44fc-acd0-f35a2bad28f8'), + ('0ed75eea-e20b-11f0-88e6-00155d5c9528', '归属', 'vendor', 'predefined', '0'), + ('431e7798-5426-4e1a-aae6-b9905a836b34', 'DataMate', 'datamate', 'predefined', '0ed75eea-e20b-11f0-88e6-00155d5c9528'), + ('79b385b4-fde8-4617-bcba-02a176938996', 'DataJuicer', 'data-juicer', 'predefined', '0ed75eea-e20b-11f0-88e6-00155d5c9528'), + ('f00eaa3e-96c1-4de4-96cd-9848ef5429ec', '其他', 'others', 'predefined', '0ed75eea-e20b-11f0-88e6-00155d5c9528'); INSERT IGNORE INTO t_operator (id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star) @@ -113,7 +117,7 @@ INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id) SELECT c.id, o.id FROM t_operator_category c CROSS JOIN t_operator o -WHERE c.id IN ('d8a5df7a-52a9-42c2-83c4-01062e60f597', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3') +WHERE c.id IN ('d8a5df7a-52a9-42c2-83c4-01062e60f597', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3', '431e7798-5426-4e1a-aae6-b9905a836b34') AND o.id IN ('FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFilter', 'FileWithHighRepeatWordRateFilter', 'FileWithHighSpecialCharRateFilter', 'FileWithManySensitiveWordsFilter', 'DuplicateFilesFilter', 'DuplicateSentencesFilter', 'AnonymizedCreditCardNumber', 'AnonymizedIdNumber', @@ -127,7 +131,269 @@ INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id) SELECT c.id, o.id FROM t_operator_category c CROSS JOIN t_operator o -WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3') +WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3', '431e7798-5426-4e1a-aae6-b9905a836b34') AND o.id IN ('ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise', 'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation', 'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify', 'ImgDirectionCorrect'); + + +INSERT IGNORE INTO t_operator +(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star) +VALUES + ('entity_attribute_aggregator', '实体属性聚合器', 'Summarizes a given attribute of an entity from a set of documents. 汇总一组文档中实体的给定属性。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('meta_tags_aggregator', '元标签聚合器', 'Merge similar meta tags into a single, unified tag. 将类似的元标记合并到一个统一的标记中。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('most_relevant_entities_aggregator', '最相关实体聚合器', 'Extracts and ranks entities closely related to a given entity from provided texts. 从提供的文本中提取与给定实体密切相关的实体并对其进行排名。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('nested_aggregator', '嵌套聚合器', 'Aggregates nested content from multiple samples into a single summary. 将多个示例中的嵌套内容聚合到单个摘要中。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('document_deduplicator', '文档去重器', 'Deduplicates samples at the document level using exact matching. 使用完全匹配在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('document_minhash_deduplicator', '文档MinHash去重器', 'Deduplicates samples at the document level using MinHash LSH. 使用MinHash LSH在文档级别删除重复样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('document_simhash_deduplicator', '文档SimHash去重器', 'Deduplicates samples at the document level using SimHash. 使用SimHash在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('image_deduplicator', '图像去重器', 'Deduplicates samples at the document level by exact matching of images. 通过图像的精确匹配在文档级别删除重复的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('ray_basic_deduplicator', 'Ray基础去重器', 'Backend for deduplicator. deduplicator的后端。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('ray_bts_minhash_deduplicator', 'Ray BTS MinHash去重器', 'A distributed implementation of Union-Find with load balancing. 具有负载平衡的Union-Find的分布式实现。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('ray_document_deduplicator', 'Ray文档去重器', 'Deduplicates samples at the document level using exact matching in Ray distributed mode. 在Ray分布式模式下使用精确匹配在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('ray_image_deduplicator', 'Ray图像去重器', 'Deduplicates samples at the document level using exact matching of images in Ray distributed mode. 在光线分布模式下使用图像的精确匹配在文档级别删除重复样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('ray_video_deduplicator', 'Ray视频去重器', 'Deduplicates samples at document-level using exact matching of videos in Ray distributed mode. 在Ray分布式模式下使用视频的精确匹配在文档级删除重复样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_deduplicator', '视频去重器', 'Deduplicates samples at the document level using exact matching of videos. 使用视频的精确匹配在文档级别删除重复的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('alphanumeric_filter', '字母数字过滤器', 'Filter to keep samples with an alphabet/numeric ratio within a specific range. 过滤器,以保持具有特定范围内的字母/数字比率的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('audio_duration_filter', '音频时长过滤器', 'Keep data samples whose audio durations are within a specified range. 保留音频持续时间在指定范围内的数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false), + ('audio_nmf_snr_filter', '音频NMF信噪比过滤器', 'Keep data samples whose audio Signal-to-Noise Ratios (SNRs) are within a specified range. 保留音频信噪比 (snr) 在指定范围内的数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false), + ('audio_size_filter', '音频大小过滤器', 'Keep data samples based on the size of their audio files. 根据音频文件的大小保留数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false), + ('average_line_length_filter', '平均行长过滤器', 'Filter to keep samples with average line length within a specific range. 过滤器,以保持平均线长度在特定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('character_repetition_filter', '字符重复过滤器', 'Filter to keep samples with character-level n-gram repetition ratio within a specific range. 过滤器将具有字符级n-gram重复比的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('flagged_words_filter', '标记词过滤器', 'Filter to keep samples with flagged-word ratio in a specified range. 过滤器将标记词比率的样本保留在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('general_field_filter', '通用字段过滤器', 'Filter to keep samples based on a general field filter condition. 根据常规字段筛选条件保留样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('image_aesthetics_filter', '图像美学过滤器', 'Filter to keep samples with aesthetics scores within a specific range. 过滤以保持美学分数在特定范围内的样品。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('image_aspect_ratio_filter', '图像长宽比过滤器', 'Filter to keep samples with image aspect ratio within a specific range. 过滤器,以保持样本的图像纵横比在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('image_face_count_filter', '图像人脸计数过滤器', 'Filter to keep samples with the number of faces within a specific range. 过滤以保持样本的面数在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('image_face_ratio_filter', '图像人脸占比过滤器', 'Filter to keep samples with face area ratios within a specific range. 过滤以保持面面积比在特定范围内的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('image_nsfw_filter', '图像NSFW过滤器', 'Filter to keep samples whose images have nsfw scores in a specified range. 过滤器保留其图像的nsfw分数在指定范围内的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('image_pair_similarity_filter', '图像对相似度过滤器', 'Filter to keep image pairs with similarities between images within a specific range. 过滤器将图像之间具有相似性的图像对保持在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('image_shape_filter', '图像形状过滤器', 'Filter to keep samples with image shape (width, height) within specific ranges. 过滤器,以保持样本的图像形状 (宽度,高度) 在特定的范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('image_size_filter', '图像大小过滤器', 'Keep data samples whose image size (in Bytes/KB/MB/...) is within a specific range. 保留图像大小 (以字节/KB/MB/... 为单位) 在特定范围内的数据样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('image_text_matching_filter', '图文匹配过滤器', 'Filter to keep samples with image-text matching scores within a specific range. 过滤器将图像文本匹配分数的样本保持在特定范围内。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('image_text_similarity_filter', '图文相似度过滤器', 'Filter to keep samples with image-text similarity within a specified range. 过滤器将具有图像-文本相似性的样本保持在指定范围内。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('image_watermark_filter', '图像水印过滤器', 'Filter to keep samples whose images have no watermark with high probability. 过滤器以保持其图像没有水印的样本具有高概率。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('in_context_influence_filter', '上下文影响过滤器', 'Filter to keep texts based on their in-context influence on a validation set. 过滤以根据文本在上下文中对验证集的影响来保留文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('instruction_following_difficulty_filter', '指令跟随难度过滤器', 'Filter to keep texts based on their instruction following difficulty (IFD, https://arxiv.org/abs/2308.12032) score. 过滤以保持文本基于他们的指令跟随难度 (IFD, https://arxiv.org/abs/ 2308.12032) 分数。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('language_id_score_filter', '语种识别得分过滤器', 'Filter to keep samples in a specific language with a confidence score above a threshold. 过滤器以保留置信度高于阈值的特定语言的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('llm_analysis_filter', 'LLM分析过滤器', 'Base filter class for leveraging LLMs to analyze and filter data samples. 用于利用LLMs分析和过滤数据样本的基本筛选器类。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('llm_difficulty_score_filter', 'LLM难度得分过滤器', 'Filter to keep samples with high difficulty scores estimated by an LLM. 过滤器以保留由LLM估计的高难度分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('llm_perplexity_filter', 'LLM困惑度过滤器', 'Filter to keep samples with perplexity scores within a specified range, computed using a specified LLM. 过滤器将困惑分数的样本保留在指定范围内,使用指定的LLM计算。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('llm_quality_score_filter', 'LLM质量得分过滤器', 'Filter to keep samples with a high quality score estimated by a language model. 过滤器,以保留具有语言模型估计的高质量分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('llm_task_relevance_filter', 'LLM任务相关性过滤器', 'Filter to keep samples with high relevance scores to validation tasks estimated by an LLM. 过滤器以保留与LLM估计的验证任务具有高相关性分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('maximum_line_length_filter', '最大行长过滤器', 'Filter to keep samples with a maximum line length within a specified range. 筛选器将最大行长度的样本保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('perplexity_filter', '困惑度过滤器', 'Filter to keep samples with perplexity score in a specified range. 过滤以保持困惑分数在指定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('phrase_grounding_recall_filter', '短语定位召回过滤器', 'Filter to keep samples based on the phrase grounding recall of phrases extracted from text in images. 根据从图像中的文本中提取的短语接地召回来过滤以保留样本。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('special_characters_filter', '特殊字符过滤器', 'Filter to keep samples with special-character ratio within a specific range. 过滤器,以将具有特殊字符比率的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('specified_field_filter', '指定字段过滤器', 'Filter samples based on the specified field information. 根据指定的字段信息筛选样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('specified_numeric_field_filter', '指定数值字段过滤器', 'Filter samples based on a specified numeric field value. 根据指定的数值字段值筛选样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('stopwords_filter', '停用词过滤器', 'Filter to keep samples with stopword ratio within a specified range. 过滤器将停止词比率的样本保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('suffix_filter', '后缀过滤器', 'Filter to keep samples with specified suffix. 过滤器以保留具有指定后缀的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('text_action_filter', '文本动作过滤器', 'Filter to keep texts that contain a minimum number of actions. 过滤以保留包含最少数量操作的文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('text_embd_similarity_filter', '文本嵌入相似度过滤器', 'Filter to keep texts whose average embedding similarity to a set of given validation texts falls within a specific range. 过滤器,以保留与一组给定验证文本的平均嵌入相似度在特定范围内的文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('text_entity_dependency_filter', '文本实体依赖过滤器', 'Identify and filter text samples based on entity dependencies. 根据实体依赖关系识别和过滤文本样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('text_length_filter', '文本长度过滤器', 'Filter to keep samples with total text length within a specific range. 过滤以保持文本总长度在特定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('text_pair_similarity_filter', '文本对相似度过滤器', 'Filter to keep text pairs with similarities within a specific range. 过滤以将具有相似性的文本对保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('token_num_filter', 'Token数量过滤器', 'Filter to keep samples with a total token number within a specified range. 筛选器将总令牌数的样本保留在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('video_aesthetics_filter', '视频美学过滤器', 'Filter to keep data samples with aesthetics scores for specified frames in the videos within a specific range. 过滤器将视频中指定帧的美学得分数据样本保留在特定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_aspect_ratio_filter', '视频长宽比过滤器', 'Filter to keep samples with video aspect ratio within a specific range. 过滤器将视频纵横比的样本保持在特定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_duration_filter', '视频时长过滤器', 'Keep data samples whose videos\' durations are within a specified range. 保留视频持续时间在指定范围内的数据样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_frames_text_similarity_filter', '视频帧文本相似度过滤器', 'Filter to keep samples based on the similarity between video frame images and text within a specific range. 根据视频帧图像和文本之间的相似性进行过滤,以保持样本在特定范围内。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('video_motion_score_filter', '视频运动得分过滤器', 'Filter to keep samples with video motion scores within a specific range. 过滤器将视频运动分数的样本保持在特定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_motion_score_raft_filter', '视频RAFT运动得分过滤器', 'Filter to keep samples with video motion scores within a specified range. 过滤器将视频运动分数的样本保持在指定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_nsfw_filter', '视频NSFW过滤器', 'Filter to keep samples whose videos have nsfw scores in a specified range. 过滤器以保留其视频的nsfw分数在指定范围内的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_ocr_area_ratio_filter', '视频OCR面积占比过滤器', 'Keep data samples whose detected text area ratios for specified frames in the video are within a specified range. 保留检测到的视频中指定帧的文本面积比率在指定范围内的数据样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_resolution_filter', '视频分辨率过滤器', 'Keep data samples whose videos\' resolutions are within a specified range. 保留视频分辨率在指定范围内的数据样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_tagging_from_frames_filter', '视频帧标签过滤器', 'Filter to keep samples whose videos contain specified tags. 过滤器以保留其视频包含指定标签的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_watermark_filter', '视频水印过滤器', 'Filter to keep samples whose videos have no watermark with high probability. 过滤器以保持其视频具有高概率没有水印的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('word_repetition_filter', '单词重复过滤器', 'Filter to keep samples with word-level n-gram repetition ratio within a specific range. 过滤器将单词级n-gram重复比率的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('words_num_filter', '词数过滤器', 'Filter to keep samples with a total word count within a specified range. 过滤器将样本的总字数保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('key_value_grouper', '键值分组器', 'Groups samples into batches based on values in specified keys. 根据指定键中的值将样本分组为批处理。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('naive_grouper', '朴素分组器', 'Group all samples in a dataset into a single batched sample. 将数据集中的所有样本分组为单个批处理样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('naive_reverse_grouper', '朴素反向分组器', 'Split batched samples into individual samples. 将批处理的样品分成单个样品。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('audio_add_gaussian_noise_mapper', '音频高斯噪声添加映射器', 'Mapper to add Gaussian noise to audio samples. 映射器将高斯噪声添加到音频样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false), + ('audio_ffmpeg_wrapped_mapper', '音频FFmpeg封装映射器', 'Wraps FFmpeg audio filters for processing audio files in a dataset. 包装FFmpeg音频过滤器,用于处理数据集中的音频文件。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false), + ('calibrate_qa_mapper', 'QA校准映射器', 'Calibrates question-answer pairs based on reference text using an API model. 使用API模型根据参考文本校准问答对。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('calibrate_query_mapper', '查询校准映射器', 'Calibrate query in question-answer pairs based on reference text. 基于参考文本校准问答对中的查询。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('calibrate_response_mapper', '回复校准映射器', 'Calibrate response in question-answer pairs based on reference text. 根据参考文本校准问答对中的回答。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('chinese_convert_mapper', '中文简繁转换映射器', 'Mapper to convert Chinese text between Traditional, Simplified, and Japanese Kanji. 映射器在繁体、简体和日文汉字之间转换中文文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('clean_copyright_mapper', '版权清洗映射器', 'Cleans copyright comments at the beginning of text samples. 清除文本示例开头的版权注释。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('clean_email_mapper', '邮箱清洗映射器', 'Cleans email addresses from text samples using a regular expression. 使用正则表达式从文本示例中清除电子邮件地址。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('clean_html_mapper', 'HTML清洗映射器', 'Cleans HTML code from text samples, converting HTML to plain text. 从文本示例中清除HTML代码,将HTML转换为纯文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('clean_ip_mapper', 'IP清洗映射器', 'Cleans IPv4 and IPv6 addresses from text samples. 从文本示例中清除IPv4和IPv6地址。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('clean_links_mapper', '链接清洗映射器', 'Mapper to clean links like http/https/ftp in text samples. 映射器来清理链接,如文本示例中的http/https/ftp。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('detect_character_attributes_mapper', '角色属性检测映射器', 'Takes an image, a caption, and main character names as input to extract the characters\' attributes. 根据给定的图像、图像描述信息和(多个)角色名称,提取图像中主要角色的属性。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('detect_character_locations_mapper', '角色位置检测映射器', 'Given an image and a list of main character names, extract the bounding boxes for each present character. 给定一张图像和主要角色的名称列表,提取每个在场角色的边界框。(YOLOE + MLLM)', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('detect_main_character_mapper', '主要角色检测映射器', 'Extract all main character names based on the given image and its caption. 根据给定的图像及其图像描述,提取所有主要角色的名字。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('dialog_intent_detection_mapper', '对话意图检测映射器', 'Generates user\'s intent labels in a dialog by analyzing the history, query, and response. 通过分析历史记录、查询和响应,在对话框中生成用户的意图标签。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('dialog_sentiment_detection_mapper', '对话情感检测映射器', 'Generates sentiment labels and analysis for user queries in a dialog. 在对话框中为用户查询生成情绪标签和分析。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('dialog_sentiment_intensity_mapper', '对话情感强度映射器', 'Mapper to predict user\'s sentiment intensity in a dialog, ranging from -5 to 5. Mapper预测用户在对话框中的情绪强度,范围从-5到5。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('dialog_topic_detection_mapper', '对话主题检测映射器', 'Generates user\'s topic labels and analysis in a dialog. 在对话框中生成用户的主题标签和分析。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('download_file_mapper', '文件下载映射器', 'Mapper to download URL files to local files or load them into memory. 映射器将URL文件下载到本地文件或将其加载到内存中。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('expand_macro_mapper', '宏展开映射器', 'Expands macro definitions in the document body of LaTeX samples. 展开LaTeX示例文档主体中的宏定义。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('extract_entity_attribute_mapper', '实体属性提取映射器', 'Extracts attributes for given entities from the text and stores them in the sample\'s metadata. 从文本中提取给定实体的属性,并将其存储在示例的元数据中。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('extract_entity_relation_mapper', '实体关系提取映射器', 'Extracts entities and relations from text to build a knowledge graph. 从文本中提取实体和关系以构建知识图谱。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('extract_event_mapper', '事件提取映射器', 'Extracts events and relevant characters from the text. 从文本中提取事件和相关字符。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('extract_keyword_mapper', '关键词提取映射器', 'Generate keywords for the text. 为文本生成关键字。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('extract_nickname_mapper', '昵称提取映射器', 'Extracts nickname relationships in the text using a language model. 使用语言模型提取文本中的昵称关系。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('extract_support_text_mapper', '支撑文本提取映射器', 'Extracts a supporting sub-text from the original text based on a given summary. 根据给定的摘要从原始文本中提取支持子文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('extract_tables_from_html_mapper', 'HTML表格提取映射器', 'Extracts tables from HTML content and stores them in a specified field. 从HTML内容中提取表并将其存储在指定字段中。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('fix_unicode_mapper', 'Unicode修复映射器', 'Fixes unicode errors in text samples. 修复文本示例中的unicode错误。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('generate_qa_from_examples_mapper', '示例生成QA映射器', 'Generates question and answer pairs from examples using a Hugging Face model. 使用拥抱面部模型从示例生成问题和答案对。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('generate_qa_from_text_mapper', '文本生成QA映射器', 'Generates question and answer pairs from text using a specified model. 使用指定的模型从文本生成问题和答案对。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('image_blur_mapper', '图像模糊映射器', 'Blurs images in the dataset with a specified probability and blur type. 使用指定的概率和模糊类型对数据集中的图像进行模糊处理。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('image_captioning_from_gpt4v_mapper', 'GPT4V图像描述映射器', 'Generates text captions for images using the GPT-4 Vision model. 使用GPT-4视觉模型为图像生成文本标题。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('image_captioning_mapper', '图像描述映射器', 'Generates image captions using a Hugging Face model and appends them to samples. 使用拥抱面部模型生成图像标题,并将其附加到样本中。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('image_detection_yolo_mapper', 'YOLO图像检测映射器', 'Perform object detection using YOLO on images and return bounding boxes and class labels. 使用YOLO对图像执行对象检测,并返回边界框和类标签。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('image_diffusion_mapper', '图像扩散生成映射器', 'Generate images using a diffusion model based on provided captions. 使用基于提供的字幕的扩散模型生成图像。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('image_face_blur_mapper', '图像人脸模糊映射器', 'Mapper to blur faces detected in images. 映射器模糊图像中检测到的人脸。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('image_remove_background_mapper', '图像去背景映射器', 'Mapper to remove the background of images. 映射器删除图像的背景。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('image_segment_mapper', '图像分割映射器', 'Perform segment-anything on images and return the bounding boxes. 对图像执行segment-任何操作并返回边界框。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('image_tagging_mapper', '图像打标映射器', 'Generates image tags for each image in the sample. 为样本中的每个图像生成图像标记。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('imgdiff_difference_area_generator_mapper', 'ImgDiff差异区域生成映射器', 'Generates and filters bounding boxes for image pairs based on similarity, segmentation, and text matching. 根据相似性、分割和文本匹配生成和过滤图像对的边界框。', '1.4.4', 'image', 'image', NULL, NULL, '', false), + ('imgdiff_difference_caption_generator_mapper', 'ImgDiff差异描述生成映射器', 'Generates difference captions for bounding box regions in two images. 为两个图像中的边界框区域生成差异字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('mllm_mapper', 'MLLM视觉问答映射器', 'Mapper to use MLLMs for visual question answering tasks. Mapper使用MLLMs进行视觉问答任务。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('nlpaug_en_mapper', 'NLPAug英语增强映射器', 'Augments English text samples using various methods from the nlpaug library. 使用nlpaug库中的各种方法增强英语文本样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('nlpcda_zh_mapper', 'NLPCDA中文增强映射器', 'Augments Chinese text samples using the nlpcda library. 使用nlpcda库扩充中文文本样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('optimize_prompt_mapper', 'Prompt优化映射器', 'Optimize prompts based on existing ones in the same batch. 根据同一批次中的现有提示优化提示。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('optimize_qa_mapper', 'QA优化映射器', 'Mapper to optimize question-answer pairs. 映射器来优化问题-答案对。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('optimize_query_mapper', '查询优化映射器', 'Optimize queries in question-answer pairs to make them more specific and detailed. 优化问答对中的查询,使其更加具体和详细。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('optimize_response_mapper', '回复优化映射器', 'Optimize response in question-answer pairs to be more detailed and specific. 优化问答对中的响应,使其更加详细和具体。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('pair_preference_mapper', '配对偏好映射器', 'Mapper to construct paired preference samples by generating a rejected response and its reason. Mapper通过生成拒绝响应及其原因来构造成对的偏好样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('punctuation_normalization_mapper', '标点归一化映射器', 'Normalizes unicode punctuations to their English equivalents in text samples. 将unicode标点规范化为文本示例中的英语等效项。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('python_file_mapper', 'Python文件映射器', 'Executes a Python function defined in a file on input data. 对输入数据执行文件中定义的Python函数。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('python_lambda_mapper', 'Python Lambda映射器', 'Mapper for applying a Python lambda function to data samples. Mapper,用于将Python lambda函数应用于数据样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('query_intent_detection_mapper', '查询意图检测映射器', 'Predicts the user\'s intent label and corresponding score for a given query. 为给定查询预测用户的意图标签和相应的分数。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('query_sentiment_detection_mapper', '查询情感检测映射器', 'Predicts user\'s sentiment label (\'negative\', \'neutral\', \'positive\') in a query. 在查询中预测用户的情绪标签 (“负面” 、 “中性” 、 “正面”)。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('query_topic_detection_mapper', '查询主题检测映射器', 'Predicts the topic label and its corresponding score for a given query. 预测给定查询的主题标签及其相应的分数。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('relation_identity_mapper', '关系识别映射器', 'Identify the relation between two entities in a given text. 确定给定文本中两个实体之间的关系。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('remove_bibliography_mapper', '参考书目移除映射器', 'Removes bibliography sections at the end of LaTeX documents. 删除LaTeX文档末尾的参考书目部分。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('remove_comments_mapper', '注释移除映射器', 'Removes comments from documents, currently supporting only \'tex\' format. 从文档中删除注释,当前仅支持 “文本” 格式。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('remove_header_mapper', '页眉移除映射器', 'Removes headers at the beginning of documents in LaTeX samples. 删除LaTeX示例中文档开头的标题。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('remove_long_words_mapper', '长词移除映射器', 'Mapper to remove long words within a specific range. 映射器删除特定范围内的长词。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('remove_non_chinese_character_mapper', '非中文字符移除映射器', 'Removes non-Chinese characters from text samples. 从文本样本中删除非中文字符。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('remove_repeat_sentences_mapper', '重复句移除映射器', 'Mapper to remove repeat sentences in text samples. 映射器删除文本样本中的重复句子。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('remove_specific_chars_mapper', '指定字符移除映射器', 'Removes specific characters from text samples. 从文本示例中删除特定字符。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('remove_table_text_mapper', '表格文本移除映射器', 'Mapper to remove table texts from text samples. 映射器从文本样本中删除表文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('remove_words_with_incorrect_substrings_mapper', '错误子串单词移除映射器', 'Mapper to remove words containing specified incorrect substrings. 映射程序删除包含指定的不正确子字符串的单词。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('replace_content_mapper', '内容替换映射器', 'Replaces content in the text that matches a specific regular expression pattern with a designated replacement string. 用指定的替换字符串替换与特定正则表达式模式匹配的文本中的内容。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('sdxl_prompt2prompt_mapper', 'SDXL Prompt2Prompt映射器', 'Generates pairs of similar images using the SDXL model. 使用SDXL模型生成成对的相似图像。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('sentence_augmentation_mapper', '句子增强映射器', 'Augments sentences by generating enhanced versions using a Hugging Face model. 通过使用拥抱面部模型生成增强版本来增强句子。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('sentence_split_mapper', '句子切分映射器', 'Splits text samples into individual sentences based on the specified language. 根据指定的语言将文本样本拆分为单个句子。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('text_chunk_mapper', '文本分块映射器', 'Split input text into chunks based on specified criteria. 根据指定的条件将输入文本拆分为块。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('text_tagging_by_prompt_mapper', 'Prompt文本打标映射器', 'Mapper to generate text tags using prompt with LLM. Mapper使用带有LLM的prompt生成文本标记。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('vggt_mapper', 'VGGT视频提取映射器', 'Input a video of a single scene, and use VGGT to extract information including Camera Pose, Depth Maps, Point Maps, and 3D Point Tracks. 输入单个场景的视频,并使用VGGT提取包括相机姿态、深度图、点图和3D点轨迹的信息。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_captioning_from_audio_mapper', '音频生成视频描述映射器', 'Mapper to caption a video according to its audio streams based on Qwen-Audio model. 映射器根据基于qwen-audio模型的音频流为视频添加字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('video_captioning_from_frames_mapper', '帧生成视频描述映射器', 'Generates video captions from sampled frames using an image-to-text model. 使用图像到文本模型从采样帧生成视频字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('video_captioning_from_summarizer_mapper', '摘要生成视频描述映射器', 'Mapper to generate video captions by summarizing several kinds of generated texts (captions from video/audio/frames, tags from audio/frames, ...). 映射器通过总结几种生成的文本 (来自视频/音频/帧的字幕,来自音频/帧的标签,...) 来生成视频字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('video_captioning_from_video_mapper', '视频生成视频描述映射器', 'Generates video captions using a Hugging Face video-to-text model and sampled video frames. 使用拥抱面部视频到文本模型和采样视频帧生成视频字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('video_captioning_from_vlm_mapper', 'VLM视频描述映射器', 'Generates video captions using a VLM that accepts videos as inputs. 使用接受视频作为输入的VLM生成视频字幕。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('video_depth_estimation_mapper', '视频深度估计映射器', 'Perform depth estimation on the video. 对视频进行深度估计。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_extract_frames_mapper', '视频抽帧映射器', 'Mapper to extract frames from video files according to specified methods. 映射器根据指定的方法从视频文件中提取帧。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('video_face_blur_mapper', '视频人脸模糊映射器', 'Mapper to blur faces detected in videos. 映射器模糊在视频中检测到的人脸。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_ffmpeg_wrapped_mapper', '视频FFmpeg封装映射器', 'Wraps FFmpeg video filters for processing video files in a dataset. 包装FFmpeg视频过滤器,用于处理数据集中的视频文件。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_hand_reconstruction_mapper', '视频手部重建映射器', 'Use the WiLoR model for hand localization and reconstruction. 使用WiLoR模型进行手部定位和重建。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_object_segmenting_mapper', '视频对象分割映射器', 'Text-guided semantic segmentation of valid objects throughout the video (YOLOE + SAM2). 在整个视频中对有效对象进行文本引导的语义分割 (YOLOE SAM2)。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_remove_watermark_mapper', '视频去水印映射器', 'Remove watermarks from videos based on specified regions. 根据指定区域从视频中删除水印。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_resize_aspect_ratio_mapper', '视频宽高比调整映射器', 'Resizes videos to fit within a specified aspect ratio range. 调整视频大小以适应指定的宽高比范围。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_resize_resolution_mapper', '视频分辨率调整映射器', 'Resizes video resolution based on specified width and height constraints. 根据指定的宽度和高度限制调整视频分辨率。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_split_by_duration_mapper', '视频按时长切分映射器', 'Splits videos into segments based on a specified duration. 根据指定的持续时间将视频拆分为多个片段。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('video_split_by_key_frame_mapper', '视频关键帧切分映射器', 'Splits a video into segments based on key frames. 根据关键帧将视频分割为多个片段。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('video_split_by_scene_mapper', '视频场景切分映射器', 'Splits videos into scene clips based on detected scene changes. 根据检测到的场景变化将视频拆分为场景剪辑。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false), + ('video_tagging_from_audio_mapper', '音频视频打标映射器', 'Generates video tags from audio streams using the Audio Spectrogram Transformer. 使用音频频谱图转换器从音频流生成视频标签。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_tagging_from_frames_mapper', '帧视频打标映射器', 'Generates video tags from frames extracted from videos. 从视频中提取的帧生成视频标签。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('video_whole_body_pose_estimation_mapper', '视频全身姿态估计映射器', 'Input a video containing people, and use the DWPose model to extract the body, hand, feet, and face keypoints of the human subjects in the video, i.e., 2D Whole-body Pose Estimation. 输入包含人的视频,并使用DWPose模型来提取视频中人类主体的身体、手、脚和面部关键点,即2D全身姿态估计。', '1.4.4', 'video', 'video', NULL, NULL, '', false), + ('whitespace_normalization_mapper', '空白字符归一化映射器', 'Normalizes various types of whitespace characters to standard spaces in text samples. 将文本样本中各种类型的空白字符规范化为标准空格。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('frequency_specified_field_selector', '频率指定字段选择器', 'Selector to filter samples based on the frequency of a specified field. 选择器根据指定字段的频率过滤样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('random_selector', '随机选择器', 'Randomly selects a subset of samples from the dataset. 从数据集中随机选择样本子集。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('range_specified_field_selector', '范围指定字段选择器', 'Selects a range of samples based on the sorted values of a specified field. 根据指定字段的排序值选择采样范围。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('tags_specified_field_selector', '标签指定字段选择器', 'Selector to filter samples based on the tags of a specified field. 选择器根据指定字段的标签过滤样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false), + ('topk_specified_field_selector', 'TopK指定字段选择器', 'Selects top samples based on the sorted values of a specified field. 根据指定字段的排序值选择顶部样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false); + +INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id) +SELECT c.id, o.id +FROM t_operator_category c + CROSS JOIN t_operator o +WHERE c.id IN ('d8a5df7a-52a9-42c2-83c4-01062e60f597', '9eda9d5d-072b-499b-916c-797a0a8750e1', + '96a3b07a-3439-4557-a835-525faad60ca3', '79b385b4-fde8-4617-bcba-02a176938996') + AND o.id IN + ('entity_attribute_aggregator', 'meta_tags_aggregator', 'most_relevant_entities_aggregator', 'nested_aggregator', + 'document_deduplicator', 'document_minhash_deduplicator', 'document_simhash_deduplicator', + 'ray_basic_deduplicator', 'ray_bts_minhash_deduplicator', 'ray_document_deduplicator', 'alphanumeric_filter', + 'average_line_length_filter', 'character_repetition_filter', 'flagged_words_filter', 'general_field_filter', + 'in_context_influence_filter', 'instruction_following_difficulty_filter', 'language_id_score_filter', + 'llm_analysis_filter', 'llm_difficulty_score_filter', 'llm_perplexity_filter', 'llm_quality_score_filter', + 'llm_task_relevance_filter', 'maximum_line_length_filter', 'perplexity_filter', 'special_characters_filter', + 'specified_field_filter', 'specified_numeric_field_filter', 'stopwords_filter', 'suffix_filter', + 'text_action_filter', 'text_embd_similarity_filter', 'text_entity_dependency_filter', 'text_length_filter', + 'text_pair_similarity_filter', 'token_num_filter', 'word_repetition_filter', 'words_num_filter', + 'key_value_grouper', 'naive_grouper', 'naive_reverse_grouper', 'calibrate_qa_mapper', 'calibrate_query_mapper', + 'calibrate_response_mapper', 'chinese_convert_mapper', 'clean_copyright_mapper', 'clean_email_mapper', + 'clean_html_mapper', 'clean_ip_mapper', 'clean_links_mapper', 'dialog_intent_detection_mapper', + 'dialog_sentiment_detection_mapper', 'dialog_sentiment_intensity_mapper', 'dialog_topic_detection_mapper', + 'download_file_mapper', 'expand_macro_mapper', 'extract_entity_attribute_mapper', + 'extract_entity_relation_mapper', 'extract_event_mapper', 'extract_keyword_mapper', 'extract_nickname_mapper', + 'extract_support_text_mapper', 'extract_tables_from_html_mapper', 'fix_unicode_mapper', + 'generate_qa_from_examples_mapper', 'generate_qa_from_text_mapper', 'nlpaug_en_mapper', 'nlpcda_zh_mapper', + 'optimize_prompt_mapper', 'optimize_qa_mapper', 'optimize_query_mapper', 'optimize_response_mapper', + 'pair_preference_mapper', 'punctuation_normalization_mapper', 'python_file_mapper', 'python_lambda_mapper', + 'query_intent_detection_mapper', 'query_sentiment_detection_mapper', 'query_topic_detection_mapper', + 'relation_identity_mapper', 'remove_bibliography_mapper', 'remove_comments_mapper', 'remove_header_mapper', + 'remove_long_words_mapper', 'remove_non_chinese_character_mapper', 'remove_repeat_sentences_mapper', + 'remove_specific_chars_mapper', 'remove_table_text_mapper', 'remove_words_with_incorrect_substrings_mapper', + 'replace_content_mapper', 'sdxl_prompt2prompt_mapper', 'sentence_augmentation_mapper', 'sentence_split_mapper', + 'text_chunk_mapper', 'text_tagging_by_prompt_mapper', 'whitespace_normalization_mapper', + 'frequency_specified_field_selector', 'random_selector', 'range_specified_field_selector', + 'tags_specified_field_selector', 'topk_specified_field_selector'); + +INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id) +SELECT c.id, o.id +FROM t_operator_category c + CROSS JOIN t_operator o +WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1', + '96a3b07a-3439-4557-a835-525faad60ca3', '79b385b4-fde8-4617-bcba-02a176938996') + AND o.id IN ('image_deduplicator', 'ray_image_deduplicator', 'image_aesthetics_filter', 'image_aspect_ratio_filter', + 'image_face_count_filter', 'image_face_ratio_filter', 'image_nsfw_filter', + 'image_pair_similarity_filter', 'image_shape_filter', 'image_size_filter', 'image_watermark_filter', + 'image_blur_mapper', 'image_detection_yolo_mapper', 'image_face_blur_mapper', + 'image_remove_background_mapper', 'image_segment_mapper', 'image_tagging_mapper', + 'imgdiff_difference_area_generator_mapper'); + +INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id) +SELECT c.id, o.id +FROM t_operator_category c + CROSS JOIN t_operator o +WHERE c.id IN ('42dd9392-73e4-458c-81ff-41751ada47b5', '9eda9d5d-072b-499b-916c-797a0a8750e1', + '96a3b07a-3439-4557-a835-525faad60ca3', '79b385b4-fde8-4617-bcba-02a176938996') + AND o.id IN ('audio_duration_filter', 'audio_nmf_snr_filter', 'audio_size_filter', 'audio_add_gaussian_noise_mapper', + 'audio_ffmpeg_wrapped_mapper'); + +INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id) +SELECT c.id, o.id +FROM t_operator_category c + CROSS JOIN t_operator o +WHERE c.id IN ('a233d584-73c8-4188-ad5d-8f7c8dda9c27', '9eda9d5d-072b-499b-916c-797a0a8750e1', + '96a3b07a-3439-4557-a835-525faad60ca3', '79b385b4-fde8-4617-bcba-02a176938996') + AND o.id IN ('ray_video_deduplicator', 'video_deduplicator', 'video_aesthetics_filter', 'video_aspect_ratio_filter', + 'video_duration_filter', 'video_motion_score_filter', 'video_motion_score_raft_filter', + 'video_nsfw_filter', 'video_ocr_area_ratio_filter', 'video_resolution_filter', + 'video_tagging_from_frames_filter', 'video_watermark_filter', 'vggt_mapper', + 'video_depth_estimation_mapper', 'video_face_blur_mapper', 'video_ffmpeg_wrapped_mapper', + 'video_hand_reconstruction_mapper', 'video_object_segmenting_mapper', 'video_remove_watermark_mapper', + 'video_resize_aspect_ratio_mapper', 'video_resize_resolution_mapper', 'video_tagging_from_audio_mapper', + 'video_tagging_from_frames_mapper', 'video_whole_body_pose_estimation_mapper'); + +INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id) +SELECT c.id, o.id +FROM t_operator_category c + CROSS JOIN t_operator o +WHERE c.id IN ('4d7dbd77-0a92-44f3-9056-2cd62d4a71e4', '9eda9d5d-072b-499b-916c-797a0a8750e1', + '96a3b07a-3439-4557-a835-525faad60ca3', '79b385b4-fde8-4617-bcba-02a176938996') + AND o.id IN ('image_text_matching_filter', 'image_text_similarity_filter', 'phrase_grounding_recall_filter', + 'video_frames_text_similarity_filter', 'detect_character_attributes_mapper', + 'detect_character_locations_mapper', 'detect_main_character_mapper', + 'image_captioning_from_gpt4v_mapper', 'image_captioning_mapper', 'image_diffusion_mapper', + 'imgdiff_difference_caption_generator_mapper', 'mllm_mapper', 'video_captioning_from_audio_mapper', + 'video_captioning_from_frames_mapper', 'video_captioning_from_summarizer_mapper', + 'video_captioning_from_video_mapper', 'video_captioning_from_vlm_mapper', 'video_extract_frames_mapper', + 'video_split_by_duration_mapper', 'video_split_by_key_frame_mapper', 'video_split_by_scene_mapper'); diff --git a/scripts/db/setting-management-init.sql b/scripts/db/setting-management-init.sql index 4cde511..82b5b4a 100644 --- a/scripts/db/setting-management-init.sql +++ b/scripts/db/setting-management-init.sql @@ -44,4 +44,4 @@ values ('sys.knowledge.base.count', '200', 'number', '10,200,500', '知识库最 ('BRAVE_SEARCH_API_KEY', 'api-xxx', 'string', '', 'deer-flow使用的搜索引擎所需的apiKey', 1, 1, 1, 'system', 'system'), ('JINA_API_KEY', '', 'string', '', 'deer-flow使用的JINA搜索引擎所需的apiKey', 1, 1, 1, 'system', 'system'), ('sys.management.dataset.pvc.name', 'datamate-dataset-pvc', 'string', '', '数据集所在pvc名称', 1, 0, 1, 'system', 'system'), - ('test_bool', 'true', 'boolean', '', '测试布尔值', 1, 1, 1, 'system', 'system'); + ('DATA_JUICER_EXECUTOR', 'default', 'string', 'default,ray', 'data-juicer使用的执行器', 1, 1, 1, 'system', 'system'); diff --git a/scripts/images/runtime/Dockerfile b/scripts/images/runtime/Dockerfile index 1af18ab..5685d0f 100644 --- a/scripts/images/runtime/Dockerfile +++ b/scripts/images/runtime/Dockerfile @@ -16,12 +16,14 @@ COPY runtime/ops/user /opt/runtime/user COPY scripts/images/runtime/start.sh /opt/runtime/start.sh ENV PYTHONPATH=/opt/runtime/datamate/ +ENV UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" +ENV UV_INDEX_STRATEGY=unsafe-best-match WORKDIR /opt/runtime RUN --mount=type=cache,target=/root/.cache/uv \ - UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" uv pip install -e . --system --index-strategy unsafe-best-match \ - && UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \ + uv pip install -e .[all] --system \ + && uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \ && uv pip uninstall torch torchvision --system \ && python -m spacy download zh_core_web_sm \ && echo "/usr/local/lib/ops/site-packages" > /usr/local/lib/python3.11/site-packages/ops.pth