feat: 支持运行data-juicer算子 (#215)

* feature: 增加data-juicer算子

* feat: 支持运行data-juicer算子

* feat: 支持data-juicer任务下发

* feat: 支持data-juicer结果数据集归档

* feat: 支持data-juicer结果数据集归档
This commit is contained in:
hhhhsc701
2025-12-31 09:20:41 +08:00
committed by GitHub
parent 63f4e3e447
commit 6a1eb85e8e
26 changed files with 709 additions and 120 deletions

View File

@@ -238,7 +238,7 @@ endif
# ========== Docker Install/Uninstall Targets ==========
# Valid service targets for docker install/uninstall
VALID_SERVICE_TARGETS := datamate backend frontend runtime mineru "deer-flow" milvus "label-studio"
VALID_SERVICE_TARGETS := datamate backend frontend runtime mineru "deer-flow" milvus "label-studio" "data-juicer" dj
# Generic docker service install target
.PHONY: %-docker-install
@@ -263,6 +263,8 @@ VALID_SERVICE_TARGETS := datamate backend frontend runtime mineru "deer-flow" mi
REGISTRY=$(REGISTRY) docker compose -f deployment/docker/deer-flow/docker-compose.yml up -d; \
elif [ "$*" = "milvus" ]; then \
docker compose -f deployment/docker/milvus/docker-compose.yml up -d; \
elif [ "$*" = "data-juicer" ] || [ "$*" = "dj" ]; then \
REGISTRY=$(REGISTRY) && docker compose -f deployment/docker/datamate/docker-compose.yml up -d datamate-data-juicer; \
else \
$(call docker-compose-service,$*,up -d,deployment/docker/datamate); \
fi
@@ -300,6 +302,8 @@ VALID_SERVICE_TARGETS := datamate backend frontend runtime mineru "deer-flow" mi
else \
docker compose -f deployment/docker/milvus/docker-compose.yml down; \
fi; \
elif [ "$*" = "data-juicer" ] || [ "$*" = "dj" ]; then \
$(call docker-compose-service,datamate-data-juicer,down,deployment/docker/datamate); \
else \
$(call docker-compose-service,$*,down,deployment/docker/datamate); \
fi
@@ -307,7 +311,7 @@ VALID_SERVICE_TARGETS := datamate backend frontend runtime mineru "deer-flow" mi
# ========== Kubernetes Install/Uninstall Targets ==========
# Valid k8s targets
VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio
VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio data-juicer dj
# Generic k8s install target
.PHONY: %-k8s-install
@@ -334,6 +338,8 @@ VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio
helm upgrade milvus deployment/helm/milvus -n $(NAMESPACE) --install; \
elif [ "$*" = "label-studio" ]; then \
helm upgrade label-studio deployment/helm/label-studio -n $(NAMESPACE) --install; \
elif [ "$*" = "data-juicer" ] || [ "$*" = "dj" ]; then \
kubectl apply -f deployment/kubernetes/data-juicer/deploy.yaml -n $(NAMESPACE); \
fi
# Generic k8s uninstall target
@@ -357,6 +363,8 @@ VALID_K8S_TARGETS := mineru datamate deer-flow milvus label-studio
helm uninstall milvus -n $(NAMESPACE) --ignore-not-found; \
elif [ "$*" = "label-studio" ]; then \
helm uninstall label-studio -n $(NAMESPACE) --ignore-not-found; \
elif [ "$*" = "data-juicer" ] || [ "$*" = "dj" ]; then \
kubectl delete -f deployment/kubernetes/data-juicer/deploy.yaml -n $(NAMESPACE); \
fi
# ========== Upgrade Targets ==========

View File

@@ -107,6 +107,8 @@ public class CleaningTaskService {
cleanTaskValidator.checkNameDuplication(request.getName());
cleanTaskValidator.checkInputAndOutput(request.getInstance());
ExecutorType executorType = cleanTaskValidator.checkAndGetExecutorType(request.getInstance());
CreateDatasetRequest createDatasetRequest = new CreateDatasetRequest();
createDatasetRequest.setName(request.getDestDatasetName());
createDatasetRequest.setDatasetType(DatasetType.valueOf(request.getDestDatasetType()));
@@ -131,7 +133,7 @@ public class CleaningTaskService {
operatorInstanceRepo.insertInstance(taskId, request.getInstance());
prepareTask(task, request.getInstance());
prepareTask(task, request.getInstance(), executorType);
scanDataset(taskId, request.getSrcDatasetId());
taskScheduler.executeTask(taskId);
return task;
@@ -209,20 +211,20 @@ public class CleaningTaskService {
taskScheduler.executeTask(taskId);
}
private void prepareTask(CleaningTaskDto task, List<OperatorInstanceDto> instances) {
private void prepareTask(CleaningTaskDto task, List<OperatorInstanceDto> instances, ExecutorType executorType) {
List<OperatorDto> allOperators = operatorRepo.findAllOperators();
Map<String, OperatorDto> defaultSettings = allOperators.stream()
Map<String, OperatorDto> operatorDtoMap = allOperators.stream()
.collect(Collectors.toMap(OperatorDto::getId, Function.identity()));
TaskProcess process = new TaskProcess();
process.setInstanceId(task.getId());
process.setDatasetId(task.getDestDatasetId());
process.setExecutorType(executorType.getValue());
process.setDatasetPath(FLOW_PATH + "/" + task.getId() + "/dataset.jsonl");
process.setExportPath(DATASET_PATH + "/" + task.getDestDatasetId());
process.setExecutorType(ExecutorType.DATAMATE.getValue());
process.setProcess(instances.stream()
.map(instance -> {
OperatorDto operatorDto = defaultSettings.get(instance.getId());
OperatorDto operatorDto = operatorDtoMap.get(instance.getId());
Map<String, Object> stringObjectMap = getDefaultValue(operatorDto);
stringObjectMap.putAll(instance.getOverrides());
Map<String, Object> runtime = getRuntime(operatorDto);
@@ -240,7 +242,7 @@ public class CleaningTaskService {
options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK);
Yaml yaml = new Yaml(options);
File file = new File(FLOW_PATH + "/" + process.getInstanceId() + "/process.yaml");
File file = new File(FLOW_PATH + "/" + task.getId() + "/process.yaml");
file.getParentFile().mkdirs();
try (FileWriter writer = new FileWriter(file)) {

View File

@@ -77,6 +77,7 @@ public class CleaningTemplateService {
@Transactional
public CleaningTemplateDto createTemplate(CreateCleaningTemplateRequest request) {
cleanTaskValidator.checkInputAndOutput(request.getInstance());
cleanTaskValidator.checkAndGetExecutorType(request.getInstance());
CleaningTemplateDto template = new CleaningTemplateDto();
String templateId = UUID.randomUUID().toString();
template.setId(templateId);

View File

@@ -12,7 +12,11 @@ public enum CleanErrorCode implements ErrorCode {
*/
DUPLICATE_TASK_NAME("clean.0001", "清洗任务名称重复"),
IN_AND_OUT_NOT_MATCH("clean.0002", "算子输入输出不匹配");
OPERATOR_LIST_EMPTY("clean.0002", "任务列表为空"),
IN_AND_OUT_NOT_MATCH("clean.0003", "算子输入输出不匹配"),
EXECUTOR_NOT_MATCH("clean.0004", "算子执行器不匹配");
private final String code;
private final String message;

View File

@@ -1,11 +1,15 @@
package com.datamate.cleaning.infrastructure.validator;
import com.datamate.cleaning.common.enums.ExecutorType;
import com.datamate.cleaning.common.exception.CleanErrorCode;
import com.datamate.cleaning.domain.repository.CleaningTaskRepository;
import com.datamate.cleaning.interfaces.dto.OperatorInstanceDto;
import com.datamate.common.infrastructure.exception.BusinessException;
import com.datamate.common.infrastructure.exception.SystemErrorCode;
import com.datamate.common.setting.application.SysParamApplicationService;
import com.datamate.operator.domain.contants.OperatorConstant;
import lombok.RequiredArgsConstructor;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component;
@@ -19,6 +23,8 @@ import java.util.regex.Pattern;
public class CleanTaskValidator {
private final CleaningTaskRepository cleaningTaskRepo;
private final SysParamApplicationService sysParamApplicationService;
private final Pattern UUID_PATTERN = Pattern.compile(
"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"
);
@@ -51,4 +57,28 @@ public class CleanTaskValidator {
throw BusinessException.of(SystemErrorCode.INVALID_PARAMETER);
}
}
public ExecutorType checkAndGetExecutorType(List<OperatorInstanceDto> operators) {
if (operators == null || operators.isEmpty()) {
throw BusinessException.of(CleanErrorCode.OPERATOR_LIST_EMPTY);
}
for (int i = 1; i < operators.size(); i++) {
OperatorInstanceDto front = operators.get(i - 1);
OperatorInstanceDto back = operators.get(i);
boolean frontHas = CollectionUtils.isNotEmpty(front.getCategories())
&& front.getCategories().contains(OperatorConstant.CATEGORY_DATA_JUICER_ID);
boolean backHas = CollectionUtils.isNotEmpty(back.getCategories())
&& back.getCategories().contains(OperatorConstant.CATEGORY_DATA_JUICER_ID);
if (frontHas == backHas) {
continue;
}
throw BusinessException.of(CleanErrorCode.EXECUTOR_NOT_MATCH,
String.format(Locale.ROOT, "ops(name: [%s, %s]) executor does not match",
front.getName(), back.getName()));
}
if (operators.getFirst().getCategories().contains(OperatorConstant.CATEGORY_DATA_JUICER_ID)) {
return ExecutorType.fromValue(sysParamApplicationService.getParamByKey("DATA_JUICER_EXECUTOR"));
}
return ExecutorType.DATAMATE;
}
}

View File

@@ -24,7 +24,7 @@ public class OperatorInstanceDto {
private String outputs;
private List<Integer> categories;
private List<String> categories;
private Map<String, Object> overrides = new HashMap<>();
}

View File

@@ -32,6 +32,8 @@ public class OperatorConstant {
public static String CATEGORY_PREDEFINED_ID = "96a3b07a-3439-4557-a835-525faad60ca3";
public static String CATEGORY_DATA_JUICER_ID = "79b385b4-fde8-4617-bcba-02a176938996";
public static Map<String, String> CATEGORY_MAP = new HashMap<>();
static {

View File

@@ -139,6 +139,21 @@ services:
- "6379:6379"
networks: [ datamate ]
datamate-data-juicer:
container_name: datamate-data-juicer
image: datajuicer/data-juicer:v1.4.4
restart: on-failure
command:
- uvicorn
- service:app
- --host
- "0.0.0.0"
volumes:
- dataset_volume:/dataset
- flow_volume:/flow
networks: [ datamate ]
profiles: [ data-juicer ]
volumes:
dataset_volume:
name: datamate-dataset-volume

View File

@@ -66,15 +66,3 @@ Name of image
{{- $name }}:{{ $tag }}
{{- end }}
{{- end }}
{{/*
Name of sidecar image
*/}}
{{- define "ray-cluster-sidecar.image" -}}
{{- $name := default (printf "%s:%s" .Values.image.repository .Values.image.tag) .Values.head.sidecarContainers.image }}
{{- if .Values.global.image.repository }}
{{- .Values.global.image.repository | trimSuffix "/" }}/{{ $name }}
{{- else }}
{{- $name }}
{{- end }}
{{- end }}

View File

@@ -96,7 +96,7 @@ spec:
securityContext:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- $defult := printf "%s:%s" .Values.image.repository .Values.image.tag }}
{{- $defult := include "ray-cluster.image" . -}}
{{- with .Values.head.sidecarContainers }}
{{- range $index, $container := . }}
{{- $image := default $defult $container.image -}}
@@ -313,10 +313,14 @@ spec:
- name: ray-worker
{{- if $values.image }}
image: {{ $values.image.repository }}:{{ $values.image.tag }}
{{- if $values.image.pullPolicy }}
imagePullPolicy: {{ $values.image.pullPolicy }}
{{- else }}
image: {{ $.Values.image.repository }}:{{ $.Values.image.tag }}
imagePullPolicy: {{ $.Values.image.pullPolicy }}
imagePullPolicy: {{ default $.Values.image.pullPolicy $.Values.global.image.pullPolicy }}
{{- end }}
{{- else }}
image: {{ include "ray-cluster.image" $ }}
imagePullPolicy: {{ default $.Values.image.pullPolicy $.Values.global.image.pullPolicy }}
{{- end }}
{{- with $values.command }}
command:

View File

@@ -248,7 +248,6 @@ ray-cluster:
subPath: site-packages
sidecarContainers:
- name: runtime
image: datamate-runtime
imagePullPolicy: IfNotPresent
args: *runtimeArgs
env: *runtimeEnv
@@ -338,6 +337,9 @@ ray-cluster:
- *flowVolume
- *logVolume
- *operatorVolume
- name: ascend
hostPath:
path: /usr/local/Ascend
volumeMounts:
- mountPath: /tmp/ray
name: log-volume
@@ -352,3 +354,5 @@ ray-cluster:
- mountPath: /usr/local/lib/ops/site-packages
name: operator-volume
subPath: site-packages
- mountPath: /usr/local/Ascend
name: ascend

View File

@@ -0,0 +1,74 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: datamate-data-juicer
labels:
app: datamate
tier: data-juicer
spec:
replicas: 1
selector:
matchLabels:
app: datamate
tier: data-juicer
template:
metadata:
labels:
app: datamate
tier: data-juicer
spec:
containers:
- name: data-juicer
image: datajuicer/data-juicer:v1.4.4
imagePullPolicy: IfNotPresent
command:
- uvicorn
args:
- service:app
- --host
- "0.0.0.0"
ports:
- containerPort: 8000
resources:
limits:
cpu: 8
memory: 32Gi
requests:
cpu: 100m
memory: 100Mi
volumeMounts:
- name: dataset-volume
mountPath: /dataset
- name: log-volume
mountPath: /var/log/datamate/data-juicer
subPath: data-juicer
- name: flow-volume
mountPath: /flow
volumes:
- name: dataset-volume
persistentVolumeClaim:
claimName: datamate-dataset-pvc
- name: flow-volume
persistentVolumeClaim:
claimName: datamate-flow-pvc
- name: log-volume
persistentVolumeClaim:
claimName: datamate-log-pvc
---
apiVersion: v1
kind: Service
metadata:
name: datamate-data-juicer
labels:
app: datamate
tier: data-juicer
spec:
type: ClusterIP
ports:
- port: 8000
targetPort: 8000
protocol: TCP
selector:
app: datamate
tier: data-juicer

View File

@@ -38,6 +38,7 @@ export default function CleansingTaskCreate() {
...item.defaultParams,
...item.overrides,
},
categories: item.categories,
inputs: item.inputs,
outputs: item.outputs,
})),

View File

@@ -44,6 +44,7 @@ export default function CleansingTemplateCreate() {
...item.defaultParams,
...item.overrides,
},
categories: item.categories,
inputs: item.inputs,
outputs: item.outputs,
})),

View File

@@ -195,7 +195,7 @@ const OperatorLibrary: React.FC<OperatorLibraryProps> = ({
<div className="pb-4 border-b border-gray-200">
<span className="flex items-center font-semibold text-base">
<Layers className="w-4 h-4 mr-2" />
({filteredOperators.length})
</span>
</div>
<div className="flex flex-col h-full pt-4 pr-4 overflow-hidden">

View File

@@ -3,7 +3,7 @@ name = "ops"
version = "0.0.1"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
requires-python = ">=3.10"
dependencies = [
"beautifulsoup4>=4.14.3",
"datasketch>=1.8.0",
@@ -12,14 +12,14 @@ dependencies = [
"jieba>=0.42.1",
"loguru>=0.7.3",
"mineru>=2.6.5",
"numpy==1.24.3",
"numpy>=2.2.6",
"python-multipart>=0.0.20",
"opencv-contrib-python-headless==4.7.0.72",
"opencv-python-headless==4.7.0.72",
"opencv-contrib-python-headless>=4.12.0.88",
"opencv-python-headless>=4.12.0.88",
"openslide-python>=1.4.3",
"paddleocr==2.8.1",
"paddlepaddle==2.6.2",
"pandas==1.5.3",
"paddleocr==3.3.0",
"paddlepaddle==3.2.2",
"pandas>=2.2.3",
"presidio-analyzer==2.2.25",
"presidio-anonymizer==2.2.25",
"pycryptodome>=3.23.0",
@@ -27,7 +27,7 @@ dependencies = [
"python-docx>=1.2.0",
"pytz>=2025.2",
"six>=1.17.0",
"spacy==3.7.0",
"spacy>=3.7.0",
"sqlalchemy>=2.0.44",
"xmltodict>=1.0.2",
"zhconv>=1.4.3",

View File

@@ -151,21 +151,31 @@ class BaseOp:
if filetype in ["ppt", "pptx", "docx", "doc", "xlsx", "csv", "md", "pdf"]:
elements = partition(filename=filepath)
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
sample[self.data_key] = b""
elif filetype in ["txt", "md", "markdown", "xml", "html", "json", "jsonl"]:
with open(filepath, 'rb') as f:
content = f.read()
sample[self.text_key] = content.decode("utf-8-sig").replace("\r\n", "\n")
sample[self.data_key] = b""
elif filetype in ['jpg', 'jpeg', 'png', 'bmp']:
image_np = cv2.imdecode(np.fromfile(filepath, dtype=np.uint8), -1)
if image_np.size:
data = cv2.imencode(f".{filetype}", image_np)[1]
image_bytes = data.tobytes()
sample[self.data_key] = image_bytes
sample[self.text_key] = ""
return sample
def read_file_first(self, sample):
if self.is_first_op:
self.read_file(sample)
@staticmethod
def save_file_and_db(sample):
if FileExporter().execute(sample):
TaskInfoPersistence().persistence_task_info(sample)
return sample
class Mapper(BaseOp):
def __init__(self, *args, **kwargs):
@@ -195,8 +205,7 @@ class Mapper(BaseOp):
sample["execute_status"] = execute_status
# 加载文件成功执行信息到数据库
if self.is_last_op:
if FileExporter().execute(sample):
TaskInfoPersistence().persistence_task_info(sample)
self.save_file_and_db(sample)
return sample
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
@@ -237,8 +246,7 @@ class Slicer(BaseOp):
# 加载文件成功执行信息到数据库
if self.is_last_op:
if FileExporter().execute(sample):
TaskInfoPersistence().persistence_task_info(sample)
self.save_file_and_db(sample)
return [sample]
@@ -333,8 +341,7 @@ class Filter(BaseOp):
# 加载文件成功执行信息到数据库
if self.is_last_op:
if FileExporter().execute(sample):
TaskInfoPersistence().persistence_task_info(sample)
self.save_file_and_db(sample)
return True
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
@@ -490,7 +497,7 @@ class FileExporter(BaseOp):
save_path = self.get_save_path(sample, target_type)
# 不存在则保存为txt文件,正常文本清洗
else:
sample = self._get_from_text_or_data(sample)
sample = self._get_from_text(sample)
save_path = self.get_save_path(sample, 'txt')
return sample, save_path
@@ -552,7 +559,7 @@ class FileExporter(BaseOp):
return sample
def _get_from_text_or_data(self, sample: Dict[str, Any]) -> Dict[str, Any]:
if sample[self.data_key] is not None and sample[self.data_key] != b'':
if sample[self.data_key] is not None and sample[self.data_key] != b'' and sample[self.data_key] != "":
return self._get_from_data(sample)
else:
return self._get_from_text(sample)

View File

@@ -1,6 +1,7 @@
from . import data_juicer_wrapper, datamate_wrapper
WRAPPERS = {
"data_juicer": data_juicer_wrapper,
"ray": data_juicer_wrapper,
"default": data_juicer_wrapper,
"datamate": datamate_wrapper
}

View File

@@ -0,0 +1,150 @@
import base64
import time
from json import dumps as jdumps
from json import loads as jloads
from typing import Dict, Optional
from urllib.parse import urljoin
import ray
import requests
import yaml
from jsonargparse import ArgumentParser
from loguru import logger
from datamate.core.base_op import FileExporter, SUCCESS_STATUS
from datamate.core.constant import Fields
from datamate.wrappers.executor import RayExecutor
DJ_OUTPUT = "outputs"
class DataJuicerClient:
def __init__(self, base_url):
self.base_url = base_url
def call_data_juicer_api(self, path: str, params: Optional[Dict] = None, json: Optional[Dict] = None):
url = urljoin(self.base_url, path)
if json is not None:
response = requests.post(url, params=params, json=json)
else:
response = requests.get(url, params=params)
return jloads(response.text)
def init_config(self, dataset_path: str, export_path, process):
"""
Initialize Data-Juicer config.
Args:
:param dataset_path: The input dataset path.
:param process: The ops
:param export_path: The export path.
"""
dj_config = {
"dataset_path": dataset_path,
"export_path": export_path,
"process": process,
"executor_type": "default",
}
url_path = "/data_juicer/config/get_init_configs"
try:
res = self.call_data_juicer_api(url_path, params={"cfg": jdumps(dj_config)})
except Exception as e:
error_msg = f"An unexpected error occurred in calling {url_path}:\n{e}"
raise RuntimeError(error_msg)
return res["result"]
def execute_config(self, dj_config: Dict):
"""
Execute data-juicer data process.
Args:
dj_config: configs of data-juicer
"""
url_path = "/data_juicer/core/DefaultExecutor/run"
try:
res = self.call_data_juicer_api(url_path, params={"skip_return": True}, json={"cfg": jdumps(dj_config)})
if res.get("status") != "success":
raise RuntimeError(f"An error occurred in calling {url_path}:\n{res}")
return dj_config["export_path"]
except Exception as e:
error_msg = f"An unexpected error occurred in calling {url_path}:\n{e}"
raise RuntimeError(error_msg)
class DataJuicerExecutor(RayExecutor):
def __init__(self, cfg = None, meta = None):
super().__init__(cfg, meta)
self.client = DataJuicerClient(base_url="http://datamate-data-juicer:8000")
self.dataset_path = f"/flow/{self.cfg.instance_id}/dataset_on_dj.jsonl"
self.export_path = f"/flow/{self.cfg.instance_id}/processed_dataset.jsonl"
def add_column(self, batch):
batch_size = len(batch["filePath"])
batch["execute_status"] = [SUCCESS_STATUS] * batch_size
batch[Fields.instance_id] = [self.cfg.instance_id] * batch_size
batch[Fields.export_path] = [self.cfg.export_path] * batch_size
return batch
def run(self):
# 1. 加载数据集
logger.info('Loading dataset with Ray...')
if self.meta:
file_content = base64.b64decode(self.meta)
lines = file_content.splitlines()
dataset = ray.data.from_items([jloads(line) for line in lines])
else:
dataset = self.load_dataset()
logger.info('Read data...')
dataset = dataset.map(FileExporter().read_file, num_cpus=0.05)
with open(self.dataset_path, "w", encoding="utf-8") as f:
for batch_df in dataset.iter_batches(batch_format="pandas", batch_size=2048):
batch_df.to_json(f, orient="records", lines=True, force_ascii=False)
logger.info('Processing data...')
tstart = time.time()
try:
dj_config = self.client.init_config(self.dataset_path, self.export_path, self.cfg.process)
result_path = self.client.execute_config(dj_config)
processed_dataset = self.load_dataset(result_path)
processed_dataset = processed_dataset.map_batches(self.add_column, num_cpus=0.05)
processed_dataset = processed_dataset.map(FileExporter().save_file_and_db, num_cpus=0.05)
for _ in processed_dataset.iter_batches():
pass
except Exception as e:
logger.error(f"An unexpected error occurred.", e)
raise e
tend = time.time()
logger.info(f'All Ops are done in {tend - tstart:.3f}s.')
if __name__ == '__main__':
parser = ArgumentParser(description="Create API for Submitting Job to Data-juicer")
parser.add_argument("--config_path", type=str, required=False, default="../configs/demo.yaml")
parser.add_argument("--flow_config", type=str, required=False, default=None)
args = parser.parse_args()
config_path = args.config_path
flow_config = args.flow_config
if flow_config:
m_cfg = yaml.safe_load(base64.b64decode(flow_config))
else:
with open(config_path, "r", encoding='utf-8') as f:
m_cfg = yaml.safe_load(f)
executor = DataJuicerExecutor(m_cfg)
try:
executor.run()
except Exception as e:
executor.update_db("FAILED")
raise e
executor.update_db("COMPLETED")

View File

@@ -1,6 +1,11 @@
# -*- coding: utf-8 -*-
import os
from datamate.scheduler import cmd_scheduler
async def submit(task_id, config_path):
await cmd_scheduler.submit(task_id, f"dj-process --config {config_path}")
current_dir = os.path.dirname(__file__)
await cmd_scheduler.submit(task_id, f"python {os.path.join(current_dir, 'data_juicer_executor.py')} "
f"--config_path={config_path}")

View File

@@ -3,21 +3,18 @@
import base64
import json
import time
from typing import Dict
import ray
import yaml
from jsonargparse import dict_to_namespace, ArgumentParser
from jsonargparse import ArgumentParser
from loguru import logger
from datamate.common.utils import check_valid_path
from datamate.core.dataset import RayDataset
from datamate.sql_manager.persistence_atction import TaskInfoPersistence
from datamate.wrappers.executor import RayExecutor
import datamate.ops
class RayExecutor:
class DataMateExecutor(RayExecutor):
"""
基于Ray的执行器.
@@ -25,38 +22,8 @@ class RayExecutor:
2. 当前仅加载json文件类型的数据集。
"""
def __init__(self, cfg=None, meta=None):
if isinstance(cfg, Dict):
self.cfg = dict_to_namespace(cfg)
else:
logger.error(f"Please set param: cfg as type Dict, but given cfg as type {type(cfg).__name__}")
raise TypeError(f"To params cfg, Dict type is required, but type {type(cfg).__name__} is given!")
self.cfg.process = cfg['process']
self.meta = meta
# init ray
logger.info('Initing Ray ...')
ray.init()
def load_meta(self, line):
meta = json.loads(line)
if meta.get("fileId"):
meta["sourceFileId"] = meta.get("fileId")
if meta.get("fileName"):
meta["sourceFileName"] = meta.get("fileName")
if meta.get("fileType"):
meta["sourceFileType"] = meta.get("fileType")
if meta.get("fileSize"):
meta["sourceFileSize"] = meta.get("fileSize")
if not meta.get("totalPageNum"):
meta["totalPageNum"] = 0
if not meta.get("extraFilePath"):
meta["extraFilePath"] = None
if not meta.get("extraFileType"):
meta["extraFileType"] = None
meta["dataset_id"] = self.cfg.dataset_id
return meta
def __init__(self, cfg = None, meta = None):
super().__init__(cfg, meta)
def run(self):
# 1. 加载数据集
@@ -77,36 +44,13 @@ class RayExecutor:
tend = time.time()
logger.info(f'All Ops are done in {tend - tstart:.3f}s.')
dataset.data.materialize()
def load_dataset(self):
retry = 0
dataset = None
jsonl_file_path = self.cfg.dataset_path
while True:
if check_valid_path(jsonl_file_path):
with open(jsonl_file_path, "r", encoding='utf-8') as meta:
lines = meta.readlines()
dataset = ray.data.from_items([self.load_meta(line) for line in lines])
break
if retry < 5:
retry += 1
time.sleep(retry)
continue
else:
logger.error(f"can not load dataset from dataset_path")
raise RuntimeError(f"Load dataset Failed!, dataset_path: {self.cfg.dataset_path}.")
return dataset
def update_db(self, status):
task_info = TaskInfoPersistence()
task_info.update_result(self.cfg.dataset_id, self.cfg.instance_id, status)
for _ in dataset.data.iter_batches():
pass
if __name__ == '__main__':
parser = ArgumentParser(description="Create API for Submitting Job to Data-juicer")
parser = ArgumentParser(description="Create API for Submitting Job to ray")
parser.add_argument("--config_path", type=str, required=False, default="../configs/demo.yaml")
parser.add_argument("--flow_config", type=str, required=False, default=None)
@@ -119,10 +63,10 @@ if __name__ == '__main__':
if flow_config:
m_cfg = yaml.safe_load(base64.b64decode(flow_config))
else:
with open(config_path, "r", encoding='utf-8') as cfg:
m_cfg = yaml.safe_load(cfg)
with open(config_path, "r", encoding='utf-8') as f:
m_cfg = yaml.safe_load(f)
executor = RayExecutor(m_cfg)
executor = DataMateExecutor(m_cfg)
try:
executor.run()
except Exception as e:

View File

@@ -0,0 +1,80 @@
import json
import time
from typing import Dict
import ray
from jsonargparse import dict_to_namespace
from loguru import logger
from datamate.common.utils import check_valid_path
from datamate.sql_manager.persistence_atction import TaskInfoPersistence
class RayExecutor:
"""
基于Ray的执行器.
1. 当前仅支持Mapper,Filter类型的算子。
2. 当前仅加载json文件类型的数据集。
"""
def __init__(self, cfg=None, meta=None):
if isinstance(cfg, Dict):
self.cfg = dict_to_namespace(cfg)
else:
logger.error(f"Please set param: cfg as type Dict, but given cfg as type {type(cfg).__name__}")
raise TypeError(f"To params cfg, Dict type is required, but type {type(cfg).__name__} is given!")
self.cfg.process = cfg['process']
self.meta = meta
# init ray
logger.info('Initing Ray ...')
ray.init()
def load_meta(self, line):
meta = json.loads(line)
if meta.get("fileId"):
meta["sourceFileId"] = meta.get("fileId")
if meta.get("fileName"):
meta["sourceFileName"] = meta.get("fileName")
if meta.get("fileType"):
meta["sourceFileType"] = meta.get("fileType")
if meta.get("fileSize"):
meta["sourceFileSize"] = meta.get("fileSize")
if not meta.get("totalPageNum"):
meta["totalPageNum"] = 0
if not meta.get("extraFilePath"):
meta["extraFilePath"] = None
if not meta.get("extraFileType"):
meta["extraFileType"] = None
meta["dataset_id"] = self.cfg.dataset_id
return meta
def run(self):
pass
def load_dataset(self, jsonl_file_path = None):
retry = 0
dataset = None
if jsonl_file_path is None:
jsonl_file_path = self.cfg.dataset_path
while True:
if check_valid_path(jsonl_file_path):
with open(jsonl_file_path, "r", encoding='utf-8') as meta:
lines = meta.readlines()
dataset = ray.data.from_items([self.load_meta(line) for line in lines])
break
if retry < 5:
retry += 1
time.sleep(retry)
continue
else:
logger.error(f"can not load dataset from dataset_path")
raise RuntimeError(f"Load dataset Failed!, dataset_path: {self.cfg.dataset_path}.")
return dataset
def update_db(self, status):
task_info = TaskInfoPersistence()
task_info.update_result(self.cfg.dataset_id, self.cfg.instance_id, status)

View File

@@ -6,7 +6,7 @@ authors = [
{ name = "Huawei datamate team" }
]
license = { text = "Apache-2.0" }
requires-python = ">=3.10"
requires-python = ">=3.10, <=3.12"
urls = { repository = "https://github.com/ModelEngine-Group/datamate" }
classifiers = [
"License :: OSI Approved :: Apache Software License",
@@ -20,7 +20,7 @@ dependencies = [
"jsonargparse>=4.44.0",
"loguru>=0.7.3",
"opencv-python-headless>=4.12.0.88",
"ray[data,default]==2.52.1",
"ray[data,default]>=2.52.1",
"unstructured[csv,docx,pptx,xlsx,pdf,md]==0.18.15",
"uvicorn[standard]>=0.38.0",
]

View File

@@ -63,7 +63,11 @@ VALUES ('64465bec-b46b-11f0-8291-00155d0e4808', '模态', 'modal', 'predefined'
('b5bfc548-8ef6-417c-b8a6-a4197c078249', 'Java', 'java', 'predefined', '873000a2-65b3-474b-8ccc-4813c08c76fb'),
('16e2d99e-eafb-44fc-acd0-f35a2bad28f8', '来源', 'origin', 'predefined', '0'),
('96a3b07a-3439-4557-a835-525faad60ca3', '系统预置', 'predefined', 'predefined', '16e2d99e-eafb-44fc-acd0-f35a2bad28f8'),
('ec2cdd17-8b93-4a81-88c4-ac9e98d10757', '用户上传', 'customized', 'predefined', '16e2d99e-eafb-44fc-acd0-f35a2bad28f8');
('ec2cdd17-8b93-4a81-88c4-ac9e98d10757', '用户上传', 'customized', 'predefined', '16e2d99e-eafb-44fc-acd0-f35a2bad28f8'),
('0ed75eea-e20b-11f0-88e6-00155d5c9528', '归属', 'vendor', 'predefined', '0'),
('431e7798-5426-4e1a-aae6-b9905a836b34', 'DataMate', 'datamate', 'predefined', '0ed75eea-e20b-11f0-88e6-00155d5c9528'),
('79b385b4-fde8-4617-bcba-02a176938996', 'DataJuicer', 'data-juicer', 'predefined', '0ed75eea-e20b-11f0-88e6-00155d5c9528'),
('f00eaa3e-96c1-4de4-96cd-9848ef5429ec', '其他', 'others', 'predefined', '0ed75eea-e20b-11f0-88e6-00155d5c9528');
INSERT IGNORE INTO t_operator
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
@@ -113,7 +117,7 @@ INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('d8a5df7a-52a9-42c2-83c4-01062e60f597', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
WHERE c.id IN ('d8a5df7a-52a9-42c2-83c4-01062e60f597', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3', '431e7798-5426-4e1a-aae6-b9905a836b34')
AND o.id IN ('FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFilter',
'FileWithHighRepeatWordRateFilter', 'FileWithHighSpecialCharRateFilter', 'FileWithManySensitiveWordsFilter',
'DuplicateFilesFilter', 'DuplicateSentencesFilter', 'AnonymizedCreditCardNumber', 'AnonymizedIdNumber',
@@ -127,7 +131,269 @@ INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3', '431e7798-5426-4e1a-aae6-b9905a836b34')
AND o.id IN ('ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation',
'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify', 'ImgDirectionCorrect');
INSERT IGNORE INTO t_operator
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
VALUES
('entity_attribute_aggregator', '实体属性聚合器', 'Summarizes a given attribute of an entity from a set of documents. 汇总一组文档中实体的给定属性。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('meta_tags_aggregator', '元标签聚合器', 'Merge similar meta tags into a single, unified tag. 将类似的元标记合并到一个统一的标记中。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('most_relevant_entities_aggregator', '最相关实体聚合器', 'Extracts and ranks entities closely related to a given entity from provided texts. 从提供的文本中提取与给定实体密切相关的实体并对其进行排名。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('nested_aggregator', '嵌套聚合器', 'Aggregates nested content from multiple samples into a single summary. 将多个示例中的嵌套内容聚合到单个摘要中。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('document_deduplicator', '文档去重器', 'Deduplicates samples at the document level using exact matching. 使用完全匹配在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('document_minhash_deduplicator', '文档MinHash去重器', 'Deduplicates samples at the document level using MinHash LSH. 使用MinHash LSH在文档级别删除重复样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('document_simhash_deduplicator', '文档SimHash去重器', 'Deduplicates samples at the document level using SimHash. 使用SimHash在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('image_deduplicator', '图像去重器', 'Deduplicates samples at the document level by exact matching of images. 通过图像的精确匹配在文档级别删除重复的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('ray_basic_deduplicator', 'Ray基础去重器', 'Backend for deduplicator. deduplicator的后端。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('ray_bts_minhash_deduplicator', 'Ray BTS MinHash去重器', 'A distributed implementation of Union-Find with load balancing. 具有负载平衡的Union-Find的分布式实现。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('ray_document_deduplicator', 'Ray文档去重器', 'Deduplicates samples at the document level using exact matching in Ray distributed mode. 在Ray分布式模式下使用精确匹配在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('ray_image_deduplicator', 'Ray图像去重器', 'Deduplicates samples at the document level using exact matching of images in Ray distributed mode. 在光线分布模式下使用图像的精确匹配在文档级别删除重复样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('ray_video_deduplicator', 'Ray视频去重器', 'Deduplicates samples at document-level using exact matching of videos in Ray distributed mode. 在Ray分布式模式下使用视频的精确匹配在文档级删除重复样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_deduplicator', '视频去重器', 'Deduplicates samples at the document level using exact matching of videos. 使用视频的精确匹配在文档级别删除重复的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('alphanumeric_filter', '字母数字过滤器', 'Filter to keep samples with an alphabet/numeric ratio within a specific range. 过滤器,以保持具有特定范围内的字母/数字比率的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('audio_duration_filter', '音频时长过滤器', 'Keep data samples whose audio durations are within a specified range. 保留音频持续时间在指定范围内的数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false),
('audio_nmf_snr_filter', '音频NMF信噪比过滤器', 'Keep data samples whose audio Signal-to-Noise Ratios (SNRs) are within a specified range. 保留音频信噪比 (snr) 在指定范围内的数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false),
('audio_size_filter', '音频大小过滤器', 'Keep data samples based on the size of their audio files. 根据音频文件的大小保留数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false),
('average_line_length_filter', '平均行长过滤器', 'Filter to keep samples with average line length within a specific range. 过滤器,以保持平均线长度在特定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('character_repetition_filter', '字符重复过滤器', 'Filter to keep samples with character-level n-gram repetition ratio within a specific range. 过滤器将具有字符级n-gram重复比的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('flagged_words_filter', '标记词过滤器', 'Filter to keep samples with flagged-word ratio in a specified range. 过滤器将标记词比率的样本保留在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('general_field_filter', '通用字段过滤器', 'Filter to keep samples based on a general field filter condition. 根据常规字段筛选条件保留样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('image_aesthetics_filter', '图像美学过滤器', 'Filter to keep samples with aesthetics scores within a specific range. 过滤以保持美学分数在特定范围内的样品。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_aspect_ratio_filter', '图像长宽比过滤器', 'Filter to keep samples with image aspect ratio within a specific range. 过滤器,以保持样本的图像纵横比在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_face_count_filter', '图像人脸计数过滤器', 'Filter to keep samples with the number of faces within a specific range. 过滤以保持样本的面数在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_face_ratio_filter', '图像人脸占比过滤器', 'Filter to keep samples with face area ratios within a specific range. 过滤以保持面面积比在特定范围内的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_nsfw_filter', '图像NSFW过滤器', 'Filter to keep samples whose images have nsfw scores in a specified range. 过滤器保留其图像的nsfw分数在指定范围内的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_pair_similarity_filter', '图像对相似度过滤器', 'Filter to keep image pairs with similarities between images within a specific range. 过滤器将图像之间具有相似性的图像对保持在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_shape_filter', '图像形状过滤器', 'Filter to keep samples with image shape (width, height) within specific ranges. 过滤器,以保持样本的图像形状 (宽度,高度) 在特定的范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_size_filter', '图像大小过滤器', 'Keep data samples whose image size (in Bytes/KB/MB/...) is within a specific range. 保留图像大小 (以字节/KB/MB/... 为单位) 在特定范围内的数据样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_text_matching_filter', '图文匹配过滤器', 'Filter to keep samples with image-text matching scores within a specific range. 过滤器将图像文本匹配分数的样本保持在特定范围内。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('image_text_similarity_filter', '图文相似度过滤器', 'Filter to keep samples with image-text similarity within a specified range. 过滤器将具有图像-文本相似性的样本保持在指定范围内。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('image_watermark_filter', '图像水印过滤器', 'Filter to keep samples whose images have no watermark with high probability. 过滤器以保持其图像没有水印的样本具有高概率。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('in_context_influence_filter', '上下文影响过滤器', 'Filter to keep texts based on their in-context influence on a validation set. 过滤以根据文本在上下文中对验证集的影响来保留文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('instruction_following_difficulty_filter', '指令跟随难度过滤器', 'Filter to keep texts based on their instruction following difficulty (IFD, https://arxiv.org/abs/2308.12032) score. 过滤以保持文本基于他们的指令跟随难度 (IFD, https://arxiv.org/abs/ 2308.12032) 分数。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('language_id_score_filter', '语种识别得分过滤器', 'Filter to keep samples in a specific language with a confidence score above a threshold. 过滤器以保留置信度高于阈值的特定语言的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('llm_analysis_filter', 'LLM分析过滤器', 'Base filter class for leveraging LLMs to analyze and filter data samples. 用于利用LLMs分析和过滤数据样本的基本筛选器类。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('llm_difficulty_score_filter', 'LLM难度得分过滤器', 'Filter to keep samples with high difficulty scores estimated by an LLM. 过滤器以保留由LLM估计的高难度分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('llm_perplexity_filter', 'LLM困惑度过滤器', 'Filter to keep samples with perplexity scores within a specified range, computed using a specified LLM. 过滤器将困惑分数的样本保留在指定范围内,使用指定的LLM计算。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('llm_quality_score_filter', 'LLM质量得分过滤器', 'Filter to keep samples with a high quality score estimated by a language model. 过滤器,以保留具有语言模型估计的高质量分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('llm_task_relevance_filter', 'LLM任务相关性过滤器', 'Filter to keep samples with high relevance scores to validation tasks estimated by an LLM. 过滤器以保留与LLM估计的验证任务具有高相关性分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('maximum_line_length_filter', '最大行长过滤器', 'Filter to keep samples with a maximum line length within a specified range. 筛选器将最大行长度的样本保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('perplexity_filter', '困惑度过滤器', 'Filter to keep samples with perplexity score in a specified range. 过滤以保持困惑分数在指定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('phrase_grounding_recall_filter', '短语定位召回过滤器', 'Filter to keep samples based on the phrase grounding recall of phrases extracted from text in images. 根据从图像中的文本中提取的短语接地召回来过滤以保留样本。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('special_characters_filter', '特殊字符过滤器', 'Filter to keep samples with special-character ratio within a specific range. 过滤器,以将具有特殊字符比率的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('specified_field_filter', '指定字段过滤器', 'Filter samples based on the specified field information. 根据指定的字段信息筛选样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('specified_numeric_field_filter', '指定数值字段过滤器', 'Filter samples based on a specified numeric field value. 根据指定的数值字段值筛选样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('stopwords_filter', '停用词过滤器', 'Filter to keep samples with stopword ratio within a specified range. 过滤器将停止词比率的样本保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('suffix_filter', '后缀过滤器', 'Filter to keep samples with specified suffix. 过滤器以保留具有指定后缀的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('text_action_filter', '文本动作过滤器', 'Filter to keep texts that contain a minimum number of actions. 过滤以保留包含最少数量操作的文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('text_embd_similarity_filter', '文本嵌入相似度过滤器', 'Filter to keep texts whose average embedding similarity to a set of given validation texts falls within a specific range. 过滤器,以保留与一组给定验证文本的平均嵌入相似度在特定范围内的文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('text_entity_dependency_filter', '文本实体依赖过滤器', 'Identify and filter text samples based on entity dependencies. 根据实体依赖关系识别和过滤文本样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('text_length_filter', '文本长度过滤器', 'Filter to keep samples with total text length within a specific range. 过滤以保持文本总长度在特定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('text_pair_similarity_filter', '文本对相似度过滤器', 'Filter to keep text pairs with similarities within a specific range. 过滤以将具有相似性的文本对保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('token_num_filter', 'Token数量过滤器', 'Filter to keep samples with a total token number within a specified range. 筛选器将总令牌数的样本保留在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('video_aesthetics_filter', '视频美学过滤器', 'Filter to keep data samples with aesthetics scores for specified frames in the videos within a specific range. 过滤器将视频中指定帧的美学得分数据样本保留在特定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_aspect_ratio_filter', '视频长宽比过滤器', 'Filter to keep samples with video aspect ratio within a specific range. 过滤器将视频纵横比的样本保持在特定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_duration_filter', '视频时长过滤器', 'Keep data samples whose videos\' durations are within a specified range. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_frames_text_similarity_filter', '', 'Filter to keep samples based on the similarity between video frame images and text within a specific range. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_motion_score_filter', '', 'Filter to keep samples with video motion scores within a specific range. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_motion_score_raft_filter', 'RAFT运动得分过滤器', 'Filter to keep samples with video motion scores within a specified range. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_nsfw_filter', 'NSFW过滤器', 'Filter to keep samples whose videos have nsfw scores in a specified range. nsfw分数在指定范围内的样本', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_ocr_area_ratio_filter', 'OCR面积占比过滤器', 'Keep data samples whose detected text area ratios for specified frames in the video are within a specified range. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_resolution_filter', '', 'Keep data samples whose videos\' resolutions are within a specified range. 保留视频分辨率在指定范围内的数据样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_tagging_from_frames_filter', '视频帧标签过滤器', 'Filter to keep samples whose videos contain specified tags. 过滤器以保留其视频包含指定标签的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_watermark_filter', '视频水印过滤器', 'Filter to keep samples whose videos have no watermark with high probability. 过滤器以保持其视频具有高概率没有水印的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('word_repetition_filter', '单词重复过滤器', 'Filter to keep samples with word-level n-gram repetition ratio within a specific range. 过滤器将单词级n-gram重复比率的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('words_num_filter', '词数过滤器', 'Filter to keep samples with a total word count within a specified range. 过滤器将样本的总字数保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('key_value_grouper', '键值分组器', 'Groups samples into batches based on values in specified keys. 根据指定键中的值将样本分组为批处理。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('naive_grouper', '朴素分组器', 'Group all samples in a dataset into a single batched sample. 将数据集中的所有样本分组为单个批处理样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('naive_reverse_grouper', '朴素反向分组器', 'Split batched samples into individual samples. 将批处理的样品分成单个样品。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('audio_add_gaussian_noise_mapper', '音频高斯噪声添加映射器', 'Mapper to add Gaussian noise to audio samples. 映射器将高斯噪声添加到音频样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false),
('audio_ffmpeg_wrapped_mapper', '音频FFmpeg封装映射器', 'Wraps FFmpeg audio filters for processing audio files in a dataset. 包装FFmpeg音频过滤器,用于处理数据集中的音频文件。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false),
('calibrate_qa_mapper', 'QA校准映射器', 'Calibrates question-answer pairs based on reference text using an API model. 使用API模型根据参考文本校准问答对。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('calibrate_query_mapper', '查询校准映射器', 'Calibrate query in question-answer pairs based on reference text. 基于参考文本校准问答对中的查询。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('calibrate_response_mapper', '回复校准映射器', 'Calibrate response in question-answer pairs based on reference text. 根据参考文本校准问答对中的回答。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('chinese_convert_mapper', '中文简繁转换映射器', 'Mapper to convert Chinese text between Traditional, Simplified, and Japanese Kanji. 映射器在繁体、简体和日文汉字之间转换中文文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('clean_copyright_mapper', '版权清洗映射器', 'Cleans copyright comments at the beginning of text samples. 清除文本示例开头的版权注释。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('clean_email_mapper', '邮箱清洗映射器', 'Cleans email addresses from text samples using a regular expression. 使用正则表达式从文本示例中清除电子邮件地址。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('clean_html_mapper', 'HTML清洗映射器', 'Cleans HTML code from text samples, converting HTML to plain text. 从文本示例中清除HTML代码,将HTML转换为纯文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('clean_ip_mapper', 'IP清洗映射器', 'Cleans IPv4 and IPv6 addresses from text samples. 从文本示例中清除IPv4和IPv6地址。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('clean_links_mapper', '链接清洗映射器', 'Mapper to clean links like http/https/ftp in text samples. 映射器来清理链接,如文本示例中的http/https/ftp。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('detect_character_attributes_mapper', '角色属性检测映射器', 'Takes an image, a caption, and main character names as input to extract the characters\' attributes. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('detect_character_locations_mapper', '', 'Given an image and a list of main character names, extract the bounding boxes for each present character. (YOLOE + MLLM)', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('detect_main_character_mapper', '', 'Extract all main character names based on the given image and its caption. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('dialog_intent_detection_mapper', '', 'Generates user\'s intent labels in a dialog by analyzing the history, query, and response. 通过分析历史记录、查询和响应,在对话框中生成用户的意图标签。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('dialog_sentiment_detection_mapper', '对话情感检测映射器', 'Generates sentiment labels and analysis for user queries in a dialog. 在对话框中为用户查询生成情绪标签和分析。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('dialog_sentiment_intensity_mapper', '对话情感强度映射器', 'Mapper to predict user\'s sentiment intensity in a dialog, ranging from -5 to 5. Mapper预测用户在对话框中的情绪强度-55', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('dialog_topic_detection_mapper', '', 'Generates user\'s topic labels and analysis in a dialog. 在对话框中生成用户的主题标签和分析。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('download_file_mapper', '文件下载映射器', 'Mapper to download URL files to local files or load them into memory. 映射器将URL文件下载到本地文件或将其加载到内存中。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('expand_macro_mapper', '宏展开映射器', 'Expands macro definitions in the document body of LaTeX samples. 展开LaTeX示例文档主体中的宏定义。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('extract_entity_attribute_mapper', '实体属性提取映射器', 'Extracts attributes for given entities from the text and stores them in the sample\'s metadata. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('extract_entity_relation_mapper', '', 'Extracts entities and relations from text to build a knowledge graph. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('extract_event_mapper', '', 'Extracts events and relevant characters from the text. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('extract_keyword_mapper', '', 'Generate keywords for the text. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('extract_nickname_mapper', '', 'Extracts nickname relationships in the text using a language model. 使', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('extract_support_text_mapper', '', 'Extracts a supporting sub-text from the original text based on a given summary. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('extract_tables_from_html_mapper', 'HTML表格提取映射器', 'Extracts tables from HTML content and stores them in a specified field. HTML内容中提取表并将其存储在指定字段中', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('fix_unicode_mapper', 'Unicode修复映射器', 'Fixes unicode errors in text samples. unicode错误', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('generate_qa_from_examples_mapper', 'QA映射器', 'Generates question and answer pairs from examples using a Hugging Face model. 使', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('generate_qa_from_text_mapper', 'QA映射器', 'Generates question and answer pairs from text using a specified model. 使', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('image_blur_mapper', '', 'Blurs images in the dataset with a specified probability and blur type. 使', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_captioning_from_gpt4v_mapper', 'GPT4V图像描述映射器', 'Generates text captions for images using the GPT-4 Vision model. 使GPT-4', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('image_captioning_mapper', '', 'Generates image captions using a Hugging Face model and appends them to samples. 使', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('image_detection_yolo_mapper', 'YOLO图像检测映射器', 'Perform object detection using YOLO on images and return bounding boxes and class labels. 使YOLO对图像执行对象检测', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_diffusion_mapper', '', 'Generate images using a diffusion model based on provided captions. 使', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('image_face_blur_mapper', '', 'Mapper to blur faces detected in images. ', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_remove_background_mapper', '', 'Mapper to remove the background of images. ', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_segment_mapper', '', 'Perform segment-anything on images and return the bounding boxes. segment-', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_tagging_mapper', '', 'Generates image tags for each image in the sample. ', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('imgdiff_difference_area_generator_mapper', 'ImgDiff差异区域生成映射器', 'Generates and filters bounding boxes for image pairs based on similarity, segmentation, and text matching. ', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('imgdiff_difference_caption_generator_mapper', 'ImgDiff差异描述生成映射器', 'Generates difference captions for bounding box regions in two images. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('mllm_mapper', 'MLLM视觉问答映射器', 'Mapper to use MLLMs for visual question answering tasks. Mapper使用MLLMs进行视觉问答任务', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('nlpaug_en_mapper', 'NLPAug英语增强映射器', 'Augments English text samples using various methods from the nlpaug library. 使nlpaug库中的各种方法增强英语文本样本', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('nlpcda_zh_mapper', 'NLPCDA中文增强映射器', 'Augments Chinese text samples using the nlpcda library. 使nlpcda库扩充中文文本样本', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('optimize_prompt_mapper', 'Prompt优化映射器', 'Optimize prompts based on existing ones in the same batch. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('optimize_qa_mapper', 'QA优化映射器', 'Mapper to optimize question-answer pairs. -', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('optimize_query_mapper', '', 'Optimize queries in question-answer pairs to make them more specific and detailed. 使', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('optimize_response_mapper', '', 'Optimize response in question-answer pairs to be more detailed and specific. 使', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('pair_preference_mapper', '', 'Mapper to construct paired preference samples by generating a rejected response and its reason. Mapper通过生成拒绝响应及其原因来构造成对的偏好样本', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('punctuation_normalization_mapper', '', 'Normalizes unicode punctuations to their English equivalents in text samples. unicode标点规范化为文本示例中的英语等效项', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('python_file_mapper', 'Python文件映射器', 'Executes a Python function defined in a file on input data. Python函数', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('python_lambda_mapper', 'Python Lambda映射器', 'Mapper for applying a Python lambda function to data samples. MapperPython lambda函数应用于数据样本', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('query_intent_detection_mapper', '', 'Predicts the user\'s intent label and corresponding score for a given query. 为给定查询预测用户的意图标签和相应的分数。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('query_sentiment_detection_mapper', '查询情感检测映射器', 'Predicts user\'s sentiment label (\'negative\', \'neutral\', \'positive\') in a query. ( )', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('query_topic_detection_mapper', '', 'Predicts the topic label and its corresponding score for a given query. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('relation_identity_mapper', '', 'Identify the relation between two entities in a given text. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_bibliography_mapper', '', 'Removes bibliography sections at the end of LaTeX documents. LaTeX文档末尾的参考书目部分', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_comments_mapper', '', 'Removes comments from documents, currently supporting only \'tex\' format. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_header_mapper', '', 'Removes headers at the beginning of documents in LaTeX samples. LaTeX示例中文档开头的标题', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_long_words_mapper', '', 'Mapper to remove long words within a specific range. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_non_chinese_character_mapper', '', 'Removes non-Chinese characters from text samples. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_repeat_sentences_mapper', '', 'Mapper to remove repeat sentences in text samples. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_specific_chars_mapper', '', 'Removes specific characters from text samples. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_table_text_mapper', '', 'Mapper to remove table texts from text samples. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_words_with_incorrect_substrings_mapper', '', 'Mapper to remove words containing specified incorrect substrings. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('replace_content_mapper', '', 'Replaces content in the text that matches a specific regular expression pattern with a designated replacement string. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('sdxl_prompt2prompt_mapper', 'SDXL Prompt2Prompt映射器', 'Generates pairs of similar images using the SDXL model. 使SDXL模型生成成对的相似图像', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('sentence_augmentation_mapper', '', 'Augments sentences by generating enhanced versions using a Hugging Face model. 使', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('sentence_split_mapper', '', 'Splits text samples into individual sentences based on the specified language. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('text_chunk_mapper', '', 'Split input text into chunks based on specified criteria. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('text_tagging_by_prompt_mapper', 'Prompt文本打标映射器', 'Mapper to generate text tags using prompt with LLM. Mapper使用带有LLM的prompt生成文本标记', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('vggt_mapper', 'VGGT视频提取映射器', 'Input a video of a single scene, and use VGGT to extract information including Camera Pose, Depth Maps, Point Maps, and 3D Point Tracks. 使VGGT提取包括相机姿态3D点轨迹的信息', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_captioning_from_audio_mapper', '', 'Mapper to caption a video according to its audio streams based on Qwen-Audio model. qwen-audio模型的音频流为视频添加字幕', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_captioning_from_frames_mapper', '', 'Generates video captions from sampled frames using an image-to-text model. 使', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_captioning_from_summarizer_mapper', '', 'Mapper to generate video captions by summarizing several kinds of generated texts (captions from video/audio/frames, tags from audio/frames, ...). (///...) ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_captioning_from_video_mapper', '', 'Generates video captions using a Hugging Face video-to-text model and sampled video frames. 使', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_captioning_from_vlm_mapper', 'VLM视频描述映射器', 'Generates video captions using a VLM that accepts videos as inputs. 使VLM生成视频字幕', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_depth_estimation_mapper', '', 'Perform depth estimation on the video. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_extract_frames_mapper', '', 'Mapper to extract frames from video files according to specified methods. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_face_blur_mapper', '', 'Mapper to blur faces detected in videos. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_ffmpeg_wrapped_mapper', 'FFmpeg封装映射器', 'Wraps FFmpeg video filters for processing video files in a dataset. FFmpeg视频过滤器', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_hand_reconstruction_mapper', '', 'Use the WiLoR model for hand localization and reconstruction. 使WiLoR模型进行手部定位和重建', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_object_segmenting_mapper', '', 'Text-guided semantic segmentation of valid objects throughout the video (YOLOE + SAM2). (YOLOE SAM2)', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_remove_watermark_mapper', '', 'Remove watermarks from videos based on specified regions. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_resize_aspect_ratio_mapper', '', 'Resizes videos to fit within a specified aspect ratio range. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_resize_resolution_mapper', '', 'Resizes video resolution based on specified width and height constraints. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_split_by_duration_mapper', '', 'Splits videos into segments based on a specified duration. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_split_by_key_frame_mapper', '', 'Splits a video into segments based on key frames. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_split_by_scene_mapper', '', 'Splits videos into scene clips based on detected scene changes. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_tagging_from_audio_mapper', '', 'Generates video tags from audio streams using the Audio Spectrogram Transformer. 使', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_tagging_from_frames_mapper', '', 'Generates video tags from frames extracted from videos. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_whole_body_pose_estimation_mapper', '姿', 'Input a video containing people, and use the DWPose model to extract the body, hand, feet, and face keypoints of the human subjects in the video, i.e., 2D Whole-body Pose Estimation. 使DWPose模型来提取视频中人类主体的身体2D全身姿态估计', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('whitespace_normalization_mapper', '', 'Normalizes various types of whitespace characters to standard spaces in text samples. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('frequency_specified_field_selector', '', 'Selector to filter samples based on the frequency of a specified field. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('random_selector', '', 'Randomly selects a subset of samples from the dataset. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('range_specified_field_selector', '', 'Selects a range of samples based on the sorted values of a specified field. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('tags_specified_field_selector', '', 'Selector to filter samples based on the tags of a specified field. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('topk_specified_field_selector', 'TopK指定字段选择器', 'Selects top samples based on the sorted values of a specified field. ', '1.4.4', 'text', 'text', NULL, NULL, '', false);
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('d8a5df7a-52a9-42c2-83c4-01062e60f597', '9eda9d5d-072b-499b-916c-797a0a8750e1',
'96a3b07a-3439-4557-a835-525faad60ca3', '79b385b4-fde8-4617-bcba-02a176938996')
AND o.id IN
('entity_attribute_aggregator', 'meta_tags_aggregator', 'most_relevant_entities_aggregator', 'nested_aggregator',
'document_deduplicator', 'document_minhash_deduplicator', 'document_simhash_deduplicator',
'ray_basic_deduplicator', 'ray_bts_minhash_deduplicator', 'ray_document_deduplicator', 'alphanumeric_filter',
'average_line_length_filter', 'character_repetition_filter', 'flagged_words_filter', 'general_field_filter',
'in_context_influence_filter', 'instruction_following_difficulty_filter', 'language_id_score_filter',
'llm_analysis_filter', 'llm_difficulty_score_filter', 'llm_perplexity_filter', 'llm_quality_score_filter',
'llm_task_relevance_filter', 'maximum_line_length_filter', 'perplexity_filter', 'special_characters_filter',
'specified_field_filter', 'specified_numeric_field_filter', 'stopwords_filter', 'suffix_filter',
'text_action_filter', 'text_embd_similarity_filter', 'text_entity_dependency_filter', 'text_length_filter',
'text_pair_similarity_filter', 'token_num_filter', 'word_repetition_filter', 'words_num_filter',
'key_value_grouper', 'naive_grouper', 'naive_reverse_grouper', 'calibrate_qa_mapper', 'calibrate_query_mapper',
'calibrate_response_mapper', 'chinese_convert_mapper', 'clean_copyright_mapper', 'clean_email_mapper',
'clean_html_mapper', 'clean_ip_mapper', 'clean_links_mapper', 'dialog_intent_detection_mapper',
'dialog_sentiment_detection_mapper', 'dialog_sentiment_intensity_mapper', 'dialog_topic_detection_mapper',
'download_file_mapper', 'expand_macro_mapper', 'extract_entity_attribute_mapper',
'extract_entity_relation_mapper', 'extract_event_mapper', 'extract_keyword_mapper', 'extract_nickname_mapper',
'extract_support_text_mapper', 'extract_tables_from_html_mapper', 'fix_unicode_mapper',
'generate_qa_from_examples_mapper', 'generate_qa_from_text_mapper', 'nlpaug_en_mapper', 'nlpcda_zh_mapper',
'optimize_prompt_mapper', 'optimize_qa_mapper', 'optimize_query_mapper', 'optimize_response_mapper',
'pair_preference_mapper', 'punctuation_normalization_mapper', 'python_file_mapper', 'python_lambda_mapper',
'query_intent_detection_mapper', 'query_sentiment_detection_mapper', 'query_topic_detection_mapper',
'relation_identity_mapper', 'remove_bibliography_mapper', 'remove_comments_mapper', 'remove_header_mapper',
'remove_long_words_mapper', 'remove_non_chinese_character_mapper', 'remove_repeat_sentences_mapper',
'remove_specific_chars_mapper', 'remove_table_text_mapper', 'remove_words_with_incorrect_substrings_mapper',
'replace_content_mapper', 'sdxl_prompt2prompt_mapper', 'sentence_augmentation_mapper', 'sentence_split_mapper',
'text_chunk_mapper', 'text_tagging_by_prompt_mapper', 'whitespace_normalization_mapper',
'frequency_specified_field_selector', 'random_selector', 'range_specified_field_selector',
'tags_specified_field_selector', 'topk_specified_field_selector');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1',
'96a3b07a-3439-4557-a835-525faad60ca3', '79b385b4-fde8-4617-bcba-02a176938996')
AND o.id IN ('image_deduplicator', 'ray_image_deduplicator', 'image_aesthetics_filter', 'image_aspect_ratio_filter',
'image_face_count_filter', 'image_face_ratio_filter', 'image_nsfw_filter',
'image_pair_similarity_filter', 'image_shape_filter', 'image_size_filter', 'image_watermark_filter',
'image_blur_mapper', 'image_detection_yolo_mapper', 'image_face_blur_mapper',
'image_remove_background_mapper', 'image_segment_mapper', 'image_tagging_mapper',
'imgdiff_difference_area_generator_mapper');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('42dd9392-73e4-458c-81ff-41751ada47b5', '9eda9d5d-072b-499b-916c-797a0a8750e1',
'96a3b07a-3439-4557-a835-525faad60ca3', '79b385b4-fde8-4617-bcba-02a176938996')
AND o.id IN ('audio_duration_filter', 'audio_nmf_snr_filter', 'audio_size_filter', 'audio_add_gaussian_noise_mapper',
'audio_ffmpeg_wrapped_mapper');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('a233d584-73c8-4188-ad5d-8f7c8dda9c27', '9eda9d5d-072b-499b-916c-797a0a8750e1',
'96a3b07a-3439-4557-a835-525faad60ca3', '79b385b4-fde8-4617-bcba-02a176938996')
AND o.id IN ('ray_video_deduplicator', 'video_deduplicator', 'video_aesthetics_filter', 'video_aspect_ratio_filter',
'video_duration_filter', 'video_motion_score_filter', 'video_motion_score_raft_filter',
'video_nsfw_filter', 'video_ocr_area_ratio_filter', 'video_resolution_filter',
'video_tagging_from_frames_filter', 'video_watermark_filter', 'vggt_mapper',
'video_depth_estimation_mapper', 'video_face_blur_mapper', 'video_ffmpeg_wrapped_mapper',
'video_hand_reconstruction_mapper', 'video_object_segmenting_mapper', 'video_remove_watermark_mapper',
'video_resize_aspect_ratio_mapper', 'video_resize_resolution_mapper', 'video_tagging_from_audio_mapper',
'video_tagging_from_frames_mapper', 'video_whole_body_pose_estimation_mapper');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('4d7dbd77-0a92-44f3-9056-2cd62d4a71e4', '9eda9d5d-072b-499b-916c-797a0a8750e1',
'96a3b07a-3439-4557-a835-525faad60ca3', '79b385b4-fde8-4617-bcba-02a176938996')
AND o.id IN ('image_text_matching_filter', 'image_text_similarity_filter', 'phrase_grounding_recall_filter',
'video_frames_text_similarity_filter', 'detect_character_attributes_mapper',
'detect_character_locations_mapper', 'detect_main_character_mapper',
'image_captioning_from_gpt4v_mapper', 'image_captioning_mapper', 'image_diffusion_mapper',
'imgdiff_difference_caption_generator_mapper', 'mllm_mapper', 'video_captioning_from_audio_mapper',
'video_captioning_from_frames_mapper', 'video_captioning_from_summarizer_mapper',
'video_captioning_from_video_mapper', 'video_captioning_from_vlm_mapper', 'video_extract_frames_mapper',
'video_split_by_duration_mapper', 'video_split_by_key_frame_mapper', 'video_split_by_scene_mapper');

View File

@@ -44,4 +44,4 @@ values ('sys.knowledge.base.count', '200', 'number', '10,200,500', '知识库最
('BRAVE_SEARCH_API_KEY', 'api-xxx', 'string', '', 'deer-flow使用的搜索引擎所需的apiKey', 1, 1, 1, 'system', 'system'),
('JINA_API_KEY', '', 'string', '', 'deer-flow使用的JINA搜索引擎所需的apiKey', 1, 1, 1, 'system', 'system'),
('sys.management.dataset.pvc.name', 'datamate-dataset-pvc', 'string', '', '数据集所在pvc名称', 1, 0, 1, 'system', 'system'),
('test_bool', 'true', 'boolean', '', '测试布尔值', 1, 1, 1, 'system', 'system');
('DATA_JUICER_EXECUTOR', 'default', 'string', 'default,ray', 'data-juicer使用的执行器', 1, 1, 1, 'system', 'system');

View File

@@ -16,12 +16,14 @@ COPY runtime/ops/user /opt/runtime/user
COPY scripts/images/runtime/start.sh /opt/runtime/start.sh
ENV PYTHONPATH=/opt/runtime/datamate/
ENV UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV UV_INDEX_STRATEGY=unsafe-best-match
WORKDIR /opt/runtime
RUN --mount=type=cache,target=/root/.cache/uv \
UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" uv pip install -e . --system --index-strategy unsafe-best-match \
&& UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \
uv pip install -e .[all] --system \
&& uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \
&& uv pip uninstall torch torchvision --system \
&& python -m spacy download zh_core_web_sm \
&& echo "/usr/local/lib/ops/site-packages" > /usr/local/lib/python3.11/site-packages/ops.pth