feature: 增加data-juicer算子 (#157)

This commit is contained in:
hhhhsc701
2025-12-11 10:32:19 +08:00
committed by GitHub
parent cfa6301e9e
commit 786f13f9c3
2 changed files with 273 additions and 1 deletions

View File

@@ -19,12 +19,22 @@ dependencies = [
"fastapi>=0.123.9", "fastapi>=0.123.9",
"jsonargparse>=4.44.0", "jsonargparse>=4.44.0",
"loguru>=0.7.3", "loguru>=0.7.3",
"opencv-python-headless>=4.12.0.88", "opencv-python-headless==4.7.0.72",
"ray[data,default]==2.52.1", "ray[data,default]==2.52.1",
"unstructured[csv,docx,pptx,xlsx]==0.18.15", "unstructured[csv,docx,pptx,xlsx]==0.18.15",
"uvicorn[standard]>=0.38.0", "uvicorn[standard]>=0.38.0",
] ]
[project.optional-dependencies]
dj = [
"py-data-juicer~=1.4.4"
]
# All dependencies
all = [
"datamate[dj]"
]
[build-system] [build-system]
requires = ["hatchling"] requires = ["hatchling"]
build-backend = "hatchling.build" build-backend = "hatchling.build"

View File

@@ -131,3 +131,265 @@ WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-
AND o.id IN ('ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise', AND o.id IN ('ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation', 'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation',
'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify', 'ImgDirectionCorrect'); 'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify', 'ImgDirectionCorrect');
INSERT IGNORE INTO t_operator
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
VALUES
('entity_attribute_aggregator', '实体属性聚合器', 'Summarizes a given attribute of an entity from a set of documents. 汇总一组文档中实体的给定属性。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('meta_tags_aggregator', '元标签聚合器', 'Merge similar meta tags into a single, unified tag. 将类似的元标记合并到一个统一的标记中。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('most_relevant_entities_aggregator', '最相关实体聚合器', 'Extracts and ranks entities closely related to a given entity from provided texts. 从提供的文本中提取与给定实体密切相关的实体并对其进行排名。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('nested_aggregator', '嵌套聚合器', 'Aggregates nested content from multiple samples into a single summary. 将多个示例中的嵌套内容聚合到单个摘要中。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('document_deduplicator', '文档去重器', 'Deduplicates samples at the document level using exact matching. 使用完全匹配在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('document_minhash_deduplicator', '文档MinHash去重器', 'Deduplicates samples at the document level using MinHash LSH. 使用MinHash LSH在文档级别删除重复样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('document_simhash_deduplicator', '文档SimHash去重器', 'Deduplicates samples at the document level using SimHash. 使用SimHash在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('image_deduplicator', '图像去重器', 'Deduplicates samples at the document level by exact matching of images. 通过图像的精确匹配在文档级别删除重复的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('ray_basic_deduplicator', 'Ray基础去重器', 'Backend for deduplicator. deduplicator的后端。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('ray_bts_minhash_deduplicator', 'Ray BTS MinHash去重器', 'A distributed implementation of Union-Find with load balancing. 具有负载平衡的Union-Find的分布式实现。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('ray_document_deduplicator', 'Ray文档去重器', 'Deduplicates samples at the document level using exact matching in Ray distributed mode. 在Ray分布式模式下使用精确匹配在文档级别删除重复的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('ray_image_deduplicator', 'Ray图像去重器', 'Deduplicates samples at the document level using exact matching of images in Ray distributed mode. 在光线分布模式下使用图像的精确匹配在文档级别删除重复样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('ray_video_deduplicator', 'Ray视频去重器', 'Deduplicates samples at document-level using exact matching of videos in Ray distributed mode. 在Ray分布式模式下使用视频的精确匹配在文档级删除重复样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_deduplicator', '视频去重器', 'Deduplicates samples at the document level using exact matching of videos. 使用视频的精确匹配在文档级别删除重复的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('alphanumeric_filter', '字母数字过滤器', 'Filter to keep samples with an alphabet/numeric ratio within a specific range. 过滤器,以保持具有特定范围内的字母/数字比率的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('audio_duration_filter', '音频时长过滤器', 'Keep data samples whose audio durations are within a specified range. 保留音频持续时间在指定范围内的数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false),
('audio_nmf_snr_filter', '音频NMF信噪比过滤器', 'Keep data samples whose audio Signal-to-Noise Ratios (SNRs) are within a specified range. 保留音频信噪比 (snr) 在指定范围内的数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false),
('audio_size_filter', '音频大小过滤器', 'Keep data samples based on the size of their audio files. 根据音频文件的大小保留数据样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false),
('average_line_length_filter', '平均行长过滤器', 'Filter to keep samples with average line length within a specific range. 过滤器,以保持平均线长度在特定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('character_repetition_filter', '字符重复过滤器', 'Filter to keep samples with character-level n-gram repetition ratio within a specific range. 过滤器将具有字符级n-gram重复比的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('flagged_words_filter', '标记词过滤器', 'Filter to keep samples with flagged-word ratio in a specified range. 过滤器将标记词比率的样本保留在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('general_field_filter', '通用字段过滤器', 'Filter to keep samples based on a general field filter condition. 根据常规字段筛选条件保留样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('image_aesthetics_filter', '图像美学过滤器', 'Filter to keep samples with aesthetics scores within a specific range. 过滤以保持美学分数在特定范围内的样品。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_aspect_ratio_filter', '图像长宽比过滤器', 'Filter to keep samples with image aspect ratio within a specific range. 过滤器,以保持样本的图像纵横比在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_face_count_filter', '图像人脸计数过滤器', 'Filter to keep samples with the number of faces within a specific range. 过滤以保持样本的面数在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_face_ratio_filter', '图像人脸占比过滤器', 'Filter to keep samples with face area ratios within a specific range. 过滤以保持面面积比在特定范围内的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_nsfw_filter', '图像NSFW过滤器', 'Filter to keep samples whose images have nsfw scores in a specified range. 过滤器保留其图像的nsfw分数在指定范围内的样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_pair_similarity_filter', '图像对相似度过滤器', 'Filter to keep image pairs with similarities between images within a specific range. 过滤器将图像之间具有相似性的图像对保持在特定范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_shape_filter', '图像形状过滤器', 'Filter to keep samples with image shape (width, height) within specific ranges. 过滤器,以保持样本的图像形状 (宽度,高度) 在特定的范围内。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_size_filter', '图像大小过滤器', 'Keep data samples whose image size (in Bytes/KB/MB/...) is within a specific range. 保留图像大小 (以字节/KB/MB/... 为单位) 在特定范围内的数据样本。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_text_matching_filter', '图文匹配过滤器', 'Filter to keep samples with image-text matching scores within a specific range. 过滤器将图像文本匹配分数的样本保持在特定范围内。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('image_text_similarity_filter', '图文相似度过滤器', 'Filter to keep samples with image-text similarity within a specified range. 过滤器将具有图像-文本相似性的样本保持在指定范围内。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('image_watermark_filter', '图像水印过滤器', 'Filter to keep samples whose images have no watermark with high probability. 过滤器以保持其图像没有水印的样本具有高概率。', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('in_context_influence_filter', '上下文影响过滤器', 'Filter to keep texts based on their in-context influence on a validation set. 过滤以根据文本在上下文中对验证集的影响来保留文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('instruction_following_difficulty_filter', '指令跟随难度过滤器', 'Filter to keep texts based on their instruction following difficulty (IFD, https://arxiv.org/abs/2308.12032) score. 过滤以保持文本基于他们的指令跟随难度 (IFD, https://arxiv.org/abs/ 2308.12032) 分数。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('language_id_score_filter', '语种识别得分过滤器', 'Filter to keep samples in a specific language with a confidence score above a threshold. 过滤器以保留置信度高于阈值的特定语言的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('llm_analysis_filter', 'LLM分析过滤器', 'Base filter class for leveraging LLMs to analyze and filter data samples. 用于利用LLMs分析和过滤数据样本的基本筛选器类。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('llm_difficulty_score_filter', 'LLM难度得分过滤器', 'Filter to keep samples with high difficulty scores estimated by an LLM. 过滤器以保留由LLM估计的高难度分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('llm_perplexity_filter', 'LLM困惑度过滤器', 'Filter to keep samples with perplexity scores within a specified range, computed using a specified LLM. 过滤器将困惑分数的样本保留在指定范围内,使用指定的LLM计算。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('llm_quality_score_filter', 'LLM质量得分过滤器', 'Filter to keep samples with a high quality score estimated by a language model. 过滤器,以保留具有语言模型估计的高质量分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('llm_task_relevance_filter', 'LLM任务相关性过滤器', 'Filter to keep samples with high relevance scores to validation tasks estimated by an LLM. 过滤器以保留与LLM估计的验证任务具有高相关性分数的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('maximum_line_length_filter', '最大行长过滤器', 'Filter to keep samples with a maximum line length within a specified range. 筛选器将最大行长度的样本保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('perplexity_filter', '困惑度过滤器', 'Filter to keep samples with perplexity score in a specified range. 过滤以保持困惑分数在指定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('phrase_grounding_recall_filter', '短语定位召回过滤器', 'Filter to keep samples based on the phrase grounding recall of phrases extracted from text in images. 根据从图像中的文本中提取的短语接地召回来过滤以保留样本。', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('special_characters_filter', '特殊字符过滤器', 'Filter to keep samples with special-character ratio within a specific range. 过滤器,以将具有特殊字符比率的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('specified_field_filter', '指定字段过滤器', 'Filter samples based on the specified field information. 根据指定的字段信息筛选样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('specified_numeric_field_filter', '指定数值字段过滤器', 'Filter samples based on a specified numeric field value. 根据指定的数值字段值筛选样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('stopwords_filter', '停用词过滤器', 'Filter to keep samples with stopword ratio within a specified range. 过滤器将停止词比率的样本保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('suffix_filter', '后缀过滤器', 'Filter to keep samples with specified suffix. 过滤器以保留具有指定后缀的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('text_action_filter', '文本动作过滤器', 'Filter to keep texts that contain a minimum number of actions. 过滤以保留包含最少数量操作的文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('text_embd_similarity_filter', '文本嵌入相似度过滤器', 'Filter to keep texts whose average embedding similarity to a set of given validation texts falls within a specific range. 过滤器,以保留与一组给定验证文本的平均嵌入相似度在特定范围内的文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('text_entity_dependency_filter', '文本实体依赖过滤器', 'Identify and filter text samples based on entity dependencies. 根据实体依赖关系识别和过滤文本样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('text_length_filter', '文本长度过滤器', 'Filter to keep samples with total text length within a specific range. 过滤以保持文本总长度在特定范围内的样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('text_pair_similarity_filter', '文本对相似度过滤器', 'Filter to keep text pairs with similarities within a specific range. 过滤以将具有相似性的文本对保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('token_num_filter', 'Token数量过滤器', 'Filter to keep samples with a total token number within a specified range. 筛选器将总令牌数的样本保留在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('video_aesthetics_filter', '视频美学过滤器', 'Filter to keep data samples with aesthetics scores for specified frames in the videos within a specific range. 过滤器将视频中指定帧的美学得分数据样本保留在特定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_aspect_ratio_filter', '视频长宽比过滤器', 'Filter to keep samples with video aspect ratio within a specific range. 过滤器将视频纵横比的样本保持在特定范围内。', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_duration_filter', '视频时长过滤器', 'Keep data samples whose videos\' durations are within a specified range. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_frames_text_similarity_filter', '', 'Filter to keep samples based on the similarity between video frame images and text within a specific range. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_motion_score_filter', '', 'Filter to keep samples with video motion scores within a specific range. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_motion_score_raft_filter', 'RAFT运动得分过滤器', 'Filter to keep samples with video motion scores within a specified range. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_nsfw_filter', 'NSFW过滤器', 'Filter to keep samples whose videos have nsfw scores in a specified range. nsfw分数在指定范围内的样本', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_ocr_area_ratio_filter', 'OCR面积占比过滤器', 'Keep data samples whose detected text area ratios for specified frames in the video are within a specified range. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_resolution_filter', '', 'Keep data samples whose videos\' resolutions are within a specified range. 保留视频分辨率在指定范围内的数据样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_tagging_from_frames_filter', '视频帧标签过滤器', 'Filter to keep samples whose videos contain specified tags. 过滤器以保留其视频包含指定标签的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_watermark_filter', '视频水印过滤器', 'Filter to keep samples whose videos have no watermark with high probability. 过滤器以保持其视频具有高概率没有水印的样本。', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('word_repetition_filter', '单词重复过滤器', 'Filter to keep samples with word-level n-gram repetition ratio within a specific range. 过滤器将单词级n-gram重复比率的样本保持在特定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('words_num_filter', '词数过滤器', 'Filter to keep samples with a total word count within a specified range. 过滤器将样本的总字数保持在指定范围内。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('key_value_grouper', '键值分组器', 'Groups samples into batches based on values in specified keys. 根据指定键中的值将样本分组为批处理。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('naive_grouper', '朴素分组器', 'Group all samples in a dataset into a single batched sample. 将数据集中的所有样本分组为单个批处理样本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('naive_reverse_grouper', '朴素反向分组器', 'Split batched samples into individual samples. 将批处理的样品分成单个样品。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('audio_add_gaussian_noise_mapper', '音频高斯噪声添加映射器', 'Mapper to add Gaussian noise to audio samples. 映射器将高斯噪声添加到音频样本。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false),
('audio_ffmpeg_wrapped_mapper', '音频FFmpeg封装映射器', 'Wraps FFmpeg audio filters for processing audio files in a dataset. 包装FFmpeg音频过滤器,用于处理数据集中的音频文件。', '1.4.4', 'audio', 'audio', NULL, NULL, '', false),
('calibrate_qa_mapper', 'QA校准映射器', 'Calibrates question-answer pairs based on reference text using an API model. 使用API模型根据参考文本校准问答对。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('calibrate_query_mapper', '查询校准映射器', 'Calibrate query in question-answer pairs based on reference text. 基于参考文本校准问答对中的查询。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('calibrate_response_mapper', '回复校准映射器', 'Calibrate response in question-answer pairs based on reference text. 根据参考文本校准问答对中的回答。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('chinese_convert_mapper', '中文简繁转换映射器', 'Mapper to convert Chinese text between Traditional, Simplified, and Japanese Kanji. 映射器在繁体、简体和日文汉字之间转换中文文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('clean_copyright_mapper', '版权清洗映射器', 'Cleans copyright comments at the beginning of text samples. 清除文本示例开头的版权注释。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('clean_email_mapper', '邮箱清洗映射器', 'Cleans email addresses from text samples using a regular expression. 使用正则表达式从文本示例中清除电子邮件地址。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('clean_html_mapper', 'HTML清洗映射器', 'Cleans HTML code from text samples, converting HTML to plain text. 从文本示例中清除HTML代码,将HTML转换为纯文本。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('clean_ip_mapper', 'IP清洗映射器', 'Cleans IPv4 and IPv6 addresses from text samples. 从文本示例中清除IPv4和IPv6地址。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('clean_links_mapper', '链接清洗映射器', 'Mapper to clean links like http/https/ftp in text samples. 映射器来清理链接,如文本示例中的http/https/ftp。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('detect_character_attributes_mapper', '角色属性检测映射器', 'Takes an image, a caption, and main character names as input to extract the characters\' attributes. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('detect_character_locations_mapper', '', 'Given an image and a list of main character names, extract the bounding boxes for each present character. (YOLOE + MLLM)', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('detect_main_character_mapper', '', 'Extract all main character names based on the given image and its caption. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('dialog_intent_detection_mapper', '', 'Generates user\'s intent labels in a dialog by analyzing the history, query, and response. 通过分析历史记录、查询和响应,在对话框中生成用户的意图标签。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('dialog_sentiment_detection_mapper', '对话情感检测映射器', 'Generates sentiment labels and analysis for user queries in a dialog. 在对话框中为用户查询生成情绪标签和分析。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('dialog_sentiment_intensity_mapper', '对话情感强度映射器', 'Mapper to predict user\'s sentiment intensity in a dialog, ranging from -5 to 5. Mapper预测用户在对话框中的情绪强度-55', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('dialog_topic_detection_mapper', '', 'Generates user\'s topic labels and analysis in a dialog. 在对话框中生成用户的主题标签和分析。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('download_file_mapper', '文件下载映射器', 'Mapper to download URL files to local files or load them into memory. 映射器将URL文件下载到本地文件或将其加载到内存中。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('expand_macro_mapper', '宏展开映射器', 'Expands macro definitions in the document body of LaTeX samples. 展开LaTeX示例文档主体中的宏定义。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('extract_entity_attribute_mapper', '实体属性提取映射器', 'Extracts attributes for given entities from the text and stores them in the sample\'s metadata. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('extract_entity_relation_mapper', '', 'Extracts entities and relations from text to build a knowledge graph. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('extract_event_mapper', '', 'Extracts events and relevant characters from the text. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('extract_keyword_mapper', '', 'Generate keywords for the text. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('extract_nickname_mapper', '', 'Extracts nickname relationships in the text using a language model. 使', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('extract_support_text_mapper', '', 'Extracts a supporting sub-text from the original text based on a given summary. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('extract_tables_from_html_mapper', 'HTML表格提取映射器', 'Extracts tables from HTML content and stores them in a specified field. HTML内容中提取表并将其存储在指定字段中', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('fix_unicode_mapper', 'Unicode修复映射器', 'Fixes unicode errors in text samples. unicode错误', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('generate_qa_from_examples_mapper', 'QA映射器', 'Generates question and answer pairs from examples using a Hugging Face model. 使', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('generate_qa_from_text_mapper', 'QA映射器', 'Generates question and answer pairs from text using a specified model. 使', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('image_blur_mapper', '', 'Blurs images in the dataset with a specified probability and blur type. 使', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_captioning_from_gpt4v_mapper', 'GPT4V图像描述映射器', 'Generates text captions for images using the GPT-4 Vision model. 使GPT-4', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('image_captioning_mapper', '', 'Generates image captions using a Hugging Face model and appends them to samples. 使', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('image_detection_yolo_mapper', 'YOLO图像检测映射器', 'Perform object detection using YOLO on images and return bounding boxes and class labels. 使YOLO对图像执行对象检测', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_diffusion_mapper', '', 'Generate images using a diffusion model based on provided captions. 使', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('image_face_blur_mapper', '', 'Mapper to blur faces detected in images. ', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_remove_background_mapper', '', 'Mapper to remove the background of images. ', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_segment_mapper', '', 'Perform segment-anything on images and return the bounding boxes. segment-', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('image_tagging_mapper', '', 'Generates image tags for each image in the sample. ', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('imgdiff_difference_area_generator_mapper', 'ImgDiff差异区域生成映射器', 'Generates and filters bounding boxes for image pairs based on similarity, segmentation, and text matching. ', '1.4.4', 'image', 'image', NULL, NULL, '', false),
('imgdiff_difference_caption_generator_mapper', 'ImgDiff差异描述生成映射器', 'Generates difference captions for bounding box regions in two images. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('mllm_mapper', 'MLLM视觉问答映射器', 'Mapper to use MLLMs for visual question answering tasks. Mapper使用MLLMs进行视觉问答任务', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('nlpaug_en_mapper', 'NLPAug英语增强映射器', 'Augments English text samples using various methods from the nlpaug library. 使nlpaug库中的各种方法增强英语文本样本', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('nlpcda_zh_mapper', 'NLPCDA中文增强映射器', 'Augments Chinese text samples using the nlpcda library. 使nlpcda库扩充中文文本样本', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('optimize_prompt_mapper', 'Prompt优化映射器', 'Optimize prompts based on existing ones in the same batch. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('optimize_qa_mapper', 'QA优化映射器', 'Mapper to optimize question-answer pairs. -', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('optimize_query_mapper', '', 'Optimize queries in question-answer pairs to make them more specific and detailed. 使', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('optimize_response_mapper', '', 'Optimize response in question-answer pairs to be more detailed and specific. 使', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('pair_preference_mapper', '', 'Mapper to construct paired preference samples by generating a rejected response and its reason. Mapper通过生成拒绝响应及其原因来构造成对的偏好样本', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('punctuation_normalization_mapper', '', 'Normalizes unicode punctuations to their English equivalents in text samples. unicode标点规范化为文本示例中的英语等效项', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('python_file_mapper', 'Python文件映射器', 'Executes a Python function defined in a file on input data. Python函数', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('python_lambda_mapper', 'Python Lambda映射器', 'Mapper for applying a Python lambda function to data samples. MapperPython lambda函数应用于数据样本', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('query_intent_detection_mapper', '', 'Predicts the user\'s intent label and corresponding score for a given query. 为给定查询预测用户的意图标签和相应的分数。', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('query_sentiment_detection_mapper', '查询情感检测映射器', 'Predicts user\'s sentiment label (\'negative\', \'neutral\', \'positive\') in a query. ( )', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('query_topic_detection_mapper', '', 'Predicts the topic label and its corresponding score for a given query. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('relation_identity_mapper', '', 'Identify the relation between two entities in a given text. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_bibliography_mapper', '', 'Removes bibliography sections at the end of LaTeX documents. LaTeX文档末尾的参考书目部分', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_comments_mapper', '', 'Removes comments from documents, currently supporting only \'tex\' format. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_header_mapper', '', 'Removes headers at the beginning of documents in LaTeX samples. LaTeX示例中文档开头的标题', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_long_words_mapper', '', 'Mapper to remove long words within a specific range. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_non_chinese_character_mapper', '', 'Removes non-Chinese characters from text samples. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_repeat_sentences_mapper', '', 'Mapper to remove repeat sentences in text samples. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_specific_chars_mapper', '', 'Removes specific characters from text samples. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_table_text_mapper', '', 'Mapper to remove table texts from text samples. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('remove_words_with_incorrect_substrings_mapper', '', 'Mapper to remove words containing specified incorrect substrings. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('replace_content_mapper', '', 'Replaces content in the text that matches a specific regular expression pattern with a designated replacement string. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('sdxl_prompt2prompt_mapper', 'SDXL Prompt2Prompt映射器', 'Generates pairs of similar images using the SDXL model. 使SDXL模型生成成对的相似图像', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('sentence_augmentation_mapper', '', 'Augments sentences by generating enhanced versions using a Hugging Face model. 使', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('sentence_split_mapper', '', 'Splits text samples into individual sentences based on the specified language. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('text_chunk_mapper', '', 'Split input text into chunks based on specified criteria. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('text_tagging_by_prompt_mapper', 'Prompt文本打标映射器', 'Mapper to generate text tags using prompt with LLM. Mapper使用带有LLM的prompt生成文本标记', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('vggt_mapper', 'VGGT视频提取映射器', 'Input a video of a single scene, and use VGGT to extract information including Camera Pose, Depth Maps, Point Maps, and 3D Point Tracks. 使VGGT提取包括相机姿态3D点轨迹的信息', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_captioning_from_audio_mapper', '', 'Mapper to caption a video according to its audio streams based on Qwen-Audio model. qwen-audio模型的音频流为视频添加字幕', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_captioning_from_frames_mapper', '', 'Generates video captions from sampled frames using an image-to-text model. 使', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_captioning_from_summarizer_mapper', '', 'Mapper to generate video captions by summarizing several kinds of generated texts (captions from video/audio/frames, tags from audio/frames, ...). (///...) ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_captioning_from_video_mapper', '', 'Generates video captions using a Hugging Face video-to-text model and sampled video frames. 使', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_captioning_from_vlm_mapper', 'VLM视频描述映射器', 'Generates video captions using a VLM that accepts videos as inputs. 使VLM生成视频字幕', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_depth_estimation_mapper', '', 'Perform depth estimation on the video. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_extract_frames_mapper', '', 'Mapper to extract frames from video files according to specified methods. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_face_blur_mapper', '', 'Mapper to blur faces detected in videos. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_ffmpeg_wrapped_mapper', 'FFmpeg封装映射器', 'Wraps FFmpeg video filters for processing video files in a dataset. FFmpeg视频过滤器', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_hand_reconstruction_mapper', '', 'Use the WiLoR model for hand localization and reconstruction. 使WiLoR模型进行手部定位和重建', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_object_segmenting_mapper', '', 'Text-guided semantic segmentation of valid objects throughout the video (YOLOE + SAM2). (YOLOE SAM2)', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_remove_watermark_mapper', '', 'Remove watermarks from videos based on specified regions. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_resize_aspect_ratio_mapper', '', 'Resizes videos to fit within a specified aspect ratio range. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_resize_resolution_mapper', '', 'Resizes video resolution based on specified width and height constraints. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_split_by_duration_mapper', '', 'Splits videos into segments based on a specified duration. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_split_by_key_frame_mapper', '', 'Splits a video into segments based on key frames. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_split_by_scene_mapper', '', 'Splits videos into scene clips based on detected scene changes. ', '1.4.4', 'multimodal', 'multimodal', NULL, NULL, '', false),
('video_tagging_from_audio_mapper', '', 'Generates video tags from audio streams using the Audio Spectrogram Transformer. 使', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_tagging_from_frames_mapper', '', 'Generates video tags from frames extracted from videos. ', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('video_whole_body_pose_estimation_mapper', '姿', 'Input a video containing people, and use the DWPose model to extract the body, hand, feet, and face keypoints of the human subjects in the video, i.e., 2D Whole-body Pose Estimation. 使DWPose模型来提取视频中人类主体的身体2D全身姿态估计', '1.4.4', 'video', 'video', NULL, NULL, '', false),
('whitespace_normalization_mapper', '', 'Normalizes various types of whitespace characters to standard spaces in text samples. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('frequency_specified_field_selector', '', 'Selector to filter samples based on the frequency of a specified field. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('random_selector', '', 'Randomly selects a subset of samples from the dataset. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('range_specified_field_selector', '', 'Selects a range of samples based on the sorted values of a specified field. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('tags_specified_field_selector', '', 'Selector to filter samples based on the tags of a specified field. ', '1.4.4', 'text', 'text', NULL, NULL, '', false),
('topk_specified_field_selector', 'TopK指定字段选择器', 'Selects top samples based on the sorted values of a specified field. ', '1.4.4', 'text', 'text', NULL, NULL, '', false);
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('d8a5df7a-52a9-42c2-83c4-01062e60f597', '9eda9d5d-072b-499b-916c-797a0a8750e1',
'96a3b07a-3439-4557-a835-525faad60ca3')
AND o.id IN
('entity_attribute_aggregator', 'meta_tags_aggregator', 'most_relevant_entities_aggregator', 'nested_aggregator',
'document_deduplicator', 'document_minhash_deduplicator', 'document_simhash_deduplicator',
'ray_basic_deduplicator', 'ray_bts_minhash_deduplicator', 'ray_document_deduplicator', 'alphanumeric_filter',
'average_line_length_filter', 'character_repetition_filter', 'flagged_words_filter', 'general_field_filter',
'in_context_influence_filter', 'instruction_following_difficulty_filter', 'language_id_score_filter',
'llm_analysis_filter', 'llm_difficulty_score_filter', 'llm_perplexity_filter', 'llm_quality_score_filter',
'llm_task_relevance_filter', 'maximum_line_length_filter', 'perplexity_filter', 'special_characters_filter',
'specified_field_filter', 'specified_numeric_field_filter', 'stopwords_filter', 'suffix_filter',
'text_action_filter', 'text_embd_similarity_filter', 'text_entity_dependency_filter', 'text_length_filter',
'text_pair_similarity_filter', 'token_num_filter', 'word_repetition_filter', 'words_num_filter',
'key_value_grouper', 'naive_grouper', 'naive_reverse_grouper', 'calibrate_qa_mapper', 'calibrate_query_mapper',
'calibrate_response_mapper', 'chinese_convert_mapper', 'clean_copyright_mapper', 'clean_email_mapper',
'clean_html_mapper', 'clean_ip_mapper', 'clean_links_mapper', 'dialog_intent_detection_mapper',
'dialog_sentiment_detection_mapper', 'dialog_sentiment_intensity_mapper', 'dialog_topic_detection_mapper',
'download_file_mapper', 'expand_macro_mapper', 'extract_entity_attribute_mapper',
'extract_entity_relation_mapper', 'extract_event_mapper', 'extract_keyword_mapper', 'extract_nickname_mapper',
'extract_support_text_mapper', 'extract_tables_from_html_mapper', 'fix_unicode_mapper',
'generate_qa_from_examples_mapper', 'generate_qa_from_text_mapper', 'nlpaug_en_mapper', 'nlpcda_zh_mapper',
'optimize_prompt_mapper', 'optimize_qa_mapper', 'optimize_query_mapper', 'optimize_response_mapper',
'pair_preference_mapper', 'punctuation_normalization_mapper', 'python_file_mapper', 'python_lambda_mapper',
'query_intent_detection_mapper', 'query_sentiment_detection_mapper', 'query_topic_detection_mapper',
'relation_identity_mapper', 'remove_bibliography_mapper', 'remove_comments_mapper', 'remove_header_mapper',
'remove_long_words_mapper', 'remove_non_chinese_character_mapper', 'remove_repeat_sentences_mapper',
'remove_specific_chars_mapper', 'remove_table_text_mapper', 'remove_words_with_incorrect_substrings_mapper',
'replace_content_mapper', 'sdxl_prompt2prompt_mapper', 'sentence_augmentation_mapper', 'sentence_split_mapper',
'text_chunk_mapper', 'text_tagging_by_prompt_mapper', 'whitespace_normalization_mapper',
'frequency_specified_field_selector', 'random_selector', 'range_specified_field_selector',
'tags_specified_field_selector', 'topk_specified_field_selector');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1',
'96a3b07a-3439-4557-a835-525faad60ca3')
AND o.id IN ('image_deduplicator', 'ray_image_deduplicator', 'image_aesthetics_filter', 'image_aspect_ratio_filter',
'image_face_count_filter', 'image_face_ratio_filter', 'image_nsfw_filter',
'image_pair_similarity_filter', 'image_shape_filter', 'image_size_filter', 'image_watermark_filter',
'image_blur_mapper', 'image_detection_yolo_mapper', 'image_face_blur_mapper',
'image_remove_background_mapper', 'image_segment_mapper', 'image_tagging_mapper',
'imgdiff_difference_area_generator_mapper');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('42dd9392-73e4-458c-81ff-41751ada47b5', '9eda9d5d-072b-499b-916c-797a0a8750e1',
'96a3b07a-3439-4557-a835-525faad60ca3')
AND o.id IN ('audio_duration_filter', 'audio_nmf_snr_filter', 'audio_size_filter', 'audio_add_gaussian_noise_mapper',
'audio_ffmpeg_wrapped_mapper');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('a233d584-73c8-4188-ad5d-8f7c8dda9c27', '9eda9d5d-072b-499b-916c-797a0a8750e1',
'96a3b07a-3439-4557-a835-525faad60ca3')
AND o.id IN ('ray_video_deduplicator', 'video_deduplicator', 'video_aesthetics_filter', 'video_aspect_ratio_filter',
'video_duration_filter', 'video_motion_score_filter', 'video_motion_score_raft_filter',
'video_nsfw_filter', 'video_ocr_area_ratio_filter', 'video_resolution_filter',
'video_tagging_from_frames_filter', 'video_watermark_filter', 'vggt_mapper',
'video_depth_estimation_mapper', 'video_face_blur_mapper', 'video_ffmpeg_wrapped_mapper',
'video_hand_reconstruction_mapper', 'video_object_segmenting_mapper', 'video_remove_watermark_mapper',
'video_resize_aspect_ratio_mapper', 'video_resize_resolution_mapper', 'video_tagging_from_audio_mapper',
'video_tagging_from_frames_mapper', 'video_whole_body_pose_estimation_mapper');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('4d7dbd77-0a92-44f3-9056-2cd62d4a71e4', '9eda9d5d-072b-499b-916c-797a0a8750e1',
'96a3b07a-3439-4557-a835-525faad60ca3')
AND o.id IN ('image_text_matching_filter', 'image_text_similarity_filter', 'phrase_grounding_recall_filter',
'video_frames_text_similarity_filter', 'detect_character_attributes_mapper',
'detect_character_locations_mapper', 'detect_main_character_mapper',
'image_captioning_from_gpt4v_mapper', 'image_captioning_mapper', 'image_diffusion_mapper',
'imgdiff_difference_caption_generator_mapper', 'mllm_mapper', 'video_captioning_from_audio_mapper',
'video_captioning_from_frames_mapper', 'video_captioning_from_summarizer_mapper',
'video_captioning_from_video_mapper', 'video_captioning_from_vlm_mapper', 'video_extract_frames_mapper',
'video_split_by_duration_mapper', 'video_split_by_key_frame_mapper', 'video_split_by_scene_mapper');