init datamate

2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions
@@ -0,0 +1,49 @@
+{
+  "name": "text_length_filter",
+  "displayName": "文本长度过滤器",
+  "version": "1.0.0",
+  "author": "DataMate Team",
+  "description": "根据文本长度过滤数据，支持最小和最大长度限制",
+  "category": "数据清洗",
+  "type": "CUSTOM",
+  "inputs": [
+    {
+      "name": "input_data",
+      "type": "array",
+      "description": "输入文本数组",
+      "required": true
+    }
+  ],
+  "outputs": [
+    {
+      "name": "filtered_data",
+      "type": "array",
+      "description": "过滤后的文本数组"
+    }
+  ],
+  "parameters": [
+    {
+      "name": "min_length",
+      "type": "integer",
+      "description": "最小文本长度",
+      "default": 10,
+      "min": 0
+    },
+    {
+      "name": "max_length",
+      "type": "integer",
+      "description": "最大文本长度",
+      "default": 1000,
+      "min": 1
+    },
+    {
+      "name": "text_field",
+      "type": "string",
+      "description": "文本字段名称（如果输入是对象数组）",
+      "default": "text"
+    }
+  ],
+  "tags": ["文本处理", "数据过滤", "长度检查"],
+  "documentation": "https://docs.datamate.com/operators/text-length-filter",
+  "repository": "https://github.com/datamate/operators/tree/main/text-length-filter"
+}
@@ -0,0 +1,135 @@
+"""
+文本长度过滤器算子
+根据设定的最小和最大长度过滤文本数据
+"""
+
+import json
+import logging
+from typing import Dict, Any, List, Union
+
+logger = logging.getLogger(__name__)
+
+class TextLengthFilter:
+    """文本长度过滤器算子"""
+    
+    def __init__(self):
+        self.name = "text_length_filter"
+        self.version = "1.0.0"
+    
+    def execute(self, config: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
+        """执行文本长度过滤"""
+        
+        logger.info(f"开始执行算子: {self.name}")
+        
+        # 获取参数
+        parameters = config.get('parameters', {})
+        min_length = parameters.get('min_length', 10)
+        max_length = parameters.get('max_length', 1000)
+        text_field = parameters.get('text_field', 'text')
+        
+        logger.info(f"过滤参数: min_length={min_length}, max_length={max_length}, text_field={text_field}")
+        
+        # 验证参数
+        if min_length < 0:
+            raise ValueError("min_length must be >= 0")
+        if max_length < min_length:
+            raise ValueError("max_length must be >= min_length")
+        
+        # 读取输入数据
+        input_path = context['input_path']
+        with open(input_path, 'r', encoding='utf-8') as f:
+            input_data = json.load(f)
+        
+        if not isinstance(input_data, list):
+            raise ValueError("输入数据必须是数组格式")
+        
+        logger.info(f"输入数据条数: {len(input_data)}")
+        
+        # 执行过滤
+        filtered_data = []
+        stats = {
+            'total_input': len(input_data),
+            'too_short': 0,
+            'too_long': 0,
+            'filtered_out': 0,
+            'kept': 0
+        }
+        
+        for i, item in enumerate(input_data):
+            try:
+                # 提取文本内容
+                if isinstance(item, str):
+                    text = item
+                elif isinstance(item, dict) and text_field in item:
+                    text = str(item[text_field])
+                else:
+                    logger.warning(f"跳过无法处理的数据项 {i}: {type(item)}")
+                    stats['filtered_out'] += 1
+                    continue
+                
+                # 检查长度
+                text_length = len(text)
+                
+                if text_length < min_length:
+                    stats['too_short'] += 1
+                    stats['filtered_out'] += 1
+                elif text_length > max_length:
+                    stats['too_long'] += 1
+                    stats['filtered_out'] += 1
+                else:
+                    filtered_data.append(item)
+                    stats['kept'] += 1
+                
+                # 进度报告
+                if (i + 1) % 1000 == 0:
+                    progress = (i + 1) / len(input_data) * 100
+                    logger.info(f"处理进度: {progress:.1f}% ({i + 1}/{len(input_data)})")
+                    
+            except Exception as e:
+                logger.warning(f"处理数据项 {i} 时出错: {e}")
+                stats['filtered_out'] += 1
+                continue
+        
+        # 保存结果
+        output_path = context['output_path']
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(filtered_data, f, ensure_ascii=False, indent=2)
+        
+        # 准备返回结果
+        result = {
+            'status': 'success',
+            'statistics': stats,
+            'filter_rate': stats['filtered_out'] / stats['total_input'] * 100 if stats['total_input'] > 0 else 0,
+            'output_path': output_path
+        }
+        
+        logger.info(f"过滤完成: {stats}")
+        logger.info(f"过滤率: {result['filter_rate']:.2f}%")
+        
+        return result
+    
+    def validate_config(self, config: Dict[str, Any]) -> List[str]:
+        """验证配置参数"""
+        errors = []
+        parameters = config.get('parameters', {})
+        
+        min_length = parameters.get('min_length')
+        max_length = parameters.get('max_length')
+        
+        if min_length is not None and not isinstance(min_length, int):
+            errors.append("min_length must be an integer")
+        
+        if max_length is not None and not isinstance(max_length, int):
+            errors.append("max_length must be an integer")
+        
+        if min_length is not None and min_length < 0:
+            errors.append("min_length must be >= 0")
+        
+        if min_length is not None and max_length is not None and max_length < min_length:
+            errors.append("max_length must be >= min_length")
+        
+        return errors
+
+def create_operator():
+    """算子工厂函数"""
+    return TextLengthFilter()