init datamate

2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='FullWidthCharacterCleaner',
+                          module_path="ops.mapper.full_width_characters_cleaner.process")
@@ -0,0 +1,18 @@
+name: '全角转半角'
+name_en: 'Full-to-Half Width Character'
+description: '将文档中的所有全角字符转换成半角字符。'
+description_en: 'Converts all full-width characters in documents to half-width characters.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'FullWidthCharacterCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: 'Ｒｅｓｉｄｅｎｔｉａｌ ａｎｄ ｃｏｍｍｅｒｃｉａｌ ｄｅｓｉｇｎ, ｓｉｔｅ ｉｎｓｐｅｃｔｉｏｎｓ, ｗｏｒｋｉｎｇ ｄｒａｗｉｎｇｓ,
+    Ｍｉｎｉｃａｄ, ｒｅｎｄｅｒｉｎｇｓ．'
+  after: 'Residential and commercial design, site inspections, working drawings, MiniCad,
+    renderings.'
+inputs: 'text'
+outputs: 'text'
@@ -0,0 +1,46 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 全角转半角
+Create: 2025/01/13
+"""
+import time
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Mapper
+
+
+class FullWidthCharacterCleaner(Mapper):
+    """将文档中的所有全角字符转换成半角字符"""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._full_to_half_dict = {
+            '＂': '"', '＃': '#', '＄': '$', '％': '%', '＆': '&', '＇': "'", '＊': '*', '＋': '+',
+            '－': '-', '．': '.', '／': '/', '０': '0', '１': '1', '２': '2', '３': '3', '４': '4',
+            '５': '5', '６': '6', '７': '7', '８': '8', '９': '9', '＜': '<', '＝': '=', '＞': '>',
+            '＠': '@', 'Ａ': 'A', 'Ｂ': 'B', 'Ｃ': 'C', 'Ｄ': 'D', 'Ｅ': 'E', 'Ｆ': 'F', 'Ｇ': 'G',
+            'Ｈ': 'H', 'Ｉ': 'I', 'Ｊ': 'J', 'Ｋ': 'K', 'Ｌ': 'L', 'Ｍ': 'M', 'Ｎ': 'N', 'Ｏ': 'O',
+            'Ｐ': 'P', 'Ｑ': 'Q', 'Ｒ': 'R', 'Ｓ': 'S', 'Ｔ': 'T', 'Ｕ': 'U', 'Ｖ': 'V', 'Ｗ': 'W',
+            'Ｘ': 'X', 'Ｙ': 'Y', 'Ｚ': 'Z', '［': '[', '＼': '\\', '］': ']', '＾': '^', '＿': '_',
+            '｀': '`', 'ａ': 'a', 'ｂ': 'b', 'ｃ': 'c', 'ｄ': 'd', 'ｅ': 'e', 'ｆ': 'f', 'ｇ': 'g',
+            'ｈ': 'h', 'ｉ': 'i', 'ｊ': 'j', 'ｋ': 'k', 'ｌ': 'l', 'ｍ': 'm', 'ｎ': 'n', 'ｏ': 'o',
+            'ｐ': 'p', 'ｑ': 'q', 'ｒ': 'r', 'ｓ': 's', 'ｔ': 't', 'ｕ': 'u', 'ｖ': 'v', 'ｗ': 'w',
+            'ｘ': 'x', 'ｙ': 'y', 'ｚ': 'z', '｛': '{', '｜': '|', '｝': '}', '～': '~'
+        }
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._full_width_character_filter(sample[self.text_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, "
+                    f"method: FullWidthCharactersCleaner costs {time.time() - start:6f} s")
+        return sample
+
+    def _full_width_character_filter(self, input_data: str):
+        res = []
+        for input_str in input_data.split('\n'):
+            res.append("".join(self._full_to_half_dict.get(char, char) for char in input_str))
+        return '\n'.join(res)