init datamate

2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions
--- a/runtime/ops/mapper/extra_space_cleaner/init.py
+++ b/runtime/ops/mapper/extra_space_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ExtraSpaceCleaner',
+                          module_path="ops.mapper.extra_space_cleaner.process")
--- a/runtime/ops/mapper/extra_space_cleaner/metadata.yml
+++ b/runtime/ops/mapper/extra_space_cleaner/metadata.yml
@@ -0,0 +1,17 @@
+name: '多余空格去除'
+name_en: 'Redundant Space Removal'
+description: '移除文档首尾、句中或标点符号附近多余空格和 tab 等。'
+description_en: 'Removes redundant spaces and tabs at the beginning and end of documents,
+  in sentences, or near punctuations.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ExtraSpaceCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '  人工智能的研究历史有着一条从以“推理”为重  点，到以“知识”为重点，再到以“学习”为重点的自然、清晰的脉络。  '
+  after: '人工智能的研究历史有着一条从以“推理”为重点，到以“知识”为重点，再到以“学习”为重点的自然、清晰的脉络。'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/extra_space_cleaner/process.py
+++ b/runtime/ops/mapper/extra_space_cleaner/process.py
@@ -0,0 +1,69 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 多余空格去除
+Create: 2025/01/13
+"""
+import re
+import time
+from pathlib import Path
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Mapper
+
+
+class ExtraSpaceCleaner(Mapper):
+    """去除多余空格、多余空行，包括文档首尾空格、首尾tab
+    【注意】去除多余空格前，会先将文档中所有空格规范化为\u0020
+    """
+
+    def __init__(self, *args, **kwargs):
+        # 匹配文档中非常见的unicode 空格
+        super().__init__(*args, **kwargs)
+        self.white_space_pattern = ('[\u00A0 \u1680 \u2000-\u200D \u2028-\u2029'
+                                    ' \u202F \u205F \u3000 \u180E \u2060 \uFEFF]')
+        self._file_path = Path(__file__).parent / 'resources' / 'special_token.txt'
+        self.escaped_special_chars = self._get_escaped_special_chars()  # 加载标点符号
+        # 匹配文章中，连续多个空格
+        extra_space_pattern = r" {2,}"
+        # 匹配多个空格、换行符混排情况
+        extra_line_pattern = r"( |\n){2,}"
+        # 匹配中文、符号间多余空格
+        extra_space_in_chinese_pattern = r"(?<=[\u4e00-\u9fa5" + self.escaped_special_chars + r"]) +(?=[\u4e00-\u9fa5" \
+                                         + self.escaped_special_chars + r"])"
+        self.extra_space_re_compile = re.compile(extra_space_pattern)
+        self.extra_space_in_chinese_re_compile = re.compile(extra_space_in_chinese_pattern)
+        self.extra_line_re_compile = re.compile(extra_line_pattern)
+        self.white_space_pattern_compile = re.compile(self.white_space_pattern)
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._clean_extra_space(sample[self.text_key])
+        logger.info(
+            f"fileName: {sample[self.filename_key]}, method: ExtraSpaceCleaner costs {time.time() - start:6f} s")
+        return sample
+
+    def _get_escaped_special_chars(self) -> str:
+        with open(self._file_path, 'r', encoding='utf-8') as f:
+            self._special_token = f.read().splitlines()
+        res = ''.join([re.escape(char) for char in self._special_token])  # 将特殊字符转义并拼接成字符串
+        return res
+
+    def _clean_extra_space(self, input_data: str) -> str:
+        # 将文档中非常见的 unicode 空格，如 u2008，转换为正常空格（半角空格）
+        input_data = self.white_space_pattern_compile.sub('\u0020', input_data)
+        # 移除文档首尾、句中或标点符号附近多余空格和 tab
+        input_data = input_data.strip()
+        # 逐行移除首尾空格
+        text = "\n".join([line.strip() for line in input_data.split("\n")])
+        text = ''.join(['【', text, '】'])
+        # 连续空格替换为一个正常空格
+        remove_extra_space = self.extra_space_re_compile.sub("\u0020", text)
+        # 去除中文、符号间的空格
+        remove_extra_space_in_chinese = self.extra_space_in_chinese_re_compile.sub("", remove_extra_space)
+        # 去除连续换行符
+        remove_duplicate_line = self.extra_line_re_compile.sub("\n", remove_extra_space_in_chinese)
+        return remove_duplicate_line[1:-1]
--- a/runtime/ops/mapper/extra_space_cleaner/resources/special_token.txt
+++ b/runtime/ops/mapper/extra_space_cleaner/resources/special_token.txt
@@ -0,0 +1,53 @@
+~
+·
+！
+@
+#
+￥
+%
+…
+&
+*
+（
+）
+—
+
+-
+=
+{
+}
+|
+【
+】
+、
+：
+“
+”
+‘
+’
+；
+《
+》
+？
+，
+。
+`
+!
+$
+^
+(
+)
+_
+[
+]
+\
+:
+"
+;
+'
+<
+>
+?
+,
+/
+.