#!/user/bin/python # -*- coding: utf-8 -*- """ Description: 多余空格去除 Create: 2025/01/13 """ import re import time from pathlib import Path from typing import Dict, Any from loguru import logger from datamate.core.base_op import Mapper class ExtraSpaceCleaner(Mapper): """去除多余空格、多余空行,包括文档首尾空格、首尾tab 【注意】去除多余空格前,会先将文档中所有空格规范化为\u0020 """ def __init__(self, *args, **kwargs): # 匹配文档中非常见的unicode 空格 super().__init__(*args, **kwargs) self.white_space_pattern = ('[\u00A0 \u1680 \u2000-\u200D \u2028-\u2029' ' \u202F \u205F \u3000 \u180E \u2060 \uFEFF]') self._file_path = Path(__file__).parent / 'resources' / 'special_token.txt' self.escaped_special_chars = self._get_escaped_special_chars() # 加载标点符号 # 匹配文章中,连续多个空格 extra_space_pattern = r" {2,}" # 匹配多个空格、换行符混排情况 extra_line_pattern = r"( |\n){2,}" # 匹配中文、符号间多余空格 extra_space_in_chinese_pattern = r"(?<=[\u4e00-\u9fa5" + self.escaped_special_chars + r"]) +(?=[\u4e00-\u9fa5" \ + self.escaped_special_chars + r"])" self.extra_space_re_compile = re.compile(extra_space_pattern) self.extra_space_in_chinese_re_compile = re.compile(extra_space_in_chinese_pattern) self.extra_line_re_compile = re.compile(extra_line_pattern) self.white_space_pattern_compile = re.compile(self.white_space_pattern) def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]: start = time.time() sample[self.text_key] = self._clean_extra_space(sample[self.text_key]) logger.info( f"fileName: {sample[self.filename_key]}, method: ExtraSpaceCleaner costs {time.time() - start:6f} s") return sample def _get_escaped_special_chars(self) -> str: with open(self._file_path, 'r', encoding='utf-8') as f: self._special_token = f.read().splitlines() res = ''.join([re.escape(char) for char in self._special_token]) # 将特殊字符转义并拼接成字符串 return res def _clean_extra_space(self, input_data: str) -> str: # 将文档中非常见的 unicode 空格,如 u2008,转换为正常空格(半角空格) input_data = self.white_space_pattern_compile.sub('\u0020', input_data) # 移除文档首尾、句中或标点符号附近多余空格和 tab input_data = input_data.strip() # 逐行移除首尾空格 text = "\n".join([line.strip() for line in input_data.split("\n")]) text = ''.join(['【', text, '】']) # 连续空格替换为一个正常空格 remove_extra_space = self.extra_space_re_compile.sub("\u0020", text) # 去除中文、符号间的空格 remove_extra_space_in_chinese = self.extra_space_in_chinese_re_compile.sub("", remove_extra_space) # 去除连续换行符 remove_duplicate_line = self.extra_line_re_compile.sub("\n", remove_extra_space_in_chinese) return remove_duplicate_line[1:-1]