Files
hhhhsc701 d59c167da4 算子将抽取与落盘固定到流程中 (#134)
* feature: 将抽取动作移到每一个算子中

* feature: 落盘算子改为默认执行

* feature: 优化前端展示

* feature: 使用pyproject管理依赖
2025-12-05 17:26:29 +08:00

71 lines
3.2 KiB
Python

#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 多余空格去除
Create: 2025/01/13
"""
import re
import time
from pathlib import Path
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class ExtraSpaceCleaner(Mapper):
"""去除多余空格、多余空行,包括文档首尾空格、首尾tab
【注意】去除多余空格前,会先将文档中所有空格规范化为\u0020
"""
def __init__(self, *args, **kwargs):
# 匹配文档中非常见的unicode 空格
super().__init__(*args, **kwargs)
self.white_space_pattern = ('[\u00A0 \u1680 \u2000-\u200D \u2028-\u2029'
' \u202F \u205F \u3000 \u180E \u2060 \uFEFF]')
self._file_path = Path(__file__).parent / 'resources' / 'special_token.txt'
self.escaped_special_chars = self._get_escaped_special_chars() # 加载标点符号
# 匹配文章中,连续多个空格
extra_space_pattern = r" {2,}"
# 匹配多个空格、换行符混排情况
extra_line_pattern = r"( |\n){2,}"
# 匹配中文、符号间多余空格
extra_space_in_chinese_pattern = r"(?<=[\u4e00-\u9fa5" + self.escaped_special_chars + r"]) +(?=[\u4e00-\u9fa5" \
+ self.escaped_special_chars + r"])"
self.extra_space_re_compile = re.compile(extra_space_pattern)
self.extra_space_in_chinese_re_compile = re.compile(extra_space_in_chinese_pattern)
self.extra_line_re_compile = re.compile(extra_line_pattern)
self.white_space_pattern_compile = re.compile(self.white_space_pattern)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._clean_extra_space(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: ExtraSpaceCleaner costs {time.time() - start:6f} s")
return sample
def _get_escaped_special_chars(self) -> str:
with open(self._file_path, 'r', encoding='utf-8') as f:
self._special_token = f.read().splitlines()
res = ''.join([re.escape(char) for char in self._special_token]) # 将特殊字符转义并拼接成字符串
return res
def _clean_extra_space(self, input_data: str) -> str:
# 将文档中非常见的 unicode 空格,如 u2008,转换为正常空格(半角空格)
input_data = self.white_space_pattern_compile.sub('\u0020', input_data)
# 移除文档首尾、句中或标点符号附近多余空格和 tab
input_data = input_data.strip()
# 逐行移除首尾空格
text = "\n".join([line.strip() for line in input_data.split("\n")])
text = ''.join(['', text, ''])
# 连续空格替换为一个正常空格
remove_extra_space = self.extra_space_re_compile.sub("\u0020", text)
# 去除中文、符号间的空格
remove_extra_space_in_chinese = self.extra_space_in_chinese_re_compile.sub("", remove_extra_space)
# 去除连续换行符
remove_duplicate_line = self.extra_line_re_compile.sub("\n", remove_extra_space_in_chinese)
return remove_duplicate_line[1:-1]