You've already forked DataMate
init datamate
This commit is contained in:
6
runtime/ops/mapper/extra_space_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/extra_space_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ExtraSpaceCleaner',
|
||||
module_path="ops.mapper.extra_space_cleaner.process")
|
||||
17
runtime/ops/mapper/extra_space_cleaner/metadata.yml
Normal file
17
runtime/ops/mapper/extra_space_cleaner/metadata.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
name: '多余空格去除'
|
||||
name_en: 'Redundant Space Removal'
|
||||
description: '移除文档首尾、句中或标点符号附近多余空格和 tab 等。'
|
||||
description_en: 'Removes redundant spaces and tabs at the beginning and end of documents,
|
||||
in sentences, or near punctuations.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ExtraSpaceCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ' 人工智能的研究历史有着一条从以“推理”为重 点,到以“知识”为重点,再到以“学习”为重点的自然、清晰的脉络。 '
|
||||
after: '人工智能的研究历史有着一条从以“推理”为重点,到以“知识”为重点,再到以“学习”为重点的自然、清晰的脉络。'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
69
runtime/ops/mapper/extra_space_cleaner/process.py
Normal file
69
runtime/ops/mapper/extra_space_cleaner/process.py
Normal file
@@ -0,0 +1,69 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 多余空格去除
|
||||
Create: 2025/01/13
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ExtraSpaceCleaner(Mapper):
|
||||
"""去除多余空格、多余空行,包括文档首尾空格、首尾tab
|
||||
【注意】去除多余空格前,会先将文档中所有空格规范化为\u0020
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# 匹配文档中非常见的unicode 空格
|
||||
super().__init__(*args, **kwargs)
|
||||
self.white_space_pattern = ('[\u00A0 \u1680 \u2000-\u200D \u2028-\u2029'
|
||||
' \u202F \u205F \u3000 \u180E \u2060 \uFEFF]')
|
||||
self._file_path = Path(__file__).parent / 'resources' / 'special_token.txt'
|
||||
self.escaped_special_chars = self._get_escaped_special_chars() # 加载标点符号
|
||||
# 匹配文章中,连续多个空格
|
||||
extra_space_pattern = r" {2,}"
|
||||
# 匹配多个空格、换行符混排情况
|
||||
extra_line_pattern = r"( |\n){2,}"
|
||||
# 匹配中文、符号间多余空格
|
||||
extra_space_in_chinese_pattern = r"(?<=[\u4e00-\u9fa5" + self.escaped_special_chars + r"]) +(?=[\u4e00-\u9fa5" \
|
||||
+ self.escaped_special_chars + r"])"
|
||||
self.extra_space_re_compile = re.compile(extra_space_pattern)
|
||||
self.extra_space_in_chinese_re_compile = re.compile(extra_space_in_chinese_pattern)
|
||||
self.extra_line_re_compile = re.compile(extra_line_pattern)
|
||||
self.white_space_pattern_compile = re.compile(self.white_space_pattern)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._clean_extra_space(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: ExtraSpaceCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _get_escaped_special_chars(self) -> str:
|
||||
with open(self._file_path, 'r', encoding='utf-8') as f:
|
||||
self._special_token = f.read().splitlines()
|
||||
res = ''.join([re.escape(char) for char in self._special_token]) # 将特殊字符转义并拼接成字符串
|
||||
return res
|
||||
|
||||
def _clean_extra_space(self, input_data: str) -> str:
|
||||
# 将文档中非常见的 unicode 空格,如 u2008,转换为正常空格(半角空格)
|
||||
input_data = self.white_space_pattern_compile.sub('\u0020', input_data)
|
||||
# 移除文档首尾、句中或标点符号附近多余空格和 tab
|
||||
input_data = input_data.strip()
|
||||
# 逐行移除首尾空格
|
||||
text = "\n".join([line.strip() for line in input_data.split("\n")])
|
||||
text = ''.join(['【', text, '】'])
|
||||
# 连续空格替换为一个正常空格
|
||||
remove_extra_space = self.extra_space_re_compile.sub("\u0020", text)
|
||||
# 去除中文、符号间的空格
|
||||
remove_extra_space_in_chinese = self.extra_space_in_chinese_re_compile.sub("", remove_extra_space)
|
||||
# 去除连续换行符
|
||||
remove_duplicate_line = self.extra_line_re_compile.sub("\n", remove_extra_space_in_chinese)
|
||||
return remove_duplicate_line[1:-1]
|
||||
@@ -0,0 +1,53 @@
|
||||
~
|
||||
·
|
||||
!
|
||||
@
|
||||
#
|
||||
¥
|
||||
%
|
||||
…
|
||||
&
|
||||
*
|
||||
(
|
||||
)
|
||||
—
|
||||
+
|
||||
-
|
||||
=
|
||||
{
|
||||
}
|
||||
|
|
||||
【
|
||||
】
|
||||
、
|
||||
:
|
||||
“
|
||||
”
|
||||
‘
|
||||
’
|
||||
;
|
||||
《
|
||||
》
|
||||
?
|
||||
,
|
||||
。
|
||||
`
|
||||
!
|
||||
$
|
||||
^
|
||||
(
|
||||
)
|
||||
_
|
||||
[
|
||||
]
|
||||
\
|
||||
:
|
||||
"
|
||||
;
|
||||
'
|
||||
<
|
||||
>
|
||||
?
|
||||
,
|
||||
/
|
||||
.
|
||||
Reference in New Issue
Block a user