You've already forked DataMate
* feature: 将抽取动作移到每一个算子中 * feature: 落盘算子改为默认执行 * feature: 优化前端展示 * feature: 使用pyproject管理依赖
48 lines
2.2 KiB
Python
48 lines
2.2 KiB
Python
#!/user/bin/python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
Description: 全角转半角
|
|
Create: 2025/01/13
|
|
"""
|
|
import time
|
|
from typing import Dict, Any
|
|
|
|
from loguru import logger
|
|
|
|
from datamate.core.base_op import Mapper
|
|
|
|
|
|
class FullWidthCharacterCleaner(Mapper):
|
|
"""将文档中的所有全角字符转换成半角字符"""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
self._full_to_half_dict = {
|
|
'"': '"', '#': '#', '$': '$', '%': '%', '&': '&', ''': "'", '*': '*', '+': '+',
|
|
'-': '-', '.': '.', '/': '/', '0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
|
|
'5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '<': '<', '=': '=', '>': '>',
|
|
'@': '@', 'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G',
|
|
'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', 'N': 'N', 'O': 'O',
|
|
'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W',
|
|
'X': 'X', 'Y': 'Y', 'Z': 'Z', '[': '[', '\': '\\', ']': ']', '^': '^', '_': '_',
|
|
'`': '`', 'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f', 'g': 'g',
|
|
'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l', 'm': 'm', 'n': 'n', 'o': 'o',
|
|
'p': 'p', 'q': 'q', 'r': 'r', 's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w',
|
|
'x': 'x', 'y': 'y', 'z': 'z', '{': '{', '|': '|', '}': '}', '~': '~'
|
|
}
|
|
|
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
|
start = time.time()
|
|
self.read_file_first(sample)
|
|
sample[self.text_key] = self._full_width_character_filter(sample[self.text_key])
|
|
logger.info(f"fileName: {sample[self.filename_key]}, "
|
|
f"method: FullWidthCharactersCleaner costs {time.time() - start:6f} s")
|
|
return sample
|
|
|
|
def _full_width_character_filter(self, input_data: str):
|
|
res = []
|
|
for input_str in input_data.split('\n'):
|
|
res.append("".join(self._full_to_half_dict.get(char, char) for char in input_str))
|
|
return '\n'.join(res)
|