#!/user/bin/python # -*- coding: utf-8 -*- """ Description: 全角转半角 Create: 2025/01/13 """ import time from typing import Dict, Any from loguru import logger from datamate.core.base_op import Mapper class FullWidthCharacterCleaner(Mapper): """将文档中的所有全角字符转换成半角字符""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._full_to_half_dict = {`': '`', 'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f', 'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l', 'm': 'm', 'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r', 's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x', 'y': 'y', 'z': 'z', '{': '{', '|': '|', '}': '}', '~': '~' } def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]: start = time.time() self.read_file_first(sample) sample[self.text_key] = self._full_width_character_filter(sample[self.text_key]) logger.info(f"fileName: {sample[self.filename_key]}, " f"method: FullWidthCharactersCleaner costs {time.time() - start:6f} s") return sample def _full_width_character_filter(self, input_data: str): res = [] for input_str in input_data.split('\n'): res.append("".join(self._full_to_half_dict.get(char, char) for char in input_str)) return '\n'.join(res)