You've already forked DataMate
init datamate
This commit is contained in:
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='FullWidthCharacterCleaner',
|
||||
module_path="ops.mapper.full_width_characters_cleaner.process")
|
||||
@@ -0,0 +1,18 @@
|
||||
name: '全角转半角'
|
||||
name_en: 'Full-to-Half Width Character'
|
||||
description: '将文档中的所有全角字符转换成半角字符。'
|
||||
description_en: 'Converts all full-width characters in documents to half-width characters.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'FullWidthCharacterCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: 'Residential and commercial design, site inspections, working drawings,
|
||||
Minicad, renderings.'
|
||||
after: 'Residential and commercial design, site inspections, working drawings, MiniCad,
|
||||
renderings.'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
46
runtime/ops/mapper/full_width_characters_cleaner/process.py
Normal file
46
runtime/ops/mapper/full_width_characters_cleaner/process.py
Normal file
@@ -0,0 +1,46 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 全角转半角
|
||||
Create: 2025/01/13
|
||||
"""
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class FullWidthCharacterCleaner(Mapper):
|
||||
"""将文档中的所有全角字符转换成半角字符"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._full_to_half_dict = {
|
||||
'"': '"', '#': '#', '$': '$', '%': '%', '&': '&', ''': "'", '*': '*', '+': '+',
|
||||
'-': '-', '.': '.', '/': '/', '0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
|
||||
'5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '<': '<', '=': '=', '>': '>',
|
||||
'@': '@', 'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G',
|
||||
'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', 'N': 'N', 'O': 'O',
|
||||
'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W',
|
||||
'X': 'X', 'Y': 'Y', 'Z': 'Z', '[': '[', '\': '\\', ']': ']', '^': '^', '_': '_',
|
||||
'`': '`', 'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f', 'g': 'g',
|
||||
'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l', 'm': 'm', 'n': 'n', 'o': 'o',
|
||||
'p': 'p', 'q': 'q', 'r': 'r', 's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w',
|
||||
'x': 'x', 'y': 'y', 'z': 'z', '{': '{', '|': '|', '}': '}', '~': '~'
|
||||
}
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._full_width_character_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FullWidthCharactersCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _full_width_character_filter(self, input_data: str):
|
||||
res = []
|
||||
for input_str in input_data.split('\n'):
|
||||
res.append("".join(self._full_to_half_dict.get(char, char) for char in input_str))
|
||||
return '\n'.join(res)
|
||||
Reference in New Issue
Block a user