You've already forked DataMate
算子将抽取与落盘固定到流程中 (#134)
* feature: 将抽取动作移到每一个算子中 * feature: 落盘算子改为默认执行 * feature: 优化前端展示 * feature: 使用pyproject管理依赖
This commit is contained in:
@@ -30,6 +30,7 @@ class ContentCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._content_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: ContentCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -64,6 +64,7 @@ class AnonymizedCreditCardNumber(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._credit_card_number_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: CreditCardNumberCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -25,6 +25,7 @@ class EmailNumberCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._email_number_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: EmailCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -22,6 +22,7 @@ class EmojiCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._emoji_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: EmojiCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -41,6 +41,7 @@ class ExtraSpaceCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._clean_extra_space(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: ExtraSpaceCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -34,6 +34,7 @@ class FullWidthCharacterCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._full_width_character_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FullWidthCharactersCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -44,6 +44,7 @@ class GrableCharactersCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._grable_characters_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: GrableCharactersCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -64,6 +64,7 @@ class HtmlTagCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
if sample[self.filetype_key] != "xml":
|
||||
sample[self.text_key] = self._remove_html_tags(sample[self.text_key])
|
||||
logger.info(
|
||||
|
||||
@@ -71,6 +71,7 @@ class AnonymizedIdNumber(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._id_number_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: IDNumberCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -28,6 +28,7 @@ class ImgDenoise(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
|
||||
img_bytes = sample[self.data_key]
|
||||
|
||||
|
||||
@@ -97,6 +97,7 @@ class ImgDirectionCorrect(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
|
||||
@@ -88,6 +88,7 @@ class ImgBrightness(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
|
||||
@@ -59,6 +59,7 @@ class ImgContrast(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
|
||||
@@ -69,6 +69,7 @@ class ImgSaturation(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
|
||||
@@ -57,6 +57,7 @@ class ImgSharpness(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
|
||||
@@ -25,6 +25,7 @@ class ImgPerspectiveTransformation(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
|
||||
@@ -29,6 +29,7 @@ class ImgResize(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
|
||||
@@ -60,6 +60,7 @@ class ImgShadowRemove(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
|
||||
@@ -21,6 +21,7 @@ class ImgTypeUnify(Mapper):
|
||||
|
||||
def execute(self, sample):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
origin_file_type = sample[self.filetype_key]
|
||||
if origin_file_type == self._setting_type:
|
||||
|
||||
@@ -80,6 +80,7 @@ class ImgWatermarkRemove(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
|
||||
@@ -24,6 +24,7 @@ class InvisibleCharactersCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._invisible_characters_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: InvisibleCharactersCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -37,6 +37,7 @@ class AnonymizedIpAddress(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._ip_address_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: IPAddressCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -35,6 +35,7 @@ class KnowledgeRelationSlice(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start_time = time.time()
|
||||
self.read_file_first(sample)
|
||||
|
||||
chunk_item = get_json_list(sample[self.text_key], chunk_size=self.chunk_size, overlap_size=self.overlap_size)
|
||||
chunk_item_json = json.dumps(chunk_item, ensure_ascii=False)
|
||||
|
||||
@@ -36,6 +36,7 @@ class LegendCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._clean_html_tag(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: LegendCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -37,6 +37,7 @@ class AnonymizedPhoneNumber(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._phone_number_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: PhoneNumberCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -53,6 +53,7 @@ class PoliticalWordCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._political_word_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: PoliticalWordCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -47,8 +47,8 @@ def duplicate_sentences_filter(input_data: str, file_name: str, duplicate_th: in
|
||||
paragraph_counts[paragraph_strip] = -1
|
||||
|
||||
except Exception as err:
|
||||
logger.exception(f"fileName: {file_name}, method: RemoveDuplicateSentencess. An error occurred when using "
|
||||
f"filtering duplicate sentences. The error is: {err}")
|
||||
logger.exception(f"fileName: {file_name}, method: RemoveDuplicateSentencess. An error occurred when using "
|
||||
f"filtering duplicate sentences. The error is: {err}")
|
||||
return input_data
|
||||
|
||||
# 将去重后的段落重新组合成文本
|
||||
@@ -63,6 +63,7 @@ class DuplicateSentencesFilter(Filter):
|
||||
duplicate_th = 5 # 段落重复次数阈值
|
||||
file_name = sample[self.filename_key]
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = duplicate_sentences_filter(sample[self.text_key], file_name, duplicate_th)
|
||||
logger.info(f"fileName: {file_name}, RemoveDuplicateSentencess costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -56,6 +56,7 @@ class SexualAndViolentWordCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._sexual_and_violent_word_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: SexualAndViolentWordCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -61,6 +61,7 @@ class TextToWord(Mapper):
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""将文本信息转换为docx文件流"""
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.data_key] = self._txt_to_docx(sample[self.text_key]) # 将文字转换为word字符串流
|
||||
sample[self.text_key] = ""
|
||||
sample["target_type"] = "docx"
|
||||
|
||||
@@ -27,6 +27,7 @@ class TraditionalChineseCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._traditional_chinese_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: TraditionalChinese costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -23,6 +23,7 @@ class UnicodeSpaceCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._clean_unicode_space(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: UnicodeSpaceCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -26,6 +26,7 @@ class AnonymizedUrlCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._url_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: UrlCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -52,6 +52,7 @@ class XMLTagCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
if sample[self.filetype_key] == "xml":
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user