算子将抽取与落盘固定到流程中 (#134)

* feature: 将抽取动作移到每一个算子中

* feature: 落盘算子改为默认执行

* feature: 优化前端展示

* feature: 使用pyproject管理依赖
This commit is contained in:
hhhhsc701
2025-12-05 17:26:29 +08:00
committed by GitHub
parent 744d15ba24
commit d59c167da4
70 changed files with 289 additions and 539 deletions

View File

@@ -30,6 +30,7 @@ class ContentCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._content_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: ContentCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -64,6 +64,7 @@ class AnonymizedCreditCardNumber(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._credit_card_number_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: CreditCardNumberCleaner costs {time.time() - start:6f} s")

View File

@@ -25,6 +25,7 @@ class EmailNumberCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._email_number_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: EmailCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -22,6 +22,7 @@ class EmojiCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._emoji_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: EmojiCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -41,6 +41,7 @@ class ExtraSpaceCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._clean_extra_space(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: ExtraSpaceCleaner costs {time.time() - start:6f} s")

View File

@@ -34,6 +34,7 @@ class FullWidthCharacterCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._full_width_character_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: FullWidthCharactersCleaner costs {time.time() - start:6f} s")

View File

@@ -44,6 +44,7 @@ class GrableCharactersCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._grable_characters_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: GrableCharactersCleaner costs {time.time() - start:6f} s")

View File

@@ -64,6 +64,7 @@ class HtmlTagCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
if sample[self.filetype_key] != "xml":
sample[self.text_key] = self._remove_html_tags(sample[self.text_key])
logger.info(

View File

@@ -71,6 +71,7 @@ class AnonymizedIdNumber(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._id_number_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: IDNumberCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -28,6 +28,7 @@ class ImgDenoise(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]

View File

@@ -97,6 +97,7 @@ class ImgDirectionCorrect(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
img_bytes = sample[self.data_key]

View File

@@ -88,6 +88,7 @@ class ImgBrightness(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]

View File

@@ -59,6 +59,7 @@ class ImgContrast(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]

View File

@@ -69,6 +69,7 @@ class ImgSaturation(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]

View File

@@ -57,6 +57,7 @@ class ImgSharpness(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]

View File

@@ -25,6 +25,7 @@ class ImgPerspectiveTransformation(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]

View File

@@ -29,6 +29,7 @@ class ImgResize(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
img_bytes = sample[self.data_key]

View File

@@ -60,6 +60,7 @@ class ImgShadowRemove(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]

View File

@@ -21,6 +21,7 @@ class ImgTypeUnify(Mapper):
def execute(self, sample):
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
origin_file_type = sample[self.filetype_key]
if origin_file_type == self._setting_type:

View File

@@ -80,6 +80,7 @@ class ImgWatermarkRemove(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
img_bytes = sample[self.data_key]

View File

@@ -24,6 +24,7 @@ class InvisibleCharactersCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._invisible_characters_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: InvisibleCharactersCleaner costs {time.time() - start:6f} s")

View File

@@ -37,6 +37,7 @@ class AnonymizedIpAddress(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._ip_address_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: IPAddressCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -35,6 +35,7 @@ class KnowledgeRelationSlice(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start_time = time.time()
self.read_file_first(sample)
chunk_item = get_json_list(sample[self.text_key], chunk_size=self.chunk_size, overlap_size=self.overlap_size)
chunk_item_json = json.dumps(chunk_item, ensure_ascii=False)

View File

@@ -36,6 +36,7 @@ class LegendCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._clean_html_tag(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: LegendCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -37,6 +37,7 @@ class AnonymizedPhoneNumber(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._phone_number_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: PhoneNumberCleaner costs {time.time() - start:6f} s")

View File

@@ -53,6 +53,7 @@ class PoliticalWordCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._political_word_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: PoliticalWordCleaner costs {time.time() - start:6f} s")

View File

@@ -47,8 +47,8 @@ def duplicate_sentences_filter(input_data: str, file_name: str, duplicate_th: in
paragraph_counts[paragraph_strip] = -1
except Exception as err:
logger.exception(f"fileName: file_name, method: RemoveDuplicateSentencess. An error occurred when using "
f"filtering duplicate sentences. The error is: {err}")
logger.exception(f"fileName: {file_name}, method: RemoveDuplicateSentencess. An error occurred when using "
f"filtering duplicate sentences. The error is: {err}")
return input_data
# 将去重后的段落重新组合成文本
@@ -63,6 +63,7 @@ class DuplicateSentencesFilter(Filter):
duplicate_th = 5 # 段落重复次数阈值
file_name = sample[self.filename_key]
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = duplicate_sentences_filter(sample[self.text_key], file_name, duplicate_th)
logger.info(f"fileName: {file_name}, RemoveDuplicateSentencess costs {time.time() - start:6f} s")
return sample

View File

@@ -56,6 +56,7 @@ class SexualAndViolentWordCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._sexual_and_violent_word_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: SexualAndViolentWordCleaner costs {time.time() - start:6f} s")

View File

@@ -61,6 +61,7 @@ class TextToWord(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
"""将文本信息转换为docx文件流"""
start = time.time()
self.read_file_first(sample)
sample[self.data_key] = self._txt_to_docx(sample[self.text_key]) # 将文字转换为word字符串流
sample[self.text_key] = ""
sample["target_type"] = "docx"

View File

@@ -27,6 +27,7 @@ class TraditionalChineseCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._traditional_chinese_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: TraditionalChinese costs {time.time() - start:6f} s")

View File

@@ -23,6 +23,7 @@ class UnicodeSpaceCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._clean_unicode_space(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: UnicodeSpaceCleaner costs {time.time() - start:6f} s")

View File

@@ -26,6 +26,7 @@ class AnonymizedUrlCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._url_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: UrlCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -52,6 +52,7 @@ class XMLTagCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
if sample[self.filetype_key] == "xml":
try: