算子将抽取与落盘固定到流程中 (#134)

* feature: 将抽取动作移到每一个算子中

* feature: 落盘算子改为默认执行

* feature: 优化前端展示

* feature: 使用pyproject管理依赖
This commit is contained in:
hhhhsc701
2025-12-05 17:26:29 +08:00
committed by GitHub
parent 744d15ba24
commit d59c167da4
70 changed files with 289 additions and 539 deletions

View File

@@ -33,6 +33,7 @@ class FileWithHighRepeatPhraseRateFilter(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._file_with_high_repeat_phrase_rate_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "

View File

@@ -30,6 +30,7 @@ class FileWithHighRepeatWordRateFilter(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._file_with_high_repeat_word_rate_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "

View File

@@ -26,6 +26,7 @@ class FileWithHighSpecialCharRateFilter(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._file_with_high_special_char_rate_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "

View File

@@ -105,6 +105,7 @@ class ImgAdvertisementImagesCleaner(Filter):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
img_bytes = sample[self.data_key]

View File

@@ -27,6 +27,7 @@ class ImgBlurredImagesCleaner(Filter):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]

View File

@@ -61,6 +61,7 @@ class ImgDuplicatedImagesCleaner(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
"""重复图片去重算子执行入口"""
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
img_data = self._duplicate_images_filter(file_name, sample[self.data_key])

View File

@@ -227,6 +227,7 @@ class ImgSimilarImagesCleaner(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
"""去除相似图片算子执行入口"""
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
img_bytes = sample[self.data_key]
data = bytes_to_numpy(img_bytes) if img_bytes else np.array([])

View File

@@ -150,6 +150,7 @@ class DuplicateFilesFilter(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
sample[self.text_key] = self.deduplicate_files(sample, file_name)

View File

@@ -90,6 +90,7 @@ class FileWithManySensitiveWordsFilter(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._file_with_many_sensitive_words_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "

View File

@@ -31,6 +31,7 @@ class FileWithShortOrLongLengthFilter(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._file_with_short_or_long_length_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "