You've already forked DataMate
算子将抽取与落盘固定到流程中 (#134)
* feature: 将抽取动作移到每一个算子中 * feature: 落盘算子改为默认执行 * feature: 优化前端展示 * feature: 使用pyproject管理依赖
This commit is contained in:
@@ -33,6 +33,7 @@ class FileWithHighRepeatPhraseRateFilter(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._file_with_high_repeat_phrase_rate_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
|
||||
@@ -30,6 +30,7 @@ class FileWithHighRepeatWordRateFilter(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._file_with_high_repeat_word_rate_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
|
||||
@@ -26,6 +26,7 @@ class FileWithHighSpecialCharRateFilter(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._file_with_high_special_char_rate_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
|
||||
@@ -105,6 +105,7 @@ class ImgAdvertisementImagesCleaner(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
|
||||
@@ -27,6 +27,7 @@ class ImgBlurredImagesCleaner(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
|
||||
@@ -61,6 +61,7 @@ class ImgDuplicatedImagesCleaner(Filter):
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""重复图片去重算子执行入口"""
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
|
||||
img_data = self._duplicate_images_filter(file_name, sample[self.data_key])
|
||||
|
||||
@@ -227,6 +227,7 @@ class ImgSimilarImagesCleaner(Filter):
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""去除相似图片算子执行入口"""
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
data = bytes_to_numpy(img_bytes) if img_bytes else np.array([])
|
||||
|
||||
@@ -150,6 +150,7 @@ class DuplicateFilesFilter(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
|
||||
sample[self.text_key] = self.deduplicate_files(sample, file_name)
|
||||
|
||||
@@ -90,6 +90,7 @@ class FileWithManySensitiveWordsFilter(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._file_with_many_sensitive_words_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
|
||||
@@ -31,6 +31,7 @@ class FileWithShortOrLongLengthFilter(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._file_with_short_or_long_length_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
|
||||
Reference in New Issue
Block a user