算子将抽取与落盘固定到流程中 (#134)

* feature: 将抽取动作移到每一个算子中

* feature: 落盘算子改为默认执行

* feature: 优化前端展示

* feature: 使用pyproject管理依赖
This commit is contained in:
hhhhsc701
2025-12-05 17:26:29 +08:00
committed by GitHub
parent 744d15ba24
commit d59c167da4
70 changed files with 289 additions and 539 deletions

View File

@@ -47,8 +47,8 @@ def duplicate_sentences_filter(input_data: str, file_name: str, duplicate_th: in
paragraph_counts[paragraph_strip] = -1
except Exception as err:
logger.exception(f"fileName: file_name, method: RemoveDuplicateSentencess. An error occurred when using "
f"filtering duplicate sentences. The error is: {err}")
logger.exception(f"fileName: {file_name}, method: RemoveDuplicateSentencess. An error occurred when using "
f"filtering duplicate sentences. The error is: {err}")
return input_data
# 将去重后的段落重新组合成文本
@@ -63,6 +63,7 @@ class DuplicateSentencesFilter(Filter):
duplicate_th = 5 # 段落重复次数阈值
file_name = sample[self.filename_key]
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = duplicate_sentences_filter(sample[self.text_key], file_name, duplicate_th)
logger.info(f"fileName: {file_name}, RemoveDuplicateSentencess costs {time.time() - start:6f} s")
return sample