Files
hhhhsc701 d59c167da4 算子将抽取与落盘固定到流程中 (#134)
* feature: 将抽取动作移到每一个算子中

* feature: 落盘算子改为默认执行

* feature: 优化前端展示

* feature: 使用pyproject管理依赖
2025-12-05 17:26:29 +08:00

48 lines
1.3 KiB
Python

# -- encoding: utf-8 --
"""
Description:
Create: 2023/11/7 9:26
"""
import json
import time
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
from .knowledge_relation import get_json_list
# 切片长度
CHUNK_SIZE = 500
# 相邻切片重合长度
OVERLAP_SIZE = 100
class KnowledgeRelationSlice(Mapper):
def __init__(self, *args, **kwargs):
super(KnowledgeRelationSlice, self).__init__(*args, **kwargs)
if 'chunk_size' not in kwargs:
self.chunk_size = CHUNK_SIZE
else:
self.chunk_size = kwargs.get("chunk_size")
if 'overlap_size' not in kwargs:
self.overlap_size = OVERLAP_SIZE
else:
self.overlap_size = kwargs.get("overlap_size")
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start_time = time.time()
self.read_file_first(sample)
chunk_item = get_json_list(sample[self.text_key], chunk_size=self.chunk_size, overlap_size=self.overlap_size)
chunk_item_json = json.dumps(chunk_item, ensure_ascii=False)
sample[self.text_key] = chunk_item_json
cost_time = time.time() - start_time
logger.info(f'Generate knowledgeRelation slice num: {len(chunk_item)}, Cost time: {cost_time} s')
return sample