You've already forked DataMate
init datamate
This commit is contained in:
6
runtime/ops/mapper/knowledge_relation_slice/__init__.py
Normal file
6
runtime/ops/mapper/knowledge_relation_slice/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='KnowledgeRelationSlice',
|
||||
module_path="ops.mapper.knowledge_relation_slice.process")
|
||||
108
runtime/ops/mapper/knowledge_relation_slice/graph_sim_func.py
Normal file
108
runtime/ops/mapper/knowledge_relation_slice/graph_sim_func.py
Normal file
@@ -0,0 +1,108 @@
|
||||
#!/usr/bin/python3.9
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
import math
|
||||
from multiprocessing import Pool, cpu_count
|
||||
|
||||
from six import iteritems
|
||||
from six.moves import range
|
||||
from loguru import logger
|
||||
|
||||
PARAM_K1 = 1.5
|
||||
PARAM_B = 0.75
|
||||
EPSILON = 0.25
|
||||
|
||||
|
||||
def effective_n_jobs(n_jobs):
|
||||
if n_jobs == 0:
|
||||
raise ValueError('n_jobs == 0 in Parallel has no meaning')
|
||||
elif n_jobs is None:
|
||||
return 1
|
||||
elif n_jobs < 0:
|
||||
n_jobs = max(cpu_count() + 1 + n_jobs, 1)
|
||||
return n_jobs
|
||||
|
||||
|
||||
class SimilarityAlgBM25(object):
|
||||
|
||||
def __init__(self, corpus_docs):
|
||||
|
||||
self.corpus_files_size = 0
|
||||
self.avg_dl = 0
|
||||
self.doc_file_freqs = []
|
||||
self.idf_dict = {}
|
||||
self.doc_len = []
|
||||
self._initialize(corpus_docs)
|
||||
|
||||
def get_sim_score(self, document, index):
|
||||
|
||||
score = 0
|
||||
doc_freqs = self.doc_file_freqs[index]
|
||||
for word in document:
|
||||
if word not in doc_freqs:
|
||||
continue
|
||||
try:
|
||||
score += (self.idf_dict[word] * doc_freqs[word] * (PARAM_K1 + 1)
|
||||
/ (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avg_dl)))
|
||||
except KeyError as ke:
|
||||
logger.warning('key not found in doc_freqs dict: ', word)
|
||||
return score
|
||||
|
||||
def get_sim_scores(self, document):
|
||||
|
||||
scores = []
|
||||
for index in range(self.corpus_files_size):
|
||||
cur_score = self.get_sim_score(document, index)
|
||||
scores.append(cur_score)
|
||||
return scores
|
||||
|
||||
def get_scores_bow(self, document):
|
||||
|
||||
scores = []
|
||||
for index in range(self.corpus_files_size):
|
||||
score = self.get_sim_score(document, index)
|
||||
if score > 0:
|
||||
scores.append((index, score))
|
||||
return scores
|
||||
|
||||
def _initialize(self, corpus_files):
|
||||
"""
|
||||
Calculates frequencies of terms in documents and in corpus_files.
|
||||
Also computes inverse document frequencies.
|
||||
"""
|
||||
nd = {} # word -> number of documents with word
|
||||
num_doc = 0
|
||||
for document_file in corpus_files:
|
||||
self.corpus_files_size += 1
|
||||
self.doc_len.append(len(document_file))
|
||||
num_doc += len(document_file)
|
||||
|
||||
frequencies_dict = {}
|
||||
for word in document_file:
|
||||
if word not in frequencies_dict:
|
||||
frequencies_dict[word] = 0
|
||||
frequencies_dict[word] += 1
|
||||
self.doc_file_freqs.append(frequencies_dict)
|
||||
|
||||
for word, _ in iteritems(frequencies_dict):
|
||||
if word not in nd:
|
||||
nd[word] = 0
|
||||
nd[word] += 1
|
||||
|
||||
self.avg_dl = float(num_doc) / self.corpus_files_size
|
||||
# collect idf sum to calculate an average idf for epsilon value
|
||||
idf_sum = 0
|
||||
|
||||
negative_idfs_list = []
|
||||
for word, freq in iteritems(nd):
|
||||
idf = math.log(self.corpus_files_size - freq + 0.5) - math.log(freq + 0.5)
|
||||
self.idf_dict[word] = idf
|
||||
idf_sum += idf
|
||||
if idf < 0:
|
||||
negative_idfs_list.append(word)
|
||||
self.average_idf = float(idf_sum) / len(self.idf_dict)
|
||||
|
||||
eps = EPSILON * self.average_idf
|
||||
for word in negative_idfs_list:
|
||||
self.idf_dict[word] = eps
|
||||
@@ -0,0 +1,184 @@
|
||||
#!/usr/bin/python3.9
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
__all__ = ['build_llm_prompt', 'get_json_list']
|
||||
|
||||
import math
|
||||
|
||||
import jieba
|
||||
from loguru import logger
|
||||
|
||||
from . import graph_sim_func as bm25
|
||||
from .knowledge_slice import TextSegmentationOperator
|
||||
|
||||
|
||||
def build_llm_prompt(text):
|
||||
#
|
||||
prompt = """
|
||||
===
|
||||
<Role>:
|
||||
你是一位问答对QA智能撰写专家,你擅长根据给定的内容给出准确、完整、详细的多个问答对。
|
||||
|
||||
===
|
||||
<Instructions>:
|
||||
- 你需要根据已知信息(context),准确、详细的生成多个QA对。
|
||||
- 生成的问答对中答案少于10个中文字符时,放弃该问答对。
|
||||
- 确保所有问答对的答案都是已知信息的一部分,且可以组成已知信息,确保没有信息遗漏。
|
||||
- 仅根据已知信息生成问答对,答案要详细,且不能创造臆想已知信息中没有的内容。
|
||||
- 确保生成的多个QA对之间不要进行排序,Q:或A:前后不要出现数字序号。
|
||||
- Q:使用疑问句方式,问号结尾;A:使用陈述句方式,句号结尾,确保回答完整。
|
||||
- 输出格式如下:
|
||||
Q:......
|
||||
A:......
|
||||
|
||||
===
|
||||
<task>
|
||||
满足上述条件的情况下,现根据context:'''{}'''
|
||||
生成的多个QA问答对为:
|
||||
|
||||
"""
|
||||
|
||||
return prompt.format(text)
|
||||
|
||||
|
||||
class KnowledgeSlice:
|
||||
# edatamate切片算法插件
|
||||
def __init__(self, file_text, chunk_size=500, overlap_size=100):
|
||||
self.file_text = file_text
|
||||
self.slice_op = TextSegmentationOperator(chunk_size, overlap_size)
|
||||
|
||||
def execute(self):
|
||||
try:
|
||||
chunks = self.slice_op.process(self.file_text)
|
||||
except Exception as err:
|
||||
logger.exception(f"split text failed, error is: {err}")
|
||||
chunks = []
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
class BM25Model:
|
||||
def __init__(self, data_list):
|
||||
self.data_list = data_list
|
||||
self.corpus = self.load_corpus()
|
||||
|
||||
def bm25_similarity(self, query, num_best=1):
|
||||
query = jieba.lcut(query)
|
||||
bm = bm25.SimilarityAlgBM25(self.corpus)
|
||||
scores = bm.get_sim_scores(query)
|
||||
id_score = [(i, score) for i, score in enumerate(scores)]
|
||||
id_score.sort(key=lambda e: e[1], reverse=True)
|
||||
|
||||
return id_score[0: num_best]
|
||||
|
||||
def load_corpus(self):
|
||||
corpus = [jieba.lcut(data) for data in self.data_list]
|
||||
|
||||
return corpus
|
||||
|
||||
|
||||
class KnowledgeGraph:
|
||||
# class for document segmentation and create relation between knowledge
|
||||
def __init__(self, corpus_file_string, chunk_size=500, overlap_size=100, kg_relation=True):
|
||||
self.corpus_file_string = corpus_file_string
|
||||
self.chunk_size = chunk_size
|
||||
self.overlap_size = overlap_size
|
||||
self.kg_relation = kg_relation
|
||||
self.slicing_corpus = []
|
||||
self.knowledge_slice = KnowledgeSlice(self.corpus_file_string, self.chunk_size, self.overlap_size)
|
||||
|
||||
@staticmethod
|
||||
def update_gallery_list(gallery_list, iterated_dict):
|
||||
# get a gallery list which not in iterated_dict
|
||||
gallery_list_update = []
|
||||
gallery_list_index = []
|
||||
for i, _ in enumerate(gallery_list):
|
||||
if i not in iterated_dict:
|
||||
gallery_list_update.append(gallery_list[i])
|
||||
gallery_list_index.append(i)
|
||||
|
||||
return gallery_list_update, gallery_list_index
|
||||
|
||||
def document_slicing(self):
|
||||
json_list = []
|
||||
all_slices_info = self.knowledge_slice.execute()
|
||||
|
||||
for _, item in enumerate(all_slices_info):
|
||||
json_list.append({
|
||||
"slice_data": item
|
||||
})
|
||||
|
||||
self.slicing_corpus = json_list
|
||||
|
||||
def build_knowledge_relation(self, slicing_corpus_list):
|
||||
# knowledge relation for each paragraph
|
||||
if not self.kg_relation:
|
||||
return slicing_corpus_list
|
||||
iterated_dict = {}
|
||||
kr_result_json_list = []
|
||||
gallery_list = []
|
||||
kr_relation_list = []
|
||||
|
||||
if len(slicing_corpus_list) < 3:
|
||||
return slicing_corpus_list
|
||||
|
||||
for _, item in enumerate(slicing_corpus_list):
|
||||
gallery_list.append(item['slice_data'])
|
||||
|
||||
for k, item in enumerate(slicing_corpus_list):
|
||||
if k not in iterated_dict:
|
||||
iterated_dict[k] = 1
|
||||
cur_gallery_list, cur_gallery_src_index = self.update_gallery_list(gallery_list, iterated_dict)
|
||||
if len(cur_gallery_list) < 1:
|
||||
kr_result_json_list.append({
|
||||
"slice_data": item['slice_data']
|
||||
})
|
||||
return kr_result_json_list
|
||||
bm25_class = BM25Model(cur_gallery_list)
|
||||
id_scores = bm25_class.bm25_similarity(item['slice_data'], 1)
|
||||
kr_result_doc = item['slice_data'] + cur_gallery_list[id_scores[0][0]]
|
||||
kr_result_json_list.append({
|
||||
"slice_data": kr_result_doc
|
||||
})
|
||||
if cur_gallery_src_index[id_scores[0][0]] not in iterated_dict:
|
||||
iterated_dict[cur_gallery_src_index[id_scores[0][0]]] = 1
|
||||
else:
|
||||
continue
|
||||
|
||||
return kr_result_json_list
|
||||
|
||||
def build_graph_efficiently(self, search_space_size=50):
|
||||
# build knowledge relation in a efficient way
|
||||
knowledge_total_num = len(self.slicing_corpus)
|
||||
knowledge_chunk_num = math.ceil(knowledge_total_num / search_space_size)
|
||||
knowledge_relation_result = []
|
||||
|
||||
for i in range(0, knowledge_chunk_num):
|
||||
cur_max_index = (i + 1) * search_space_size
|
||||
if cur_max_index > knowledge_total_num:
|
||||
corpus_list = self.slicing_corpus[i * search_space_size:]
|
||||
else:
|
||||
corpus_list = self.slicing_corpus[i * search_space_size:cur_max_index]
|
||||
# to do knowledge relation
|
||||
cur_knowledge_relation_result = self.build_knowledge_relation(corpus_list)
|
||||
knowledge_relation_result.extend(cur_knowledge_relation_result)
|
||||
|
||||
return knowledge_relation_result
|
||||
|
||||
def knowledge_corpus_list_json(self):
|
||||
# deal the corpus and return structed information json_list
|
||||
self.document_slicing()
|
||||
kr_result_list_json = self.build_graph_efficiently()
|
||||
|
||||
return kr_result_list_json
|
||||
|
||||
|
||||
def get_json_list(txt_string, chunk_size=500, overlap_size=100, kg_relation=True):
|
||||
if len(txt_string) > 0:
|
||||
kg_extract = KnowledgeGraph(txt_string, chunk_size, overlap_size, kg_relation)
|
||||
kr_result_json_list = kg_extract.knowledge_corpus_list_json()
|
||||
else:
|
||||
kr_result_json_list = []
|
||||
|
||||
return kr_result_json_list
|
||||
@@ -0,0 +1,23 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
from typing import List
|
||||
|
||||
from loguru import logger
|
||||
from datamate.common.utils.text_splitter import TextSplitter
|
||||
|
||||
|
||||
class TextSegmentationOperator:
|
||||
def __init__(self, chunk_size, chunk_overlap):
|
||||
try:
|
||||
self.text_splitter = TextSplitter(-1, chunk_size, chunk_overlap)
|
||||
except Exception as err:
|
||||
logger.exception(f"init text splitter failed, error is: {err}")
|
||||
raise err
|
||||
|
||||
def process(self, input_data: str) -> List[str]:
|
||||
if input_data.strip() == "":
|
||||
logger.info("input text is empty, return empty chunks.")
|
||||
return []
|
||||
return self.text_splitter.split_text(input_data)
|
||||
16
runtime/ops/mapper/knowledge_relation_slice/metadata.yml
Normal file
16
runtime/ops/mapper/knowledge_relation_slice/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '知识库关系切片'
|
||||
name_en: 'Knowledge base relationship slicing'
|
||||
description: '知识库关系切片'
|
||||
description_en: 'Knowledge base relationship slicing.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'KnowledgeRelationSlice'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
46
runtime/ops/mapper/knowledge_relation_slice/process.py
Normal file
46
runtime/ops/mapper/knowledge_relation_slice/process.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description:
|
||||
Create: 2023/11/7 9:26
|
||||
"""
|
||||
import json
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
from .knowledge_relation import get_json_list
|
||||
|
||||
# 切片长度
|
||||
CHUNK_SIZE = 500
|
||||
# 相邻切片重合长度
|
||||
OVERLAP_SIZE = 100
|
||||
|
||||
|
||||
class KnowledgeRelationSlice(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(KnowledgeRelationSlice, self).__init__(*args, **kwargs)
|
||||
if 'chunk_size' not in kwargs:
|
||||
self.chunk_size = CHUNK_SIZE
|
||||
else:
|
||||
self.chunk_size = kwargs.get("chunk_size")
|
||||
|
||||
if 'overlap_size' not in kwargs:
|
||||
self.overlap_size = OVERLAP_SIZE
|
||||
else:
|
||||
self.overlap_size = kwargs.get("overlap_size")
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start_time = time.time()
|
||||
|
||||
chunk_item = get_json_list(sample[self.text_key], chunk_size=self.chunk_size, overlap_size=self.overlap_size)
|
||||
chunk_item_json = json.dumps(chunk_item, ensure_ascii=False)
|
||||
sample[self.text_key] = chunk_item_json
|
||||
|
||||
cost_time = time.time() - start_time
|
||||
logger.info(f'Generate knowledgeRelation slice num: {len(chunk_item)}, Cost time: {cost_time} s')
|
||||
|
||||
return sample
|
||||
Reference in New Issue
Block a user