You've already forked DataMate
init datamate
This commit is contained in:
108
runtime/ops/mapper/knowledge_relation_slice/graph_sim_func.py
Normal file
108
runtime/ops/mapper/knowledge_relation_slice/graph_sim_func.py
Normal file
@@ -0,0 +1,108 @@
|
||||
#!/usr/bin/python3.9
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
import math
|
||||
from multiprocessing import Pool, cpu_count
|
||||
|
||||
from six import iteritems
|
||||
from six.moves import range
|
||||
from loguru import logger
|
||||
|
||||
PARAM_K1 = 1.5
|
||||
PARAM_B = 0.75
|
||||
EPSILON = 0.25
|
||||
|
||||
|
||||
def effective_n_jobs(n_jobs):
|
||||
if n_jobs == 0:
|
||||
raise ValueError('n_jobs == 0 in Parallel has no meaning')
|
||||
elif n_jobs is None:
|
||||
return 1
|
||||
elif n_jobs < 0:
|
||||
n_jobs = max(cpu_count() + 1 + n_jobs, 1)
|
||||
return n_jobs
|
||||
|
||||
|
||||
class SimilarityAlgBM25(object):
|
||||
|
||||
def __init__(self, corpus_docs):
|
||||
|
||||
self.corpus_files_size = 0
|
||||
self.avg_dl = 0
|
||||
self.doc_file_freqs = []
|
||||
self.idf_dict = {}
|
||||
self.doc_len = []
|
||||
self._initialize(corpus_docs)
|
||||
|
||||
def get_sim_score(self, document, index):
|
||||
|
||||
score = 0
|
||||
doc_freqs = self.doc_file_freqs[index]
|
||||
for word in document:
|
||||
if word not in doc_freqs:
|
||||
continue
|
||||
try:
|
||||
score += (self.idf_dict[word] * doc_freqs[word] * (PARAM_K1 + 1)
|
||||
/ (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avg_dl)))
|
||||
except KeyError as ke:
|
||||
logger.warning('key not found in doc_freqs dict: ', word)
|
||||
return score
|
||||
|
||||
def get_sim_scores(self, document):
|
||||
|
||||
scores = []
|
||||
for index in range(self.corpus_files_size):
|
||||
cur_score = self.get_sim_score(document, index)
|
||||
scores.append(cur_score)
|
||||
return scores
|
||||
|
||||
def get_scores_bow(self, document):
|
||||
|
||||
scores = []
|
||||
for index in range(self.corpus_files_size):
|
||||
score = self.get_sim_score(document, index)
|
||||
if score > 0:
|
||||
scores.append((index, score))
|
||||
return scores
|
||||
|
||||
def _initialize(self, corpus_files):
|
||||
"""
|
||||
Calculates frequencies of terms in documents and in corpus_files.
|
||||
Also computes inverse document frequencies.
|
||||
"""
|
||||
nd = {} # word -> number of documents with word
|
||||
num_doc = 0
|
||||
for document_file in corpus_files:
|
||||
self.corpus_files_size += 1
|
||||
self.doc_len.append(len(document_file))
|
||||
num_doc += len(document_file)
|
||||
|
||||
frequencies_dict = {}
|
||||
for word in document_file:
|
||||
if word not in frequencies_dict:
|
||||
frequencies_dict[word] = 0
|
||||
frequencies_dict[word] += 1
|
||||
self.doc_file_freqs.append(frequencies_dict)
|
||||
|
||||
for word, _ in iteritems(frequencies_dict):
|
||||
if word not in nd:
|
||||
nd[word] = 0
|
||||
nd[word] += 1
|
||||
|
||||
self.avg_dl = float(num_doc) / self.corpus_files_size
|
||||
# collect idf sum to calculate an average idf for epsilon value
|
||||
idf_sum = 0
|
||||
|
||||
negative_idfs_list = []
|
||||
for word, freq in iteritems(nd):
|
||||
idf = math.log(self.corpus_files_size - freq + 0.5) - math.log(freq + 0.5)
|
||||
self.idf_dict[word] = idf
|
||||
idf_sum += idf
|
||||
if idf < 0:
|
||||
negative_idfs_list.append(word)
|
||||
self.average_idf = float(idf_sum) / len(self.idf_dict)
|
||||
|
||||
eps = EPSILON * self.average_idf
|
||||
for word in negative_idfs_list:
|
||||
self.idf_dict[word] = eps
|
||||
Reference in New Issue
Block a user