Source code for pm4py.algo.clustering.trace_attribute_driven.linkage_method.linkage_avg

'''
    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

    PM4Py is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    PM4Py is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
'''
import numpy as np
from scipy.spatial.distance import squareform
from pm4py.algo.clustering.trace_attribute_driven.leven_dist import leven_dist_calc
from pm4py.algo.clustering.trace_attribute_driven.merge_log import merge_log
from pm4py.algo.clustering.trace_attribute_driven.dfg import dfg_dist
from pm4py.algo.clustering.trace_attribute_driven.variants import act_dist_calc
from pm4py.algo.clustering.trace_attribute_driven.variants import suc_dist_calc


[docs]def linkage_dfg_update(loglist, dist_mat, alpha, percent): index_list = [] for i in range(len(dist_mat)): for j in range(i + 1, len(dist_mat)): index_list.append([i, j]) y = squareform(dist_mat) n = len(dist_mat) # The number of observations. Z = [] cluster_size = dict(zip(range(n), np.ones(n))) # record merged cluster size every step k = 1 logsindex = list(range(len(loglist))) while (k <= n - 2): min_index = np.argmin(y) # update Z temp = [] temp.extend(index_list[min_index]) temp.append(y[min_index]) cluster_size[n - 1 + k] = cluster_size[temp[0]] + cluster_size[temp[1]] temp.append(cluster_size[n - 1 + k]) Z.append(temp) # get index of min in y item = index_list[min_index][::] record1 = [] record2 = [] for ele in index_list: if item[0] in ele: record1.append(index_list.index(ele)) inde = ele.index(item[0]) ele[inde] = n - 1 + k if item[1] in ele: # here if/elif both works record2.append(index_list.index(ele)) inde = ele.index(item[1]) ele[inde] = n - 1 + k ele.sort() record = list(set(record1).union(set(record2))) merged1 = merge_log.update_merge([loglist[item[0]], loglist[item[1]]]) # here the logsindex is changing diff = list(set(logsindex).difference(set(item))) # diff is the node number need to be updated update_dist = dict() for ele in diff: (dist_act, dist_dfg) = dfg_dist.dfg_dist_calc(merged1, loglist[ele]) tempdist = dist_act * alpha + dist_dfg * (1 - alpha) # tempdist = leven_dist_calc.leven_dist_avg(merged1, loglist[ele], percent, percent) update_dist[ele] = tempdist loglist.append(merged1) diff.append(n - 1 + k) logsindex = diff del (record1[record1.index(min_index)]) del (record2[record2.index(min_index)]) # for i in range(len(record1)): # y[record1[i]] = (y[record1[i]]*cluster_size[item[0]] + y[record2[i]]*cluster_size[item[1]]) / (cluster_size[item[0]]+cluster_size[item[1]]) for ele in record1: uindex = index_list[ele][0] # record1 is the location if nodes in diff in the index_list y[ele] = update_dist[uindex] diff1 = list(set(range(len(index_list))).difference(set(record))) newindex = record1 + diff1 newindex.sort() range_newindex = range(len(newindex)) tempy = list(range_newindex) templist = list(range_newindex) for i in range_newindex: tempy[i] = y[newindex[i]] templist[i] = index_list[newindex[i]] index_list = templist y = tempy k = k + 1 temp = [] temp.extend(index_list[0]) temp.append(y[0]) cluster_size[n - 1 + k] = cluster_size[temp[0]] + cluster_size[temp[1]] temp.append(cluster_size[n - 1 + k]) Z.append(temp) Z = np.array(Z) return Z
[docs]def linkage_avg(loglist, dist_mat, alpha, percent): index_list = [] cluster_size = [] for i in range(len(dist_mat)): cluster_size.append(len(loglist[i])) for j in range(i + 1, len(dist_mat)): index_list.append([i, j]) y = squareform(dist_mat) n = len(dist_mat) # The number of observations. Z = [] cluster_size = dict(zip(range(n), cluster_size)) # record merged cluster size every step k = 1 while (k <= n - 2): min_index = np.argmin(y) # update Z temp = [] temp.extend(index_list[min_index]) temp.append(y[min_index]) cluster_size[n - 1 + k] = cluster_size[temp[0]] + cluster_size[temp[1]] temp.append(cluster_size[n - 1 + k]) Z.append(temp) # get index of min in y item = index_list[min_index][::] record1 = [] record2 = [] for ele in index_list: if item[0] in ele: record1.append(index_list.index(ele)) inde = ele.index(item[0]) ele[inde] = n - 1 + k if item[1] in ele: # here if/elif both works record2.append(index_list.index(ele)) inde = ele.index(item[1]) ele[inde] = n - 1 + k ele.sort() record = list(set(record1).union(set(record2))) del (record1[record1.index(min_index)]) del (record2[record2.index(min_index)]) for i in range(len(record1)): y[record1[i]] = (y[record1[i]] * cluster_size[item[0]] + y[record2[i]] * cluster_size[item[1]]) / ( cluster_size[item[0]] + cluster_size[item[1]]) # for ele in record1: # uindex = index_list[ele][0] # record1 is the location if nodes in diff in the index_list # y[ele] = update_dist[uindex] diff1 = list(set(range(len(index_list))).difference(set(record))) newindex = record1 + diff1 newindex.sort() range_newindex = range(len(newindex)) tempy = list(range_newindex) templist = list(range_newindex) for i in range_newindex: tempy[i] = y[newindex[i]] templist[i] = index_list[newindex[i]] index_list = templist y = tempy k = k + 1 temp = [] temp.extend(index_list[0]) temp.append(y[0]) cluster_size[n - 1 + k] = cluster_size[temp[0]] + cluster_size[temp[1]] temp.append(cluster_size[n - 1 + k]) Z.append(temp) Z = np.array(Z) return Z
[docs]def linkage_DMM_update(loglist, dist_mat, alpha, percent): index_list = [] for i in range(len(dist_mat)): for j in range(i + 1, len(dist_mat)): index_list.append([i, j]) y = squareform(dist_mat) n = len(dist_mat) # The number of observations. Z = [] cluster_size = dict(zip(range(n), np.ones(n))) # record merged cluster size every step k = 1 logsindex = list(range(len(loglist))) while (k <= n - 2): min_index = np.argmin(y) # update Z temp = [] temp.extend(index_list[min_index]) temp.append(y[min_index]) cluster_size[n - 1 + k] = cluster_size[temp[0]] + cluster_size[temp[1]] temp.append(cluster_size[n - 1 + k]) Z.append(temp) # get index of min in y item = index_list[min_index][::] record1 = [] record2 = [] for ele in index_list: if item[0] in ele: record1.append(index_list.index(ele)) inde = ele.index(item[0]) ele[inde] = n - 1 + k if item[1] in ele: # here if/elif both works record2.append(index_list.index(ele)) inde = ele.index(item[1]) ele[inde] = n - 1 + k ele.sort() record = list(set(record1).union(set(record2))) merged1 = merge_log.update_merge([loglist[item[0]], loglist[item[1]]]) # here the logsindex is changing diff = list(set(logsindex).difference(set(item))) # diff is the node number need to be updated update_dist = dict() for ele in diff: dist_act = act_dist_calc.act_sim_percent(merged1, loglist[ele], percent, percent) dist_suc = suc_dist_calc.suc_sim_percent(merged1, loglist[ele], percent, percent) tempdist = dist_act * alpha + dist_suc * (1 - alpha) # tempdist = leven_dist_calc.leven_dist_avg(merged1, loglist[ele], percent, percent) update_dist[ele] = tempdist loglist.append(merged1) diff.append(n - 1 + k) logsindex = diff del (record1[record1.index(min_index)]) del (record2[record2.index(min_index)]) # for i in range(len(record1)): # y[record1[i]] = (y[record1[i]]*cluster_size[item[0]] + y[record2[i]]*cluster_size[item[1]]) / (cluster_size[item[0]]+cluster_size[item[1]]) for ele in record1: uindex = index_list[ele][0] # record1 is the location if nodes in diff in the index_list y[ele] = update_dist[uindex] diff1 = list(set(range(len(index_list))).difference(set(record))) newindex = record1 + diff1 newindex.sort() range_newindex = range(len(newindex)) tempy = list(range_newindex) templist = list(range_newindex) for i in range_newindex: tempy[i] = y[newindex[i]] templist[i] = index_list[newindex[i]] index_list = templist y = tempy k = k + 1 temp = [] temp.extend(index_list[0]) temp.append(y[0]) cluster_size[n - 1 + k] = cluster_size[temp[0]] + cluster_size[temp[1]] temp.append(cluster_size[n - 1 + k]) Z.append(temp) Z = np.array(Z) return Z
[docs]def linkage_DMM_update_leven(loglist, dist_mat, alpha, percent): index_list = [] for i in range(len(dist_mat)): for j in range(i + 1, len(dist_mat)): index_list.append([i, j]) y = squareform(dist_mat) n = len(dist_mat) # The number of observations. Z = [] cluster_size = dict(zip(range(n), np.ones(n))) # record merged cluster size every step k = 1 logsindex = list(range(len(loglist))) while (k <= n - 2): min_index = np.argmin(y) # update Z temp = [] temp.extend(index_list[min_index]) temp.append(y[min_index]) cluster_size[n - 1 + k] = cluster_size[temp[0]] + cluster_size[temp[1]] temp.append(cluster_size[n - 1 + k]) Z.append(temp) # get index of min in y item = index_list[min_index][::] record1 = [] record2 = [] for ele in index_list: if item[0] in ele: record1.append(index_list.index(ele)) inde = ele.index(item[0]) ele[inde] = n - 1 + k if item[1] in ele: # here if/elif both works record2.append(index_list.index(ele)) inde = ele.index(item[1]) ele[inde] = n - 1 + k ele.sort() record = list(set(record1).union(set(record2))) merged1 = merge_log.update_merge([loglist[item[0]], loglist[item[1]]]) # here the logsindex is changing diff = list(set(logsindex).difference(set(item))) # diff is the node number need to be updated update_dist = dict() for ele in diff: tempdist = leven_dist_calc.leven_dist(merged1, loglist[ele], percent, percent) # tempdist = leven_dist_calc.leven_dist_avg(merged1, loglist[ele], percent, percent) update_dist[ele] = tempdist loglist.append(merged1) diff.append(n - 1 + k) logsindex = diff del (record1[record1.index(min_index)]) del (record2[record2.index(min_index)]) # for i in range(len(record1)): # y[record1[i]] = (y[record1[i]]*cluster_size[item[0]] + y[record2[i]]*cluster_size[item[1]]) / (cluster_size[item[0]]+cluster_size[item[1]]) for ele in record1: uindex = index_list[ele][0] # record1 is the location if nodes in diff in the index_list y[ele] = update_dist[uindex] diff1 = list(set(range(len(index_list))).difference(set(record))) newindex = record1 + diff1 newindex.sort() range_newindex = range(len(newindex)) tempy = list(range_newindex) templist = list(range_newindex) for i in range_newindex: tempy[i] = y[newindex[i]] templist[i] = index_list[newindex[i]] index_list = templist y = tempy k = k + 1 temp = [] temp.extend(index_list[0]) temp.append(y[0]) cluster_size[n - 1 + k] = cluster_size[temp[0]] + cluster_size[temp[1]] temp.append(cluster_size[n - 1 + k]) Z.append(temp) Z = np.array(Z) return Z