import pandas as pd import numpy as np import itertools import networkx as nx from scipy.stats import f as f_dist from scipy.stats import chi2 as chi2_dist import sample_path as sp import structure as st import network_graph as ng import parameters_estimator as pe import cache as ch class StructureEstimator: def __init__(self, sample_path, exp_test_alfa, chi_test_alfa): self.sample_path = sample_path self.complete_graph_frame = self.build_complete_graph_frame(self.sample_path.structure.list_of_nodes_labels()) self.complete_graph = self.build_complete_graph(self.sample_path.structure.list_of_nodes_labels()) self.exp_test_sign = exp_test_alfa self.chi_test_alfa = chi_test_alfa self.cache = ch.Cache() def build_complete_graph_frame(self, node_ids): complete_frame = pd.DataFrame(itertools.permutations(node_ids, 2)) complete_frame.columns = ['From', 'To'] return complete_frame def build_complete_graph(self, node_ids): complete_graph = nx.DiGraph() complete_graph.add_nodes_from(node_ids) complete_graph.add_edges_from(itertools.permutations(node_ids, 2)) return complete_graph def complete_test(self, test_parent, test_child, parent_set): p_set = parent_set[:] complete_info = parent_set[:] complete_info.append(test_parent) tmp_df = self.complete_graph_frame.loc[self.complete_graph_frame['To'].isin([test_child])] #tmp_df = self.complete_graph_frame.loc[np.in1d(self.complete_graph_frame['To'], test_child)] d2 = tmp_df.loc[tmp_df['From'].isin(complete_info)] complete_info.append(test_child) v2 = self.sample_path.structure.variables_frame.loc[ self.sample_path.structure.variables_frame['Name'].isin(complete_info)] #print(tmp_df) #d1 = tmp_df.loc[tmp_df['From'].isin(parent_set)] #parent_set.append(test_child) #print(parent_set) """v1 = self.sample_path.structure.variables_frame.loc[self.sample_path.structure.variables_frame['Name'].isin(parent_set)] s1 = st.Structure(d1, v1, self.sample_path.total_variables_count) g1 = ng.NetworkGraph(s1) g1.init_graph()""" #parent_set.append(test_parent) """d2 = tmp_df.loc[tmp_df['From'].isin(parent_set)] v2 = self.sample_path.structure.variables_frame.loc[self.sample_path.structure.variables_frame['Name'].isin(parent_set)] s2 = st.Structure(d2, v2, self.sample_path.total_variables_count) g2 = ng.NetworkGraph(s2) g2.init_graph()""" #parent_set.append(test_child) sofc1 = None #if not sofc1: if not p_set: sofc1 = self.cache.find(test_child) if not sofc1: #d1 = tmp_df.loc[tmp_df['From'].isin(parent_set)] d1 = d2[d2.From != test_parent] #v1 = self.sample_path.structure.variables_frame.loc[ #self.sample_path.structure.variables_frame['Name'].isin(parent_set)] v1 = v2[v2.Name != test_parent] #print("D1", d1) #print("V1", v1) s1 = st.Structure(d1, v1, self.sample_path.total_variables_count) g1 = ng.NetworkGraph(s1) g1.init_graph() p1 = pe.ParametersEstimator(self.sample_path, g1) p1.init_sets_cims_container() #print("Computing params for",test_child, test_parent, parent_set) p1.compute_parameters_for_node(test_child) sofc1 = p1.sets_of_cims_struct.sets_of_cims[s1.get_positional_node_indx(test_child)] self.cache.put(test_child,sofc1) sofc2 = None p_set.append(test_parent) if p_set: #p_set.append(test_parent) #print("PSET ", p_set) set_p_set = set(p_set) sofc2 = self.cache.find(set_p_set) #print("Sofc2 ", sofc2) #print(self.cache.list_of_sets_of_indxs) """p2 = pe.ParametersEstimator(self.sample_path, g2) p2.init_sets_cims_container() #p2.compute_parameters() p2.compute_parameters_for_node(test_child) sofc2 = p2.sets_of_cims_struct.sets_of_cims[s2.get_positional_node_indx(test_child)]""" if not sofc2 or p_set: print("Cache Miss SOC2") #parent_set.append(test_parent) #d2 = tmp_df.loc[tmp_df['From'].isin(p_set)] #v2 = self.sample_path.structure.variables_frame.loc[ #self.sample_path.structure.variables_frame['Name'].isin(parent_set)] #print("D2", d2) #print("V2", v2) #s2 = st.Structure(d2, v2, self.sample_path.total_variables_count) s2 = st.Structure(d2, v2, self.sample_path.total_variables_count) g2 = ng.NetworkGraph(s2) g2.init_graph() p2 = pe.ParametersEstimator(self.sample_path, g2) p2.init_sets_cims_container() # p2.compute_parameters() p2.compute_parameters_for_node(test_child) sofc2 = p2.sets_of_cims_struct.sets_of_cims[s2.get_positional_node_indx(test_child)] if p_set: #set_p_set = set(p_set) self.cache.put(set_p_set, sofc2) end = 0 increment = self.sample_path.structure.get_states_number(test_parent) for cim1 in sofc1.actual_cims: start = end end = start + increment for j in range(start, end): #cim2 = sofc2.actual_cims[j] #print(indx) #print("Run Test", i, j) if not self.independence_test(test_child, cim1, sofc2.actual_cims[j]): return False return True def independence_test(self, tested_child, cim1, cim2): # Fake exp test r1s = cim1.state_transition_matrix.diagonal() r2s = cim2.state_transition_matrix.diagonal() F_stats = cim2.cim.diagonal() / cim1.cim.diagonal() child_states_numb = self.sample_path.structure.get_states_number(tested_child) for val in range(0, child_states_numb): # i possibili valori di tested child TODO QUESTO CONTO DEVE ESSERE VETTORIZZATO #r1 = cim1.state_transition_matrix[val][val] #r2 = cim2.state_transition_matrix[val][val] #print("No Test Parent:",cim1.cim[val][val],"With Test Parent", cim2.cim[val][val]) #F = cim2.cim[val][val] / cim1.cim[val][val] #print("Exponential test", F_stats[val], r1s[val], r2s[val]) #print(f_dist.ppf(1 - self.exp_test_sign / 2, r1, r2)) #print(f_dist.ppf(self.exp_test_sign / 2, r1, r2)) if F_stats[val] < f_dist.ppf(self.exp_test_sign / 2, r1s[val], r2s[val]) or \ F_stats[val] > f_dist.ppf(1 - self.exp_test_sign / 2, r1s[val], r2s[val]): print("CONDITIONALLY DEPENDENT EXP") return False # fake chi test M1_no_diag = self.remove_diagonal_elements(cim1.state_transition_matrix) M2_no_diag = self.remove_diagonal_elements(cim2.state_transition_matrix) #print("M1 no diag", M1_no_diag) #print("M2 no diag", M2_no_diag) chi_2_quantile = chi2_dist.ppf(1 - self.chi_test_alfa, child_states_numb - 1) """ Ks = np.sqrt(cim1.state_transition_matrix.diagonal() / cim2.state_transition_matrix.diagonal()) Ls = np.reciprocal(Ks) chi_stats = np.sum((np.power((M2_no_diag.T * Ks).T - (M1_no_diag.T * Ls).T, 2) \ / (M1_no_diag + M2_no_diag)), axis=1)""" Ks = np.sqrt(r1s / r2s) Ls = np.sqrt(r2s / r1s) for val in range(0, child_states_numb): #K = math.sqrt(cim1.state_transition_matrix[val][val] / cim2.state_transition_matrix[val][val]) #L = 1 / K Chi = np.sum(np.power(Ks[val] * M2_no_diag[val] - Ls[val] *M1_no_diag[val], 2) / (M1_no_diag[val] + M2_no_diag[val])) #print("Chi Stats", Chi) #print("Chi Quantile", chi_2_quantile) if Chi > chi_2_quantile: #if np.any(chi_stats > chi_2_quantile): print("CONDITIONALLY DEPENDENT CHI") return False #print("Chi test", Chi) return True def one_iteration_of_CTPC_algorithm(self, var_id): u = list(self.complete_graph.predecessors(var_id)) tests_parents_numb = len(u) #print(u) b = 0 #parent_indx = 0 while b < len(u): #for parent_id in u: parent_indx = 0 while u and parent_indx < tests_parents_numb and b < len(u): # list_without_test_parent = u.remove(parent_id) removed = False #print("b", b) #print("Parent Indx", parent_indx) #if not list(self.generate_possible_sub_sets_of_size(u, b, u[parent_indx])): #break S = self.generate_possible_sub_sets_of_size(u, b, u[parent_indx]) #print("U Set", u) #print("S", S) for parents_set in S: #print("Parent Set", parents_set) #print("Test Parent", u[parent_indx]) if self.complete_test(u[parent_indx], var_id, parents_set): #print("Removing EDGE:", u[parent_indx], var_id) self.complete_graph.remove_edge(u[parent_indx], var_id) #print(self.complete_graph_frame) """self.complete_graph_frame = \ self.complete_graph_frame.drop( self.complete_graph_frame[(self.complete_graph_frame.From == u[parent_indx]) & (self.complete_graph_frame.To == var_id)].index)""" self.complete_graph_frame.drop(self.complete_graph_frame[(self.complete_graph_frame.From == u[parent_indx]) & (self.complete_graph_frame.To == var_id)].index) #print(self.complete_graph_frame) #u.remove(u[parent_indx]) del u[parent_indx] removed = True #else: #parent_indx += 1 if not removed: parent_indx += 1 b += 1 self.cache.clear() def generate_possible_sub_sets_of_size(self, u, size, parent_id): #print("Inside Generate subsets", u) #print("InsideGenerate Subsets", parent_id) list_without_test_parent = u[:] list_without_test_parent.remove(parent_id) # u.remove(parent_id) #print(list(map(list, itertools.combinations(list_without_test_parent, size)))) return map(list, itertools.combinations(list_without_test_parent, size)) def remove_diagonal_elements(self, matrix): m = matrix.shape[0] strided = np.lib.stride_tricks.as_strided s0, s1 = matrix.strides return strided(matrix.ravel()[1:], shape=(m - 1, m), strides=(s0 + s1, s1)).reshape(m, -1) def ctpc_algorithm(self): for node_id in self.sample_path.structure.list_of_nodes_labels(): print("TESTING VAR:", node_id) self.one_iteration_of_CTPC_algorithm(node_id)