diff --git a/main_package/classes/estimators/structure_constraint_based_estimator.py b/main_package/classes/estimators/structure_constraint_based_estimator.py index 0e27ee8..3779e6d 100644 --- a/main_package/classes/estimators/structure_constraint_based_estimator.py +++ b/main_package/classes/estimators/structure_constraint_based_estimator.py @@ -36,28 +36,30 @@ class StructureConstraintBasedEstimator(se.StructureEstimator): :chi_test_alfa: the significance level for the chi Hp test """ - def __init__(self, sample_path: sp.SamplePath, exp_test_alfa: float, chi_test_alfa: float): - super().__init__(sample_path) + def __init__(self, sample_path: sp.SamplePath, exp_test_alfa: float, chi_test_alfa: float,known_edges: typing.List= []): + super().__init__(sample_path,known_edges) self.exp_test_sign = exp_test_alfa self.chi_test_alfa = chi_test_alfa def complete_test(self, test_parent: str, test_child: str, parent_set: typing.List, child_states_numb: int, tot_vars_count: int): - """ - Permorms a complete independence test on the directed graphs G1 = test_child U parent_set - G2 = G1 U test_parent (added as an additional parent of the test_child). + """Performs a complete independence test on the directed graphs G1 = {test_child U parent_set} + G2 = {G1 U test_parent} (added as an additional parent of the test_child). Generates all the necessary structures and datas to perform the tests. - Parameters: - test_parent: the node label of the test parent - test_child: the node label of the child - parent_set: the common parent set - child_states_numb: the cardinality of the test_child - tot_vars_count_ the total number of variables in the net - Returns: - True iff test_child and test_parent are independent given the sep_set parent_set - False otherwise + :param test_parent: the node label of the test parent + :type test_parent: string + :param test_child: the node label of the child + :type test_child: string + :param parent_set: the common parent set + :type parent_set: List + :param child_states_numb: the cardinality of the ``test_child`` + :type child_states_numb: int + :param tot_vars_count: the total number of variables in the net + :type tot_vars_count: int + :return: True iff test_child and test_parent are independent given the sep_set parent_set. False otherwise + :rtype: bool """ #print("Test Parent:", test_parent) #print("Sep Set", parent_set) @@ -92,7 +94,7 @@ class StructureConstraintBasedEstimator(se.StructureEstimator): s1 = st.Structure(l1, indxs1, vals1, eds1, tot_vars_count) g1 = ng.NetworkGraph(s1) g1.fast_init(test_child) - p1 = pe.ParametersEstimator(self.sample_path, g1) + p1 = pe.ParametersEstimator(self._sample_path, g1) p1.fast_init(test_child) sofc1 = p1.compute_parameters_for_node(test_child) #if not p_set: @@ -125,7 +127,7 @@ class StructureConstraintBasedEstimator(se.StructureEstimator): s2 = st.Structure(l2, indxs2, vals2, eds2, tot_vars_count) g2 = ng.NetworkGraph(s2) g2.fast_init(test_child) - p2 = pe.ParametersEstimator(self.sample_path, g2) + p2 = pe.ParametersEstimator(self._sample_path, g2) p2.fast_init(test_child) sofc2 = p2.compute_parameters_for_node(test_child) self.cache.put(set(p_set), sofc2) @@ -146,19 +148,18 @@ class StructureConstraintBasedEstimator(se.StructureEstimator): def independence_test(self, child_states_numb: int, cim1: condim.ConditionalIntensityMatrix, cim2: condim.ConditionalIntensityMatrix): - """ - Compute the actual independence test using two cims. + """Compute the actual independence test using two cims. It is performed first the exponential test and if the null hypothesis is not rejected, - it is permormed also the chi_test. - - Parameters: - child_states_numb: the cardinality of the test child - cim1: a cim belonging to the graph without test parent - cim2: a cim belonging to the graph with test parent - - Returns: - True iff both tests do NOT reject the null hypothesis of indipendence - False otherwise + it is performed also the chi_test. + + :param child_states_numb: the cardinality of the test child + :type child_states_numb: int + :param cim1: a cim belonging to the graph without test parent + :type cim1: ConditionalIntensityMatrix + :param cim2: a cim belonging to the graph with test parent + :type cim2: ConditionalIntensityMatrix + :return: True iff both tests do NOT reject the null hypothesis of independence. False otherwise. + :rtype: bool """ M1 = cim1.state_transition_matrix M2 = cim2.state_transition_matrix @@ -202,14 +203,10 @@ class StructureConstraintBasedEstimator(se.StructureEstimator): return True def one_iteration_of_CTPC_algorithm(self, var_id: str, tot_vars_count: int): - """ - Performs an iteration of the CTPC algorithm using the node var_id as test_child. + """Performs an iteration of the CTPC algorithm using the node ``var_id`` as ``test_child``. - Parameters: - var_id: the node label of the test child - tot_vars_count: the number of nodes in the net - Returns: - void + :param var_id: the node label of the test child + :type var_id: string """ optimizer_obj = optimizer.ConstraintBasedOptimizer( node_id = var_id, @@ -227,7 +224,7 @@ class StructureConstraintBasedEstimator(se.StructureEstimator): void """ ctpc_algo = self.one_iteration_of_CTPC_algorithm - total_vars_numb = self.sample_path.total_variables_count + total_vars_numb = self._sample_path.total_variables_count n_nodes= len(self.nodes) @@ -239,7 +236,7 @@ class StructureConstraintBasedEstimator(se.StructureEstimator): 'Remove all the edges from the structure' - self.sample_path.structure.clean_structure_edges() + self._sample_path.structure.clean_structure_edges() 'Estimate the best parents for each node' #with multiprocessing.Pool(processes=cpu_count) as pool: diff --git a/main_package/classes/estimators/structure_estimator.py b/main_package/classes/estimators/structure_estimator.py index db24847..3c3256b 100644 --- a/main_package/classes/estimators/structure_estimator.py +++ b/main_package/classes/estimators/structure_estimator.py @@ -33,22 +33,39 @@ class StructureEstimator(ABC): :cache: the cache object """ - def __init__(self, sample_path: sp.SamplePath): - self.sample_path = sample_path - self.nodes = np.array(self.sample_path.structure.nodes_labels) - self.nodes_vals = self.sample_path.structure.nodes_values - self.nodes_indxs = self.sample_path.structure.nodes_indexes - self.complete_graph = self.build_complete_graph(self.sample_path.structure.nodes_labels) + def __init__(self, sample_path: sp.SamplePath, known_edges: typing.List = None): + self._sample_path = sample_path + self.nodes = np.array(self._sample_path.structure.nodes_labels) + self.nodes_vals = self._sample_path.structure.nodes_values + self.nodes_indxs = self._sample_path.structure.nodes_indexes + self._removable_edges_matrix = self.build_removable_edges_matrix(known_edges) + self.complete_graph = self.build_complete_graph(self._sample_path.structure.nodes_labels) self.cache = ch.Cache() - def build_complete_graph(self, node_ids: typing.List): + def build_removable_edges_matrix(self, known_edges: typing.List): + """Builds a boolean matrix who shows if a edge could be removed or not, based on prior knowledge given: + + :param known_edges: the list of nodes labels + :type known_edges: List + :return: a boolean matrix + :rtype: np.ndarray """ - Builds a complete directed graph (no self loops) given the nodes labels in the list node_ids: + tot_vars_count = self._sample_path.total_variables_count + complete_adj_matrix = np.full((tot_vars_count, tot_vars_count), True) + if known_edges: + for edge in known_edges: + i = self._sample_path.structure.get_node_indx(edge[0]) + j = self._sample_path.structure.get_node_indx(edge[1]) + complete_adj_matrix[i][j] = False + return complete_adj_matrix + + def build_complete_graph(self, node_ids: typing.List): + """Builds a complete directed graph (no self loops) given the nodes labels in the list ``node_ids``: - Parameters: - node_ids: the list of nodes labels - Returns: - a complete Digraph Object + :param node_ids: the list of nodes labels + :type node_ids: List + :return: a complete Digraph Object + :rtype: networkx.DiGraph """ complete_graph = nx.DiGraph() complete_graph.add_nodes_from(node_ids) @@ -57,33 +74,28 @@ class StructureEstimator(ABC): def generate_possible_sub_sets_of_size(self, u: typing.List, size: int, parent_label: str): - """ - Creates a list containing all possible subsets of the list u of size size, - that do not contains a the node identified by parent_label. - - Parameters: - u: the list of nodes - size: the size of the subsets - parent_label: the nodes to exclude in the subsets generation - Returns: - a Map Object containing a list of lists - + """Creates a list containing all possible subsets of the list ``u`` of size ``size``, + that do not contains a the node identified by ``parent_label``. + + :param u: the list of nodes + :type u: List + :param size: the size of the subsets + :type size: int + :param parent_label: the node to exclude in the subsets generation + :type parent_label: string + :return: an Iterator Object containing a list of lists + :rtype: Iterator """ list_without_test_parent = u[:] list_without_test_parent.remove(parent_label) return map(list, itertools.combinations(list_without_test_parent, size)) def save_results(self): - """ - Save the estimated Structure to a .json file - - Parameters: - void - Returns: - void + """Save the estimated Structure to a .json file in the path where the data are loaded from. + The file is named as the input dataset but the `results_` word is appended to the results file. """ res = json_graph.node_link_data(self.complete_graph) - name = self.sample_path.importer.file_path.rsplit('/',1)[-1] + name = self._sample_path.importer.file_path.rsplit('/',1)[-1] #print(name) name = '../results_' + name with open(name, 'w+') as f: @@ -99,14 +111,71 @@ class StructureEstimator(ABC): @abc.abstractmethod def estimate_structure(self) -> typing.List: + """Abstract method to estimate the structure + + :return: List of estimated edges + :rtype: Typing.List """ - Compute Optimization process for a structure_estimator + pass - Parameters: + + def adjacency_matrix(self) -> np.ndarray: + """Converts the estimated structure ``_complete_graph`` to a boolean adjacency matrix representation. - Returns: - the estimated structure for the node + :return: The adjacency matrix of the graph ``_complete_graph`` + :rtype: numpy.ndArray + """ + return nx.adj_matrix(self._complete_graph).toarray().astype(bool) + def spurious_edges(self) -> typing.List: + """Return the spurious edges present in the estimated structure, if a prior net structure is present in + ``_sample_path.structure``. + + :return: A list containing the spurious edges + :rtype: List """ - pass + if not self._sample_path.has_prior_net_structure: + raise RuntimeError("Can not compute spurious edges with no prior net structure!") + real_graph = nx.DiGraph() + real_graph.add_nodes_from(self._sample_path.structure.nodes_labels) + real_graph.add_edges_from(self._sample_path.structure.edges) + return nx.difference(real_graph, self._complete_graph).edges + + def save_plot_estimated_structure_graph(self) -> None: + """Plot the estimated structure in a graphical model style. + Spurious edges are colored in red. + """ + graph_to_draw = nx.DiGraph() + spurious_edges = self.spurious_edges() + non_spurious_edges = list(set(self._complete_graph.edges) - set(spurious_edges)) + print(non_spurious_edges) + edges_colors = ['red' if edge in spurious_edges else 'black' for edge in self._complete_graph.edges] + graph_to_draw.add_edges_from(spurious_edges) + graph_to_draw.add_edges_from(non_spurious_edges) + pos = nx.spring_layout(graph_to_draw, k=0.5*1/np.sqrt(len(graph_to_draw.nodes())), iterations=50,scale=10) + options = { + "node_size": 2000, + "node_color": "white", + "edgecolors": "black", + 'linewidths':2, + "with_labels":True, + "font_size":13, + 'connectionstyle': 'arc3, rad = 0.1', + "arrowsize": 15, + "arrowstyle": '<|-', + "width": 1, + "edge_color":edges_colors, + } + + nx.draw(graph_to_draw, pos, **options) + ax = plt.gca() + ax.margins(0.20) + plt.axis("off") + name = self._sample_path._importer.file_path.rsplit('/', 1)[-1] + name = name.split('.', 1)[0] + name += '_' + str(self._sample_path._importer.dataset_id()) + name += '.png' + plt.savefig(name) + plt.clf() + print("Estimated Structure Plot Saved At: ", os.path.abspath(name)) diff --git a/main_package/classes/estimators/structure_score_based_estimator.py b/main_package/classes/estimators/structure_score_based_estimator.py index 8fe2395..6ab118d 100644 --- a/main_package/classes/estimators/structure_score_based_estimator.py +++ b/main_package/classes/estimators/structure_score_based_estimator.py @@ -44,8 +44,8 @@ class StructureScoreBasedEstimator(se.StructureEstimator): """ - def __init__(self, sample_path: sp.SamplePath, tau_xu:int=0.1, alpha_xu:int = 1): - super().__init__(sample_path) + def __init__(self, sample_path: sp.SamplePath, tau_xu:int=0.1, alpha_xu:int = 1,known_edges: typing.List= []): + super().__init__(sample_path,known_edges) self.tau_xu=tau_xu self.alpha_xu=alpha_xu @@ -70,11 +70,11 @@ class StructureScoreBasedEstimator(se.StructureEstimator): """ 'Save the true edges structure in tuples' - true_edges = copy.deepcopy(self.sample_path.structure.edges) + true_edges = copy.deepcopy(self._sample_path.structure.edges) true_edges = set(map(tuple, true_edges)) 'Remove all the edges from the structure' - self.sample_path.structure.clean_structure_edges() + self._sample_path.structure.clean_structure_edges() estimate_parents = self.estimate_parents @@ -208,7 +208,7 @@ class StructureScoreBasedEstimator(se.StructureEstimator): 'inizialize the graph for a single node' graph.fast_init(node_id) - params_estimation = pe.ParametersEstimator(self.sample_path, graph) + params_estimation = pe.ParametersEstimator(self._sample_path, graph) 'Inizialize and compute parameters for node' params_estimation.fast_init(node_id) diff --git a/main_package/classes/optimizers/constraint_based_optimizer.py b/main_package/classes/optimizers/constraint_based_optimizer.py index 4ee69d1..1f6f72d 100644 --- a/main_package/classes/optimizers/constraint_based_optimizer.py +++ b/main_package/classes/optimizers/constraint_based_optimizer.py @@ -51,9 +51,9 @@ class ConstraintBasedOptimizer(Optimizer): """ print("##################TESTING VAR################", self.node_id) - graph = ng.NetworkGraph(self.structure_estimator.sample_path.structure) + graph = ng.NetworkGraph(self.structure_estimator._sample_path.structure) - other_nodes = [node for node in self.structure_estimator.sample_path.structure.nodes_labels if node != self.node_id] + other_nodes = [node for node in self.structure_estimator._sample_path.structure.nodes_labels if node != self.node_id] for possible_parent in other_nodes: graph.add_edges([(possible_parent,self.node_id)]) @@ -63,7 +63,7 @@ class ConstraintBasedOptimizer(Optimizer): #tests_parents_numb = len(u) #complete_frame = self.complete_graph_frame #test_frame = complete_frame.loc[complete_frame['To'].isin([self.node_id])] - child_states_numb = self.structure_estimator.sample_path.structure.get_states_number(self.node_id) + child_states_numb = self.structure_estimator._sample_path.structure.get_states_number(self.node_id) b = 0 while b < len(u): parent_indx = 0 diff --git a/main_package/classes/optimizers/hill_climbing_search.py b/main_package/classes/optimizers/hill_climbing_search.py index e0da330..6a211c1 100644 --- a/main_package/classes/optimizers/hill_climbing_search.py +++ b/main_package/classes/optimizers/hill_climbing_search.py @@ -61,10 +61,31 @@ class HillClimbing(Optimizer): """ #'Create the graph for the single node' - graph = ng.NetworkGraph(self.structure_estimator.sample_path.structure) + graph = ng.NetworkGraph(self.structure_estimator._sample_path.structure) + 'get the index for the current node' + node_index = self.structure_estimator._sample_path._structure.get_node_indx(self.node_id) - other_nodes = [node for node in self.structure_estimator.sample_path.structure.nodes_labels if node != self.node_id] + 'list of prior edges' + prior_parents = set() + + 'Add the edges from prior knowledge' + for i in range(len(self.structure_estimator._removable_edges_matrix)): + if not self.structure_estimator._removable_edges_matrix[i][node_index]: + parent_id= self.structure_estimator._sample_path._structure.get_node_id(i) + prior_parents.add(parent_id) + + 'Add the node to the starting structure' + graph.add_edges([(parent_id, self.node_id)]) + + + + 'get all the possible parents' + other_nodes = [node for node in + self.structure_estimator._sample_path.structure.nodes_labels if + node != self.node_id and + not prior_parents.__contains__(node)] + actual_best_score = self.structure_estimator.get_score_from_graph(graph,self.node_id) patince_count = 0 diff --git a/main_package/classes/optimizers/tabu_search.py b/main_package/classes/optimizers/tabu_search.py index 9675945..0d2bf80 100644 --- a/main_package/classes/optimizers/tabu_search.py +++ b/main_package/classes/optimizers/tabu_search.py @@ -68,10 +68,30 @@ class TabuSearch(Optimizer): print(f"tabu search is processing the structure of {self.node_id}") 'Create the graph for the single node' - graph = ng.NetworkGraph(self.structure_estimator.sample_path.structure) + graph = ng.NetworkGraph(self.structure_estimator._sample_path.structure) + + 'get the index for the current node' + node_index = self.structure_estimator._sample_path._structure.get_node_indx(self.node_id) + + 'list of prior edges' + prior_parents = set() + + 'Add the edges from prior knowledge' + for i in range(len(self.structure_estimator._removable_edges_matrix)): + if not self.structure_estimator._removable_edges_matrix[i][node_index]: + parent_id= self.structure_estimator._sample_path._structure.get_node_id(i) + prior_parents.add(parent_id) + + 'Add the node to the starting structure' + graph.add_edges([(parent_id, self.node_id)]) + + 'get all the possible parents' - other_nodes = set([node for node in self.structure_estimator.sample_path.structure.nodes_labels if node != self.node_id]) + other_nodes = set([node for node in + self.structure_estimator._sample_path.structure.nodes_labels if + node != self.node_id and + not prior_parents.__contains__(node)]) 'calculate the score for the node without parents' actual_best_score = self.structure_estimator.get_score_from_graph(graph,self.node_id) diff --git a/main_package/tests/estimators/test_structure_constraint_based_estimator.py b/main_package/tests/estimators/test_structure_constraint_based_estimator.py index 9cee5cc..26d6fb2 100644 --- a/main_package/tests/estimators/test_structure_constraint_based_estimator.py +++ b/main_package/tests/estimators/test_structure_constraint_based_estimator.py @@ -10,12 +10,13 @@ import numpy as np import psutil from line_profiler import LineProfiler +import json +import pandas as pd + import utility.cache as ch import structure_graph.sample_path as sp import estimators.structure_constraint_based_estimator as se -import utility.json_importer as ji - -from multiprocessing import set_start_method +import utility.sample_importer as si import copy @@ -23,8 +24,29 @@ import copy class TestStructureConstraintBasedEstimator(unittest.TestCase): @classmethod def setUpClass(cls): - #cls.read_files = glob.glob(os.path.join('../../data', "*.json")) - cls.importer = ji.JsonImporter("../../data/networks_and_trajectories_ternary_data_15.json", 'samples', 'dyn.str', 'variables', 'Time', 'Name',1) + with open("../../data/networks_and_trajectories_ternary_data_3.json") as f: + raw_data = json.load(f) + + trajectory_list_raw= raw_data[0]["samples"] + + trajectory_list = [pd.DataFrame(sample) for sample in trajectory_list_raw] + + variables= pd.DataFrame(raw_data[0]["variables"]) + prior_net_structure = pd.DataFrame(raw_data[0]["dyn.str"]) + + + cls.importer = si.SampleImporter( + trajectory_list=trajectory_list, + variables=variables, + prior_net_structure=prior_net_structure + ) + + cls.importer.import_data() + #cls.s1 = sp.SamplePath(cls.importer) + + #cls.traj = cls.s1.concatenated_samples + + # print(len(cls.traj)) cls.s1 = sp.SamplePath(cls.importer) cls.s1.build_trajectories() cls.s1.build_structure() @@ -33,7 +55,6 @@ class TestStructureConstraintBasedEstimator(unittest.TestCase): true_edges = copy.deepcopy(self.s1.structure.edges) true_edges = set(map(tuple, true_edges)) - set_start_method("spawn") se1 = se.StructureConstraintBasedEstimator(self.s1,0.1,0.1) edges = se1.estimate_structure(disable_multiprocessing=False) diff --git a/main_package/tests/estimators/test_structure_score_based_estimator.py b/main_package/tests/estimators/test_structure_score_based_estimator.py index ab1bd42..0dedcd3 100644 --- a/main_package/tests/estimators/test_structure_score_based_estimator.py +++ b/main_package/tests/estimators/test_structure_score_based_estimator.py @@ -14,7 +14,11 @@ import copy import utility.cache as ch import structure_graph.sample_path as sp import estimators.structure_score_based_estimator as se -import utility.json_importer as ji +import utility.sample_importer as si + +import json + +import pandas as pd @@ -22,8 +26,29 @@ class TestStructureScoreBasedEstimator(unittest.TestCase): @classmethod def setUpClass(cls): - #cls.read_files = glob.glob(os.path.join('../../data', "*.json")) - cls.importer = ji.JsonImporter("../../data/networks_and_trajectories_binary_data_15.json", 'samples', 'dyn.str', 'variables', 'Time', 'Name') + with open("../../data/networks_and_trajectories_binary_data_01_6.json") as f: + raw_data = json.load(f) + + trajectory_list_raw= raw_data[0]["samples"] + + trajectory_list = [pd.DataFrame(sample) for sample in trajectory_list_raw] + + variables= pd.DataFrame(raw_data[0]["variables"]) + prior_net_structure = pd.DataFrame(raw_data[0]["dyn.str"]) + + + cls.importer = si.SampleImporter( + trajectory_list=trajectory_list, + variables=variables, + prior_net_structure=prior_net_structure + ) + + cls.importer.import_data() + #cls.s1 = sp.SamplePath(cls.importer) + + #cls.traj = cls.s1.concatenated_samples + + # print(len(cls.traj)) cls.s1 = sp.SamplePath(cls.importer) cls.s1.build_trajectories() cls.s1.build_structure() @@ -35,14 +60,14 @@ class TestStructureScoreBasedEstimator(unittest.TestCase): true_edges = set(map(tuple, true_edges)) - se1 = se.StructureScoreBasedEstimator(self.s1) + se1 = se.StructureScoreBasedEstimator(self.s1,known_edges = [('X','Q')]) edges = se1.estimate_structure( max_parents = None, iterations_number = 100, patience = 35, tabu_length = 15, tabu_rules_duration = 15, - optimizer = 'tabu', + optimizer = 'hill', disable_multiprocessing=True ) diff --git a/main_package/tests/results/results.csv b/main_package/tests/results/results.csv index f81ad13..24d6fc4 100644 --- a/main_package/tests/results/results.csv +++ b/main_package/tests/results/results.csv @@ -1 +1,14 @@ Time,Type,Variables,Density_Network,Cardinality,Index,F1,Precision,Recall + +4.19, +4.078, +4.368, +6.024, +8.198, +10.586, +10.589, +10.447, +10.516, +8.335, +11.243, +11.78, \ No newline at end of file