Refactor AbstractImporter class to obtain better decoupling from SamplePath

4 years ago · f32b79590f
parent 8e1666a607
commit f32b79590f
6 changed files with 203 additions and 129 deletions
--- a/main_package/classes/abstract_importer.py
+++ b/main_package/classes/abstract_importer.py
@ -1,4 +1,6 @@
 from abc import ABC, abstractmethod
 import pandas as pd
 import typing
 class AbstractImporter(ABC):
@ -6,13 +8,22 @@ class AbstractImporter(ABC):
    Interface that exposes all the necessary methods to import the trajectories and the net structure.
    :file_path: the file path
-
+    :_concatenated_samples: the concatenation of all the processed trajectories
    :df_structure: Dataframe containing the structure of the network (edges)
    :df_variables: Dataframe containing the nodes cardinalities
    :df_concatenated_samples: the concatenation and processing of all the trajectories present in the list df_samples list
    :sorter: the columns header(excluding the time column) of the Dataframe concatenated_samples
    """
    def __init__(self, file_path: str):
        self.file_path = file_path
        self._df_variables = None
        self._df_structure = None
        self._concatenated_samples = None
        self._sorter = None
        super().__init__()
    """
    @abstractmethod
    def import_trajectories(self, raw_data):
        pass
@ -20,4 +31,105 @@ class AbstractImporter(ABC):
    @abstractmethod
    def import_structure(self, raw_data):
        pass
    """
    @abstractmethod
    def import_data(self):
        """
        Imports and prepares all data present needed for susequent computation.
        Parameters:
            void
        Returns:
            void
        POSTCONDITION: the class members self._df_variables and self._df_structure HAVE to be properly constructed
        as Pandas Dataframes
        """
        pass
    def compute_row_delta_sigle_samples_frame(self, sample_frame: pd.DataFrame,
                                              columns_header: typing.List, shifted_cols_header: typing.List) \
            -> pd.DataFrame:
        """
        Computes the difference between each value present in th time column.
        Copies and shift by one position up all the values present in the remaining columns.
        PREREQUISITE: the Dataframe in input has to follow the column structure of this header:
        [Time|Variable values], so it is assumed TIME is ALWAYS the FIRST column.
        Parameters:
            sample_frame: the traj to be processed
            time_header_label: the label for the times
            columns_header: the original header of sample_frame
            shifted_cols_header: a copy of columns_header with changed names of the contents
        Returns:
            sample_frame: the processed dataframe
        """
        #sample_frame[time_header_label] = sample_frame[time_header_label].diff().shift(-1)
        sample_frame.iloc[:, 0] = sample_frame.iloc[:, 0].diff().shift(-1)
        shifted_cols = sample_frame[columns_header].shift(-1).fillna(0).astype('int32')
        shifted_cols.columns = shifted_cols_header
        sample_frame = sample_frame.assign(**shifted_cols)
        sample_frame.drop(sample_frame.tail(1).index, inplace=True)
        return sample_frame
    def compute_row_delta_in_all_samples_frames(self, df_samples_list: typing.List):
        """
        Calls the method compute_row_delta_sigle_samples_frame on every dataframe present in the list df_samples_list.
        Concatenates the result in the dataframe concatanated_samples
        PREREQUISITE: the Dataframe in input has to follow the column structure of this header:
        [Time|Variable values], so it is assumed TIME is ALWAYS the FIRST column.
        The class member self._sorter HAS to be properly INITIALIZED
        Parameters:
            time_header_label: the label of the time column
            df_samples_list: the datframe's list to be processed and concatenated
        Returns:
            void
        """
        shifted_cols_header = [s + "S" for s in self._sorter]
        compute_row_delta = self.compute_row_delta_sigle_samples_frame
        proc_samples_list = [compute_row_delta(sample, self._sorter, shifted_cols_header)
                                for sample in df_samples_list]
        self._concatenated_samples = pd.concat(proc_samples_list)
        complete_header = self._sorter[:]
        complete_header.insert(0,'Time')
        complete_header.extend(shifted_cols_header)
        #print("Complete Header", complete_header)
        self._concatenated_samples = self._concatenated_samples[complete_header]
        #print("Concat Samples",self._concatenated_samples)
    def build_list_of_samples_array(self, data_frame: pd.DataFrame) -> typing.List:
        """
        Builds a List containing the columns of dataframe and converts them to a numpy array.
        Parameters:
            :data_frame: the dataframe from which the columns have to be extracted and converted
        Returns:
            :columns_list: the resulting list of numpy arrays
        """
        columns_list = [data_frame[column].to_numpy() for column in data_frame]
        return columns_list
    def clear_concatenated_frame(self):
        """
        Removes all values in the dataframe concatenated_samples
        Parameters:
            void
        Returns:
            void
         """
        self._concatenated_samples = self._concatenated_samples.iloc[0:0]
    @property
    def concatenated_samples(self) -> pd.DataFrame:
        return self._concatenated_samples
    @property
    def variables(self) -> pd.DataFrame:
        return self._df_variables
    @property
    def structure(self) -> pd.DataFrame:
        return self._df_structure
    @property
    def sorter(self):
        return self._sorter
--- a/main_package/classes/json_importer.py
+++ b/main_package/classes/json_importer.py
@ -23,10 +23,6 @@ class JsonImporter(ai.AbstractImporter):
    :time_key: the key used to identify the timestamps in each trajectory
    :variables_key: the key used to identify the names of the variables in the net
    :df_samples_list: a Dataframe list in which every df contains a trajectory
    :df_structure: Dataframe containing the structure of the network (edges)
    :df_variables: Dataframe containing the nodes cardinalities
    :df_concatenated_samples: the concatenation and processing of all the trajectories present in the list df_samples list
    :sorter: the columns header(excluding the time column) of the Dataframe concatenated_samples
    """
    def __init__(self, file_path: str, samples_label: str, structure_label: str, variables_label: str, time_key: str,
@ -37,10 +33,6 @@ class JsonImporter(ai.AbstractImporter):
        self.time_key = time_key
        self.variables_key = variables_key
        self.df_samples_list = []
        self._df_structure = pd.DataFrame()
        self._df_variables = pd.DataFrame()
        self._concatenated_samples = None
        self.sorter = None
        super(JsonImporter, self).__init__(file_path)
    def import_data(self):
@ -53,10 +45,10 @@ class JsonImporter(ai.AbstractImporter):
        """
        raw_data = self.read_json_file()
        self.import_trajectories(raw_data)
-        self.compute_row_delta_in_all_samples_frames(self.time_key)
+        self.compute_row_delta_in_all_samples_frames(self.df_samples_list)
        self.clear_data_frame_list()
        self.import_structure(raw_data)
-        self.import_variables(raw_data, self.sorter)
+        self.import_variables(raw_data, self._sorter)
    def import_trajectories(self, raw_data: typing.List):
        """
@ -79,7 +71,6 @@ class JsonImporter(ai.AbstractImporter):
        """
        self._df_structure = self.one_level_normalizing(raw_data, 0, self.structure_label)
    def import_variables(self, raw_data: typing.List, sorter: typing.List):
        """
        Imports the data in raw_data at the key variables_label.
@ -92,13 +83,13 @@ class JsonImporter(ai.AbstractImporter):
            void
        """
        self._df_variables = self.one_level_normalizing(raw_data, 0, self.variables_label)
-        #self.sorter = self._df_variables[self.variables_key].to_list()
+        #TODO Usando come Pre-requisito l'ordinamento del frame _df_variables uguale a quello presente in
-        #self.sorter.sort()
+        #TODO self _sorter questo codice risulta inutile
        #print("Sorter:", self.sorter)
        self._df_variables[self.variables_key] = self._df_variables[self.variables_key].astype("category")
        self._df_variables[self.variables_key] = self._df_variables[self.variables_key].cat.set_categories(sorter)
        self._df_variables = self._df_variables.sort_values([self.variables_key])
        self._df_variables.reset_index(inplace=True)
        self._df_variables.drop('index', axis=1, inplace=True)
        print("Var Frame", self._df_variables)
    def read_json_file(self) -> typing.List:
@ -111,15 +102,9 @@ class JsonImporter(ai.AbstractImporter):
              data: the contents of the json file
        """
        #try:
            #read_files = glob.glob(os.path.join(self.files_path, "*.json"))
            #if not read_files:
                #raise ValueError('No .json file found in the entered path!')
        with open(self.file_path) as f:
            data = json.load(f)
            return data
        #except ValueError as err:
            #print(err.args)
    def one_level_normalizing(self, raw_data: typing.List, indx: int, key: str) -> pd.DataFrame:
        """
@ -152,82 +137,15 @@ class JsonImporter(ai.AbstractImporter):
        smps = raw_data[indx][trajectories_key]
        self.df_samples_list = [dataframe(sample) for sample in smps]
        columns_header = list(self.df_samples_list[0].columns.values)
        #print("COLUMNs HEADER", columns_header)
        columns_header.remove(self.time_key)
-        self.sorter = columns_header
+        self._sorter = columns_header
    def compute_row_delta_sigle_samples_frame(self, sample_frame: pd.DataFrame, time_header_label: str,
                                              columns_header: typing.List, shifted_cols_header: typing.List) \
            -> pd.DataFrame:
        """
        Computes the difference between each value present in th time column.
        Copies and shift by one position up all the values present in the remaining columns.
        Parameters:
            sample_frame: the traj to be processed
            time_header_label: the label for the times
            columns_header: the original header of sample_frame
            shifted_cols_header: a copy of columns_header with changed names of the contents
        Returns:
            sample_frame: the processed dataframe
        """
        sample_frame[time_header_label] = sample_frame[time_header_label].diff().shift(-1)
        shifted_cols = sample_frame[columns_header].shift(-1).fillna(0).astype('int32')
        #print(shifted_cols)
        shifted_cols.columns = shifted_cols_header
        sample_frame = sample_frame.assign(**shifted_cols)
        sample_frame.drop(sample_frame.tail(1).index, inplace=True)
        return sample_frame
    def compute_row_delta_in_all_samples_frames(self, time_header_label: str):
        """
        Calls the method compute_row_delta_sigle_samples_frame on every dataframe present in the list self.df_samples_list.
        Concatenates the result in the dataframe concatanated_samples
        Parameters:
            time_header_label: the label of the time column
        Returns:
            void
        """
        shifted_cols_header = [s + "S" for s in self.sorter]
        compute_row_delta = self.compute_row_delta_sigle_samples_frame
        self.df_samples_list = [compute_row_delta(sample, time_header_label, self.sorter, shifted_cols_header)
                                for sample in self.df_samples_list]
        self._concatenated_samples = pd.concat(self.df_samples_list)
        complete_header = self.sorter[:]
        complete_header.insert(0,'Time')
        complete_header.extend(shifted_cols_header)
        #print("Complete Header", complete_header)
        self._concatenated_samples = self._concatenated_samples[complete_header]
        #print("Concat Samples",self._concatenated_samples)
    def build_list_of_samples_array(self, data_frame: pd.DataFrame) -> typing.List:
        """
        Builds a List containing the columns of dataframe and converts them to a numpy array.
        Parameters:
            :data_frame: the dataframe from which the columns have to be extracted and converted
        Returns:
            :columns_list: the resulting list of numpy arrays
        """
        columns_list = [data_frame[column].to_numpy() for column in data_frame]
        #for column in data_frame:
            #columns_list.append(data_frame[column].to_numpy())
        return columns_list
    def clear_concatenated_frame(self):
        """
        Removes all values in the dataframe concatenated_samples
        Parameters:
            void
        Returns:
            void
         """
        self._concatenated_samples = self._concatenated_samples.iloc[0:0]
    def clear_data_frame_list(self):
        """
        Removes all values present in the dataframes in the list df_samples_list
        """
-        for indx in range(len(self.df_samples_list)):  # Le singole traj non servono più #TODO usare list comprens
+        for indx in range(len(self.df_samples_list)):
            self.df_samples_list[indx] = self.df_samples_list[indx].iloc[0:0]
    def import_sampled_cims(self, raw_data: typing.List, indx: int, cims_key: str) -> typing.Dict:
@ -239,17 +157,6 @@ class JsonImporter(ai.AbstractImporter):
                cims_for_all_vars[var].append(pd.DataFrame(raw_data[indx][cims_key][var][p_comb]).to_numpy())
        return cims_for_all_vars
    @property
    def concatenated_samples(self):
        return self._concatenated_samples
    @property
    def variables(self):
        return self._df_variables
    @property
    def structure(self):
        return self._df_structure
--- a/main_package/classes/sample_path.py
+++ b/main_package/classes/sample_path.py
@ -1,29 +1,28 @@
-import abstract_sample_path as asam
+
-import json_importer as imp
+import abstract_importer as imp
 import structure as st
 import trajectory as tr
-class SamplePath(asam.AbstractSamplePath):
+class SamplePath:
    """
    Aggregates all the informations about the trajectories, the real structure of the sampled net and variables
    cardinalites.
    Has the task of creating the objects that will contain the mentioned data.
-    :importer: the Importer objects that will import ad process data
+
    :trajectories: the Trajectory object that will contain all the concatenated trajectories
    :structure: the Structure Object that will contain all the structurral infos about the net
    :total_variables_count: the number of variables in the net
    """
-
+    def __init__(self, importer: imp.AbstractImporter):
-    #def __init__(self, files_path: str, samples_label: str, structure_label: str, variables_label: str, time_key: str,
+        """
-                 #variables_key: str):
+        :importer: the Importer objects that will import ad process data
-    def __init__(self, importer: imp.JsonImporter):
+        """
-        #self.importer =importer
+        self.importer = importer
-        super().__init__(importer)
+        self._trajectories = None
-        #self._trajectories = None
+        self._structure = None
        #self._structure = None
        self.total_variables_count = None
    def build_trajectories(self):
@ -51,11 +50,15 @@ class SamplePath(asam.AbstractSamplePath):
        Returns:
            void
        """
        if self.importer.sorter != self.importer.variables.iloc[:, 0].to_list():
            raise RuntimeError("The Dataset columns order have to match the order of labels in the variables Frame!")
        self.total_variables_count = len(self.importer.sorter)
-        labels = self.importer.variables[self.importer.variables_key].to_list()
+        #labels = self.importer.variables[self.importer.variables_key].to_list()
        #print("SAMPLE PATH LABELS",labels)
        #print(self.importer.variables)
        labels = self.importer.variables.iloc[:, 0].to_list()
        indxs = self.importer.variables.index.to_numpy()
-        vals = self.importer.variables['Value'].to_numpy()
+        vals = self.importer.variables.iloc[:, 1].to_numpy()
        edges = list(self.importer.structure.to_records(index=False))
        self._structure = st.Structure(labels, indxs, vals, edges,
                                       self.total_variables_count)
--- a/main_package/classes/set_of_cims.py
+++ b/main_package/classes/set_of_cims.py
@ -79,8 +79,8 @@ class SetOfCims:
        if mask_arr.size <= 1:
            return self.actual_cims
        else:
-            tmp_parents_comb_from_ids = np.argwhere(np.all(self.p_combs[:, mask_arr] == comb, axis=1)).ravel()
+            flat_indxs = np.argwhere(np.all(self.p_combs[:, mask_arr] == comb, axis=1)).ravel()
-            return self.actual_cims[tmp_parents_comb_from_ids]
+            return self.actual_cims[flat_indxs]
    @property
    def get_cims(self):
--- a/main_package/classes/simple_cvs_importer.py
+++ b/main_package/classes/simple_cvs_importer.py
@ -0,0 +1,53 @@
 import pandas as pd
 import glob
 import os
 import abstract_importer as ai
 import sample_path as sp
 class CSVImporter(ai.AbstractImporter):
    def __init__(self, file_path):
        self._df_samples_list = None
        super(CSVImporter, self).__init__(file_path)
    def import_data(self):
        self.read_csv_file()
        self.import_variables()
        self.import_structure()
        self.compute_row_delta_in_all_samples_frames(self._df_samples_list)
    def read_csv_file(self):
        df = pd.read_csv(self.file_path)
        df.drop(df.columns[[0]], axis=1, inplace=True)
        self._df_samples_list = [df]
    def import_variables(self):
        variables_list = list(self._df_samples_list[0].columns)[1:]
        #wrong_vars_labels = ['Y','Z','X']
        self._sorter = variables_list
        values_list = [3 for var in variables_list]
        # initialize list of lists
        data = {'Name':variables_list, 'Value':values_list}
        # Create the pandas DataFrame
        self._df_variables = pd.DataFrame(data)
    def import_structure(self):
        data = {'From':['X','Y','Z'], 'To':['Z','Z','Y']}
        self._df_structure = pd.DataFrame(data)
 read_files = glob.glob(os.path.join('../data', "*.csv"))
 print(read_files[0])
 csvimp = CSVImporter(read_files[0])
 #csvimp.import_data()
 s1 = sp.SamplePath(csvimp)
 s1.build_trajectories()
 s1.build_structure()
 print(s1.structure)
 print(s1.trajectories)
--- a/main_package/tests/test_json_importer.py
+++ b/main_package/tests/test_json_importer.py
@ -6,11 +6,9 @@ import glob
 import numpy as np
 import pandas as pd
 import json_importer as ji
 import json
 class TestJsonImporter(unittest.TestCase):
    @classmethod
@ -26,10 +24,10 @@ class TestJsonImporter(unittest.TestCase):
        self.assertEqual(j1.variables_key, 'Name')
        self.assertEqual(j1.file_path, self.read_files[0])
        self.assertFalse(j1.df_samples_list)
-        self.assertTrue(j1.variables.empty)
+        self.assertIsNone(j1.variables)
-        self.assertTrue(j1.structure.empty)
+        self.assertIsNone(j1.structure)
-        self.assertFalse(j1.concatenated_samples)
+        self.assertIsNone(j1.concatenated_samples)
-        self.assertFalse(j1.sorter)
+        self.assertIsNone(j1.sorter)
    def test_read_json_file_found(self):
        data_set = {"key1": [1, 2, 3], "key2": [4, 5, 6]}
@ -73,7 +71,7 @@ class TestJsonImporter(unittest.TestCase):
        sample_frame = j1.df_samples_list[0]
        columns_header = list(sample_frame.columns.values)
        shifted_cols_header = [s + "S" for s in columns_header[1:]]
-        new_sample_frame = j1.compute_row_delta_sigle_samples_frame(sample_frame, j1.time_key, columns_header[1:],
+        new_sample_frame = j1.compute_row_delta_sigle_samples_frame(sample_frame, columns_header[1:],
                                                                    shifted_cols_header)
        self.assertEqual(len(list(sample_frame.columns.values)) + len(shifted_cols_header),
                         len(list(new_sample_frame.columns.values)))
@ -83,15 +81,16 @@ class TestJsonImporter(unittest.TestCase):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
        raw_data = j1.read_json_file()
        j1.import_trajectories(raw_data)
-        j1.compute_row_delta_in_all_samples_frames(j1.time_key)
+        j1.compute_row_delta_in_all_samples_frames(j1.df_samples_list)
-        self.assertEqual(list(j1.df_samples_list[0].columns.values), list(j1.concatenated_samples.columns.values))
+        self.assertEqual(list(j1.df_samples_list[0].columns.values),
                         list(j1.concatenated_samples.columns.values)[:len(list(j1.df_samples_list[0].columns.values))])
        self.assertEqual(list(j1.concatenated_samples.columns.values)[0], j1.time_key)
    def test_clear_data_frame_list(self):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
        raw_data = j1.read_json_file()
        j1.import_trajectories(raw_data)
-        j1.compute_row_delta_in_all_samples_frames(j1.time_key)
+        j1.compute_row_delta_in_all_samples_frames(j1.df_samples_list)
        j1.clear_data_frame_list()
        for df in j1.df_samples_list:
            self.assertTrue(df.empty)
@ -137,7 +136,7 @@ class TestJsonImporter(unittest.TestCase):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
        raw_data = j1.read_json_file()
        cims = j1.import_sampled_cims(raw_data, 0, 'dyn.cims')
-        j1.import_variables(raw_data, ['X','Y','Z'])
+        j1.import_variables(raw_data, ['X','Y','Z']) #TODO NON PUò dipendere direttamente da questo sorter
        self.assertEqual(list(cims.keys()), j1.variables['Name'].tolist())
    def test_import_data(self):