Refactors on JSONImporter class and tests

4 years ago · 59cda0d3d9
parent f32b79590f
commit 59cda0d3d9
5 changed files with 158 additions and 98 deletions
--- a/main_package/classes/abstract_importer.py
+++ b/main_package/classes/abstract_importer.py
@ -5,17 +5,22 @@ import typing
 class AbstractImporter(ABC):
    """
-    Interface that exposes all the necessary methods to import the trajectories and the net structure.
+    Abstract class that exposes all the necessary methods to process the trajectories and the net structure.
    :file_path: the file path
    :_concatenated_samples: the concatenation of all the processed trajectories
    :df_structure: Dataframe containing the structure of the network (edges)
    :df_variables: Dataframe containing the nodes cardinalities
-    :df_concatenated_samples: the concatenation and processing of all the trajectories present in the list df_samples list
+    :df_concatenated_samples: the concatenation and processing of all the trajectories present
    in the list df_samples list
    :sorter: the columns header(excluding the time column) of the Dataframe concatenated_samples
    """
    def __init__(self, file_path: str):
        """
        Parameters:
            :file_path: the path to the file containing the data
        """
        self.file_path = file_path
        self._df_variables = None
        self._df_structure = None
@ -23,26 +28,30 @@ class AbstractImporter(ABC):
        self._sorter = None
        super().__init__()
    """
    @abstractmethod
-    def import_trajectories(self, raw_data):
+    def import_data(self):
-        pass
+        """
-
+        Imports and prepares all data present needed for susequent computation.
-    @abstractmethod
+        Parameters:
-    def import_structure(self, raw_data):
+            :void
        Returns:
            :void
        post[self]: the class members self._df_variables and self._df_structure HAVE to be properly constructed
        as Pandas Dataframes with the following structure:
        Header of self._df_structure = [From_Node | To_Node]
        Header of self.df_variables = [Variable_Label | Variable_Cardinality]
        """
        pass
    """
    @abstractmethod
-    def import_data(self):
+    def build_sorter(self, sample_frame: pd.DataFrame) -> typing.List:
        """
-        Imports and prepares all data present needed for susequent computation.
+        Initializes the self._sorter class member from a trajectory dataframe, exctracting the header of the frame
        and keeping ONLY the variables symbolic labels, cutting out the time label in the header.
        Parameters:
-            void
+            :sample_frame: The dataframe from which extract the header
        Returns:
-            void
+            :a list containing the processed header.
        POSTCONDITION: the class members self._df_variables and self._df_structure HAVE to be properly constructed
        as Pandas Dataframes
        """
        pass
@ -52,16 +61,15 @@ class AbstractImporter(ABC):
        """
        Computes the difference between each value present in th time column.
        Copies and shift by one position up all the values present in the remaining columns.
        PREREQUISITE: the Dataframe in input has to follow the column structure of this header:
        [Time|Variable values], so it is assumed TIME is ALWAYS the FIRST column.
        Parameters:
-            sample_frame: the traj to be processed
+            :sample_frame: the traj to be processed
-            time_header_label: the label for the times
+            :time_header_label: the label for the times
-            columns_header: the original header of sample_frame
+            :columns_header: the original header of sample_frame
-            shifted_cols_header: a copy of columns_header with changed names of the contents
+            :shifted_cols_header: a copy of columns_header with changed names of the contents
        Returns:
-            sample_frame: the processed dataframe
+            :sample_frame: the processed dataframe
-
+        pre: the Dataframe sample_frame has to follow the column structure of this header:
            Header of sample_frame = [Time | Variable values]
        """
        #sample_frame[time_header_label] = sample_frame[time_header_label].diff().shift(-1)
        sample_frame.iloc[:, 0] = sample_frame.iloc[:, 0].diff().shift(-1)
@ -75,16 +83,18 @@ class AbstractImporter(ABC):
        """
        Calls the method compute_row_delta_sigle_samples_frame on every dataframe present in the list df_samples_list.
        Concatenates the result in the dataframe concatanated_samples
        PREREQUISITE: the Dataframe in input has to follow the column structure of this header:
        [Time|Variable values], so it is assumed TIME is ALWAYS the FIRST column.
        The class member self._sorter HAS to be properly INITIALIZED
        Parameters:
            time_header_label: the label of the time column
            df_samples_list: the datframe's list to be processed and concatenated
        Returns:
            void
        pre: the Dataframe sample_frame has to follow the column structure of this header:
            Header of sample_frame = [Time | Variable values]
            The class member self._sorter HAS to be properly INITIALIZED (See class members definition doc)
        """
        if not self.sorter:
            raise RuntimeError("The class member self._sorter has to be INITIALIZED!")
        shifted_cols_header = [s + "S" for s in self._sorter]
        compute_row_delta = self.compute_row_delta_sigle_samples_frame
        proc_samples_list = [compute_row_delta(sample, self._sorter, shifted_cols_header)
@ -112,9 +122,9 @@ class AbstractImporter(ABC):
        """
        Removes all values in the dataframe concatenated_samples
        Parameters:
-            void
+            :void
        Returns:
-            void
+            :void
         """
        self._concatenated_samples = self._concatenated_samples.iloc[0:0]
@ -131,5 +141,5 @@ class AbstractImporter(ABC):
        return self._df_structure
    @property
-    def sorter(self):
+    def sorter(self) -> typing.List:
        return self._sorter
--- a/main_package/classes/json_importer.py
+++ b/main_package/classes/json_importer.py
@ -1,7 +1,6 @@
 import json
 import typing
 import pandas as pd
 import abstract_importer as ai
@ -9,9 +8,9 @@ import abstract_importer as ai
 class JsonImporter(ai.AbstractImporter):
    """
-    Implements the Interface AbstractImporter and adds all the necessary methods to process and prepare the data in json ext.
+    Implements the abstracts methods of AbstractImporter and adds all the necessary methods to process and prepare the data in json ext.
    with the following structure:
-    [] 0
+    [0]
        |_ dyn.cims
        |_ dyn.str
        |_ samples
@ -27,28 +26,38 @@ class JsonImporter(ai.AbstractImporter):
    def __init__(self, file_path: str, samples_label: str, structure_label: str, variables_label: str, time_key: str,
                 variables_key: str):
        """
        Parameters:
            file_path: the path of the file that contains tha data to be imported
            :samples_label: the reference key for the samples in the trajectories
            :structure_label: the reference key for the structure of the network data
            :variables_label: the reference key for the cardinalites of the nodes data
            :time_key: the key used to identify the timestamps in each trajectory
            :variables_key: the key used to identify the names of the variables in the net
        """
        self.samples_label = samples_label
        self.structure_label = structure_label
        self.variables_label = variables_label
        self.time_key = time_key
        self.variables_key = variables_key
-        self.df_samples_list = []
+        self.df_samples_list = None
        super(JsonImporter, self).__init__(file_path)
    def import_data(self):
        """
-        Imports and prepares all data present needed for susequent computation.
+        Imports and prepares all data present needed for subsequent processing.
        Parameters:
-            void
+            :void
        Returns:
-            void
+            _void
        """
        raw_data = self.read_json_file()
-        self.import_trajectories(raw_data)
+        self.df_samples_list = self.import_trajectories(raw_data)
        self._sorter = self.build_sorter(self.df_samples_list[0])
        self.compute_row_delta_in_all_samples_frames(self.df_samples_list)
        self.clear_data_frame_list()
-        self.import_structure(raw_data)
+        self._df_structure = self.import_structure(raw_data)
-        self.import_variables(raw_data, self._sorter)
+        self._df_variables = self.import_variables(raw_data, self._sorter)
    def import_trajectories(self, raw_data: typing.List):
        """
@ -56,50 +65,51 @@ class JsonImporter(ai.AbstractImporter):
        Parameters:
            :raw_data: List of Dicts
        Returns:
-            void
+            :List of dataframes containing all the trajectories
        """
-        self.normalize_trajectories(raw_data, 0, self.samples_label)
+        return self.normalize_trajectories(raw_data, 0, self.samples_label)
-    def import_structure(self, raw_data: typing.List):
+    def import_structure(self, raw_data: typing.List) -> pd.DataFrame:
        """
        Imports in a dataframe the data in the list raw_data at the key structure_label
        Parameters:
-            raw_data: the data
+            :raw_data: the data
        Returns:
-            void
+            :Daframe containg the starting node a ending node of every arc of the network
        """
-        self._df_structure = self.one_level_normalizing(raw_data, 0, self.structure_label)
+        return self.one_level_normalizing(raw_data, 0, self.structure_label)
-    def import_variables(self, raw_data: typing.List, sorter: typing.List):
+    def import_variables(self, raw_data: typing.List, sorter: typing.List) -> pd.DataFrame:
        """
        Imports the data in raw_data at the key variables_label.
        Sorts the row of the dataframe df_variables using the list sorter.
        Parameters:
-            raw_data: the data
+            :raw_data: the data
-            sorter: the list used to sort the dataframe self.df_variables
+            :sorter: the header of the dataset containing only variables symbolic labels
        Returns:
-            void
+            :Datframe containg the variables simbolic labels and their cardinalities
        """
-        self._df_variables = self.one_level_normalizing(raw_data, 0, self.variables_label)
+        return self.one_level_normalizing(raw_data, 0, self.variables_label)
        #TODO Usando come Pre-requisito l'ordinamento del frame _df_variables uguale a quello presente in
        #TODO self _sorter questo codice risulta inutile
-        self._df_variables[self.variables_key] = self._df_variables[self.variables_key].astype("category")
+        """self._df_variables[self.variables_key] = self._df_variables[self.variables_key].astype("category")
        self._df_variables[self.variables_key] = self._df_variables[self.variables_key].cat.set_categories(sorter)
        self._df_variables = self._df_variables.sort_values([self.variables_key])
        self._df_variables.reset_index(inplace=True)
        self._df_variables.drop('index', axis=1, inplace=True)
-        print("Var Frame", self._df_variables)
+        #print("Var Frame", self._df_variables)
        """
    def read_json_file(self) -> typing.List:
        """
-        Reads the first json file in the path self.filePath
+        Reads the JSON file in the path self.filePath
        Parameters:
-              void
+              :void
        Returns:
-              data: the contents of the json file
+              :data: the contents of the json file
        """
        with open(self.file_path) as f:
@ -111,11 +121,11 @@ class JsonImporter(ai.AbstractImporter):
        Extracts the one-level nested data in the list raw_data at the index indx at the key key
        Parameters:
-            raw_data: List of Dicts
+            :raw_data: List of Dicts
-            indx: The index of the array from which the data have to be extracted
+            :indx: The index of the array from which the data have to be extracted
-            key: the key for the Dicts from which exctract data
+            :key: the key for the Dicts from which exctract data
        Returns:
-            a normalized dataframe
+            :a normalized dataframe:
        """
        return pd.DataFrame(raw_data[indx][key])
@ -123,32 +133,52 @@ class JsonImporter(ai.AbstractImporter):
    def normalize_trajectories(self, raw_data: typing.List, indx: int, trajectories_key: str):
        """
        Extracts the traj in raw_data at the index index at the key trajectories key.
        Adds the extracted traj in the dataframe list self._df_samples_list.
        Initializes the list self.sorter.
        Parameters:
-            raw_data: the data
+            :raw_data: the data
-            indx: the index of the array from which extract data
+            :indx: the index of the array from which extract data
-            trajectories_key: the key of the trajectories objects
+            :trajectories_key: the key of the trajectories objects
        Returns:
-            void
+            :A list of daframes containg the trajectories
        """
        dataframe = pd.DataFrame
        smps = raw_data[indx][trajectories_key]
-        self.df_samples_list = [dataframe(sample) for sample in smps]
+        df_samples_list = [dataframe(sample) for sample in smps]
-        columns_header = list(self.df_samples_list[0].columns.values)
+        return df_samples_list
-        #print("COLUMNs HEADER", columns_header)
+        #columns_header = list(self.df_samples_list[0].columns.values)
        #columns_header.remove(self.time_key)
        #self._sorter = columns_header
    def build_sorter(self, sample_frame: pd.DataFrame) -> typing.List:
        """
        Implements the abstract method build_sorter for this dataset
        """
        columns_header = list(sample_frame.columns.values)
        columns_header.remove(self.time_key)
-        self._sorter = columns_header
+        return columns_header
    def clear_data_frame_list(self):
        """
        Removes all values present in the dataframes in the list df_samples_list
        Parameters:
            :void
        Returns:
            :void
        """
        for indx in range(len(self.df_samples_list)):
            self.df_samples_list[indx] = self.df_samples_list[indx].iloc[0:0]
    def import_sampled_cims(self, raw_data: typing.List, indx: int, cims_key: str) -> typing.Dict:
        """
        Imports the synthetic CIMS in the dataset in a dictionary, using variables labels
        as keys for the set of CIMS of a particular node.
        Parameters:
            :raw_data: the data
            :indx: the json array index
            :cims_key: the key where the json object cims are placed
        Returns:
            :a dictionary containing the sampled CIMS for all the variables in the net
        """
        cims_for_all_vars = {}
        for var in raw_data[indx][cims_key]:
            sampled_cims_list = []
--- a/main_package/classes/sample_path.py
+++ b/main_package/classes/sample_path.py
@ -10,7 +10,7 @@ class SamplePath:
    cardinalites.
    Has the task of creating the objects that will contain the mentioned data.
-
+    :importer: the Importer objects that will import ad process data
    :trajectories: the Trajectory object that will contain all the concatenated trajectories
    :structure: the Structure Object that will contain all the structurral infos about the net
    :total_variables_count: the number of variables in the net
@ -18,12 +18,14 @@ class SamplePath:
    """
    def __init__(self, importer: imp.AbstractImporter):
        """
-        :importer: the Importer objects that will import ad process data
+        Parameters:
            :importer: the Importer objects that will import ad process data
        """
        self.importer = importer
        self._trajectories = None
        self._structure = None
        self.total_variables_count = None
        self.importer.import_data()
    def build_trajectories(self):
        """
@ -31,11 +33,11 @@ class SamplePath:
        Clears all the unused dataframes in Importer Object
        Parameters:
-            void
+            :void
        Returns:
-            void
+            :void
        """
-        self.importer.import_data()
+        #self.importer.import_data()
        self._trajectories = \
            tr.Trajectory(self.importer.build_list_of_samples_array(self.importer.concatenated_samples),
                          len(self.importer.sorter) + 1)
@ -46,12 +48,13 @@ class SamplePath:
        """
        Builds the Structure object that aggregates all the infos about the net.
        Parameters:
-            void
+            :void
        Returns:
-            void
+            :void
        """
        if self.importer.sorter != self.importer.variables.iloc[:, 0].to_list():
            raise RuntimeError("The Dataset columns order have to match the order of labels in the variables Frame!")
        self.total_variables_count = len(self.importer.sorter)
        #labels = self.importer.variables[self.importer.variables_key].to_list()
        #print("SAMPLE PATH LABELS",labels)
@ -64,11 +67,11 @@ class SamplePath:
                                       self.total_variables_count)
    @property
-    def trajectories(self):
+    def trajectories(self) -> tr.Trajectory:
        return self._trajectories
    @property
-    def structure(self):
+    def structure(self) -> st.Structure:
        return self._structure
    def total_variables_count(self):
--- a/main_package/tests/test_json_importer.py
+++ b/main_package/tests/test_json_importer.py
@ -23,7 +23,7 @@ class TestJsonImporter(unittest.TestCase):
        self.assertEqual(j1.time_key, 'Time')
        self.assertEqual(j1.variables_key, 'Name')
        self.assertEqual(j1.file_path, self.read_files[0])
-        self.assertFalse(j1.df_samples_list)
+        self.assertIsNone(j1.df_samples_list)
        self.assertIsNone(j1.variables)
        self.assertIsNone(j1.structure)
        self.assertIsNone(j1.concatenated_samples)
@ -50,9 +50,9 @@ class TestJsonImporter(unittest.TestCase):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
        raw_data = j1.read_json_file()
        #print(raw_data)
-        j1.normalize_trajectories(raw_data, 0, j1.samples_label)
+        df_samples_list = j1.normalize_trajectories(raw_data, 0, j1.samples_label)
-        self.assertEqual(len(j1.df_samples_list), len(raw_data[0][j1.samples_label]))
+        self.assertEqual(len(df_samples_list), len(raw_data[0][j1.samples_label]))
-        self.assertEqual(list(j1.df_samples_list[0].columns.values)[1:], j1.sorter)
+        #self.assertEqual(list(j1.df_samples_list[0].columns.values)[1:], j1.sorter)
    def test_normalize_trajectories_wrong_indx(self):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
@ -67,8 +67,9 @@ class TestJsonImporter(unittest.TestCase):
    def test_compute_row_delta_single_samples_frame(self):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
        raw_data = j1.read_json_file()
-        j1.normalize_trajectories(raw_data, 0, j1.samples_label)
+        j1.df_samples_list = j1.import_trajectories(raw_data)
        sample_frame = j1.df_samples_list[0]
        original_copy = sample_frame.copy()
        columns_header = list(sample_frame.columns.values)
        shifted_cols_header = [s + "S" for s in columns_header[1:]]
        new_sample_frame = j1.compute_row_delta_sigle_samples_frame(sample_frame, columns_header[1:],
@ -76,11 +77,20 @@ class TestJsonImporter(unittest.TestCase):
        self.assertEqual(len(list(sample_frame.columns.values)) + len(shifted_cols_header),
                         len(list(new_sample_frame.columns.values)))
        self.assertEqual(sample_frame.shape[0] - 1, new_sample_frame.shape[0])
        for indx, row in new_sample_frame.iterrows():
            self.assertAlmostEqual(row['Time'],
                                   original_copy.iloc[indx + 1]['Time'] - original_copy.iloc[indx]['Time'])
        for indx, row in new_sample_frame.iterrows():
            np.array_equal(np.array(row[columns_header[1:]],dtype=int),
                  np.array(original_copy.iloc[indx][columns_header[1:]],dtype=int))
            np.array_equal(np.array(row[shifted_cols_header], dtype=int),
                           np.array(original_copy.iloc[indx + 1][columns_header[1:]], dtype=int))
    def test_compute_row_delta_in_all_frames(self):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
        raw_data = j1.read_json_file()
-        j1.import_trajectories(raw_data)
+        j1.df_samples_list = j1.import_trajectories(raw_data)
        j1._sorter = j1.build_sorter(j1.df_samples_list[0])
        j1.compute_row_delta_in_all_samples_frames(j1.df_samples_list)
        self.assertEqual(list(j1.df_samples_list[0].columns.values),
                         list(j1.concatenated_samples.columns.values)[:len(list(j1.df_samples_list[0].columns.values))])
@ -89,7 +99,8 @@ class TestJsonImporter(unittest.TestCase):
    def test_clear_data_frame_list(self):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
        raw_data = j1.read_json_file()
-        j1.import_trajectories(raw_data)
+        j1.df_samples_list = j1.import_trajectories(raw_data)
        j1._sorter = j1.build_sorter(j1.df_samples_list[0])
        j1.compute_row_delta_in_all_samples_frames(j1.df_samples_list)
        j1.clear_data_frame_list()
        for df in j1.df_samples_list:
@ -121,23 +132,25 @@ class TestJsonImporter(unittest.TestCase):
    def test_import_variables(self):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
        sorter = ['X', 'Y', 'Z']
-        raw_data = [{'variables':{"Name": ['Z', 'Y', 'X'], "value": [3, 3, 3]}}]
+        raw_data = [{'variables':{"Name": ['X', 'Y', 'Z'], "value": [3, 3, 3]}}]
-        j1.import_variables(raw_data, sorter)
+        df_var = j1.import_variables(raw_data, sorter)
-        self.assertEqual(list(j1.variables[j1.variables_key]), sorter)
+        self.assertEqual(list(df_var[j1.variables_key]), sorter)
    def test_import_structure(self):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
        raw_data = [{"dyn.str":[{"From":"X","To":"Z"},{"From":"Y","To":"Z"},{"From":"Z","To":"Y"}]}]
-        j1.import_structure(raw_data)
+        df_struct = j1.import_structure(raw_data)
        #print(raw_data[0]['dyn.str'][0].items())
-        self.assertIsInstance(j1.structure, pd.DataFrame)
+        self.assertIsInstance(df_struct, pd.DataFrame)
    def test_import_sampled_cims(self):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
        raw_data = j1.read_json_file()
        j1.df_samples_list = j1.import_trajectories(raw_data)
        j1._sorter = j1.build_sorter(j1.df_samples_list[0])
        cims = j1.import_sampled_cims(raw_data, 0, 'dyn.cims')
-        j1.import_variables(raw_data, ['X','Y','Z']) #TODO NON PUò dipendere direttamente da questo sorter
+        #j1.import_variables(raw_data, j1.sorter)
-        self.assertEqual(list(cims.keys()), j1.variables['Name'].tolist())
+        self.assertEqual(list(cims.keys()), j1.sorter)
    def test_import_data(self):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
--- a/main_package/tests/test_sample_path.py
+++ b/main_package/tests/test_sample_path.py
@ -17,18 +17,22 @@ class TestSamplePath(unittest.TestCase):
        cls.importer = ji.JsonImporter(cls.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
    def test_init(self):
        s1 = sp.SamplePath(self.importer)
        self.assertIsNone(s1.trajectories)
        self.assertIsNone(s1.structure)
        self.assertFalse(s1.importer.concatenated_samples.empty)
        self.assertIsNone(s1.total_variables_count)
    def test_build_trajectories(self):
        s1 = sp.SamplePath(self.importer)
        s1.build_trajectories()
        self.assertIsNotNone(s1.trajectories)
        self.assertIsInstance(s1.trajectories, tr.Trajectory)
    def test_build_structure(self):
        s1 = sp.SamplePath(self.importer)
        s1.build_structure()
        self.assertIsNotNone(s1.structure)
        self.assertIsInstance(s1.structure, st.Structure)
        self.assertTrue(s1.importer.concatenated_samples.empty)
        self.assertEqual(s1.total_variables_count, len(s1.importer.sorter))
        print(s1.structure)
        print(s1.trajectories)
 if __name__ == '__main__':
    unittest.main()