Refactor AbstractImporter class to obtain better decoupling from SamplePath

4 years ago · f32b79590f
parent 8e1666a607
commit f32b79590f
6 changed files with 203 additions and 129 deletions
--- a/main_package/classes/abstract_importer.py
+++ b/main_package/classes/abstract_importer.py
@ -1,4 +1,6 @@
 from abc import ABC, abstractmethod
+import pandas as pd
+import typing


 class AbstractImporter(ABC):
@ -6,13 +8,22 @@ class AbstractImporter(ABC):
    Interface that exposes all the necessary methods to import the trajectories and the net structure.

    :file_path: the file path
-
+    :_concatenated_samples: the concatenation of all the processed trajectories
+    :df_structure: Dataframe containing the structure of the network (edges)
+    :df_variables: Dataframe containing the nodes cardinalities
+    :df_concatenated_samples: the concatenation and processing of all the trajectories present in the list df_samples list
+    :sorter: the columns header(excluding the time column) of the Dataframe concatenated_samples
    """

    def __init__(self, file_path: str):
        self.file_path = file_path
+        self._df_variables = None
+        self._df_structure = None
+        self._concatenated_samples = None
+        self._sorter = None
        super().__init__()

+    """
    @abstractmethod
    def import_trajectories(self, raw_data):
        pass
@ -20,4 +31,105 @@ class AbstractImporter(ABC):
    @abstractmethod
    def import_structure(self, raw_data):
        pass
+    """
+
+    @abstractmethod
+    def import_data(self):
+        """
+        Imports and prepares all data present needed for susequent computation.
+        Parameters:
+            void
+        Returns:
+            void
+        POSTCONDITION: the class members self._df_variables and self._df_structure HAVE to be properly constructed
+        as Pandas Dataframes
+        """
+        pass
+
+    def compute_row_delta_sigle_samples_frame(self, sample_frame: pd.DataFrame,
+                                              columns_header: typing.List, shifted_cols_header: typing.List) \
+            -> pd.DataFrame:
+        """
+        Computes the difference between each value present in th time column.
+        Copies and shift by one position up all the values present in the remaining columns.
+        PREREQUISITE: the Dataframe in input has to follow the column structure of this header:
+        [Time|Variable values], so it is assumed TIME is ALWAYS the FIRST column.
+        Parameters:
+            sample_frame: the traj to be processed
+            time_header_label: the label for the times
+            columns_header: the original header of sample_frame
+            shifted_cols_header: a copy of columns_header with changed names of the contents
+        Returns:
+            sample_frame: the processed dataframe
+
+        """
+        #sample_frame[time_header_label] = sample_frame[time_header_label].diff().shift(-1)
+        sample_frame.iloc[:, 0] = sample_frame.iloc[:, 0].diff().shift(-1)
+        shifted_cols = sample_frame[columns_header].shift(-1).fillna(0).astype('int32')
+        shifted_cols.columns = shifted_cols_header
+        sample_frame = sample_frame.assign(**shifted_cols)
+        sample_frame.drop(sample_frame.tail(1).index, inplace=True)
+        return sample_frame
+
+    def compute_row_delta_in_all_samples_frames(self, df_samples_list: typing.List):
+        """
+        Calls the method compute_row_delta_sigle_samples_frame on every dataframe present in the list df_samples_list.
+        Concatenates the result in the dataframe concatanated_samples
+        PREREQUISITE: the Dataframe in input has to follow the column structure of this header:
+        [Time|Variable values], so it is assumed TIME is ALWAYS the FIRST column.
+        The class member self._sorter HAS to be properly INITIALIZED
+        Parameters:
+            time_header_label: the label of the time column
+            df_samples_list: the datframe's list to be processed and concatenated
+
+        Returns:
+            void
+        """
+        shifted_cols_header = [s + "S" for s in self._sorter]
+        compute_row_delta = self.compute_row_delta_sigle_samples_frame
+        proc_samples_list = [compute_row_delta(sample, self._sorter, shifted_cols_header)
+                                for sample in df_samples_list]
+        self._concatenated_samples = pd.concat(proc_samples_list)
+        complete_header = self._sorter[:]
+        complete_header.insert(0,'Time')
+        complete_header.extend(shifted_cols_header)
+        #print("Complete Header", complete_header)
+        self._concatenated_samples = self._concatenated_samples[complete_header]
+        #print("Concat Samples",self._concatenated_samples)
+
+    def build_list_of_samples_array(self, data_frame: pd.DataFrame) -> typing.List:
+        """
+        Builds a List containing the columns of dataframe and converts them to a numpy array.
+        Parameters:
+            :data_frame: the dataframe from which the columns have to be extracted and converted
+        Returns:
+            :columns_list: the resulting list of numpy arrays
+        """
+        columns_list = [data_frame[column].to_numpy() for column in data_frame]
+        return columns_list
+
+    def clear_concatenated_frame(self):
+        """
+        Removes all values in the dataframe concatenated_samples
+        Parameters:
+            void
+        Returns:
+            void
+         """
+        self._concatenated_samples = self._concatenated_samples.iloc[0:0]
+
+    @property
+    def concatenated_samples(self) -> pd.DataFrame:
+        return self._concatenated_samples
+
+    @property
+    def variables(self) -> pd.DataFrame:
+        return self._df_variables
+
+    @property
+    def structure(self) -> pd.DataFrame:
+        return self._df_structure

+    @property
+    def sorter(self):
+        return self._sorter
--- a/main_package/classes/json_importer.py
+++ b/main_package/classes/json_importer.py
@ -23,10 +23,6 @@ class JsonImporter(ai.AbstractImporter):
    :time_key: the key used to identify the timestamps in each trajectory
    :variables_key: the key used to identify the names of the variables in the net
    :df_samples_list: a Dataframe list in which every df contains a trajectory
-    :df_structure: Dataframe containing the structure of the network (edges)
-    :df_variables: Dataframe containing the nodes cardinalities
-    :df_concatenated_samples: the concatenation and processing of all the trajectories present in the list df_samples list
-    :sorter: the columns header(excluding the time column) of the Dataframe concatenated_samples
    """

    def __init__(self, file_path: str, samples_label: str, structure_label: str, variables_label: str, time_key: str,
@ -37,10 +33,6 @@ class JsonImporter(ai.AbstractImporter):
        self.time_key = time_key
        self.variables_key = variables_key
        self.df_samples_list = []
-        self._df_structure = pd.DataFrame()
-        self._df_variables = pd.DataFrame()
-        self._concatenated_samples = None
-        self.sorter = None
        super(JsonImporter, self).__init__(file_path)

    def import_data(self):
@ -53,10 +45,10 @@ class JsonImporter(ai.AbstractImporter):
        """
        raw_data = self.read_json_file()
        self.import_trajectories(raw_data)
-        self.compute_row_delta_in_all_samples_frames(self.time_key)
+        self.compute_row_delta_in_all_samples_frames(self.df_samples_list)
        self.clear_data_frame_list()
        self.import_structure(raw_data)
-        self.import_variables(raw_data, self.sorter)
+        self.import_variables(raw_data, self._sorter)

    def import_trajectories(self, raw_data: typing.List):
        """
@ -79,7 +71,6 @@ class JsonImporter(ai.AbstractImporter):
        """
        self._df_structure = self.one_level_normalizing(raw_data, 0, self.structure_label)

-
    def import_variables(self, raw_data: typing.List, sorter: typing.List):
        """
        Imports the data in raw_data at the key variables_label.
@ -92,13 +83,13 @@ class JsonImporter(ai.AbstractImporter):
            void
        """
        self._df_variables = self.one_level_normalizing(raw_data, 0, self.variables_label)
-        #self.sorter = self._df_variables[self.variables_key].to_list()
-        #self.sorter.sort()
-        #print("Sorter:", self.sorter)
+        #TODO Usando come Pre-requisito l'ordinamento del frame _df_variables uguale a quello presente in
+        #TODO self _sorter questo codice risulta inutile
        self._df_variables[self.variables_key] = self._df_variables[self.variables_key].astype("category")
        self._df_variables[self.variables_key] = self._df_variables[self.variables_key].cat.set_categories(sorter)
        self._df_variables = self._df_variables.sort_values([self.variables_key])
        self._df_variables.reset_index(inplace=True)
+        self._df_variables.drop('index', axis=1, inplace=True)
        print("Var Frame", self._df_variables)

    def read_json_file(self) -> typing.List:
@ -111,15 +102,9 @@ class JsonImporter(ai.AbstractImporter):
              data: the contents of the json file

        """
-        #try:
-            #read_files = glob.glob(os.path.join(self.files_path, "*.json"))
-            #if not read_files:
-                #raise ValueError('No .json file found in the entered path!')
        with open(self.file_path) as f:
            data = json.load(f)
            return data
-        #except ValueError as err:
-            #print(err.args)

    def one_level_normalizing(self, raw_data: typing.List, indx: int, key: str) -> pd.DataFrame:
        """
@ -152,82 +137,15 @@ class JsonImporter(ai.AbstractImporter):
        smps = raw_data[indx][trajectories_key]
        self.df_samples_list = [dataframe(sample) for sample in smps]
        columns_header = list(self.df_samples_list[0].columns.values)
+        #print("COLUMNs HEADER", columns_header)
        columns_header.remove(self.time_key)
-        self.sorter = columns_header
-
-    def compute_row_delta_sigle_samples_frame(self, sample_frame: pd.DataFrame, time_header_label: str,
-                                              columns_header: typing.List, shifted_cols_header: typing.List) \
-            -> pd.DataFrame:
-        """
-        Computes the difference between each value present in th time column.
-        Copies and shift by one position up all the values present in the remaining columns.
-        Parameters:
-            sample_frame: the traj to be processed
-            time_header_label: the label for the times
-            columns_header: the original header of sample_frame
-            shifted_cols_header: a copy of columns_header with changed names of the contents
-        Returns:
-            sample_frame: the processed dataframe
-
-        """
-        sample_frame[time_header_label] = sample_frame[time_header_label].diff().shift(-1)
-        shifted_cols = sample_frame[columns_header].shift(-1).fillna(0).astype('int32')
-        #print(shifted_cols)
-        shifted_cols.columns = shifted_cols_header
-        sample_frame = sample_frame.assign(**shifted_cols)
-        sample_frame.drop(sample_frame.tail(1).index, inplace=True)
-        return sample_frame
-
-    def compute_row_delta_in_all_samples_frames(self, time_header_label: str):
-        """
-        Calls the method compute_row_delta_sigle_samples_frame on every dataframe present in the list self.df_samples_list.
-        Concatenates the result in the dataframe concatanated_samples
-
-        Parameters:
-            time_header_label: the label of the time column
-        Returns:
-            void
-        """
-        shifted_cols_header = [s + "S" for s in self.sorter]
-        compute_row_delta = self.compute_row_delta_sigle_samples_frame
-        self.df_samples_list = [compute_row_delta(sample, time_header_label, self.sorter, shifted_cols_header)
-                                for sample in self.df_samples_list]
-        self._concatenated_samples = pd.concat(self.df_samples_list)
-        complete_header = self.sorter[:]
-        complete_header.insert(0,'Time')
-        complete_header.extend(shifted_cols_header)
-        #print("Complete Header", complete_header)
-        self._concatenated_samples = self._concatenated_samples[complete_header]
-        #print("Concat Samples",self._concatenated_samples)
-
-    def build_list_of_samples_array(self, data_frame: pd.DataFrame) -> typing.List:
-        """
-        Builds a List containing the columns of dataframe and converts them to a numpy array.
-        Parameters:
-            :data_frame: the dataframe from which the columns have to be extracted and converted
-        Returns:
-            :columns_list: the resulting list of numpy arrays
-        """
-        columns_list = [data_frame[column].to_numpy() for column in data_frame]
-        #for column in data_frame:
-            #columns_list.append(data_frame[column].to_numpy())
-        return columns_list
-
-    def clear_concatenated_frame(self):
-        """
-        Removes all values in the dataframe concatenated_samples
-        Parameters:
-            void
-        Returns:
-            void
-         """
-        self._concatenated_samples = self._concatenated_samples.iloc[0:0]
+        self._sorter = columns_header

    def clear_data_frame_list(self):
        """
        Removes all values present in the dataframes in the list df_samples_list
        """
-        for indx in range(len(self.df_samples_list)):  # Le singole traj non servono più #TODO usare list comprens
+        for indx in range(len(self.df_samples_list)):
            self.df_samples_list[indx] = self.df_samples_list[indx].iloc[0:0]

    def import_sampled_cims(self, raw_data: typing.List, indx: int, cims_key: str) -> typing.Dict:
@ -239,17 +157,6 @@ class JsonImporter(ai.AbstractImporter):
                cims_for_all_vars[var].append(pd.DataFrame(raw_data[indx][cims_key][var][p_comb]).to_numpy())
        return cims_for_all_vars

-    @property
-    def concatenated_samples(self):
-        return self._concatenated_samples
-
-    @property
-    def variables(self):
-        return self._df_variables
-
-    @property
-    def structure(self):
-        return self._df_structure



--- a/main_package/classes/sample_path.py
+++ b/main_package/classes/sample_path.py
@ -1,29 +1,28 @@
-import abstract_sample_path as asam
-import json_importer as imp
+
+import abstract_importer as imp
 import structure as st
 import trajectory as tr


-class SamplePath(asam.AbstractSamplePath):
+class SamplePath:
    """
    Aggregates all the informations about the trajectories, the real structure of the sampled net and variables
    cardinalites.
    Has the task of creating the objects that will contain the mentioned data.
-    :importer: the Importer objects that will import ad process data
+

    :trajectories: the Trajectory object that will contain all the concatenated trajectories
    :structure: the Structure Object that will contain all the structurral infos about the net
    :total_variables_count: the number of variables in the net

    """
-
-    #def __init__(self, files_path: str, samples_label: str, structure_label: str, variables_label: str, time_key: str,
-                 #variables_key: str):
-    def __init__(self, importer: imp.JsonImporter):
-        #self.importer =importer
-        super().__init__(importer)
-        #self._trajectories = None
-        #self._structure = None
+    def __init__(self, importer: imp.AbstractImporter):
+        """
+        :importer: the Importer objects that will import ad process data
+        """
+        self.importer = importer
+        self._trajectories = None
+        self._structure = None
        self.total_variables_count = None

    def build_trajectories(self):
@ -51,11 +50,15 @@ class SamplePath(asam.AbstractSamplePath):
        Returns:
            void
        """
+        if self.importer.sorter != self.importer.variables.iloc[:, 0].to_list():
+            raise RuntimeError("The Dataset columns order have to match the order of labels in the variables Frame!")
        self.total_variables_count = len(self.importer.sorter)
-        labels = self.importer.variables[self.importer.variables_key].to_list()
+        #labels = self.importer.variables[self.importer.variables_key].to_list()
        #print("SAMPLE PATH LABELS",labels)
+        #print(self.importer.variables)
+        labels = self.importer.variables.iloc[:, 0].to_list()
        indxs = self.importer.variables.index.to_numpy()
-        vals = self.importer.variables['Value'].to_numpy()
+        vals = self.importer.variables.iloc[:, 1].to_numpy()
        edges = list(self.importer.structure.to_records(index=False))
        self._structure = st.Structure(labels, indxs, vals, edges,
                                       self.total_variables_count)
--- a/main_package/classes/set_of_cims.py
+++ b/main_package/classes/set_of_cims.py
@ -79,8 +79,8 @@ class SetOfCims:
        if mask_arr.size <= 1:
            return self.actual_cims
        else:
-            tmp_parents_comb_from_ids = np.argwhere(np.all(self.p_combs[:, mask_arr] == comb, axis=1)).ravel()
-            return self.actual_cims[tmp_parents_comb_from_ids]
+            flat_indxs = np.argwhere(np.all(self.p_combs[:, mask_arr] == comb, axis=1)).ravel()
+            return self.actual_cims[flat_indxs]

    @property
    def get_cims(self):
--- a/main_package/classes/simple_cvs_importer.py
+++ b/main_package/classes/simple_cvs_importer.py
@ -0,0 +1,53 @@
+import pandas as pd
+import glob
+import os
+
+import abstract_importer as ai
+import sample_path as sp
+
+
+class CSVImporter(ai.AbstractImporter):
+
+    def __init__(self, file_path):
+        self._df_samples_list = None
+        super(CSVImporter, self).__init__(file_path)
+
+    def import_data(self):
+        self.read_csv_file()
+        self.import_variables()
+        self.import_structure()
+        self.compute_row_delta_in_all_samples_frames(self._df_samples_list)
+
+    def read_csv_file(self):
+        df = pd.read_csv(self.file_path)
+        df.drop(df.columns[[0]], axis=1, inplace=True)
+        self._df_samples_list = [df]
+
+    def import_variables(self):
+        variables_list = list(self._df_samples_list[0].columns)[1:]
+        #wrong_vars_labels = ['Y','Z','X']
+        self._sorter = variables_list
+        values_list = [3 for var in variables_list]
+        # initialize list of lists
+        data = {'Name':variables_list, 'Value':values_list}
+
+        # Create the pandas DataFrame
+        self._df_variables = pd.DataFrame(data)
+
+    def import_structure(self):
+        data = {'From':['X','Y','Z'], 'To':['Z','Z','Y']}
+        self._df_structure = pd.DataFrame(data)
+
+
+read_files = glob.glob(os.path.join('../data', "*.csv"))
+print(read_files[0])
+csvimp = CSVImporter(read_files[0])
+
+#csvimp.import_data()
+
+
+s1 = sp.SamplePath(csvimp)
+s1.build_trajectories()
+s1.build_structure()
+print(s1.structure)
+print(s1.trajectories)
--- a/main_package/tests/test_json_importer.py
+++ b/main_package/tests/test_json_importer.py
@ -6,11 +6,9 @@ import glob
 import numpy as np
 import pandas as pd
 import json_importer as ji
-
 import json


-
 class TestJsonImporter(unittest.TestCase):

    @classmethod
@ -26,10 +24,10 @@ class TestJsonImporter(unittest.TestCase):
        self.assertEqual(j1.variables_key, 'Name')
        self.assertEqual(j1.file_path, self.read_files[0])
        self.assertFalse(j1.df_samples_list)
-        self.assertTrue(j1.variables.empty)
-        self.assertTrue(j1.structure.empty)
-        self.assertFalse(j1.concatenated_samples)
-        self.assertFalse(j1.sorter)
+        self.assertIsNone(j1.variables)
+        self.assertIsNone(j1.structure)
+        self.assertIsNone(j1.concatenated_samples)
+        self.assertIsNone(j1.sorter)

    def test_read_json_file_found(self):
        data_set = {"key1": [1, 2, 3], "key2": [4, 5, 6]}
@ -73,7 +71,7 @@ class TestJsonImporter(unittest.TestCase):
        sample_frame = j1.df_samples_list[0]
        columns_header = list(sample_frame.columns.values)
        shifted_cols_header = [s + "S" for s in columns_header[1:]]
-        new_sample_frame = j1.compute_row_delta_sigle_samples_frame(sample_frame, j1.time_key, columns_header[1:],
+        new_sample_frame = j1.compute_row_delta_sigle_samples_frame(sample_frame, columns_header[1:],
                                                                    shifted_cols_header)
        self.assertEqual(len(list(sample_frame.columns.values)) + len(shifted_cols_header),
                         len(list(new_sample_frame.columns.values)))
@ -83,15 +81,16 @@ class TestJsonImporter(unittest.TestCase):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
        raw_data = j1.read_json_file()
        j1.import_trajectories(raw_data)
-        j1.compute_row_delta_in_all_samples_frames(j1.time_key)
-        self.assertEqual(list(j1.df_samples_list[0].columns.values), list(j1.concatenated_samples.columns.values))
+        j1.compute_row_delta_in_all_samples_frames(j1.df_samples_list)
+        self.assertEqual(list(j1.df_samples_list[0].columns.values),
+                         list(j1.concatenated_samples.columns.values)[:len(list(j1.df_samples_list[0].columns.values))])
        self.assertEqual(list(j1.concatenated_samples.columns.values)[0], j1.time_key)

    def test_clear_data_frame_list(self):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
        raw_data = j1.read_json_file()
        j1.import_trajectories(raw_data)
-        j1.compute_row_delta_in_all_samples_frames(j1.time_key)
+        j1.compute_row_delta_in_all_samples_frames(j1.df_samples_list)
        j1.clear_data_frame_list()
        for df in j1.df_samples_list:
            self.assertTrue(df.empty)
@ -137,7 +136,7 @@ class TestJsonImporter(unittest.TestCase):
        j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
        raw_data = j1.read_json_file()
        cims = j1.import_sampled_cims(raw_data, 0, 'dyn.cims')
-        j1.import_variables(raw_data, ['X','Y','Z'])
+        j1.import_variables(raw_data, ['X','Y','Z']) #TODO NON PUò dipendere direttamente da questo sorter
        self.assertEqual(list(cims.keys()), j1.variables['Name'].tolist())

    def test_import_data(self):