diff --git a/main_package/classes/abstract_importer.py b/main_package/classes/abstract_importer.py index 707f0bd..cf34738 100644 --- a/main_package/classes/abstract_importer.py +++ b/main_package/classes/abstract_importer.py @@ -1,4 +1,6 @@ from abc import ABC, abstractmethod +import pandas as pd +import typing class AbstractImporter(ABC): @@ -6,13 +8,22 @@ class AbstractImporter(ABC): Interface that exposes all the necessary methods to import the trajectories and the net structure. :file_path: the file path - + :_concatenated_samples: the concatenation of all the processed trajectories + :df_structure: Dataframe containing the structure of the network (edges) + :df_variables: Dataframe containing the nodes cardinalities + :df_concatenated_samples: the concatenation and processing of all the trajectories present in the list df_samples list + :sorter: the columns header(excluding the time column) of the Dataframe concatenated_samples """ def __init__(self, file_path: str): self.file_path = file_path + self._df_variables = None + self._df_structure = None + self._concatenated_samples = None + self._sorter = None super().__init__() + """ @abstractmethod def import_trajectories(self, raw_data): pass @@ -20,4 +31,105 @@ class AbstractImporter(ABC): @abstractmethod def import_structure(self, raw_data): pass + """ + + @abstractmethod + def import_data(self): + """ + Imports and prepares all data present needed for susequent computation. + Parameters: + void + Returns: + void + POSTCONDITION: the class members self._df_variables and self._df_structure HAVE to be properly constructed + as Pandas Dataframes + """ + pass + + def compute_row_delta_sigle_samples_frame(self, sample_frame: pd.DataFrame, + columns_header: typing.List, shifted_cols_header: typing.List) \ + -> pd.DataFrame: + """ + Computes the difference between each value present in th time column. + Copies and shift by one position up all the values present in the remaining columns. + PREREQUISITE: the Dataframe in input has to follow the column structure of this header: + [Time|Variable values], so it is assumed TIME is ALWAYS the FIRST column. + Parameters: + sample_frame: the traj to be processed + time_header_label: the label for the times + columns_header: the original header of sample_frame + shifted_cols_header: a copy of columns_header with changed names of the contents + Returns: + sample_frame: the processed dataframe + + """ + #sample_frame[time_header_label] = sample_frame[time_header_label].diff().shift(-1) + sample_frame.iloc[:, 0] = sample_frame.iloc[:, 0].diff().shift(-1) + shifted_cols = sample_frame[columns_header].shift(-1).fillna(0).astype('int32') + shifted_cols.columns = shifted_cols_header + sample_frame = sample_frame.assign(**shifted_cols) + sample_frame.drop(sample_frame.tail(1).index, inplace=True) + return sample_frame + + def compute_row_delta_in_all_samples_frames(self, df_samples_list: typing.List): + """ + Calls the method compute_row_delta_sigle_samples_frame on every dataframe present in the list df_samples_list. + Concatenates the result in the dataframe concatanated_samples + PREREQUISITE: the Dataframe in input has to follow the column structure of this header: + [Time|Variable values], so it is assumed TIME is ALWAYS the FIRST column. + The class member self._sorter HAS to be properly INITIALIZED + Parameters: + time_header_label: the label of the time column + df_samples_list: the datframe's list to be processed and concatenated + + Returns: + void + """ + shifted_cols_header = [s + "S" for s in self._sorter] + compute_row_delta = self.compute_row_delta_sigle_samples_frame + proc_samples_list = [compute_row_delta(sample, self._sorter, shifted_cols_header) + for sample in df_samples_list] + self._concatenated_samples = pd.concat(proc_samples_list) + complete_header = self._sorter[:] + complete_header.insert(0,'Time') + complete_header.extend(shifted_cols_header) + #print("Complete Header", complete_header) + self._concatenated_samples = self._concatenated_samples[complete_header] + #print("Concat Samples",self._concatenated_samples) + + def build_list_of_samples_array(self, data_frame: pd.DataFrame) -> typing.List: + """ + Builds a List containing the columns of dataframe and converts them to a numpy array. + Parameters: + :data_frame: the dataframe from which the columns have to be extracted and converted + Returns: + :columns_list: the resulting list of numpy arrays + """ + columns_list = [data_frame[column].to_numpy() for column in data_frame] + return columns_list + + def clear_concatenated_frame(self): + """ + Removes all values in the dataframe concatenated_samples + Parameters: + void + Returns: + void + """ + self._concatenated_samples = self._concatenated_samples.iloc[0:0] + + @property + def concatenated_samples(self) -> pd.DataFrame: + return self._concatenated_samples + + @property + def variables(self) -> pd.DataFrame: + return self._df_variables + + @property + def structure(self) -> pd.DataFrame: + return self._df_structure + @property + def sorter(self): + return self._sorter diff --git a/main_package/classes/json_importer.py b/main_package/classes/json_importer.py index 708c6ad..675ee06 100644 --- a/main_package/classes/json_importer.py +++ b/main_package/classes/json_importer.py @@ -23,10 +23,6 @@ class JsonImporter(ai.AbstractImporter): :time_key: the key used to identify the timestamps in each trajectory :variables_key: the key used to identify the names of the variables in the net :df_samples_list: a Dataframe list in which every df contains a trajectory - :df_structure: Dataframe containing the structure of the network (edges) - :df_variables: Dataframe containing the nodes cardinalities - :df_concatenated_samples: the concatenation and processing of all the trajectories present in the list df_samples list - :sorter: the columns header(excluding the time column) of the Dataframe concatenated_samples """ def __init__(self, file_path: str, samples_label: str, structure_label: str, variables_label: str, time_key: str, @@ -37,10 +33,6 @@ class JsonImporter(ai.AbstractImporter): self.time_key = time_key self.variables_key = variables_key self.df_samples_list = [] - self._df_structure = pd.DataFrame() - self._df_variables = pd.DataFrame() - self._concatenated_samples = None - self.sorter = None super(JsonImporter, self).__init__(file_path) def import_data(self): @@ -53,10 +45,10 @@ class JsonImporter(ai.AbstractImporter): """ raw_data = self.read_json_file() self.import_trajectories(raw_data) - self.compute_row_delta_in_all_samples_frames(self.time_key) + self.compute_row_delta_in_all_samples_frames(self.df_samples_list) self.clear_data_frame_list() self.import_structure(raw_data) - self.import_variables(raw_data, self.sorter) + self.import_variables(raw_data, self._sorter) def import_trajectories(self, raw_data: typing.List): """ @@ -79,7 +71,6 @@ class JsonImporter(ai.AbstractImporter): """ self._df_structure = self.one_level_normalizing(raw_data, 0, self.structure_label) - def import_variables(self, raw_data: typing.List, sorter: typing.List): """ Imports the data in raw_data at the key variables_label. @@ -92,13 +83,13 @@ class JsonImporter(ai.AbstractImporter): void """ self._df_variables = self.one_level_normalizing(raw_data, 0, self.variables_label) - #self.sorter = self._df_variables[self.variables_key].to_list() - #self.sorter.sort() - #print("Sorter:", self.sorter) + #TODO Usando come Pre-requisito l'ordinamento del frame _df_variables uguale a quello presente in + #TODO self _sorter questo codice risulta inutile self._df_variables[self.variables_key] = self._df_variables[self.variables_key].astype("category") self._df_variables[self.variables_key] = self._df_variables[self.variables_key].cat.set_categories(sorter) self._df_variables = self._df_variables.sort_values([self.variables_key]) self._df_variables.reset_index(inplace=True) + self._df_variables.drop('index', axis=1, inplace=True) print("Var Frame", self._df_variables) def read_json_file(self) -> typing.List: @@ -111,15 +102,9 @@ class JsonImporter(ai.AbstractImporter): data: the contents of the json file """ - #try: - #read_files = glob.glob(os.path.join(self.files_path, "*.json")) - #if not read_files: - #raise ValueError('No .json file found in the entered path!') with open(self.file_path) as f: data = json.load(f) return data - #except ValueError as err: - #print(err.args) def one_level_normalizing(self, raw_data: typing.List, indx: int, key: str) -> pd.DataFrame: """ @@ -152,82 +137,15 @@ class JsonImporter(ai.AbstractImporter): smps = raw_data[indx][trajectories_key] self.df_samples_list = [dataframe(sample) for sample in smps] columns_header = list(self.df_samples_list[0].columns.values) + #print("COLUMNs HEADER", columns_header) columns_header.remove(self.time_key) - self.sorter = columns_header - - def compute_row_delta_sigle_samples_frame(self, sample_frame: pd.DataFrame, time_header_label: str, - columns_header: typing.List, shifted_cols_header: typing.List) \ - -> pd.DataFrame: - """ - Computes the difference between each value present in th time column. - Copies and shift by one position up all the values present in the remaining columns. - Parameters: - sample_frame: the traj to be processed - time_header_label: the label for the times - columns_header: the original header of sample_frame - shifted_cols_header: a copy of columns_header with changed names of the contents - Returns: - sample_frame: the processed dataframe - - """ - sample_frame[time_header_label] = sample_frame[time_header_label].diff().shift(-1) - shifted_cols = sample_frame[columns_header].shift(-1).fillna(0).astype('int32') - #print(shifted_cols) - shifted_cols.columns = shifted_cols_header - sample_frame = sample_frame.assign(**shifted_cols) - sample_frame.drop(sample_frame.tail(1).index, inplace=True) - return sample_frame - - def compute_row_delta_in_all_samples_frames(self, time_header_label: str): - """ - Calls the method compute_row_delta_sigle_samples_frame on every dataframe present in the list self.df_samples_list. - Concatenates the result in the dataframe concatanated_samples - - Parameters: - time_header_label: the label of the time column - Returns: - void - """ - shifted_cols_header = [s + "S" for s in self.sorter] - compute_row_delta = self.compute_row_delta_sigle_samples_frame - self.df_samples_list = [compute_row_delta(sample, time_header_label, self.sorter, shifted_cols_header) - for sample in self.df_samples_list] - self._concatenated_samples = pd.concat(self.df_samples_list) - complete_header = self.sorter[:] - complete_header.insert(0,'Time') - complete_header.extend(shifted_cols_header) - #print("Complete Header", complete_header) - self._concatenated_samples = self._concatenated_samples[complete_header] - #print("Concat Samples",self._concatenated_samples) - - def build_list_of_samples_array(self, data_frame: pd.DataFrame) -> typing.List: - """ - Builds a List containing the columns of dataframe and converts them to a numpy array. - Parameters: - :data_frame: the dataframe from which the columns have to be extracted and converted - Returns: - :columns_list: the resulting list of numpy arrays - """ - columns_list = [data_frame[column].to_numpy() for column in data_frame] - #for column in data_frame: - #columns_list.append(data_frame[column].to_numpy()) - return columns_list - - def clear_concatenated_frame(self): - """ - Removes all values in the dataframe concatenated_samples - Parameters: - void - Returns: - void - """ - self._concatenated_samples = self._concatenated_samples.iloc[0:0] + self._sorter = columns_header def clear_data_frame_list(self): """ Removes all values present in the dataframes in the list df_samples_list """ - for indx in range(len(self.df_samples_list)): # Le singole traj non servono più #TODO usare list comprens + for indx in range(len(self.df_samples_list)): self.df_samples_list[indx] = self.df_samples_list[indx].iloc[0:0] def import_sampled_cims(self, raw_data: typing.List, indx: int, cims_key: str) -> typing.Dict: @@ -239,17 +157,6 @@ class JsonImporter(ai.AbstractImporter): cims_for_all_vars[var].append(pd.DataFrame(raw_data[indx][cims_key][var][p_comb]).to_numpy()) return cims_for_all_vars - @property - def concatenated_samples(self): - return self._concatenated_samples - - @property - def variables(self): - return self._df_variables - - @property - def structure(self): - return self._df_structure diff --git a/main_package/classes/sample_path.py b/main_package/classes/sample_path.py index e365e85..a104b7e 100644 --- a/main_package/classes/sample_path.py +++ b/main_package/classes/sample_path.py @@ -1,29 +1,28 @@ -import abstract_sample_path as asam -import json_importer as imp + +import abstract_importer as imp import structure as st import trajectory as tr -class SamplePath(asam.AbstractSamplePath): +class SamplePath: """ Aggregates all the informations about the trajectories, the real structure of the sampled net and variables cardinalites. Has the task of creating the objects that will contain the mentioned data. - :importer: the Importer objects that will import ad process data + :trajectories: the Trajectory object that will contain all the concatenated trajectories :structure: the Structure Object that will contain all the structurral infos about the net :total_variables_count: the number of variables in the net """ - - #def __init__(self, files_path: str, samples_label: str, structure_label: str, variables_label: str, time_key: str, - #variables_key: str): - def __init__(self, importer: imp.JsonImporter): - #self.importer =importer - super().__init__(importer) - #self._trajectories = None - #self._structure = None + def __init__(self, importer: imp.AbstractImporter): + """ + :importer: the Importer objects that will import ad process data + """ + self.importer = importer + self._trajectories = None + self._structure = None self.total_variables_count = None def build_trajectories(self): @@ -51,11 +50,15 @@ class SamplePath(asam.AbstractSamplePath): Returns: void """ + if self.importer.sorter != self.importer.variables.iloc[:, 0].to_list(): + raise RuntimeError("The Dataset columns order have to match the order of labels in the variables Frame!") self.total_variables_count = len(self.importer.sorter) - labels = self.importer.variables[self.importer.variables_key].to_list() + #labels = self.importer.variables[self.importer.variables_key].to_list() #print("SAMPLE PATH LABELS",labels) + #print(self.importer.variables) + labels = self.importer.variables.iloc[:, 0].to_list() indxs = self.importer.variables.index.to_numpy() - vals = self.importer.variables['Value'].to_numpy() + vals = self.importer.variables.iloc[:, 1].to_numpy() edges = list(self.importer.structure.to_records(index=False)) self._structure = st.Structure(labels, indxs, vals, edges, self.total_variables_count) diff --git a/main_package/classes/set_of_cims.py b/main_package/classes/set_of_cims.py index d45b1f5..dc83d02 100644 --- a/main_package/classes/set_of_cims.py +++ b/main_package/classes/set_of_cims.py @@ -79,8 +79,8 @@ class SetOfCims: if mask_arr.size <= 1: return self.actual_cims else: - tmp_parents_comb_from_ids = np.argwhere(np.all(self.p_combs[:, mask_arr] == comb, axis=1)).ravel() - return self.actual_cims[tmp_parents_comb_from_ids] + flat_indxs = np.argwhere(np.all(self.p_combs[:, mask_arr] == comb, axis=1)).ravel() + return self.actual_cims[flat_indxs] @property def get_cims(self): diff --git a/main_package/classes/simple_cvs_importer.py b/main_package/classes/simple_cvs_importer.py new file mode 100644 index 0000000..257ba90 --- /dev/null +++ b/main_package/classes/simple_cvs_importer.py @@ -0,0 +1,53 @@ +import pandas as pd +import glob +import os + +import abstract_importer as ai +import sample_path as sp + + +class CSVImporter(ai.AbstractImporter): + + def __init__(self, file_path): + self._df_samples_list = None + super(CSVImporter, self).__init__(file_path) + + def import_data(self): + self.read_csv_file() + self.import_variables() + self.import_structure() + self.compute_row_delta_in_all_samples_frames(self._df_samples_list) + + def read_csv_file(self): + df = pd.read_csv(self.file_path) + df.drop(df.columns[[0]], axis=1, inplace=True) + self._df_samples_list = [df] + + def import_variables(self): + variables_list = list(self._df_samples_list[0].columns)[1:] + #wrong_vars_labels = ['Y','Z','X'] + self._sorter = variables_list + values_list = [3 for var in variables_list] + # initialize list of lists + data = {'Name':variables_list, 'Value':values_list} + + # Create the pandas DataFrame + self._df_variables = pd.DataFrame(data) + + def import_structure(self): + data = {'From':['X','Y','Z'], 'To':['Z','Z','Y']} + self._df_structure = pd.DataFrame(data) + + +read_files = glob.glob(os.path.join('../data', "*.csv")) +print(read_files[0]) +csvimp = CSVImporter(read_files[0]) + +#csvimp.import_data() + + +s1 = sp.SamplePath(csvimp) +s1.build_trajectories() +s1.build_structure() +print(s1.structure) +print(s1.trajectories) diff --git a/main_package/tests/test_json_importer.py b/main_package/tests/test_json_importer.py index f940ae6..026f229 100644 --- a/main_package/tests/test_json_importer.py +++ b/main_package/tests/test_json_importer.py @@ -6,11 +6,9 @@ import glob import numpy as np import pandas as pd import json_importer as ji - import json - class TestJsonImporter(unittest.TestCase): @classmethod @@ -26,10 +24,10 @@ class TestJsonImporter(unittest.TestCase): self.assertEqual(j1.variables_key, 'Name') self.assertEqual(j1.file_path, self.read_files[0]) self.assertFalse(j1.df_samples_list) - self.assertTrue(j1.variables.empty) - self.assertTrue(j1.structure.empty) - self.assertFalse(j1.concatenated_samples) - self.assertFalse(j1.sorter) + self.assertIsNone(j1.variables) + self.assertIsNone(j1.structure) + self.assertIsNone(j1.concatenated_samples) + self.assertIsNone(j1.sorter) def test_read_json_file_found(self): data_set = {"key1": [1, 2, 3], "key2": [4, 5, 6]} @@ -73,7 +71,7 @@ class TestJsonImporter(unittest.TestCase): sample_frame = j1.df_samples_list[0] columns_header = list(sample_frame.columns.values) shifted_cols_header = [s + "S" for s in columns_header[1:]] - new_sample_frame = j1.compute_row_delta_sigle_samples_frame(sample_frame, j1.time_key, columns_header[1:], + new_sample_frame = j1.compute_row_delta_sigle_samples_frame(sample_frame, columns_header[1:], shifted_cols_header) self.assertEqual(len(list(sample_frame.columns.values)) + len(shifted_cols_header), len(list(new_sample_frame.columns.values))) @@ -83,15 +81,16 @@ class TestJsonImporter(unittest.TestCase): j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name') raw_data = j1.read_json_file() j1.import_trajectories(raw_data) - j1.compute_row_delta_in_all_samples_frames(j1.time_key) - self.assertEqual(list(j1.df_samples_list[0].columns.values), list(j1.concatenated_samples.columns.values)) + j1.compute_row_delta_in_all_samples_frames(j1.df_samples_list) + self.assertEqual(list(j1.df_samples_list[0].columns.values), + list(j1.concatenated_samples.columns.values)[:len(list(j1.df_samples_list[0].columns.values))]) self.assertEqual(list(j1.concatenated_samples.columns.values)[0], j1.time_key) def test_clear_data_frame_list(self): j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name') raw_data = j1.read_json_file() j1.import_trajectories(raw_data) - j1.compute_row_delta_in_all_samples_frames(j1.time_key) + j1.compute_row_delta_in_all_samples_frames(j1.df_samples_list) j1.clear_data_frame_list() for df in j1.df_samples_list: self.assertTrue(df.empty) @@ -137,7 +136,7 @@ class TestJsonImporter(unittest.TestCase): j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name') raw_data = j1.read_json_file() cims = j1.import_sampled_cims(raw_data, 0, 'dyn.cims') - j1.import_variables(raw_data, ['X','Y','Z']) + j1.import_variables(raw_data, ['X','Y','Z']) #TODO NON PUò dipendere direttamente da questo sorter self.assertEqual(list(cims.keys()), j1.variables['Name'].tolist()) def test_import_data(self):