1
0
Fork 0

Refactor AbstractImporter class to obtain better decoupling from SamplePath

parallel_struct_est
philpMartin 4 years ago
parent 8e1666a607
commit f32b79590f
  1. 114
      main_package/classes/abstract_importer.py
  2. 109
      main_package/classes/json_importer.py
  3. 31
      main_package/classes/sample_path.py
  4. 4
      main_package/classes/set_of_cims.py
  5. 53
      main_package/classes/simple_cvs_importer.py
  6. 21
      main_package/tests/test_json_importer.py

@ -1,4 +1,6 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
import pandas as pd
import typing
class AbstractImporter(ABC): class AbstractImporter(ABC):
@ -6,13 +8,22 @@ class AbstractImporter(ABC):
Interface that exposes all the necessary methods to import the trajectories and the net structure. Interface that exposes all the necessary methods to import the trajectories and the net structure.
:file_path: the file path :file_path: the file path
:_concatenated_samples: the concatenation of all the processed trajectories
:df_structure: Dataframe containing the structure of the network (edges)
:df_variables: Dataframe containing the nodes cardinalities
:df_concatenated_samples: the concatenation and processing of all the trajectories present in the list df_samples list
:sorter: the columns header(excluding the time column) of the Dataframe concatenated_samples
""" """
def __init__(self, file_path: str): def __init__(self, file_path: str):
self.file_path = file_path self.file_path = file_path
self._df_variables = None
self._df_structure = None
self._concatenated_samples = None
self._sorter = None
super().__init__() super().__init__()
"""
@abstractmethod @abstractmethod
def import_trajectories(self, raw_data): def import_trajectories(self, raw_data):
pass pass
@ -20,4 +31,105 @@ class AbstractImporter(ABC):
@abstractmethod @abstractmethod
def import_structure(self, raw_data): def import_structure(self, raw_data):
pass pass
"""
@abstractmethod
def import_data(self):
"""
Imports and prepares all data present needed for susequent computation.
Parameters:
void
Returns:
void
POSTCONDITION: the class members self._df_variables and self._df_structure HAVE to be properly constructed
as Pandas Dataframes
"""
pass
def compute_row_delta_sigle_samples_frame(self, sample_frame: pd.DataFrame,
columns_header: typing.List, shifted_cols_header: typing.List) \
-> pd.DataFrame:
"""
Computes the difference between each value present in th time column.
Copies and shift by one position up all the values present in the remaining columns.
PREREQUISITE: the Dataframe in input has to follow the column structure of this header:
[Time|Variable values], so it is assumed TIME is ALWAYS the FIRST column.
Parameters:
sample_frame: the traj to be processed
time_header_label: the label for the times
columns_header: the original header of sample_frame
shifted_cols_header: a copy of columns_header with changed names of the contents
Returns:
sample_frame: the processed dataframe
"""
#sample_frame[time_header_label] = sample_frame[time_header_label].diff().shift(-1)
sample_frame.iloc[:, 0] = sample_frame.iloc[:, 0].diff().shift(-1)
shifted_cols = sample_frame[columns_header].shift(-1).fillna(0).astype('int32')
shifted_cols.columns = shifted_cols_header
sample_frame = sample_frame.assign(**shifted_cols)
sample_frame.drop(sample_frame.tail(1).index, inplace=True)
return sample_frame
def compute_row_delta_in_all_samples_frames(self, df_samples_list: typing.List):
"""
Calls the method compute_row_delta_sigle_samples_frame on every dataframe present in the list df_samples_list.
Concatenates the result in the dataframe concatanated_samples
PREREQUISITE: the Dataframe in input has to follow the column structure of this header:
[Time|Variable values], so it is assumed TIME is ALWAYS the FIRST column.
The class member self._sorter HAS to be properly INITIALIZED
Parameters:
time_header_label: the label of the time column
df_samples_list: the datframe's list to be processed and concatenated
Returns:
void
"""
shifted_cols_header = [s + "S" for s in self._sorter]
compute_row_delta = self.compute_row_delta_sigle_samples_frame
proc_samples_list = [compute_row_delta(sample, self._sorter, shifted_cols_header)
for sample in df_samples_list]
self._concatenated_samples = pd.concat(proc_samples_list)
complete_header = self._sorter[:]
complete_header.insert(0,'Time')
complete_header.extend(shifted_cols_header)
#print("Complete Header", complete_header)
self._concatenated_samples = self._concatenated_samples[complete_header]
#print("Concat Samples",self._concatenated_samples)
def build_list_of_samples_array(self, data_frame: pd.DataFrame) -> typing.List:
"""
Builds a List containing the columns of dataframe and converts them to a numpy array.
Parameters:
:data_frame: the dataframe from which the columns have to be extracted and converted
Returns:
:columns_list: the resulting list of numpy arrays
"""
columns_list = [data_frame[column].to_numpy() for column in data_frame]
return columns_list
def clear_concatenated_frame(self):
"""
Removes all values in the dataframe concatenated_samples
Parameters:
void
Returns:
void
"""
self._concatenated_samples = self._concatenated_samples.iloc[0:0]
@property
def concatenated_samples(self) -> pd.DataFrame:
return self._concatenated_samples
@property
def variables(self) -> pd.DataFrame:
return self._df_variables
@property
def structure(self) -> pd.DataFrame:
return self._df_structure
@property
def sorter(self):
return self._sorter

@ -23,10 +23,6 @@ class JsonImporter(ai.AbstractImporter):
:time_key: the key used to identify the timestamps in each trajectory :time_key: the key used to identify the timestamps in each trajectory
:variables_key: the key used to identify the names of the variables in the net :variables_key: the key used to identify the names of the variables in the net
:df_samples_list: a Dataframe list in which every df contains a trajectory :df_samples_list: a Dataframe list in which every df contains a trajectory
:df_structure: Dataframe containing the structure of the network (edges)
:df_variables: Dataframe containing the nodes cardinalities
:df_concatenated_samples: the concatenation and processing of all the trajectories present in the list df_samples list
:sorter: the columns header(excluding the time column) of the Dataframe concatenated_samples
""" """
def __init__(self, file_path: str, samples_label: str, structure_label: str, variables_label: str, time_key: str, def __init__(self, file_path: str, samples_label: str, structure_label: str, variables_label: str, time_key: str,
@ -37,10 +33,6 @@ class JsonImporter(ai.AbstractImporter):
self.time_key = time_key self.time_key = time_key
self.variables_key = variables_key self.variables_key = variables_key
self.df_samples_list = [] self.df_samples_list = []
self._df_structure = pd.DataFrame()
self._df_variables = pd.DataFrame()
self._concatenated_samples = None
self.sorter = None
super(JsonImporter, self).__init__(file_path) super(JsonImporter, self).__init__(file_path)
def import_data(self): def import_data(self):
@ -53,10 +45,10 @@ class JsonImporter(ai.AbstractImporter):
""" """
raw_data = self.read_json_file() raw_data = self.read_json_file()
self.import_trajectories(raw_data) self.import_trajectories(raw_data)
self.compute_row_delta_in_all_samples_frames(self.time_key) self.compute_row_delta_in_all_samples_frames(self.df_samples_list)
self.clear_data_frame_list() self.clear_data_frame_list()
self.import_structure(raw_data) self.import_structure(raw_data)
self.import_variables(raw_data, self.sorter) self.import_variables(raw_data, self._sorter)
def import_trajectories(self, raw_data: typing.List): def import_trajectories(self, raw_data: typing.List):
""" """
@ -79,7 +71,6 @@ class JsonImporter(ai.AbstractImporter):
""" """
self._df_structure = self.one_level_normalizing(raw_data, 0, self.structure_label) self._df_structure = self.one_level_normalizing(raw_data, 0, self.structure_label)
def import_variables(self, raw_data: typing.List, sorter: typing.List): def import_variables(self, raw_data: typing.List, sorter: typing.List):
""" """
Imports the data in raw_data at the key variables_label. Imports the data in raw_data at the key variables_label.
@ -92,13 +83,13 @@ class JsonImporter(ai.AbstractImporter):
void void
""" """
self._df_variables = self.one_level_normalizing(raw_data, 0, self.variables_label) self._df_variables = self.one_level_normalizing(raw_data, 0, self.variables_label)
#self.sorter = self._df_variables[self.variables_key].to_list() #TODO Usando come Pre-requisito l'ordinamento del frame _df_variables uguale a quello presente in
#self.sorter.sort() #TODO self _sorter questo codice risulta inutile
#print("Sorter:", self.sorter)
self._df_variables[self.variables_key] = self._df_variables[self.variables_key].astype("category") self._df_variables[self.variables_key] = self._df_variables[self.variables_key].astype("category")
self._df_variables[self.variables_key] = self._df_variables[self.variables_key].cat.set_categories(sorter) self._df_variables[self.variables_key] = self._df_variables[self.variables_key].cat.set_categories(sorter)
self._df_variables = self._df_variables.sort_values([self.variables_key]) self._df_variables = self._df_variables.sort_values([self.variables_key])
self._df_variables.reset_index(inplace=True) self._df_variables.reset_index(inplace=True)
self._df_variables.drop('index', axis=1, inplace=True)
print("Var Frame", self._df_variables) print("Var Frame", self._df_variables)
def read_json_file(self) -> typing.List: def read_json_file(self) -> typing.List:
@ -111,15 +102,9 @@ class JsonImporter(ai.AbstractImporter):
data: the contents of the json file data: the contents of the json file
""" """
#try:
#read_files = glob.glob(os.path.join(self.files_path, "*.json"))
#if not read_files:
#raise ValueError('No .json file found in the entered path!')
with open(self.file_path) as f: with open(self.file_path) as f:
data = json.load(f) data = json.load(f)
return data return data
#except ValueError as err:
#print(err.args)
def one_level_normalizing(self, raw_data: typing.List, indx: int, key: str) -> pd.DataFrame: def one_level_normalizing(self, raw_data: typing.List, indx: int, key: str) -> pd.DataFrame:
""" """
@ -152,82 +137,15 @@ class JsonImporter(ai.AbstractImporter):
smps = raw_data[indx][trajectories_key] smps = raw_data[indx][trajectories_key]
self.df_samples_list = [dataframe(sample) for sample in smps] self.df_samples_list = [dataframe(sample) for sample in smps]
columns_header = list(self.df_samples_list[0].columns.values) columns_header = list(self.df_samples_list[0].columns.values)
#print("COLUMNs HEADER", columns_header)
columns_header.remove(self.time_key) columns_header.remove(self.time_key)
self.sorter = columns_header self._sorter = columns_header
def compute_row_delta_sigle_samples_frame(self, sample_frame: pd.DataFrame, time_header_label: str,
columns_header: typing.List, shifted_cols_header: typing.List) \
-> pd.DataFrame:
"""
Computes the difference between each value present in th time column.
Copies and shift by one position up all the values present in the remaining columns.
Parameters:
sample_frame: the traj to be processed
time_header_label: the label for the times
columns_header: the original header of sample_frame
shifted_cols_header: a copy of columns_header with changed names of the contents
Returns:
sample_frame: the processed dataframe
"""
sample_frame[time_header_label] = sample_frame[time_header_label].diff().shift(-1)
shifted_cols = sample_frame[columns_header].shift(-1).fillna(0).astype('int32')
#print(shifted_cols)
shifted_cols.columns = shifted_cols_header
sample_frame = sample_frame.assign(**shifted_cols)
sample_frame.drop(sample_frame.tail(1).index, inplace=True)
return sample_frame
def compute_row_delta_in_all_samples_frames(self, time_header_label: str):
"""
Calls the method compute_row_delta_sigle_samples_frame on every dataframe present in the list self.df_samples_list.
Concatenates the result in the dataframe concatanated_samples
Parameters:
time_header_label: the label of the time column
Returns:
void
"""
shifted_cols_header = [s + "S" for s in self.sorter]
compute_row_delta = self.compute_row_delta_sigle_samples_frame
self.df_samples_list = [compute_row_delta(sample, time_header_label, self.sorter, shifted_cols_header)
for sample in self.df_samples_list]
self._concatenated_samples = pd.concat(self.df_samples_list)
complete_header = self.sorter[:]
complete_header.insert(0,'Time')
complete_header.extend(shifted_cols_header)
#print("Complete Header", complete_header)
self._concatenated_samples = self._concatenated_samples[complete_header]
#print("Concat Samples",self._concatenated_samples)
def build_list_of_samples_array(self, data_frame: pd.DataFrame) -> typing.List:
"""
Builds a List containing the columns of dataframe and converts them to a numpy array.
Parameters:
:data_frame: the dataframe from which the columns have to be extracted and converted
Returns:
:columns_list: the resulting list of numpy arrays
"""
columns_list = [data_frame[column].to_numpy() for column in data_frame]
#for column in data_frame:
#columns_list.append(data_frame[column].to_numpy())
return columns_list
def clear_concatenated_frame(self):
"""
Removes all values in the dataframe concatenated_samples
Parameters:
void
Returns:
void
"""
self._concatenated_samples = self._concatenated_samples.iloc[0:0]
def clear_data_frame_list(self): def clear_data_frame_list(self):
""" """
Removes all values present in the dataframes in the list df_samples_list Removes all values present in the dataframes in the list df_samples_list
""" """
for indx in range(len(self.df_samples_list)): # Le singole traj non servono più #TODO usare list comprens for indx in range(len(self.df_samples_list)):
self.df_samples_list[indx] = self.df_samples_list[indx].iloc[0:0] self.df_samples_list[indx] = self.df_samples_list[indx].iloc[0:0]
def import_sampled_cims(self, raw_data: typing.List, indx: int, cims_key: str) -> typing.Dict: def import_sampled_cims(self, raw_data: typing.List, indx: int, cims_key: str) -> typing.Dict:
@ -239,17 +157,6 @@ class JsonImporter(ai.AbstractImporter):
cims_for_all_vars[var].append(pd.DataFrame(raw_data[indx][cims_key][var][p_comb]).to_numpy()) cims_for_all_vars[var].append(pd.DataFrame(raw_data[indx][cims_key][var][p_comb]).to_numpy())
return cims_for_all_vars return cims_for_all_vars
@property
def concatenated_samples(self):
return self._concatenated_samples
@property
def variables(self):
return self._df_variables
@property
def structure(self):
return self._df_structure

@ -1,29 +1,28 @@
import abstract_sample_path as asam
import json_importer as imp import abstract_importer as imp
import structure as st import structure as st
import trajectory as tr import trajectory as tr
class SamplePath(asam.AbstractSamplePath): class SamplePath:
""" """
Aggregates all the informations about the trajectories, the real structure of the sampled net and variables Aggregates all the informations about the trajectories, the real structure of the sampled net and variables
cardinalites. cardinalites.
Has the task of creating the objects that will contain the mentioned data. Has the task of creating the objects that will contain the mentioned data.
:importer: the Importer objects that will import ad process data
:trajectories: the Trajectory object that will contain all the concatenated trajectories :trajectories: the Trajectory object that will contain all the concatenated trajectories
:structure: the Structure Object that will contain all the structurral infos about the net :structure: the Structure Object that will contain all the structurral infos about the net
:total_variables_count: the number of variables in the net :total_variables_count: the number of variables in the net
""" """
def __init__(self, importer: imp.AbstractImporter):
#def __init__(self, files_path: str, samples_label: str, structure_label: str, variables_label: str, time_key: str, """
#variables_key: str): :importer: the Importer objects that will import ad process data
def __init__(self, importer: imp.JsonImporter): """
#self.importer =importer self.importer = importer
super().__init__(importer) self._trajectories = None
#self._trajectories = None self._structure = None
#self._structure = None
self.total_variables_count = None self.total_variables_count = None
def build_trajectories(self): def build_trajectories(self):
@ -51,11 +50,15 @@ class SamplePath(asam.AbstractSamplePath):
Returns: Returns:
void void
""" """
if self.importer.sorter != self.importer.variables.iloc[:, 0].to_list():
raise RuntimeError("The Dataset columns order have to match the order of labels in the variables Frame!")
self.total_variables_count = len(self.importer.sorter) self.total_variables_count = len(self.importer.sorter)
labels = self.importer.variables[self.importer.variables_key].to_list() #labels = self.importer.variables[self.importer.variables_key].to_list()
#print("SAMPLE PATH LABELS",labels) #print("SAMPLE PATH LABELS",labels)
#print(self.importer.variables)
labels = self.importer.variables.iloc[:, 0].to_list()
indxs = self.importer.variables.index.to_numpy() indxs = self.importer.variables.index.to_numpy()
vals = self.importer.variables['Value'].to_numpy() vals = self.importer.variables.iloc[:, 1].to_numpy()
edges = list(self.importer.structure.to_records(index=False)) edges = list(self.importer.structure.to_records(index=False))
self._structure = st.Structure(labels, indxs, vals, edges, self._structure = st.Structure(labels, indxs, vals, edges,
self.total_variables_count) self.total_variables_count)

@ -79,8 +79,8 @@ class SetOfCims:
if mask_arr.size <= 1: if mask_arr.size <= 1:
return self.actual_cims return self.actual_cims
else: else:
tmp_parents_comb_from_ids = np.argwhere(np.all(self.p_combs[:, mask_arr] == comb, axis=1)).ravel() flat_indxs = np.argwhere(np.all(self.p_combs[:, mask_arr] == comb, axis=1)).ravel()
return self.actual_cims[tmp_parents_comb_from_ids] return self.actual_cims[flat_indxs]
@property @property
def get_cims(self): def get_cims(self):

@ -0,0 +1,53 @@
import pandas as pd
import glob
import os
import abstract_importer as ai
import sample_path as sp
class CSVImporter(ai.AbstractImporter):
def __init__(self, file_path):
self._df_samples_list = None
super(CSVImporter, self).__init__(file_path)
def import_data(self):
self.read_csv_file()
self.import_variables()
self.import_structure()
self.compute_row_delta_in_all_samples_frames(self._df_samples_list)
def read_csv_file(self):
df = pd.read_csv(self.file_path)
df.drop(df.columns[[0]], axis=1, inplace=True)
self._df_samples_list = [df]
def import_variables(self):
variables_list = list(self._df_samples_list[0].columns)[1:]
#wrong_vars_labels = ['Y','Z','X']
self._sorter = variables_list
values_list = [3 for var in variables_list]
# initialize list of lists
data = {'Name':variables_list, 'Value':values_list}
# Create the pandas DataFrame
self._df_variables = pd.DataFrame(data)
def import_structure(self):
data = {'From':['X','Y','Z'], 'To':['Z','Z','Y']}
self._df_structure = pd.DataFrame(data)
read_files = glob.glob(os.path.join('../data', "*.csv"))
print(read_files[0])
csvimp = CSVImporter(read_files[0])
#csvimp.import_data()
s1 = sp.SamplePath(csvimp)
s1.build_trajectories()
s1.build_structure()
print(s1.structure)
print(s1.trajectories)

@ -6,11 +6,9 @@ import glob
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import json_importer as ji import json_importer as ji
import json import json
class TestJsonImporter(unittest.TestCase): class TestJsonImporter(unittest.TestCase):
@classmethod @classmethod
@ -26,10 +24,10 @@ class TestJsonImporter(unittest.TestCase):
self.assertEqual(j1.variables_key, 'Name') self.assertEqual(j1.variables_key, 'Name')
self.assertEqual(j1.file_path, self.read_files[0]) self.assertEqual(j1.file_path, self.read_files[0])
self.assertFalse(j1.df_samples_list) self.assertFalse(j1.df_samples_list)
self.assertTrue(j1.variables.empty) self.assertIsNone(j1.variables)
self.assertTrue(j1.structure.empty) self.assertIsNone(j1.structure)
self.assertFalse(j1.concatenated_samples) self.assertIsNone(j1.concatenated_samples)
self.assertFalse(j1.sorter) self.assertIsNone(j1.sorter)
def test_read_json_file_found(self): def test_read_json_file_found(self):
data_set = {"key1": [1, 2, 3], "key2": [4, 5, 6]} data_set = {"key1": [1, 2, 3], "key2": [4, 5, 6]}
@ -73,7 +71,7 @@ class TestJsonImporter(unittest.TestCase):
sample_frame = j1.df_samples_list[0] sample_frame = j1.df_samples_list[0]
columns_header = list(sample_frame.columns.values) columns_header = list(sample_frame.columns.values)
shifted_cols_header = [s + "S" for s in columns_header[1:]] shifted_cols_header = [s + "S" for s in columns_header[1:]]
new_sample_frame = j1.compute_row_delta_sigle_samples_frame(sample_frame, j1.time_key, columns_header[1:], new_sample_frame = j1.compute_row_delta_sigle_samples_frame(sample_frame, columns_header[1:],
shifted_cols_header) shifted_cols_header)
self.assertEqual(len(list(sample_frame.columns.values)) + len(shifted_cols_header), self.assertEqual(len(list(sample_frame.columns.values)) + len(shifted_cols_header),
len(list(new_sample_frame.columns.values))) len(list(new_sample_frame.columns.values)))
@ -83,15 +81,16 @@ class TestJsonImporter(unittest.TestCase):
j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name') j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
raw_data = j1.read_json_file() raw_data = j1.read_json_file()
j1.import_trajectories(raw_data) j1.import_trajectories(raw_data)
j1.compute_row_delta_in_all_samples_frames(j1.time_key) j1.compute_row_delta_in_all_samples_frames(j1.df_samples_list)
self.assertEqual(list(j1.df_samples_list[0].columns.values), list(j1.concatenated_samples.columns.values)) self.assertEqual(list(j1.df_samples_list[0].columns.values),
list(j1.concatenated_samples.columns.values)[:len(list(j1.df_samples_list[0].columns.values))])
self.assertEqual(list(j1.concatenated_samples.columns.values)[0], j1.time_key) self.assertEqual(list(j1.concatenated_samples.columns.values)[0], j1.time_key)
def test_clear_data_frame_list(self): def test_clear_data_frame_list(self):
j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name') j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
raw_data = j1.read_json_file() raw_data = j1.read_json_file()
j1.import_trajectories(raw_data) j1.import_trajectories(raw_data)
j1.compute_row_delta_in_all_samples_frames(j1.time_key) j1.compute_row_delta_in_all_samples_frames(j1.df_samples_list)
j1.clear_data_frame_list() j1.clear_data_frame_list()
for df in j1.df_samples_list: for df in j1.df_samples_list:
self.assertTrue(df.empty) self.assertTrue(df.empty)
@ -137,7 +136,7 @@ class TestJsonImporter(unittest.TestCase):
j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name') j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
raw_data = j1.read_json_file() raw_data = j1.read_json_file()
cims = j1.import_sampled_cims(raw_data, 0, 'dyn.cims') cims = j1.import_sampled_cims(raw_data, 0, 'dyn.cims')
j1.import_variables(raw_data, ['X','Y','Z']) j1.import_variables(raw_data, ['X','Y','Z']) #TODO NON PUò dipendere direttamente da questo sorter
self.assertEqual(list(cims.keys()), j1.variables['Name'].tolist()) self.assertEqual(list(cims.keys()), j1.variables['Name'].tolist())
def test_import_data(self): def test_import_data(self):