1
0
Fork 0

Refactor AbstractImporter class to obtain better decoupling from SamplePath

parallel_struct_est
philpMartin 4 years ago
parent 8e1666a607
commit f32b79590f
  1. 114
      main_package/classes/abstract_importer.py
  2. 109
      main_package/classes/json_importer.py
  3. 31
      main_package/classes/sample_path.py
  4. 4
      main_package/classes/set_of_cims.py
  5. 53
      main_package/classes/simple_cvs_importer.py
  6. 21
      main_package/tests/test_json_importer.py

@ -1,4 +1,6 @@
from abc import ABC, abstractmethod
import pandas as pd
import typing
class AbstractImporter(ABC):
@ -6,13 +8,22 @@ class AbstractImporter(ABC):
Interface that exposes all the necessary methods to import the trajectories and the net structure.
:file_path: the file path
:_concatenated_samples: the concatenation of all the processed trajectories
:df_structure: Dataframe containing the structure of the network (edges)
:df_variables: Dataframe containing the nodes cardinalities
:df_concatenated_samples: the concatenation and processing of all the trajectories present in the list df_samples list
:sorter: the columns header(excluding the time column) of the Dataframe concatenated_samples
"""
def __init__(self, file_path: str):
self.file_path = file_path
self._df_variables = None
self._df_structure = None
self._concatenated_samples = None
self._sorter = None
super().__init__()
"""
@abstractmethod
def import_trajectories(self, raw_data):
pass
@ -20,4 +31,105 @@ class AbstractImporter(ABC):
@abstractmethod
def import_structure(self, raw_data):
pass
"""
@abstractmethod
def import_data(self):
"""
Imports and prepares all data present needed for susequent computation.
Parameters:
void
Returns:
void
POSTCONDITION: the class members self._df_variables and self._df_structure HAVE to be properly constructed
as Pandas Dataframes
"""
pass
def compute_row_delta_sigle_samples_frame(self, sample_frame: pd.DataFrame,
columns_header: typing.List, shifted_cols_header: typing.List) \
-> pd.DataFrame:
"""
Computes the difference between each value present in th time column.
Copies and shift by one position up all the values present in the remaining columns.
PREREQUISITE: the Dataframe in input has to follow the column structure of this header:
[Time|Variable values], so it is assumed TIME is ALWAYS the FIRST column.
Parameters:
sample_frame: the traj to be processed
time_header_label: the label for the times
columns_header: the original header of sample_frame
shifted_cols_header: a copy of columns_header with changed names of the contents
Returns:
sample_frame: the processed dataframe
"""
#sample_frame[time_header_label] = sample_frame[time_header_label].diff().shift(-1)
sample_frame.iloc[:, 0] = sample_frame.iloc[:, 0].diff().shift(-1)
shifted_cols = sample_frame[columns_header].shift(-1).fillna(0).astype('int32')
shifted_cols.columns = shifted_cols_header
sample_frame = sample_frame.assign(**shifted_cols)
sample_frame.drop(sample_frame.tail(1).index, inplace=True)
return sample_frame
def compute_row_delta_in_all_samples_frames(self, df_samples_list: typing.List):
"""
Calls the method compute_row_delta_sigle_samples_frame on every dataframe present in the list df_samples_list.
Concatenates the result in the dataframe concatanated_samples
PREREQUISITE: the Dataframe in input has to follow the column structure of this header:
[Time|Variable values], so it is assumed TIME is ALWAYS the FIRST column.
The class member self._sorter HAS to be properly INITIALIZED
Parameters:
time_header_label: the label of the time column
df_samples_list: the datframe's list to be processed and concatenated
Returns:
void
"""
shifted_cols_header = [s + "S" for s in self._sorter]
compute_row_delta = self.compute_row_delta_sigle_samples_frame
proc_samples_list = [compute_row_delta(sample, self._sorter, shifted_cols_header)
for sample in df_samples_list]
self._concatenated_samples = pd.concat(proc_samples_list)
complete_header = self._sorter[:]
complete_header.insert(0,'Time')
complete_header.extend(shifted_cols_header)
#print("Complete Header", complete_header)
self._concatenated_samples = self._concatenated_samples[complete_header]
#print("Concat Samples",self._concatenated_samples)
def build_list_of_samples_array(self, data_frame: pd.DataFrame) -> typing.List:
"""
Builds a List containing the columns of dataframe and converts them to a numpy array.
Parameters:
:data_frame: the dataframe from which the columns have to be extracted and converted
Returns:
:columns_list: the resulting list of numpy arrays
"""
columns_list = [data_frame[column].to_numpy() for column in data_frame]
return columns_list
def clear_concatenated_frame(self):
"""
Removes all values in the dataframe concatenated_samples
Parameters:
void
Returns:
void
"""
self._concatenated_samples = self._concatenated_samples.iloc[0:0]
@property
def concatenated_samples(self) -> pd.DataFrame:
return self._concatenated_samples
@property
def variables(self) -> pd.DataFrame:
return self._df_variables
@property
def structure(self) -> pd.DataFrame:
return self._df_structure
@property
def sorter(self):
return self._sorter

@ -23,10 +23,6 @@ class JsonImporter(ai.AbstractImporter):
:time_key: the key used to identify the timestamps in each trajectory
:variables_key: the key used to identify the names of the variables in the net
:df_samples_list: a Dataframe list in which every df contains a trajectory
:df_structure: Dataframe containing the structure of the network (edges)
:df_variables: Dataframe containing the nodes cardinalities
:df_concatenated_samples: the concatenation and processing of all the trajectories present in the list df_samples list
:sorter: the columns header(excluding the time column) of the Dataframe concatenated_samples
"""
def __init__(self, file_path: str, samples_label: str, structure_label: str, variables_label: str, time_key: str,
@ -37,10 +33,6 @@ class JsonImporter(ai.AbstractImporter):
self.time_key = time_key
self.variables_key = variables_key
self.df_samples_list = []
self._df_structure = pd.DataFrame()
self._df_variables = pd.DataFrame()
self._concatenated_samples = None
self.sorter = None
super(JsonImporter, self).__init__(file_path)
def import_data(self):
@ -53,10 +45,10 @@ class JsonImporter(ai.AbstractImporter):
"""
raw_data = self.read_json_file()
self.import_trajectories(raw_data)
self.compute_row_delta_in_all_samples_frames(self.time_key)
self.compute_row_delta_in_all_samples_frames(self.df_samples_list)
self.clear_data_frame_list()
self.import_structure(raw_data)
self.import_variables(raw_data, self.sorter)
self.import_variables(raw_data, self._sorter)
def import_trajectories(self, raw_data: typing.List):
"""
@ -79,7 +71,6 @@ class JsonImporter(ai.AbstractImporter):
"""
self._df_structure = self.one_level_normalizing(raw_data, 0, self.structure_label)
def import_variables(self, raw_data: typing.List, sorter: typing.List):
"""
Imports the data in raw_data at the key variables_label.
@ -92,13 +83,13 @@ class JsonImporter(ai.AbstractImporter):
void
"""
self._df_variables = self.one_level_normalizing(raw_data, 0, self.variables_label)
#self.sorter = self._df_variables[self.variables_key].to_list()
#self.sorter.sort()
#print("Sorter:", self.sorter)
#TODO Usando come Pre-requisito l'ordinamento del frame _df_variables uguale a quello presente in
#TODO self _sorter questo codice risulta inutile
self._df_variables[self.variables_key] = self._df_variables[self.variables_key].astype("category")
self._df_variables[self.variables_key] = self._df_variables[self.variables_key].cat.set_categories(sorter)
self._df_variables = self._df_variables.sort_values([self.variables_key])
self._df_variables.reset_index(inplace=True)
self._df_variables.drop('index', axis=1, inplace=True)
print("Var Frame", self._df_variables)
def read_json_file(self) -> typing.List:
@ -111,15 +102,9 @@ class JsonImporter(ai.AbstractImporter):
data: the contents of the json file
"""
#try:
#read_files = glob.glob(os.path.join(self.files_path, "*.json"))
#if not read_files:
#raise ValueError('No .json file found in the entered path!')
with open(self.file_path) as f:
data = json.load(f)
return data
#except ValueError as err:
#print(err.args)
def one_level_normalizing(self, raw_data: typing.List, indx: int, key: str) -> pd.DataFrame:
"""
@ -152,82 +137,15 @@ class JsonImporter(ai.AbstractImporter):
smps = raw_data[indx][trajectories_key]
self.df_samples_list = [dataframe(sample) for sample in smps]
columns_header = list(self.df_samples_list[0].columns.values)
#print("COLUMNs HEADER", columns_header)
columns_header.remove(self.time_key)
self.sorter = columns_header
def compute_row_delta_sigle_samples_frame(self, sample_frame: pd.DataFrame, time_header_label: str,
columns_header: typing.List, shifted_cols_header: typing.List) \
-> pd.DataFrame:
"""
Computes the difference between each value present in th time column.
Copies and shift by one position up all the values present in the remaining columns.
Parameters:
sample_frame: the traj to be processed
time_header_label: the label for the times
columns_header: the original header of sample_frame
shifted_cols_header: a copy of columns_header with changed names of the contents
Returns:
sample_frame: the processed dataframe
"""
sample_frame[time_header_label] = sample_frame[time_header_label].diff().shift(-1)
shifted_cols = sample_frame[columns_header].shift(-1).fillna(0).astype('int32')
#print(shifted_cols)
shifted_cols.columns = shifted_cols_header
sample_frame = sample_frame.assign(**shifted_cols)
sample_frame.drop(sample_frame.tail(1).index, inplace=True)
return sample_frame
def compute_row_delta_in_all_samples_frames(self, time_header_label: str):
"""
Calls the method compute_row_delta_sigle_samples_frame on every dataframe present in the list self.df_samples_list.
Concatenates the result in the dataframe concatanated_samples
Parameters:
time_header_label: the label of the time column
Returns:
void
"""
shifted_cols_header = [s + "S" for s in self.sorter]
compute_row_delta = self.compute_row_delta_sigle_samples_frame
self.df_samples_list = [compute_row_delta(sample, time_header_label, self.sorter, shifted_cols_header)
for sample in self.df_samples_list]
self._concatenated_samples = pd.concat(self.df_samples_list)
complete_header = self.sorter[:]
complete_header.insert(0,'Time')
complete_header.extend(shifted_cols_header)
#print("Complete Header", complete_header)
self._concatenated_samples = self._concatenated_samples[complete_header]
#print("Concat Samples",self._concatenated_samples)
def build_list_of_samples_array(self, data_frame: pd.DataFrame) -> typing.List:
"""
Builds a List containing the columns of dataframe and converts them to a numpy array.
Parameters:
:data_frame: the dataframe from which the columns have to be extracted and converted
Returns:
:columns_list: the resulting list of numpy arrays
"""
columns_list = [data_frame[column].to_numpy() for column in data_frame]
#for column in data_frame:
#columns_list.append(data_frame[column].to_numpy())
return columns_list
def clear_concatenated_frame(self):
"""
Removes all values in the dataframe concatenated_samples
Parameters:
void
Returns:
void
"""
self._concatenated_samples = self._concatenated_samples.iloc[0:0]
self._sorter = columns_header
def clear_data_frame_list(self):
"""
Removes all values present in the dataframes in the list df_samples_list
"""
for indx in range(len(self.df_samples_list)): # Le singole traj non servono più #TODO usare list comprens
for indx in range(len(self.df_samples_list)):
self.df_samples_list[indx] = self.df_samples_list[indx].iloc[0:0]
def import_sampled_cims(self, raw_data: typing.List, indx: int, cims_key: str) -> typing.Dict:
@ -239,17 +157,6 @@ class JsonImporter(ai.AbstractImporter):
cims_for_all_vars[var].append(pd.DataFrame(raw_data[indx][cims_key][var][p_comb]).to_numpy())
return cims_for_all_vars
@property
def concatenated_samples(self):
return self._concatenated_samples
@property
def variables(self):
return self._df_variables
@property
def structure(self):
return self._df_structure

@ -1,29 +1,28 @@
import abstract_sample_path as asam
import json_importer as imp
import abstract_importer as imp
import structure as st
import trajectory as tr
class SamplePath(asam.AbstractSamplePath):
class SamplePath:
"""
Aggregates all the informations about the trajectories, the real structure of the sampled net and variables
cardinalites.
Has the task of creating the objects that will contain the mentioned data.
:importer: the Importer objects that will import ad process data
:trajectories: the Trajectory object that will contain all the concatenated trajectories
:structure: the Structure Object that will contain all the structurral infos about the net
:total_variables_count: the number of variables in the net
"""
#def __init__(self, files_path: str, samples_label: str, structure_label: str, variables_label: str, time_key: str,
#variables_key: str):
def __init__(self, importer: imp.JsonImporter):
#self.importer =importer
super().__init__(importer)
#self._trajectories = None
#self._structure = None
def __init__(self, importer: imp.AbstractImporter):
"""
:importer: the Importer objects that will import ad process data
"""
self.importer = importer
self._trajectories = None
self._structure = None
self.total_variables_count = None
def build_trajectories(self):
@ -51,11 +50,15 @@ class SamplePath(asam.AbstractSamplePath):
Returns:
void
"""
if self.importer.sorter != self.importer.variables.iloc[:, 0].to_list():
raise RuntimeError("The Dataset columns order have to match the order of labels in the variables Frame!")
self.total_variables_count = len(self.importer.sorter)
labels = self.importer.variables[self.importer.variables_key].to_list()
#labels = self.importer.variables[self.importer.variables_key].to_list()
#print("SAMPLE PATH LABELS",labels)
#print(self.importer.variables)
labels = self.importer.variables.iloc[:, 0].to_list()
indxs = self.importer.variables.index.to_numpy()
vals = self.importer.variables['Value'].to_numpy()
vals = self.importer.variables.iloc[:, 1].to_numpy()
edges = list(self.importer.structure.to_records(index=False))
self._structure = st.Structure(labels, indxs, vals, edges,
self.total_variables_count)

@ -79,8 +79,8 @@ class SetOfCims:
if mask_arr.size <= 1:
return self.actual_cims
else:
tmp_parents_comb_from_ids = np.argwhere(np.all(self.p_combs[:, mask_arr] == comb, axis=1)).ravel()
return self.actual_cims[tmp_parents_comb_from_ids]
flat_indxs = np.argwhere(np.all(self.p_combs[:, mask_arr] == comb, axis=1)).ravel()
return self.actual_cims[flat_indxs]
@property
def get_cims(self):

@ -0,0 +1,53 @@
import pandas as pd
import glob
import os
import abstract_importer as ai
import sample_path as sp
class CSVImporter(ai.AbstractImporter):
def __init__(self, file_path):
self._df_samples_list = None
super(CSVImporter, self).__init__(file_path)
def import_data(self):
self.read_csv_file()
self.import_variables()
self.import_structure()
self.compute_row_delta_in_all_samples_frames(self._df_samples_list)
def read_csv_file(self):
df = pd.read_csv(self.file_path)
df.drop(df.columns[[0]], axis=1, inplace=True)
self._df_samples_list = [df]
def import_variables(self):
variables_list = list(self._df_samples_list[0].columns)[1:]
#wrong_vars_labels = ['Y','Z','X']
self._sorter = variables_list
values_list = [3 for var in variables_list]
# initialize list of lists
data = {'Name':variables_list, 'Value':values_list}
# Create the pandas DataFrame
self._df_variables = pd.DataFrame(data)
def import_structure(self):
data = {'From':['X','Y','Z'], 'To':['Z','Z','Y']}
self._df_structure = pd.DataFrame(data)
read_files = glob.glob(os.path.join('../data', "*.csv"))
print(read_files[0])
csvimp = CSVImporter(read_files[0])
#csvimp.import_data()
s1 = sp.SamplePath(csvimp)
s1.build_trajectories()
s1.build_structure()
print(s1.structure)
print(s1.trajectories)

@ -6,11 +6,9 @@ import glob
import numpy as np
import pandas as pd
import json_importer as ji
import json
class TestJsonImporter(unittest.TestCase):
@classmethod
@ -26,10 +24,10 @@ class TestJsonImporter(unittest.TestCase):
self.assertEqual(j1.variables_key, 'Name')
self.assertEqual(j1.file_path, self.read_files[0])
self.assertFalse(j1.df_samples_list)
self.assertTrue(j1.variables.empty)
self.assertTrue(j1.structure.empty)
self.assertFalse(j1.concatenated_samples)
self.assertFalse(j1.sorter)
self.assertIsNone(j1.variables)
self.assertIsNone(j1.structure)
self.assertIsNone(j1.concatenated_samples)
self.assertIsNone(j1.sorter)
def test_read_json_file_found(self):
data_set = {"key1": [1, 2, 3], "key2": [4, 5, 6]}
@ -73,7 +71,7 @@ class TestJsonImporter(unittest.TestCase):
sample_frame = j1.df_samples_list[0]
columns_header = list(sample_frame.columns.values)
shifted_cols_header = [s + "S" for s in columns_header[1:]]
new_sample_frame = j1.compute_row_delta_sigle_samples_frame(sample_frame, j1.time_key, columns_header[1:],
new_sample_frame = j1.compute_row_delta_sigle_samples_frame(sample_frame, columns_header[1:],
shifted_cols_header)
self.assertEqual(len(list(sample_frame.columns.values)) + len(shifted_cols_header),
len(list(new_sample_frame.columns.values)))
@ -83,15 +81,16 @@ class TestJsonImporter(unittest.TestCase):
j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
raw_data = j1.read_json_file()
j1.import_trajectories(raw_data)
j1.compute_row_delta_in_all_samples_frames(j1.time_key)
self.assertEqual(list(j1.df_samples_list[0].columns.values), list(j1.concatenated_samples.columns.values))
j1.compute_row_delta_in_all_samples_frames(j1.df_samples_list)
self.assertEqual(list(j1.df_samples_list[0].columns.values),
list(j1.concatenated_samples.columns.values)[:len(list(j1.df_samples_list[0].columns.values))])
self.assertEqual(list(j1.concatenated_samples.columns.values)[0], j1.time_key)
def test_clear_data_frame_list(self):
j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
raw_data = j1.read_json_file()
j1.import_trajectories(raw_data)
j1.compute_row_delta_in_all_samples_frames(j1.time_key)
j1.compute_row_delta_in_all_samples_frames(j1.df_samples_list)
j1.clear_data_frame_list()
for df in j1.df_samples_list:
self.assertTrue(df.empty)
@ -137,7 +136,7 @@ class TestJsonImporter(unittest.TestCase):
j1 = ji.JsonImporter(self.read_files[0], 'samples', 'dyn.str', 'variables', 'Time', 'Name')
raw_data = j1.read_json_file()
cims = j1.import_sampled_cims(raw_data, 0, 'dyn.cims')
j1.import_variables(raw_data, ['X','Y','Z'])
j1.import_variables(raw_data, ['X','Y','Z']) #TODO NON PUò dipendere direttamente da questo sorter
self.assertEqual(list(cims.keys()), j1.variables['Name'].tolist())
def test_import_data(self):