1
0
Fork 0

Refactored and Tested JsonImporter class

parallel_struct_est
philpMartin 4 years ago
parent 6617ddaa4f
commit 3c91af18ca
  1. 83
      main_package/classes/json_importer.py
  2. 2
      main_package/classes/sample_path.py
  3. 0
      main_package/tests/__init__.py
  4. 126
      main_package/tests/test_json_importer.py

@ -22,36 +22,42 @@ class JsonImporter(AbstractImporter):
""" """
def __init__(self, files_path): def __init__(self, files_path, samples_label, structure_label, variables_label, time_key, variables_key):
self.samples_label = samples_label
self.structure_label = structure_label
self.variables_label = variables_label
self.time_key = time_key
self.variables_key = variables_key
self.df_samples_list = [] self.df_samples_list = []
self._df_structure = pd.DataFrame() self._df_structure = pd.DataFrame()
self._df_variables = pd.DataFrame() self._df_variables = pd.DataFrame()
self._concatenated_samples = None self._concatenated_samples = None
self.sorter = None
super(JsonImporter, self).__init__(files_path) super(JsonImporter, self).__init__(files_path)
def import_data(self): def import_data(self):
raw_data = self.read_json_file() raw_data = self.read_json_file()
self.import_trajectories(raw_data) self.import_trajectories(raw_data)
self.compute_row_delta_in_all_samples_frames('Time') self.compute_row_delta_in_all_samples_frames(self.time_key)
self.clear_data_frame_list()
self.import_structure(raw_data) self.import_structure(raw_data)
self.import_variables(raw_data) self.import_variables(raw_data, self.sorter)
#Le variabili DEVONO essere ordinate come le Colonne del dataset
assert list(self._df_variables['Name']) == \
(list(self._concatenated_samples.columns.values[1:len(self.variables['Name']) + 1]))
def import_trajectories(self, raw_data): def import_trajectories(self, raw_data):
self.normalize_trajectories(raw_data, 0, 'samples') self.normalize_trajectories(raw_data, 0, self.samples_label)
def import_structure(self, raw_data): def import_structure(self, raw_data):
self._df_structure = self.one_level_normalizing(raw_data, 0, 'dyn.str') self._df_structure = self.one_level_normalizing(raw_data, 0, self.structure_label)
def import_variables(self, raw_data): def import_variables(self, raw_data, sorter):
self._df_variables = self.one_level_normalizing(raw_data, 0, 'variables') self._df_variables = self.one_level_normalizing(raw_data, 0, self.variables_label)
self._df_variables[self.variables_key] = self._df_variables[self.variables_key].astype("category")
self._df_variables[self.variables_key] = self._df_variables[self.variables_key].cat.set_categories(sorter)
self._df_variables = self._df_variables.sort_values([self.variables_key])
def read_json_file(self): def read_json_file(self):
""" """
Legge 'tutti' i file .json presenti nel path self.filepath Legge il primo file .json nel path self.filepath
Parameters: Parameters:
void void
@ -61,10 +67,11 @@ class JsonImporter(AbstractImporter):
""" """
try: try:
read_files = glob.glob(os.path.join(self.files_path, "*.json")) read_files = glob.glob(os.path.join(self.files_path, "*.json"))
for file_name in read_files: if not read_files:
with open(file_name) as f: raise ValueError('No .json file found in the entered path!')
data = json.load(f) with open(read_files[0]) as f:
return data data = json.load(f)
return data
except ValueError as err: except ValueError as err:
print(err.args) print(err.args)
@ -81,7 +88,7 @@ class JsonImporter(AbstractImporter):
Il dataframe contenente i dati normalizzati Il dataframe contenente i dati normalizzati
""" """
return pd.json_normalize(raw_data[indx][key]) return pd.DataFrame(raw_data[indx][key])
def normalize_trajectories(self, raw_data, indx, trajectories_key): def normalize_trajectories(self, raw_data, indx, trajectories_key):
""" """
@ -106,35 +113,13 @@ class JsonImporter(AbstractImporter):
def compute_row_delta_in_all_samples_frames(self, time_header_label): def compute_row_delta_in_all_samples_frames(self, time_header_label):
columns_header = list(self.df_samples_list[0].columns.values) columns_header = list(self.df_samples_list[0].columns.values)
self.sorter = columns_header[1:]
shifted_cols_header = [s + "S" for s in columns_header[1:]] shifted_cols_header = [s + "S" for s in columns_header[1:]]
for indx, sample in enumerate(self.df_samples_list): for indx, sample in enumerate(self.df_samples_list):
self.df_samples_list[indx] = self.compute_row_delta_sigle_samples_frame(sample, self.df_samples_list[indx] = self.compute_row_delta_sigle_samples_frame(sample,
time_header_label, columns_header, shifted_cols_header) time_header_label, columns_header, shifted_cols_header)
#print(self.df_samples_list[indx]) #print(self.df_samples_list[indx])
self._concatenated_samples = pd.concat(self.df_samples_list) self._concatenated_samples = pd.concat(self.df_samples_list)
#print("Concatenated", self._concatenated_samples)
for indx in range(len(self.df_samples_list)): # Le singole traj non servono più
self.df_samples_list[indx] = self.df_samples_list[indx].iloc[0:0]
def compute_row_delta_sigle_samples_frame(self, sample_frame):
columns_header = list(sample_frame.columns.values)
#print(columns_header)
for col_name in columns_header:
if col_name == 'Time':
sample_frame[col_name + 'Delta'] = sample_frame[col_name].diff()
#else:
#sample_frame[col_name + 'Delta'] = (sample_frame[col_name].diff().bfill() != 0).astype(int)
#sample_frame['Delta'] = sample_frame['Time'].diff()
#print(sample_frame)
def compute_row_delta_in_all_samples_frames(self):
for sample in self.df_samples_list:
self.compute_row_delta_sigle_samples_frame(sample)
self.concatenated_samples = pd.concat(self.df_samples_list)
self.concatenated_samples['Time'] = self.concatenated_samples['TimeDelta']
del self.concatenated_samples['TimeDelta']
self.concatenated_samples['Time'] = self.concatenated_samples['Time'].fillna(0)
def build_list_of_samples_array(self, data_frame): def build_list_of_samples_array(self, data_frame):
""" """
@ -160,6 +145,10 @@ class JsonImporter(AbstractImporter):
""" """
self._concatenated_samples = self._concatenated_samples.iloc[0:0] self._concatenated_samples = self._concatenated_samples.iloc[0:0]
def clear_data_frame_list(self):
for indx in range(len(self.df_samples_list)): # Le singole traj non servono più
self.df_samples_list[indx] = self.df_samples_list[indx].iloc[0:0]
@property @property
def concatenated_samples(self): def concatenated_samples(self):
return self._concatenated_samples return self._concatenated_samples
@ -168,24 +157,10 @@ class JsonImporter(AbstractImporter):
def variables(self): def variables(self):
return self._df_variables return self._df_variables
@property @property
def structure(self): def structure(self):
return self._df_structure return self._df_structure
"""ij = JsonImporter("../data")
#raw_data = ij.read_json_file()
lp = LineProfiler()
lp_wrapper = lp(ij.import_data)
lp_wrapper()
lp.print_stats()
ij.import_data()
#print(ij.df_samples_list[7])
print(ij.df_structure)
print(ij.df_variables)
print(ij.concatenated_samples)"""

@ -45,7 +45,7 @@ os.chdir('..')
path = os.getcwd() + '/data' path = os.getcwd() + '/data'
"""os.getcwd() os.getcwd()
os.chdir('..') os.chdir('..')
path = os.getcwd() + '/data' path = os.getcwd() + '/data'

@ -0,0 +1,126 @@
import unittest
import numpy as np
import pandas as pd
import json_importer as ji
import os
import json
class TestJsonImporter(unittest.TestCase):
def test_init(self):
path = os.getcwd()
j1 = ji.JsonImporter(path, 'samples', 'dyn.str', 'variables', 'Time', 'Name')
self.assertEqual(j1.samples_label, 'samples')
self.assertEqual(j1.structure_label, 'dyn.str')
self.assertEqual(j1.variables_label, 'variables')
self.assertEqual(j1.time_key, 'Time')
self.assertEqual(j1.variables_key, 'Name')
self.assertEqual(j1.files_path, path)
self.assertTrue(not j1.df_samples_list)
self.assertTrue(j1.variables.empty)
self.assertTrue(j1.structure.empty)
self.assertTrue(not j1.concatenated_samples)
def test_read_json_file_found(self):
data_set = {"key1": [1, 2, 3], "key2": [4, 5, 6]}
with open('data.json', 'w') as f:
json.dump(data_set, f)
path = os.getcwd()
j1 = ji.JsonImporter(path, '', '', '', '', '')
imported_data = j1.read_json_file()
self.assertTrue(self.ordered(data_set) == self.ordered(imported_data))
os.remove('data.json')
def test_read_json_file_not_found(self):
path = os.getcwd()
#print(path)
j1 = ji.JsonImporter(path, '', '', '', '', '')
self.assertIsNone(j1.read_json_file())
def test_normalize_trajectories(self):
j1 = ji.JsonImporter('../data', 'samples', 'dyn.str', 'variables', 'Time', 'Name')
raw_data = j1.read_json_file()
j1.normalize_trajectories(raw_data, 0, j1.samples_label)
self.assertEqual(len(j1.df_samples_list), len(raw_data[0][j1.samples_label]))
def test_normalize_trajectories_wrong_indx(self):
j1 = ji.JsonImporter('../data', 'samples', 'dyn.str', 'variables', 'Time', 'Name')
raw_data = j1.read_json_file()
self.assertRaises(IndexError, j1.normalize_trajectories, raw_data, 1, j1.samples_label)
def test_normalize_trajectories_wrong_key(self):
j1 = ji.JsonImporter('../data', 'sample', 'dyn.str', 'variables', 'Time', 'Name')
raw_data = j1.read_json_file()
self.assertRaises(KeyError, j1.normalize_trajectories, raw_data, 0, j1.samples_label)
def test_compute_row_delta_single_samples_frame(self):
j1 = ji.JsonImporter('../data', 'samples', 'dyn.str', 'variables', 'Time', 'Name')
raw_data = j1.read_json_file()
j1.normalize_trajectories(raw_data, 0, j1.samples_label)
sample_frame = j1.df_samples_list[0]
columns_header = list(sample_frame.columns.values)
shifted_cols_header = [s + "S" for s in columns_header[1:]]
new_sample_frame = j1.compute_row_delta_sigle_samples_frame(sample_frame, j1.time_key, columns_header,
shifted_cols_header)
self.assertEqual(len(list(sample_frame.columns.values)) + len(shifted_cols_header),
len(list(new_sample_frame.columns.values)))
self.assertEqual(sample_frame.shape[0] - 1, new_sample_frame.shape[0])
def test_compute_row_delta_in_all_frames(self):
j1 = ji.JsonImporter('../data', 'samples', 'dyn.str', 'variables', 'Time', 'Name')
raw_data = j1.read_json_file()
j1.import_trajectories(raw_data)
j1.compute_row_delta_in_all_samples_frames(j1.time_key)
self.assertEqual(list(j1.df_samples_list[0].columns.values), list(j1.concatenated_samples.columns.values))
def test_clear_data_frame_list(self):
j1 = ji.JsonImporter('../data', 'samples', 'dyn.str', 'variables', 'Time', 'Name')
raw_data = j1.read_json_file()
j1.import_trajectories(raw_data)
j1.compute_row_delta_in_all_samples_frames(j1.time_key)
j1.clear_data_frame_list()
for df in j1.df_samples_list:
self.assertTrue(df.empty)
def test_build_list_of_samples_array(self):
data_set = {"key1": [1, 2, 3], "key2": [4.1, 5.2, 6.3]}
with open('data.json', 'w') as f:
json.dump(data_set, f)
path = os.getcwd()
j1 = ji.JsonImporter(path, '', '', '', '', '')
raw_data = j1.read_json_file()
frame = pd.DataFrame(raw_data)
col_list = j1.build_list_of_samples_array(frame)
forced_list = []
for key in data_set:
forced_list.append(np.array(data_set[key]))
for a1, a2 in zip(col_list, forced_list):
self.assertTrue(np.array_equal(a1, a2))
os.remove('data.json')
def test_import_variables(self):
j1 = ji.JsonImporter('../data', 'samples', 'dyn.str', 'variables', 'Time', 'Name')
raw_data = [{'variables':{"Name": ['Z', 'Y', 'X'], "value": [3, 3, 3]}}]
j1.import_variables(raw_data, ['X', 'Y', 'Z'])
def test_import_data(self):
j1 = ji.JsonImporter('../data', 'samples', 'dyn.str', 'variables', 'Time', 'Name')
j1.import_data()
print(j1.variables)
def ordered(self, obj):
if isinstance(obj, dict):
return sorted((k, self.ordered(v)) for k, v in obj.items())
if isinstance(obj, list):
return sorted(self.ordered(x) for x in obj)
else:
return obj
if __name__ == '__main__':
unittest.main()