From adb0f99419fb6dd2920d385f069e61f61052a8ed Mon Sep 17 00:00:00 2001 From: AlessandroBregoli Date: Tue, 7 Feb 2023 14:34:58 +0100 Subject: [PATCH] Added automatic stopping for MonteCarloReward --- reCTBN/src/parameter_learning.rs | 8 +- reCTBN/src/reward.rs | 2 +- reCTBN/src/reward/reward_evaluation.rs | 97 +++++++++++++------ .../score_based_algorithm.rs | 2 +- reCTBN/tests/reward_evaluation.rs | 6 +- 5 files changed, 73 insertions(+), 42 deletions(-) diff --git a/reCTBN/src/parameter_learning.rs b/reCTBN/src/parameter_learning.rs index 7e45e58..3c34d06 100644 --- a/reCTBN/src/parameter_learning.rs +++ b/reCTBN/src/parameter_learning.rs @@ -144,11 +144,9 @@ impl ParameterLearning for BayesianApproach { .zip(M.mapv(|x| x as f64).axis_iter(Axis(2))) .for_each(|(mut C, m)| C.assign(&(&m.mapv(|y| y + alpha) / &T.mapv(|y| y + tau)))); - - CIM.outer_iter_mut() - .for_each(|mut C| { - C.diag_mut().fill(0.0); - }); + CIM.outer_iter_mut().for_each(|mut C| { + C.diag_mut().fill(0.0); + }); //Set the diagonal of the inner matrices to the the row sum multiplied by -1 let tmp_diag_sum: Array2 = CIM.sum_axis(Axis(2)).mapv(|x| x * -1.0); diff --git a/reCTBN/src/reward.rs b/reCTBN/src/reward.rs index f0edf2f..910954c 100644 --- a/reCTBN/src/reward.rs +++ b/reCTBN/src/reward.rs @@ -1,5 +1,5 @@ -pub mod reward_function; pub mod reward_evaluation; +pub mod reward_function; use std::collections::HashMap; diff --git a/reCTBN/src/reward/reward_evaluation.rs b/reCTBN/src/reward/reward_evaluation.rs index 431efde..3802489 100644 --- a/reCTBN/src/reward/reward_evaluation.rs +++ b/reCTBN/src/reward/reward_evaluation.rs @@ -1,11 +1,11 @@ use std::collections::HashMap; use rayon::prelude::{IntoParallelIterator, ParallelIterator}; +use statrs::distribution::ContinuousCDF; use crate::params::{self, ParamsTrait}; use crate::process; - use crate::{ process::NetworkProcessState, reward::RewardEvaluation, @@ -14,11 +14,13 @@ use crate::{ pub enum RewardCriteria { FiniteHorizon, - InfiniteHorizon {discount_factor: f64}, + InfiniteHorizon { discount_factor: f64 }, } pub struct MonteCarloReward { - n_iterations: usize, + max_iterations: usize, + max_err_stop: f64, + alpha_stop: f64, end_time: f64, reward_criteria: RewardCriteria, seed: Option, @@ -26,13 +28,17 @@ pub struct MonteCarloReward { impl MonteCarloReward { pub fn new( - n_iterations: usize, + max_iterations: usize, + max_err_stop: f64, + alpha_stop: f64, end_time: f64, reward_criteria: RewardCriteria, seed: Option, ) -> MonteCarloReward { MonteCarloReward { - n_iterations, + max_iterations, + max_err_stop, + alpha_stop, end_time, reward_criteria, seed, @@ -58,7 +64,8 @@ impl RewardEvaluation for MonteCarloReward { let n_states: usize = variables_domain.iter().map(|x| x.len()).product(); - (0..n_states).into_par_iter() + (0..n_states) + .into_par_iter() .map(|s| { let state: process::NetworkProcessState = variables_domain .iter() @@ -85,10 +92,13 @@ impl RewardEvaluation for MonteCarloReward { ) -> f64 { let mut sampler = ForwardSampler::new(network_process, self.seed.clone(), Some(state.clone())); - let mut ret = 0.0; + let mut expected_value = 0.0; + let mut squared_expected_value = 0.0; + let normal = statrs::distribution::Normal::new(0.0, 1.0).unwrap(); - for _i in 0..self.n_iterations { + for i in 0..self.max_iterations { sampler.reset(); + let mut ret = 0.0; let mut previous = sampler.next().unwrap(); while previous.t < self.end_time { let current = sampler.next().unwrap(); @@ -96,7 +106,7 @@ impl RewardEvaluation for MonteCarloReward { let r = reward_function.call(&previous.state, None); let discount = match self.reward_criteria { RewardCriteria::FiniteHorizon => self.end_time - previous.t, - RewardCriteria::InfiniteHorizon {discount_factor} => { + RewardCriteria::InfiniteHorizon { discount_factor } => { std::f64::consts::E.powf(-discount_factor * previous.t) - std::f64::consts::E.powf(-discount_factor * self.end_time) } @@ -105,8 +115,8 @@ impl RewardEvaluation for MonteCarloReward { } else { let r = reward_function.call(&previous.state, Some(¤t.state)); let discount = match self.reward_criteria { - RewardCriteria::FiniteHorizon => current.t-previous.t, - RewardCriteria::InfiniteHorizon {discount_factor} => { + RewardCriteria::FiniteHorizon => current.t - previous.t, + RewardCriteria::InfiniteHorizon { discount_factor } => { std::f64::consts::E.powf(-discount_factor * previous.t) - std::f64::consts::E.powf(-discount_factor * current.t) } @@ -114,51 +124,74 @@ impl RewardEvaluation for MonteCarloReward { ret += discount * r.instantaneous_reward; ret += match self.reward_criteria { RewardCriteria::FiniteHorizon => 1.0, - RewardCriteria::InfiniteHorizon {discount_factor} => { + RewardCriteria::InfiniteHorizon { discount_factor } => { std::f64::consts::E.powf(-discount_factor * current.t) } } * r.transition_reward; } previous = current; } + + let float_i = i as f64; + expected_value = + expected_value * float_i as f64 / (float_i + 1.0) + ret / (float_i + 1.0); + squared_expected_value = squared_expected_value * float_i as f64 / (float_i + 1.0) + + ret.powi(2) / (float_i + 1.0); + + if i > 2 { + let var = + (float_i + 1.0) / float_i * (squared_expected_value - expected_value.powi(2)); + if self.alpha_stop + - 2.0 * normal.cdf(-(float_i + 1.0).sqrt() * self.max_err_stop / var.sqrt()) + > 0.0 + { + return expected_value; + } + } } - ret / self.n_iterations as f64 + expected_value } } pub struct NeighborhoodRelativeReward { - inner_reward: RE + inner_reward: RE, } -impl NeighborhoodRelativeReward{ - pub fn new(inner_reward: RE) -> NeighborhoodRelativeReward{ - NeighborhoodRelativeReward {inner_reward} +impl NeighborhoodRelativeReward { + pub fn new(inner_reward: RE) -> NeighborhoodRelativeReward { + NeighborhoodRelativeReward { inner_reward } } } -impl RewardEvaluation for NeighborhoodRelativeReward { +impl RewardEvaluation for NeighborhoodRelativeReward { fn evaluate_state_space( &self, network_process: &N, reward_function: &R, ) -> HashMap { + let absolute_reward = self + .inner_reward + .evaluate_state_space(network_process, reward_function); - let absolute_reward = self.inner_reward.evaluate_state_space(network_process, reward_function); - //This approach optimize memory. Maybe optimizing execution time can be better. - absolute_reward.iter().map(|(k1, v1)| { - let mut max_val:f64 = 1.0; - absolute_reward.iter().for_each(|(k2,v2)| { - let count_diff:usize = k1.iter().zip(k2.iter()).map(|(s1, s2)| if s1 == s2 {0} else {1}).sum(); - if count_diff < 2 { - max_val = max_val.max(v1/v2); - } - - }); - (k1.clone(), max_val) - }).collect() - + absolute_reward + .iter() + .map(|(k1, v1)| { + let mut max_val: f64 = 1.0; + absolute_reward.iter().for_each(|(k2, v2)| { + let count_diff: usize = k1 + .iter() + .zip(k2.iter()) + .map(|(s1, s2)| if s1 == s2 { 0 } else { 1 }) + .sum(); + if count_diff < 2 { + max_val = max_val.max(v1 / v2); + } + }); + (k1.clone(), max_val) + }) + .collect() } fn evaluate_state( diff --git a/reCTBN/src/structure_learning/score_based_algorithm.rs b/reCTBN/src/structure_learning/score_based_algorithm.rs index 6850027..9173b86 100644 --- a/reCTBN/src/structure_learning/score_based_algorithm.rs +++ b/reCTBN/src/structure_learning/score_based_algorithm.rs @@ -82,7 +82,7 @@ impl StructureLearningAlgorithm for HillClimbing { } (node, parent_set) })); - + for (child_node, candidate_parent_set) in learned_parent_sets { for parent_node in candidate_parent_set.iter() { net.add_edge(*parent_node, child_node); diff --git a/reCTBN/tests/reward_evaluation.rs b/reCTBN/tests/reward_evaluation.rs index 63e9c98..c0372b1 100644 --- a/reCTBN/tests/reward_evaluation.rs +++ b/reCTBN/tests/reward_evaluation.rs @@ -33,7 +33,7 @@ fn simple_factored_reward_function_binary_node_MC() { let s0: NetworkProcessState = vec![params::StateType::Discrete(0)]; let s1: NetworkProcessState = vec![params::StateType::Discrete(1)]; - let mc = MonteCarloReward::new(100, 10.0, RewardCriteria::InfiniteHorizon { discount_factor: 1.0 }, Some(215)); + let mc = MonteCarloReward::new(10000, 1e-1, 1e-1, 10.0, RewardCriteria::InfiniteHorizon { discount_factor: 1.0 }, Some(215)); assert_abs_diff_eq!(3.0, mc.evaluate_state(&net, &rf, &s0), epsilon = 1e-2); assert_abs_diff_eq!(3.0, mc.evaluate_state(&net, &rf, &s1), epsilon = 1e-2); @@ -42,7 +42,7 @@ fn simple_factored_reward_function_binary_node_MC() { assert_abs_diff_eq!(3.0, rst[&s1], epsilon = 1e-2); - let mc = MonteCarloReward::new(100, 10.0, RewardCriteria::FiniteHorizon, Some(215)); + let mc = MonteCarloReward::new(10000, 1e-1, 1e-1, 10.0, RewardCriteria::FiniteHorizon, Some(215)); assert_abs_diff_eq!(30.0, mc.evaluate_state(&net, &rf, &s0), epsilon = 1e-2); assert_abs_diff_eq!(30.0, mc.evaluate_state(&net, &rf, &s1), epsilon = 1e-2); @@ -113,7 +113,7 @@ fn simple_factored_reward_function_chain_MC() { params::StateType::Discrete(0), ]; - let mc = MonteCarloReward::new(1000, 10.0, RewardCriteria::InfiniteHorizon { discount_factor: 1.0 }, Some(215)); + let mc = MonteCarloReward::new(10000, 1e-1, 1e-1, 10.0, RewardCriteria::InfiniteHorizon { discount_factor: 1.0 }, Some(215)); assert_abs_diff_eq!(2.447, mc.evaluate_state(&net, &rf, &s000), epsilon = 1e-1); let rst = mc.evaluate_state_space(&net, &rf);