Added automatic stopping for MonteCarloReward

pull/87/head
AlessandroBregoli 2 years ago
parent 5d676be180
commit adb0f99419
  1. 8
      reCTBN/src/parameter_learning.rs
  2. 2
      reCTBN/src/reward.rs
  3. 97
      reCTBN/src/reward/reward_evaluation.rs
  4. 6
      reCTBN/tests/reward_evaluation.rs

@ -144,11 +144,9 @@ impl ParameterLearning for BayesianApproach {
.zip(M.mapv(|x| x as f64).axis_iter(Axis(2))) .zip(M.mapv(|x| x as f64).axis_iter(Axis(2)))
.for_each(|(mut C, m)| C.assign(&(&m.mapv(|y| y + alpha) / &T.mapv(|y| y + tau)))); .for_each(|(mut C, m)| C.assign(&(&m.mapv(|y| y + alpha) / &T.mapv(|y| y + tau))));
CIM.outer_iter_mut().for_each(|mut C| {
CIM.outer_iter_mut() C.diag_mut().fill(0.0);
.for_each(|mut C| { });
C.diag_mut().fill(0.0);
});
//Set the diagonal of the inner matrices to the the row sum multiplied by -1 //Set the diagonal of the inner matrices to the the row sum multiplied by -1
let tmp_diag_sum: Array2<f64> = CIM.sum_axis(Axis(2)).mapv(|x| x * -1.0); let tmp_diag_sum: Array2<f64> = CIM.sum_axis(Axis(2)).mapv(|x| x * -1.0);

@ -1,5 +1,5 @@
pub mod reward_function;
pub mod reward_evaluation; pub mod reward_evaluation;
pub mod reward_function;
use std::collections::HashMap; use std::collections::HashMap;

@ -1,11 +1,11 @@
use std::collections::HashMap; use std::collections::HashMap;
use rayon::prelude::{IntoParallelIterator, ParallelIterator}; use rayon::prelude::{IntoParallelIterator, ParallelIterator};
use statrs::distribution::ContinuousCDF;
use crate::params::{self, ParamsTrait}; use crate::params::{self, ParamsTrait};
use crate::process; use crate::process;
use crate::{ use crate::{
process::NetworkProcessState, process::NetworkProcessState,
reward::RewardEvaluation, reward::RewardEvaluation,
@ -14,11 +14,13 @@ use crate::{
pub enum RewardCriteria { pub enum RewardCriteria {
FiniteHorizon, FiniteHorizon,
InfiniteHorizon {discount_factor: f64}, InfiniteHorizon { discount_factor: f64 },
} }
pub struct MonteCarloReward { pub struct MonteCarloReward {
n_iterations: usize, max_iterations: usize,
max_err_stop: f64,
alpha_stop: f64,
end_time: f64, end_time: f64,
reward_criteria: RewardCriteria, reward_criteria: RewardCriteria,
seed: Option<u64>, seed: Option<u64>,
@ -26,13 +28,17 @@ pub struct MonteCarloReward {
impl MonteCarloReward { impl MonteCarloReward {
pub fn new( pub fn new(
n_iterations: usize, max_iterations: usize,
max_err_stop: f64,
alpha_stop: f64,
end_time: f64, end_time: f64,
reward_criteria: RewardCriteria, reward_criteria: RewardCriteria,
seed: Option<u64>, seed: Option<u64>,
) -> MonteCarloReward { ) -> MonteCarloReward {
MonteCarloReward { MonteCarloReward {
n_iterations, max_iterations,
max_err_stop,
alpha_stop,
end_time, end_time,
reward_criteria, reward_criteria,
seed, seed,
@ -58,7 +64,8 @@ impl RewardEvaluation for MonteCarloReward {
let n_states: usize = variables_domain.iter().map(|x| x.len()).product(); let n_states: usize = variables_domain.iter().map(|x| x.len()).product();
(0..n_states).into_par_iter() (0..n_states)
.into_par_iter()
.map(|s| { .map(|s| {
let state: process::NetworkProcessState = variables_domain let state: process::NetworkProcessState = variables_domain
.iter() .iter()
@ -85,10 +92,13 @@ impl RewardEvaluation for MonteCarloReward {
) -> f64 { ) -> f64 {
let mut sampler = let mut sampler =
ForwardSampler::new(network_process, self.seed.clone(), Some(state.clone())); ForwardSampler::new(network_process, self.seed.clone(), Some(state.clone()));
let mut ret = 0.0; let mut expected_value = 0.0;
let mut squared_expected_value = 0.0;
let normal = statrs::distribution::Normal::new(0.0, 1.0).unwrap();
for _i in 0..self.n_iterations { for i in 0..self.max_iterations {
sampler.reset(); sampler.reset();
let mut ret = 0.0;
let mut previous = sampler.next().unwrap(); let mut previous = sampler.next().unwrap();
while previous.t < self.end_time { while previous.t < self.end_time {
let current = sampler.next().unwrap(); let current = sampler.next().unwrap();
@ -96,7 +106,7 @@ impl RewardEvaluation for MonteCarloReward {
let r = reward_function.call(&previous.state, None); let r = reward_function.call(&previous.state, None);
let discount = match self.reward_criteria { let discount = match self.reward_criteria {
RewardCriteria::FiniteHorizon => self.end_time - previous.t, RewardCriteria::FiniteHorizon => self.end_time - previous.t,
RewardCriteria::InfiniteHorizon {discount_factor} => { RewardCriteria::InfiniteHorizon { discount_factor } => {
std::f64::consts::E.powf(-discount_factor * previous.t) std::f64::consts::E.powf(-discount_factor * previous.t)
- std::f64::consts::E.powf(-discount_factor * self.end_time) - std::f64::consts::E.powf(-discount_factor * self.end_time)
} }
@ -105,8 +115,8 @@ impl RewardEvaluation for MonteCarloReward {
} else { } else {
let r = reward_function.call(&previous.state, Some(&current.state)); let r = reward_function.call(&previous.state, Some(&current.state));
let discount = match self.reward_criteria { let discount = match self.reward_criteria {
RewardCriteria::FiniteHorizon => current.t-previous.t, RewardCriteria::FiniteHorizon => current.t - previous.t,
RewardCriteria::InfiniteHorizon {discount_factor} => { RewardCriteria::InfiniteHorizon { discount_factor } => {
std::f64::consts::E.powf(-discount_factor * previous.t) std::f64::consts::E.powf(-discount_factor * previous.t)
- std::f64::consts::E.powf(-discount_factor * current.t) - std::f64::consts::E.powf(-discount_factor * current.t)
} }
@ -114,51 +124,74 @@ impl RewardEvaluation for MonteCarloReward {
ret += discount * r.instantaneous_reward; ret += discount * r.instantaneous_reward;
ret += match self.reward_criteria { ret += match self.reward_criteria {
RewardCriteria::FiniteHorizon => 1.0, RewardCriteria::FiniteHorizon => 1.0,
RewardCriteria::InfiniteHorizon {discount_factor} => { RewardCriteria::InfiniteHorizon { discount_factor } => {
std::f64::consts::E.powf(-discount_factor * current.t) std::f64::consts::E.powf(-discount_factor * current.t)
} }
} * r.transition_reward; } * r.transition_reward;
} }
previous = current; previous = current;
} }
let float_i = i as f64;
expected_value =
expected_value * float_i as f64 / (float_i + 1.0) + ret / (float_i + 1.0);
squared_expected_value = squared_expected_value * float_i as f64 / (float_i + 1.0)
+ ret.powi(2) / (float_i + 1.0);
if i > 2 {
let var =
(float_i + 1.0) / float_i * (squared_expected_value - expected_value.powi(2));
if self.alpha_stop
- 2.0 * normal.cdf(-(float_i + 1.0).sqrt() * self.max_err_stop / var.sqrt())
> 0.0
{
return expected_value;
}
}
} }
ret / self.n_iterations as f64 expected_value
} }
} }
pub struct NeighborhoodRelativeReward<RE: RewardEvaluation> { pub struct NeighborhoodRelativeReward<RE: RewardEvaluation> {
inner_reward: RE inner_reward: RE,
} }
impl<RE: RewardEvaluation> NeighborhoodRelativeReward<RE>{ impl<RE: RewardEvaluation> NeighborhoodRelativeReward<RE> {
pub fn new(inner_reward: RE) -> NeighborhoodRelativeReward<RE>{ pub fn new(inner_reward: RE) -> NeighborhoodRelativeReward<RE> {
NeighborhoodRelativeReward {inner_reward} NeighborhoodRelativeReward { inner_reward }
} }
} }
impl<RE:RewardEvaluation> RewardEvaluation for NeighborhoodRelativeReward<RE> { impl<RE: RewardEvaluation> RewardEvaluation for NeighborhoodRelativeReward<RE> {
fn evaluate_state_space<N: process::NetworkProcess, R: super::RewardFunction>( fn evaluate_state_space<N: process::NetworkProcess, R: super::RewardFunction>(
&self, &self,
network_process: &N, network_process: &N,
reward_function: &R, reward_function: &R,
) -> HashMap<process::NetworkProcessState, f64> { ) -> HashMap<process::NetworkProcessState, f64> {
let absolute_reward = self
let absolute_reward = self.inner_reward.evaluate_state_space(network_process, reward_function); .inner_reward
.evaluate_state_space(network_process, reward_function);
//This approach optimize memory. Maybe optimizing execution time can be better. //This approach optimize memory. Maybe optimizing execution time can be better.
absolute_reward.iter().map(|(k1, v1)| { absolute_reward
let mut max_val:f64 = 1.0; .iter()
absolute_reward.iter().for_each(|(k2,v2)| { .map(|(k1, v1)| {
let count_diff:usize = k1.iter().zip(k2.iter()).map(|(s1, s2)| if s1 == s2 {0} else {1}).sum(); let mut max_val: f64 = 1.0;
if count_diff < 2 { absolute_reward.iter().for_each(|(k2, v2)| {
max_val = max_val.max(v1/v2); let count_diff: usize = k1
} .iter()
.zip(k2.iter())
}); .map(|(s1, s2)| if s1 == s2 { 0 } else { 1 })
(k1.clone(), max_val) .sum();
}).collect() if count_diff < 2 {
max_val = max_val.max(v1 / v2);
}
});
(k1.clone(), max_val)
})
.collect()
} }
fn evaluate_state<N: process::NetworkProcess, R: super::RewardFunction>( fn evaluate_state<N: process::NetworkProcess, R: super::RewardFunction>(

@ -33,7 +33,7 @@ fn simple_factored_reward_function_binary_node_MC() {
let s0: NetworkProcessState = vec![params::StateType::Discrete(0)]; let s0: NetworkProcessState = vec![params::StateType::Discrete(0)];
let s1: NetworkProcessState = vec![params::StateType::Discrete(1)]; let s1: NetworkProcessState = vec![params::StateType::Discrete(1)];
let mc = MonteCarloReward::new(100, 10.0, RewardCriteria::InfiniteHorizon { discount_factor: 1.0 }, Some(215)); let mc = MonteCarloReward::new(10000, 1e-1, 1e-1, 10.0, RewardCriteria::InfiniteHorizon { discount_factor: 1.0 }, Some(215));
assert_abs_diff_eq!(3.0, mc.evaluate_state(&net, &rf, &s0), epsilon = 1e-2); assert_abs_diff_eq!(3.0, mc.evaluate_state(&net, &rf, &s0), epsilon = 1e-2);
assert_abs_diff_eq!(3.0, mc.evaluate_state(&net, &rf, &s1), epsilon = 1e-2); assert_abs_diff_eq!(3.0, mc.evaluate_state(&net, &rf, &s1), epsilon = 1e-2);
@ -42,7 +42,7 @@ fn simple_factored_reward_function_binary_node_MC() {
assert_abs_diff_eq!(3.0, rst[&s1], epsilon = 1e-2); assert_abs_diff_eq!(3.0, rst[&s1], epsilon = 1e-2);
let mc = MonteCarloReward::new(100, 10.0, RewardCriteria::FiniteHorizon, Some(215)); let mc = MonteCarloReward::new(10000, 1e-1, 1e-1, 10.0, RewardCriteria::FiniteHorizon, Some(215));
assert_abs_diff_eq!(30.0, mc.evaluate_state(&net, &rf, &s0), epsilon = 1e-2); assert_abs_diff_eq!(30.0, mc.evaluate_state(&net, &rf, &s0), epsilon = 1e-2);
assert_abs_diff_eq!(30.0, mc.evaluate_state(&net, &rf, &s1), epsilon = 1e-2); assert_abs_diff_eq!(30.0, mc.evaluate_state(&net, &rf, &s1), epsilon = 1e-2);
@ -113,7 +113,7 @@ fn simple_factored_reward_function_chain_MC() {
params::StateType::Discrete(0), params::StateType::Discrete(0),
]; ];
let mc = MonteCarloReward::new(1000, 10.0, RewardCriteria::InfiniteHorizon { discount_factor: 1.0 }, Some(215)); let mc = MonteCarloReward::new(10000, 1e-1, 1e-1, 10.0, RewardCriteria::InfiniteHorizon { discount_factor: 1.0 }, Some(215));
assert_abs_diff_eq!(2.447, mc.evaluate_state(&net, &rf, &s000), epsilon = 1e-1); assert_abs_diff_eq!(2.447, mc.evaluate_state(&net, &rf, &s000), epsilon = 1e-1);
let rst = mc.evaluate_state_space(&net, &rf); let rst = mc.evaluate_state_space(&net, &rf);

Loading…
Cancel
Save