|
|
@ -1,11 +1,11 @@ |
|
|
|
use std::collections::HashMap; |
|
|
|
use std::collections::HashMap; |
|
|
|
|
|
|
|
|
|
|
|
use rayon::prelude::{IntoParallelIterator, ParallelIterator}; |
|
|
|
use rayon::prelude::{IntoParallelIterator, ParallelIterator}; |
|
|
|
|
|
|
|
use statrs::distribution::ContinuousCDF; |
|
|
|
|
|
|
|
|
|
|
|
use crate::params::{self, ParamsTrait}; |
|
|
|
use crate::params::{self, ParamsTrait}; |
|
|
|
use crate::process; |
|
|
|
use crate::process; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use crate::{ |
|
|
|
use crate::{ |
|
|
|
process::NetworkProcessState, |
|
|
|
process::NetworkProcessState, |
|
|
|
reward::RewardEvaluation, |
|
|
|
reward::RewardEvaluation, |
|
|
@ -14,11 +14,13 @@ use crate::{ |
|
|
|
|
|
|
|
|
|
|
|
pub enum RewardCriteria { |
|
|
|
pub enum RewardCriteria { |
|
|
|
FiniteHorizon, |
|
|
|
FiniteHorizon, |
|
|
|
InfiniteHorizon {discount_factor: f64}, |
|
|
|
InfiniteHorizon { discount_factor: f64 }, |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
pub struct MonteCarloReward { |
|
|
|
pub struct MonteCarloReward { |
|
|
|
n_iterations: usize, |
|
|
|
max_iterations: usize, |
|
|
|
|
|
|
|
max_err_stop: f64, |
|
|
|
|
|
|
|
alpha_stop: f64, |
|
|
|
end_time: f64, |
|
|
|
end_time: f64, |
|
|
|
reward_criteria: RewardCriteria, |
|
|
|
reward_criteria: RewardCriteria, |
|
|
|
seed: Option<u64>, |
|
|
|
seed: Option<u64>, |
|
|
@ -26,13 +28,17 @@ pub struct MonteCarloReward { |
|
|
|
|
|
|
|
|
|
|
|
impl MonteCarloReward { |
|
|
|
impl MonteCarloReward { |
|
|
|
pub fn new( |
|
|
|
pub fn new( |
|
|
|
n_iterations: usize, |
|
|
|
max_iterations: usize, |
|
|
|
|
|
|
|
max_err_stop: f64, |
|
|
|
|
|
|
|
alpha_stop: f64, |
|
|
|
end_time: f64, |
|
|
|
end_time: f64, |
|
|
|
reward_criteria: RewardCriteria, |
|
|
|
reward_criteria: RewardCriteria, |
|
|
|
seed: Option<u64>, |
|
|
|
seed: Option<u64>, |
|
|
|
) -> MonteCarloReward { |
|
|
|
) -> MonteCarloReward { |
|
|
|
MonteCarloReward { |
|
|
|
MonteCarloReward { |
|
|
|
n_iterations, |
|
|
|
max_iterations, |
|
|
|
|
|
|
|
max_err_stop, |
|
|
|
|
|
|
|
alpha_stop, |
|
|
|
end_time, |
|
|
|
end_time, |
|
|
|
reward_criteria, |
|
|
|
reward_criteria, |
|
|
|
seed, |
|
|
|
seed, |
|
|
@ -58,7 +64,8 @@ impl RewardEvaluation for MonteCarloReward { |
|
|
|
|
|
|
|
|
|
|
|
let n_states: usize = variables_domain.iter().map(|x| x.len()).product(); |
|
|
|
let n_states: usize = variables_domain.iter().map(|x| x.len()).product(); |
|
|
|
|
|
|
|
|
|
|
|
(0..n_states).into_par_iter() |
|
|
|
(0..n_states) |
|
|
|
|
|
|
|
.into_par_iter() |
|
|
|
.map(|s| { |
|
|
|
.map(|s| { |
|
|
|
let state: process::NetworkProcessState = variables_domain |
|
|
|
let state: process::NetworkProcessState = variables_domain |
|
|
|
.iter() |
|
|
|
.iter() |
|
|
@ -85,10 +92,13 @@ impl RewardEvaluation for MonteCarloReward { |
|
|
|
) -> f64 { |
|
|
|
) -> f64 { |
|
|
|
let mut sampler = |
|
|
|
let mut sampler = |
|
|
|
ForwardSampler::new(network_process, self.seed.clone(), Some(state.clone())); |
|
|
|
ForwardSampler::new(network_process, self.seed.clone(), Some(state.clone())); |
|
|
|
let mut ret = 0.0; |
|
|
|
let mut expected_value = 0.0; |
|
|
|
|
|
|
|
let mut squared_expected_value = 0.0; |
|
|
|
|
|
|
|
let normal = statrs::distribution::Normal::new(0.0, 1.0).unwrap(); |
|
|
|
|
|
|
|
|
|
|
|
for _i in 0..self.n_iterations { |
|
|
|
for i in 0..self.max_iterations { |
|
|
|
sampler.reset(); |
|
|
|
sampler.reset(); |
|
|
|
|
|
|
|
let mut ret = 0.0; |
|
|
|
let mut previous = sampler.next().unwrap(); |
|
|
|
let mut previous = sampler.next().unwrap(); |
|
|
|
while previous.t < self.end_time { |
|
|
|
while previous.t < self.end_time { |
|
|
|
let current = sampler.next().unwrap(); |
|
|
|
let current = sampler.next().unwrap(); |
|
|
@ -96,7 +106,7 @@ impl RewardEvaluation for MonteCarloReward { |
|
|
|
let r = reward_function.call(&previous.state, None); |
|
|
|
let r = reward_function.call(&previous.state, None); |
|
|
|
let discount = match self.reward_criteria { |
|
|
|
let discount = match self.reward_criteria { |
|
|
|
RewardCriteria::FiniteHorizon => self.end_time - previous.t, |
|
|
|
RewardCriteria::FiniteHorizon => self.end_time - previous.t, |
|
|
|
RewardCriteria::InfiniteHorizon {discount_factor} => { |
|
|
|
RewardCriteria::InfiniteHorizon { discount_factor } => { |
|
|
|
std::f64::consts::E.powf(-discount_factor * previous.t) |
|
|
|
std::f64::consts::E.powf(-discount_factor * previous.t) |
|
|
|
- std::f64::consts::E.powf(-discount_factor * self.end_time) |
|
|
|
- std::f64::consts::E.powf(-discount_factor * self.end_time) |
|
|
|
} |
|
|
|
} |
|
|
@ -105,8 +115,8 @@ impl RewardEvaluation for MonteCarloReward { |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
let r = reward_function.call(&previous.state, Some(¤t.state)); |
|
|
|
let r = reward_function.call(&previous.state, Some(¤t.state)); |
|
|
|
let discount = match self.reward_criteria { |
|
|
|
let discount = match self.reward_criteria { |
|
|
|
RewardCriteria::FiniteHorizon => current.t-previous.t, |
|
|
|
RewardCriteria::FiniteHorizon => current.t - previous.t, |
|
|
|
RewardCriteria::InfiniteHorizon {discount_factor} => { |
|
|
|
RewardCriteria::InfiniteHorizon { discount_factor } => { |
|
|
|
std::f64::consts::E.powf(-discount_factor * previous.t) |
|
|
|
std::f64::consts::E.powf(-discount_factor * previous.t) |
|
|
|
- std::f64::consts::E.powf(-discount_factor * current.t) |
|
|
|
- std::f64::consts::E.powf(-discount_factor * current.t) |
|
|
|
} |
|
|
|
} |
|
|
@ -114,51 +124,74 @@ impl RewardEvaluation for MonteCarloReward { |
|
|
|
ret += discount * r.instantaneous_reward; |
|
|
|
ret += discount * r.instantaneous_reward; |
|
|
|
ret += match self.reward_criteria { |
|
|
|
ret += match self.reward_criteria { |
|
|
|
RewardCriteria::FiniteHorizon => 1.0, |
|
|
|
RewardCriteria::FiniteHorizon => 1.0, |
|
|
|
RewardCriteria::InfiniteHorizon {discount_factor} => { |
|
|
|
RewardCriteria::InfiniteHorizon { discount_factor } => { |
|
|
|
std::f64::consts::E.powf(-discount_factor * current.t) |
|
|
|
std::f64::consts::E.powf(-discount_factor * current.t) |
|
|
|
} |
|
|
|
} |
|
|
|
} * r.transition_reward; |
|
|
|
} * r.transition_reward; |
|
|
|
} |
|
|
|
} |
|
|
|
previous = current; |
|
|
|
previous = current; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let float_i = i as f64; |
|
|
|
|
|
|
|
expected_value = |
|
|
|
|
|
|
|
expected_value * float_i as f64 / (float_i + 1.0) + ret / (float_i + 1.0); |
|
|
|
|
|
|
|
squared_expected_value = squared_expected_value * float_i as f64 / (float_i + 1.0) |
|
|
|
|
|
|
|
+ ret.powi(2) / (float_i + 1.0); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if i > 2 { |
|
|
|
|
|
|
|
let var = |
|
|
|
|
|
|
|
(float_i + 1.0) / float_i * (squared_expected_value - expected_value.powi(2)); |
|
|
|
|
|
|
|
if self.alpha_stop |
|
|
|
|
|
|
|
- 2.0 * normal.cdf(-(float_i + 1.0).sqrt() * self.max_err_stop / var.sqrt()) |
|
|
|
|
|
|
|
> 0.0 |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
return expected_value; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
ret / self.n_iterations as f64 |
|
|
|
expected_value |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
pub struct NeighborhoodRelativeReward<RE: RewardEvaluation> { |
|
|
|
pub struct NeighborhoodRelativeReward<RE: RewardEvaluation> { |
|
|
|
inner_reward: RE |
|
|
|
inner_reward: RE, |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
impl<RE: RewardEvaluation> NeighborhoodRelativeReward<RE>{ |
|
|
|
impl<RE: RewardEvaluation> NeighborhoodRelativeReward<RE> { |
|
|
|
pub fn new(inner_reward: RE) -> NeighborhoodRelativeReward<RE>{ |
|
|
|
pub fn new(inner_reward: RE) -> NeighborhoodRelativeReward<RE> { |
|
|
|
NeighborhoodRelativeReward {inner_reward} |
|
|
|
NeighborhoodRelativeReward { inner_reward } |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
impl<RE:RewardEvaluation> RewardEvaluation for NeighborhoodRelativeReward<RE> { |
|
|
|
impl<RE: RewardEvaluation> RewardEvaluation for NeighborhoodRelativeReward<RE> { |
|
|
|
fn evaluate_state_space<N: process::NetworkProcess, R: super::RewardFunction>( |
|
|
|
fn evaluate_state_space<N: process::NetworkProcess, R: super::RewardFunction>( |
|
|
|
&self, |
|
|
|
&self, |
|
|
|
network_process: &N, |
|
|
|
network_process: &N, |
|
|
|
reward_function: &R, |
|
|
|
reward_function: &R, |
|
|
|
) -> HashMap<process::NetworkProcessState, f64> { |
|
|
|
) -> HashMap<process::NetworkProcessState, f64> { |
|
|
|
|
|
|
|
let absolute_reward = self |
|
|
|
let absolute_reward = self.inner_reward.evaluate_state_space(network_process, reward_function); |
|
|
|
.inner_reward |
|
|
|
|
|
|
|
.evaluate_state_space(network_process, reward_function); |
|
|
|
|
|
|
|
|
|
|
|
//This approach optimize memory. Maybe optimizing execution time can be better.
|
|
|
|
//This approach optimize memory. Maybe optimizing execution time can be better.
|
|
|
|
absolute_reward.iter().map(|(k1, v1)| { |
|
|
|
absolute_reward |
|
|
|
let mut max_val:f64 = 1.0; |
|
|
|
.iter() |
|
|
|
absolute_reward.iter().for_each(|(k2,v2)| { |
|
|
|
.map(|(k1, v1)| { |
|
|
|
let count_diff:usize = k1.iter().zip(k2.iter()).map(|(s1, s2)| if s1 == s2 {0} else {1}).sum(); |
|
|
|
let mut max_val: f64 = 1.0; |
|
|
|
if count_diff < 2 { |
|
|
|
absolute_reward.iter().for_each(|(k2, v2)| { |
|
|
|
max_val = max_val.max(v1/v2); |
|
|
|
let count_diff: usize = k1 |
|
|
|
} |
|
|
|
.iter() |
|
|
|
|
|
|
|
.zip(k2.iter()) |
|
|
|
}); |
|
|
|
.map(|(s1, s2)| if s1 == s2 { 0 } else { 1 }) |
|
|
|
(k1.clone(), max_val) |
|
|
|
.sum(); |
|
|
|
}).collect() |
|
|
|
if count_diff < 2 { |
|
|
|
|
|
|
|
max_val = max_val.max(v1 / v2); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
(k1.clone(), max_val) |
|
|
|
|
|
|
|
}) |
|
|
|
|
|
|
|
.collect() |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn evaluate_state<N: process::NetworkProcess, R: super::RewardFunction>( |
|
|
|
fn evaluate_state<N: process::NetworkProcess, R: super::RewardFunction>( |
|
|
|