1
+ from typing import List , Dict , Tuple
1
2
import random
2
3
import numpy as np
3
- import copy
4
- import math
5
4
from collections import Counter
6
5
from gym_csle_cyborg .envs .cyborg_scenario_two_wrapper import CyborgScenarioTwoWrapper
7
6
from gym_csle_cyborg .dao .red_agent_type import RedAgentType
12
11
import csle_agents .constants .constants as agents_constants
13
12
14
13
15
- def monte_carlo_most_frequent_particle (particles , N ) :
14
+ def monte_carlo_most_frequent_particle (particles : List [ CyborgWrapperState ] , N : int ) -> CyborgWrapperState :
16
15
"""
17
16
Samples N particles and returns the most frequently sampled particle
17
+
18
+ :param particles: the list of particles
19
+ :param N: the number of samples
20
+ :return: the most frequently sampled particle
18
21
"""
19
22
samples = [random .choice (particles ) for _ in range (N )]
20
23
counter = Counter (samples )
21
24
most_frequent_particle = counter .most_common (1 )[0 ][0 ]
22
25
return most_frequent_particle
23
26
24
27
25
- def particle_filter (particles , max_num_particles , train_env , u , obs , x_s ):
28
+ def particle_filter (particles : List [CyborgWrapperState ], max_num_particles : int , train_env : CyborgScenarioTwoWrapper ,
29
+ u : int , obs : int , x_s : CyborgWrapperState ) -> List [CyborgWrapperState ]:
26
30
"""
27
31
Implements a particle filter
32
+
33
+ :param particles: the list of particles
34
+ :param max_num_particles: the maximum number of particles
35
+ :param train_env: the environment used for sampling
36
+ :param u: the latest control
37
+ :param obs: the latest observation
38
+ :param x_s: the true cyborg state
39
+ :return: the list of updated particles
28
40
"""
29
41
new_particles = []
30
42
failed_samples = 0
31
43
while len (new_particles ) < max_num_particles :
32
- # print(f"{len(new_particles)}/{max_num_particles}")
33
44
x = random .choice (particles )
34
45
train_env .set_state (state = x )
35
46
_ , _ , _ , _ , info = train_env .step (u )
@@ -46,74 +57,96 @@ def particle_filter(particles, max_num_particles, train_env, u, obs, x_s):
46
57
return new_particles
47
58
48
59
49
- def restore_policy (x : CyborgWrapperState ):
60
+ def restore_policy (x : CyborgWrapperState , train_env : CyborgScenarioTwoWrapper , particles : List [CyborgWrapperState ]) \
61
+ -> int :
50
62
"""
51
63
Implements a heuristic restore policy for Cage2
64
+
65
+ :param x: the certainty-equivalence state
66
+ :param train_env: the environment used for simulation
67
+ :param particles: the current list of particles
68
+ :return: the control
52
69
"""
53
70
u = - 1
54
- if x .s [1 ][2 ] == 2 :
55
- u = 0 # Ent0
56
- if x .s [2 ][2 ] == 2 :
57
- u = 1 # Ent 1
58
- if x .s [3 ][2 ] == 2 :
59
- u = 2 # Ent 2
60
- if x .s [7 ][2 ] == 2 :
61
- u = 3 # Opserver
62
-
63
- if x .s [1 ][2 ] == 1 :
64
- u = 8 # Ent0
65
- if x .s [2 ][2 ] == 1 :
66
- u = 9 # Ent1
67
- if x .s [3 ][2 ] == 1 :
68
- u = 10 # Ent2
69
- if x .s [3 ][2 ] == 1 :
70
- u = 11 # Opserver
71
- if x .s [9 ][2 ] == 1 :
72
- u = 22 # User1
73
- if x .s [10 ][2 ] == 1 :
74
- u = 23 # User2
75
- if x .s [11 ][2 ] == 1 :
76
- u = 24 # User3
77
- if x .s [12 ][2 ] == 1 :
78
- u = 25 # User4
71
+ restore_actions = [0 , 1 , 2 , 3 ]
72
+ remove_actions = [8 , 9 , 10 , 11 , 22 , 23 , 24 , 25 ]
73
+ remove_hosts = [1 , 2 , 3 , 7 , 9 , 10 , 11 , 12 ]
74
+ restore_hosts = [1 , 2 , 3 , 7 ]
75
+ outcomes = {}
76
+ for h in remove_hosts :
77
+ outcomes [h ] = []
78
+ for i , host in enumerate (remove_hosts ):
79
+ for p in particles :
80
+ if p .s [host ][2 ] == 1 :
81
+ train_env .set_state (p )
82
+ train_env .step (action = remove_actions [i ])
83
+ if train_env .s [host ][2 ] == 0 :
84
+ outcomes [host ].append (1 )
85
+ else :
86
+ outcomes [host ].append (0 )
87
+ for i , h in enumerate (remove_hosts ):
88
+ if len (outcomes [h ]) > 0 :
89
+ remove_p = np .mean (outcomes [h ])
90
+ if remove_p >= 0.9 :
91
+ return remove_actions [i ]
92
+ for i , host in enumerate (restore_hosts ):
93
+ if x .s [host ][2 ] > 0 :
94
+ return restore_actions [i ]
79
95
return u
80
96
81
97
82
- def rollout_policy (x : CyborgWrapperState , train_env : CyborgScenarioTwoWrapper , J , state_to_id , mu , l , id_to_state ,
83
- gamma = 0.99 , mc_samples = 10 ):
98
+ def rollout_policy (train_env : CyborgScenarioTwoWrapper , J : List [float ], state_to_id : Dict [str , int ],
99
+ mu : List [List [float ]], l : int , id_to_state : Dict [int , List [int ]],
100
+ particles : List [CyborgWrapperState ], gamma = 0.99 , mc_samples = 10 ) -> Tuple [int , float ]:
84
101
"""
85
102
A rollout policy for cage-2
103
+
104
+ :param train_env: the environment to use for sampling
105
+ :param J: the cost-to-go function of the base policy
106
+ :param state_to_id: the aggreate state to aggregate state id map
107
+ :param mu: the base policy
108
+ :param l: the lookahead horizon
109
+ :param id_to_state: the aggregate state id to aggregate state map
110
+ :param particles: the current particle state
111
+ :param gamma: the discount factor
112
+ :param mc_samples: the number of Monte-Carlo samples to use
113
+ :return: the next control and its estimated value
86
114
"""
87
115
U = [27 , 28 , 29 , 30 , 31 , 32 , 35 ]
88
116
Q_n = []
89
- u_r = restore_policy (x = x )
90
117
for u in U :
91
118
returns = []
92
119
for i in range (mc_samples ):
93
- train_env .set_state (x )
120
+ particle = random .choice (particles )
121
+ train_env .set_state (particle )
94
122
_ , _ , _ , _ , info = train_env .step (action = u )
95
123
x_prime = info [agents_constants .COMMON .STATE ]
96
124
aggregate_state = Cage2AggregateMDP .get_aggregate_state (s = x_prime , state_to_id = state_to_id )
97
- # c = -info[agents_constants.COMMON.REWARD]
98
- c = Cage2AggregateMDP .cost_function (x = aggregate_state , u = U .index (u ), id_to_state = id_to_state )
125
+ c = - info [agents_constants .COMMON .REWARD ]
99
126
if l == 1 :
100
127
returns .append (c + gamma * J [aggregate_state ])
101
128
else :
102
- returns .append (c + gamma * rollout_policy (copy . deepcopy ( x_prime ), train_env = train_env , J = J ,
129
+ returns .append (c + gamma * rollout_policy (train_env = train_env , J = J ,
103
130
state_to_id = state_to_id , id_to_state = id_to_state ,
104
131
mu = mu , l = l - 1 )[1 ])
105
132
Q_n .append (np .mean (returns ))
106
133
u_star = int (np .argmin (Q_n ))
107
- J_star = Q_n [u_star ]
134
+ J_star = float ( Q_n [u_star ])
108
135
u_star = U [u_star ]
136
+ u_r = restore_policy (x = x , train_env = train_env , particles = particles )
109
137
if u_r != - 1 :
110
138
u_star = u_r
111
139
return u_star , J_star
112
140
113
141
114
- def base_policy (x , mu , id_to_state ) :
142
+ def base_policy (x : CyborgWrapperState , mu : List [ List [ float ]] , id_to_state : Dict [ int , List [ int ]]) -> int :
115
143
"""
116
144
Implements the base policy mu
145
+
146
+ :param x: the current state id
147
+ :param mu: the base policy
148
+ :param id_to_state: the aggregate state id to aggregate state map
149
+ :return: the next control
117
150
"""
118
151
aggregate_state = Cage2AggregateMDP .get_aggregate_state (s = x , state_to_id = state_to_id )
119
152
return Cage2AggregateMDP .get_aggregate_control (mu = mu , aggregate_state = aggregate_state , id_to_state = id_to_state )
@@ -130,8 +163,8 @@ def base_policy(x, mu, id_to_state):
130
163
decoy_optimization = False )
131
164
N = 10000
132
165
max_env_steps = 100
133
- mu = np .loadtxt ("./mu .txt" )
134
- J = np .loadtxt ("./J .txt" )
166
+ mu = np .loadtxt ("test/mu2 .txt" )
167
+ J = np .loadtxt ("test/J2 .txt" )
135
168
X , state_to_id , id_to_state = Cage2AggregateMDP .X ()
136
169
gamma = 0.99
137
170
l = 1
@@ -145,21 +178,24 @@ def base_policy(x, mu, id_to_state):
145
178
particles = env .initial_particles
146
179
while not done and t < max_env_steps :
147
180
monte_carlo_state = monte_carlo_most_frequent_particle (particles = particles , N = 100 )
148
- # u = restore_policy(x=x)
149
- u = restore_policy (x = monte_carlo_state )
181
+ u = restore_policy (x = monte_carlo_state , train_env = train_env , particles = particles )
182
+ if t <= 2 :
183
+ u = 31
150
184
if u == - 1 :
151
185
# u = base_policy(x=monte_carlo_state, mu=mu, id_to_state=id_to_state)
152
- # u = base_policy(x=x, mu=mu, id_to_state=id_to_state)
153
- u = rollout_policy (x = x , state_to_id = state_to_id , id_to_state = id_to_state ,
154
- train_env = train_env , J = J , mu = mu , gamma = gamma , l = l )[0 ]
186
+ u = rollout_policy (state_to_id = state_to_id , id_to_state = id_to_state , train_env = train_env , J = J , mu = mu ,
187
+ gamma = gamma , l = l , particles = particles , mc_samples = 20 )[0 ]
155
188
_ , _ , _ , _ , info = env .step (u )
156
189
particles = particle_filter (particles = particles , max_num_particles = 50 ,
157
190
train_env = train_env , u = u , obs = info [agents_constants .COMMON .OBSERVATION ],
158
191
x_s = info [agents_constants .COMMON .STATE ])
159
192
c = - info [agents_constants .COMMON .REWARD ]
160
- C += math .pow (gamma , t - 1 ) * c
161
- print (f"t:{ t } , u: { u } , c: { c } , a: { action_id_to_type_and_host [u ]} , C: { C } , "
162
- f"aggstate: { id_to_state [Cage2AggregateMDP .get_aggregate_state (s = x , state_to_id = state_to_id )]} " )
193
+ C += c
194
+ # aggstate = id_to_state[Cage2AggregateMDP.get_aggregate_state(s=monte_carlo_state,
195
+ # state_to_id=state_to_id)]
196
+ # print(f"t:{t}, u: {u}, c: {c}, a: {action_id_to_type_and_host[u]}, C: {C}, "
197
+ # f"aggstate: {aggstate},"
198
+ # f"true state: {id_to_state[Cage2AggregateMDP.get_aggregate_state(s=x, state_to_id=state_to_id)]}")
163
199
x = info [agents_constants .COMMON .STATE ]
164
200
t += 1
165
201
returns .append (C )
0 commit comments