Skip to content

Commit 68531fb

Browse files
committed
aggregate cage-2 mdp
1 parent 325f4d9 commit 68531fb

File tree

7 files changed

+296
-868
lines changed

7 files changed

+296
-868
lines changed

examples/eval/cyborg_scenario_two/cage2_aggregate_mdp.py

+191-168
Large diffs are not rendered by default.

examples/eval/cyborg_scenario_two/eval_aggregate_mdp_on_wrapper.py

+86-50
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1+
from typing import List, Dict, Tuple
12
import random
23
import numpy as np
3-
import copy
4-
import math
54
from collections import Counter
65
from gym_csle_cyborg.envs.cyborg_scenario_two_wrapper import CyborgScenarioTwoWrapper
76
from gym_csle_cyborg.dao.red_agent_type import RedAgentType
@@ -12,24 +11,36 @@
1211
import csle_agents.constants.constants as agents_constants
1312

1413

15-
def monte_carlo_most_frequent_particle(particles, N):
14+
def monte_carlo_most_frequent_particle(particles: List[CyborgWrapperState], N: int) -> CyborgWrapperState:
1615
"""
1716
Samples N particles and returns the most frequently sampled particle
17+
18+
:param particles: the list of particles
19+
:param N: the number of samples
20+
:return: the most frequently sampled particle
1821
"""
1922
samples = [random.choice(particles) for _ in range(N)]
2023
counter = Counter(samples)
2124
most_frequent_particle = counter.most_common(1)[0][0]
2225
return most_frequent_particle
2326

2427

25-
def particle_filter(particles, max_num_particles, train_env, u, obs, x_s):
28+
def particle_filter(particles: List[CyborgWrapperState], max_num_particles: int, train_env: CyborgScenarioTwoWrapper,
29+
u: int, obs: int, x_s: CyborgWrapperState) -> List[CyborgWrapperState]:
2630
"""
2731
Implements a particle filter
32+
33+
:param particles: the list of particles
34+
:param max_num_particles: the maximum number of particles
35+
:param train_env: the environment used for sampling
36+
:param u: the latest control
37+
:param obs: the latest observation
38+
:param x_s: the true cyborg state
39+
:return: the list of updated particles
2840
"""
2941
new_particles = []
3042
failed_samples = 0
3143
while len(new_particles) < max_num_particles:
32-
# print(f"{len(new_particles)}/{max_num_particles}")
3344
x = random.choice(particles)
3445
train_env.set_state(state=x)
3546
_, _, _, _, info = train_env.step(u)
@@ -46,74 +57,96 @@ def particle_filter(particles, max_num_particles, train_env, u, obs, x_s):
4657
return new_particles
4758

4859

49-
def restore_policy(x: CyborgWrapperState):
60+
def restore_policy(x: CyborgWrapperState, train_env: CyborgScenarioTwoWrapper, particles: List[CyborgWrapperState]) \
61+
-> int:
5062
"""
5163
Implements a heuristic restore policy for Cage2
64+
65+
:param x: the certainty-equivalence state
66+
:param train_env: the environment used for simulation
67+
:param particles: the current list of particles
68+
:return: the control
5269
"""
5370
u = -1
54-
if x.s[1][2] == 2:
55-
u = 0 # Ent0
56-
if x.s[2][2] == 2:
57-
u = 1 # Ent 1
58-
if x.s[3][2] == 2:
59-
u = 2 # Ent 2
60-
if x.s[7][2] == 2:
61-
u = 3 # Opserver
62-
63-
if x.s[1][2] == 1:
64-
u = 8 # Ent0
65-
if x.s[2][2] == 1:
66-
u = 9 # Ent1
67-
if x.s[3][2] == 1:
68-
u = 10 # Ent2
69-
if x.s[3][2] == 1:
70-
u = 11 # Opserver
71-
if x.s[9][2] == 1:
72-
u = 22 # User1
73-
if x.s[10][2] == 1:
74-
u = 23 # User2
75-
if x.s[11][2] == 1:
76-
u = 24 # User3
77-
if x.s[12][2] == 1:
78-
u = 25 # User4
71+
restore_actions = [0, 1, 2, 3]
72+
remove_actions = [8, 9, 10, 11, 22, 23, 24, 25]
73+
remove_hosts = [1, 2, 3, 7, 9, 10, 11, 12]
74+
restore_hosts = [1, 2, 3, 7]
75+
outcomes = {}
76+
for h in remove_hosts:
77+
outcomes[h] = []
78+
for i, host in enumerate(remove_hosts):
79+
for p in particles:
80+
if p.s[host][2] == 1:
81+
train_env.set_state(p)
82+
train_env.step(action=remove_actions[i])
83+
if train_env.s[host][2] == 0:
84+
outcomes[host].append(1)
85+
else:
86+
outcomes[host].append(0)
87+
for i, h in enumerate(remove_hosts):
88+
if len(outcomes[h]) > 0:
89+
remove_p = np.mean(outcomes[h])
90+
if remove_p >= 0.9:
91+
return remove_actions[i]
92+
for i, host in enumerate(restore_hosts):
93+
if x.s[host][2] > 0:
94+
return restore_actions[i]
7995
return u
8096

8197

82-
def rollout_policy(x: CyborgWrapperState, train_env: CyborgScenarioTwoWrapper, J, state_to_id, mu, l, id_to_state,
83-
gamma=0.99, mc_samples=10):
98+
def rollout_policy(train_env: CyborgScenarioTwoWrapper, J: List[float], state_to_id: Dict[str, int],
99+
mu: List[List[float]], l: int, id_to_state: Dict[int, List[int]],
100+
particles: List[CyborgWrapperState], gamma=0.99, mc_samples=10) -> Tuple[int, float]:
84101
"""
85102
A rollout policy for cage-2
103+
104+
:param train_env: the environment to use for sampling
105+
:param J: the cost-to-go function of the base policy
106+
:param state_to_id: the aggreate state to aggregate state id map
107+
:param mu: the base policy
108+
:param l: the lookahead horizon
109+
:param id_to_state: the aggregate state id to aggregate state map
110+
:param particles: the current particle state
111+
:param gamma: the discount factor
112+
:param mc_samples: the number of Monte-Carlo samples to use
113+
:return: the next control and its estimated value
86114
"""
87115
U = [27, 28, 29, 30, 31, 32, 35]
88116
Q_n = []
89-
u_r = restore_policy(x=x)
90117
for u in U:
91118
returns = []
92119
for i in range(mc_samples):
93-
train_env.set_state(x)
120+
particle = random.choice(particles)
121+
train_env.set_state(particle)
94122
_, _, _, _, info = train_env.step(action=u)
95123
x_prime = info[agents_constants.COMMON.STATE]
96124
aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=x_prime, state_to_id=state_to_id)
97-
# c = -info[agents_constants.COMMON.REWARD]
98-
c = Cage2AggregateMDP.cost_function(x=aggregate_state, u=U.index(u), id_to_state=id_to_state)
125+
c = -info[agents_constants.COMMON.REWARD]
99126
if l == 1:
100127
returns.append(c + gamma * J[aggregate_state])
101128
else:
102-
returns.append(c + gamma * rollout_policy(copy.deepcopy(x_prime), train_env=train_env, J=J,
129+
returns.append(c + gamma * rollout_policy(train_env=train_env, J=J,
103130
state_to_id=state_to_id, id_to_state=id_to_state,
104131
mu=mu, l=l - 1)[1])
105132
Q_n.append(np.mean(returns))
106133
u_star = int(np.argmin(Q_n))
107-
J_star = Q_n[u_star]
134+
J_star = float(Q_n[u_star])
108135
u_star = U[u_star]
136+
u_r = restore_policy(x=x, train_env=train_env, particles=particles)
109137
if u_r != -1:
110138
u_star = u_r
111139
return u_star, J_star
112140

113141

114-
def base_policy(x, mu, id_to_state):
142+
def base_policy(x: CyborgWrapperState, mu: List[List[float]], id_to_state: Dict[int, List[int]]) -> int:
115143
"""
116144
Implements the base policy mu
145+
146+
:param x: the current state id
147+
:param mu: the base policy
148+
:param id_to_state: the aggregate state id to aggregate state map
149+
:return: the next control
117150
"""
118151
aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=x, state_to_id=state_to_id)
119152
return Cage2AggregateMDP.get_aggregate_control(mu=mu, aggregate_state=aggregate_state, id_to_state=id_to_state)
@@ -130,8 +163,8 @@ def base_policy(x, mu, id_to_state):
130163
decoy_optimization=False)
131164
N = 10000
132165
max_env_steps = 100
133-
mu = np.loadtxt("./mu.txt")
134-
J = np.loadtxt("./J.txt")
166+
mu = np.loadtxt("test/mu2.txt")
167+
J = np.loadtxt("test/J2.txt")
135168
X, state_to_id, id_to_state = Cage2AggregateMDP.X()
136169
gamma = 0.99
137170
l = 1
@@ -145,21 +178,24 @@ def base_policy(x, mu, id_to_state):
145178
particles = env.initial_particles
146179
while not done and t < max_env_steps:
147180
monte_carlo_state = monte_carlo_most_frequent_particle(particles=particles, N=100)
148-
# u = restore_policy(x=x)
149-
u = restore_policy(x=monte_carlo_state)
181+
u = restore_policy(x=monte_carlo_state, train_env=train_env, particles=particles)
182+
if t <= 2:
183+
u = 31
150184
if u == -1:
151185
# u = base_policy(x=monte_carlo_state, mu=mu, id_to_state=id_to_state)
152-
# u = base_policy(x=x, mu=mu, id_to_state=id_to_state)
153-
u = rollout_policy(x=x, state_to_id=state_to_id, id_to_state=id_to_state,
154-
train_env=train_env, J=J, mu=mu, gamma=gamma, l=l)[0]
186+
u = rollout_policy(state_to_id=state_to_id, id_to_state=id_to_state, train_env=train_env, J=J, mu=mu,
187+
gamma=gamma, l=l, particles=particles, mc_samples=20)[0]
155188
_, _, _, _, info = env.step(u)
156189
particles = particle_filter(particles=particles, max_num_particles=50,
157190
train_env=train_env, u=u, obs=info[agents_constants.COMMON.OBSERVATION],
158191
x_s=info[agents_constants.COMMON.STATE])
159192
c = -info[agents_constants.COMMON.REWARD]
160-
C += math.pow(gamma, t - 1) * c
161-
print(f"t:{t}, u: {u}, c: {c}, a: {action_id_to_type_and_host[u]}, C: {C}, "
162-
f"aggstate: {id_to_state[Cage2AggregateMDP.get_aggregate_state(s=x, state_to_id=state_to_id)]}")
193+
C += c
194+
# aggstate = id_to_state[Cage2AggregateMDP.get_aggregate_state(s=monte_carlo_state,
195+
# state_to_id=state_to_id)]
196+
# print(f"t:{t}, u: {u}, c: {c}, a: {action_id_to_type_and_host[u]}, C: {C}, "
197+
# f"aggstate: {aggstate},"
198+
# f"true state: {id_to_state[Cage2AggregateMDP.get_aggregate_state(s=x, state_to_id=state_to_id)]}")
163199
x = info[agents_constants.COMMON.STATE]
164200
t += 1
165201
returns.append(C)

examples/eval/cyborg_scenario_two/test/.gitignore

-3
This file was deleted.

0 commit comments

Comments
 (0)