Skip to content

Commit 53870c9

Browse files
committed
cage-2 agg mdp
1 parent 5346cac commit 53870c9

File tree

9 files changed

+726
-73
lines changed

9 files changed

+726
-73
lines changed

examples/eval/cyborg_scenario_two/cage2_aggregate_mdp.py

+480
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
import numpy as np
2+
import copy
3+
from gym_csle_cyborg.envs.cyborg_scenario_two_wrapper import CyborgScenarioTwoWrapper
4+
from gym_csle_cyborg.dao.red_agent_type import RedAgentType
5+
from gym_csle_cyborg.dao.csle_cyborg_wrapper_config import CSLECyborgWrapperConfig
6+
from gym_csle_cyborg.util.cyborg_env_util import CyborgEnvUtil
7+
from gym_csle_cyborg.dao.cyborg_wrapper_state import CyborgWrapperState
8+
from cyborg_agg_mdp import Cage2AggregateMDP
9+
10+
11+
# def monte_carlo_most_frequent(elements, num_samples):
12+
# if not elements:
13+
# raise ValueError("The input list is empty.")
14+
#
15+
# # Perform random sampling
16+
# samples = [random.choice(elements) for _ in range(num_samples)]
17+
#
18+
# # Count occurrences of sampled elements
19+
# counter = Counter(samples)
20+
#
21+
# # Find the most common element
22+
# most_frequent_element = counter.most_common(1)[0][0]
23+
# return most_frequent_element
24+
25+
# def particle_filter(particles, max_num_particles, train_env, action, obs):
26+
# new_particles = []
27+
# while len(particles) < max_num_particles:
28+
# x = random.choice(particles)
29+
# train_env.set_state(state=x)
30+
# _, r, _, _, info = train_env.step(action)
31+
# s_prime = info["s"]
32+
# o = info["o"]
33+
# if o == obs:
34+
# new_particles.append(s_prime)
35+
# return new_particles
36+
37+
def restore_policy(s: CyborgWrapperState):
38+
a = -1
39+
if s.s[1][2] == 2:
40+
a = 0 # Ent0
41+
if s.s[2][2] == 2:
42+
a = 1 # Ent 1
43+
if s.s[3][2] == 2:
44+
a = 2 # Ent 2
45+
if s.s[7][2] == 2:
46+
a = 3 # Opserver
47+
48+
if s.s[1][2] == 1:
49+
a = 8 # Ent0
50+
if s.s[2][2] == 1:
51+
a = 9 # Ent1
52+
if s.s[3][2] == 1:
53+
a = 10 # Ent2
54+
if s.s[3][2] == 1:
55+
a = 11 # Opserver
56+
if s.s[9][2] == 1:
57+
a = 22 # User1
58+
if s.s[10][2] == 1:
59+
a = 23 # User2
60+
if s.s[11][2] == 1:
61+
a = 24 # User3
62+
if s.s[12][2] == 1:
63+
a = 25 # User4
64+
return a
65+
66+
def rollout(s: CyborgWrapperState, train_env: CyborgScenarioTwoWrapper, J, state_to_id, mu, l, gamma=0.99):
67+
# U = [0, 1, 2, 3, 8, 9, 10, 11, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 35]
68+
U = [27, 28, 29, 30, 31, 32, 35]
69+
U = [27, 28, 29, 30, 31, 32]
70+
Q_n = []
71+
for u in U:
72+
u_r = restore_policy(s=s)
73+
if u_r != -1:
74+
o, c, done, _, info = train_env.step(action=u_r)
75+
s_prime = info["s"]
76+
aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=s_prime, state_to_id=state_to_id)
77+
if l == 1:
78+
return u_r, J[aggregate_state]
79+
else:
80+
returns = []
81+
for i in range(2):
82+
returns.append(rollout(copy.deepcopy(s_prime), train_env=train_env, J=J, state_to_id=state_to_id, mu=mu, l=l-1)[1])
83+
cost_to_go = np.mean(returns)
84+
else:
85+
train_env.set_state(s)
86+
o, c, done, _, info = train_env.step(action=u)
87+
s_prime = info["s"]
88+
aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=s_prime, state_to_id=state_to_id)
89+
if l == 1:
90+
cost_to_go = J[aggregate_state]
91+
else:
92+
returns = []
93+
for i in range(2):
94+
returns.append(rollout(copy.deepcopy(s_prime), train_env=train_env, J=J, state_to_id=state_to_id, mu=mu, l=l-1)[1])
95+
cost_to_go = np.mean(returns)
96+
Q_n.append(-c + gamma*cost_to_go)
97+
# print(Q_n)
98+
# print(U[int(np.argmin(Q_n))])
99+
u_star = int(np.argmin(Q_n))
100+
return U[u_star], Q_n[u_star]
101+
102+
103+
if __name__ == '__main__':
104+
config = CSLECyborgWrapperConfig(maximum_steps=100, gym_env_name="",
105+
save_trace=False, reward_shaping=False, scenario=2,
106+
red_agent_type=RedAgentType.B_LINE_AGENT)
107+
env = CyborgScenarioTwoWrapper(config=config)
108+
train_env = CyborgScenarioTwoWrapper(config=config)
109+
action_id_to_type_and_host, type_and_host_to_action_id \
110+
= CyborgEnvUtil.get_action_dicts(scenario=2, reduced_action_space=True, decoy_state=True, decoy_optimization=False)
111+
N = 10000
112+
max_env_steps = 100
113+
mu = np.loadtxt("./mu1.txt")
114+
J = np.loadtxt("./J1.txt")
115+
X, state_to_id, id_to_state = Cage2AggregateMDP.X()
116+
gamma = 0.99
117+
l = 3
118+
returns = []
119+
for i in range(N):
120+
print(f"{i}/{N}")
121+
done = False
122+
_, info = env.reset()
123+
s = info["s"]
124+
t = 1
125+
R = 0
126+
particles = env.initial_particles
127+
while not done and t < max_env_steps:
128+
# monte_carlo_state = monte_carlo_most_frequent(elements=particles, num_samples=100)
129+
aggregate_state = Cage2AggregateMDP.get_aggregate_state(s=s, state_to_id=state_to_id)
130+
a = -1
131+
a = restore_policy(s=s)
132+
133+
if t <= 1:
134+
a = 31
135+
if a == -1:
136+
a = Cage2AggregateMDP.get_aggregate_control(mu=mu, aggregate_state=aggregate_state,
137+
id_to_state=id_to_state)
138+
# print(f"base: {a}")
139+
a = rollout(s=s, state_to_id=state_to_id, train_env=train_env, J=J, mu=mu, gamma=gamma, l=l)[0]
140+
# print(f"rollout: {a}")
141+
o, r, done, _, info = env.step(a)
142+
# particles = particle_filter(particles=particles, max_num_particles=1000,
143+
# train_env=train_env, action=a, obs=o)
144+
s = info["s"]
145+
t+= 1
146+
R+= r
147+
# print(f"t:{t}, r: {r}, a: {action_id_to_type_and_host[a]}, R: {R}, aggstate: {id_to_state[aggregate_state]}")
148+
returns.append(R)
149+
print(np.mean(returns))

examples/eval/cyborg_scenario_two/eval_on_base_env.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,17 @@
2020
random.seed(seed)
2121
np.random.seed(seed)
2222
torch.manual_seed(seed)
23-
# print(csle_cyborg_env.action_id_to_type_and_host)
24-
# import sys
25-
# sys.exit(0)
26-
# print("Starting policy evaluation")
2723
for i in range(num_evaluations):
2824
o, _ = csle_cyborg_env.reset()
2925
R = 0
3026
t = 0
3127
while t < max_horizon:
32-
# a = ppo_policy.action(o=o)
33-
a = 4
28+
a = ppo_policy.action(o=o)
3429
o, r, done, _, info = csle_cyborg_env.step(a)
3530
table = csle_cyborg_env.get_true_table()
3631
print(table)
3732
print(r)
3833
R += r
3934
t += 1
4035
returns.append(R)
41-
# print(f"{i}/{num_evaluations}, avg R: {np.mean(returns)}, R: {R}")
36+
print(f"{i}/{num_evaluations}, avg R: {np.mean(returns)}, R: {R}")

examples/eval/cyborg_scenario_two/evaluate_on_wrapper_env.py

-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
random.seed(seed)
2020
np.random.seed(seed)
2121
torch.manual_seed(seed)
22-
print("Starting policy evaluation")
2322
for i in range(num_evaluations):
2423
o, _ = env.reset()
2524
R = 0
Original file line numberDiff line numberDiff line change
@@ -1,94 +1,112 @@
1+
from typing import List
12
import numpy as np
2-
import torch
3-
import random
4-
import json
5-
import io
6-
from gym_csle_cyborg.dao.csle_cyborg_config import CSLECyborgConfig
7-
from gym_csle_cyborg.dao.red_agent_type import RedAgentType
8-
from gym_csle_cyborg.envs.cyborg_scenario_two_defender import CyborgScenarioTwoDefender
3+
import time
94
from gym_csle_cyborg.envs.cyborg_scenario_two_wrapper import CyborgScenarioTwoWrapper
105
from gym_csle_cyborg.dao.csle_cyborg_wrapper_config import CSLECyborgWrapperConfig
116
from csle_agents.agents.pomcp.pomcp import POMCP
127
from csle_agents.agents.pomcp.pomcp_acquisition_function_type import POMCPAcquisitionFunctionType
138
import csle_agents.constants.constants as agents_constants
149
from csle_common.logging.log import Logger
10+
from gym_csle_cyborg.util.cyborg_env_util import CyborgEnvUtil
11+
from gym_csle_cyborg.dao.red_agent_type import RedAgentType
12+
13+
14+
def heuristic_value(o: List[List[int]]) -> float:
15+
"""
16+
A heuristic value function
17+
18+
:param o: the observation vector
19+
:return: the value
20+
"""
21+
host_costs = CyborgEnvUtil.get_host_compromised_costs()
22+
val = 0
23+
for i in range(len(o)):
24+
if o[i][2] > 0:
25+
val += host_costs[i]
26+
return val
27+
1528

1629
if __name__ == '__main__':
17-
# ppo_policy = PPOPolicy(model=None, simulation_name="", save_path="")
18-
config = CSLECyborgConfig(
19-
gym_env_name="csle-cyborg-scenario-two-v1", scenario=2, baseline_red_agents=[RedAgentType.B_LINE_AGENT],
20-
maximum_steps=100, red_agent_distribution=[1.0], reduced_action_space=True, decoy_state=True,
21-
scanned_state=True, decoy_optimization=False, cache_visited_states=False)
22-
eval_env = CyborgScenarioTwoDefender(config=config)
23-
config = CSLECyborgWrapperConfig(maximum_steps=100, gym_env_name="",
24-
save_trace=False, reward_shaping=False, scenario=2)
30+
config = CSLECyborgWrapperConfig(
31+
gym_env_name="csle-cyborg-scenario-two-wrapper-v1", maximum_steps=100, save_trace=False, scenario=2,
32+
reward_shaping=True, red_agent_type=RedAgentType.B_LINE_AGENT)
33+
eval_env = CyborgScenarioTwoWrapper(config=config)
2534
train_env = CyborgScenarioTwoWrapper(config=config)
35+
action_id_to_type_and_host, type_and_host_to_action_id \
36+
= CyborgEnvUtil.get_action_dicts(scenario=2, reduced_action_space=True, decoy_state=True,
37+
decoy_optimization=False)
2638

27-
num_evaluations = 10
28-
max_horizon = 100
29-
returns = []
30-
seed = 215125
31-
random.seed(seed)
32-
np.random.seed(seed)
33-
torch.manual_seed(seed)
39+
N = 5000
40+
rollout_policy = lambda x, deterministic: 35
41+
value_function = heuristic_value
3442
A = train_env.get_action_space()
35-
gamma = 0.75
36-
c = 1
37-
print("Starting policy evaluation")
38-
for i in range(num_evaluations):
43+
gamma = 0.99
44+
reinvigoration = False
45+
reinvigorated_particles_ratio = 0.0
46+
initial_particles = train_env.initial_particles
47+
planning_time = 3.75
48+
prune_action_space = False
49+
max_particles = 1000
50+
max_planning_depth = 50
51+
max_rollout_depth = 4
52+
c = 0.5
53+
c2 = 15000
54+
use_rollout_policy = False
55+
prior_weight = 5
56+
prior_confidence = 0
57+
acquisition_function_type = POMCPAcquisitionFunctionType.UCB
58+
log_steps_frequency = 1
59+
max_negative_samples = 20
60+
default_node_value = 0
61+
verbose = False
62+
eval_batch_size = 100
63+
max_env_steps = 100
64+
prune_size = 3
65+
start = time.time()
66+
67+
# Run N episodes
68+
returns = []
69+
for i in range(N):
70+
done = False
71+
action_sequence = []
3972
_, info = eval_env.reset()
4073
s = info[agents_constants.COMMON.STATE]
4174
train_env.reset()
42-
initial_particles = train_env.initial_particles
43-
max_particles = 1000
44-
planning_time = 60
45-
value_function = lambda x: 0
46-
reinvigoration = False
47-
rollout_policy = False
48-
verbose = False
49-
default_node_value = 0
50-
prior_weight = 1
51-
acquisition_function_type = POMCPAcquisitionFunctionType.UCB
52-
use_rollout_policy = False
53-
reinvigorated_particles_ratio = False
54-
prune_action_space = False
55-
prune_size = 3
56-
prior_confidence = 0
5775
pomcp = POMCP(A=A, gamma=gamma, env=train_env, c=c, initial_particles=initial_particles,
5876
planning_time=planning_time, max_particles=max_particles, rollout_policy=rollout_policy,
5977
value_function=value_function, reinvigoration=reinvigoration, verbose=verbose,
6078
default_node_value=default_node_value, prior_weight=prior_weight,
61-
acquisition_function_type=acquisition_function_type, c2=1500,
79+
acquisition_function_type=acquisition_function_type, c2=c2,
6280
use_rollout_policy=use_rollout_policy, prior_confidence=prior_confidence,
6381
reinvigorated_particles_ratio=reinvigorated_particles_ratio,
6482
prune_action_space=prune_action_space, prune_size=prune_size)
65-
rollout_depth = 4
66-
planning_depth = 100
6783
R = 0
68-
t = 0
69-
action_sequence = []
70-
while t < max_horizon:
71-
pomcp.solve(max_rollout_depth=rollout_depth, max_planning_depth=planning_depth)
84+
t = 1
85+
86+
# Run episode
87+
while not done and t <= max_env_steps:
88+
rollout_depth = max_rollout_depth
89+
planning_depth = max_planning_depth
90+
pomcp.solve(max_rollout_depth=rollout_depth, max_planning_depth=planning_depth, t=t)
7291
action = pomcp.get_action()
73-
o, r, done, _, info = eval_env.step(action)
92+
o, _, done, _, info = eval_env.step(action)
93+
r = info[agents_constants.COMMON.REWARD]
7494
action_sequence.append(action)
7595
s_prime = info[agents_constants.COMMON.STATE]
7696
obs_id = info[agents_constants.COMMON.OBSERVATION]
77-
pomcp.update_tree_with_new_samples(action_sequence=action_sequence, observation=obs_id)
78-
print(eval_env.get_true_table())
79-
print(eval_env.get_table())
97+
pomcp.update_tree_with_new_samples(action_sequence=action_sequence, observation=obs_id, t=t)
8098
R += r
8199
t += 1
82-
Logger.__call__().get_logger().info(f"[POMCP] t: {t}, a: {action}, r: {r}, o: {obs_id}, "
83-
f"s_prime: {s_prime},"
84-
f", action sequence: {action_sequence}, R: {R}")
100+
if t % log_steps_frequency == 0:
101+
Logger.__call__().get_logger().info(f"[POMCP] t: {t}, a: {action_id_to_type_and_host[action]}, r: {r}, "
102+
f"action sequence: {action_sequence}, R: {round(R, 2)}")
103+
104+
# Logging
85105
returns.append(R)
86-
print(f"{i}/{num_evaluations}, avg R: {np.mean(returns)}, R: {R}")
87-
results = {}
88-
results["seed"] = seed
89-
results["training_time"] = 0
90-
results["returns"] = returns
91-
results["planning_time"] = planning_time
92-
json_str = json.dumps(results, indent=4, sort_keys=True)
93-
with io.open(f"/Users/kim/pomcp_{0}_60s.json", 'w', encoding='utf-8') as f:
94-
f.write(json_str)
106+
progress = round((i + 1) / N, 2)
107+
time_elapsed_minutes = round((time.time() - start) / 60, 3)
108+
Logger.__call__().get_logger().info(
109+
f"[POMCP] episode: {i}, J:{R}, "
110+
f"J_avg: {np.mean(returns)}, "
111+
f"progress: {round(progress * 100, 2)}%, "
112+
f"runtime: {time_elapsed_minutes} min")

simulation-system/libs/csle-agents/src/csle_agents/constants/constants.py

+1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ class COMMON:
5757
EVALUATE_WITH_DISCOUNT = "evaluate_with_discount"
5858
STATE = "s"
5959
OBSERVATION = "o"
60+
REWARD = "r"
6061

6162

6263
class PPO:

simulation-system/libs/gym-csle-cyborg/src/gym_csle_cyborg/constants/constants.py

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class ENV_METRICS:
3131
ATTACKER_ACTION = "a2"
3232
OBSERVATION = "o"
3333
OBSERVATION_VECTOR = "obs_vec"
34+
REWARD = "r"
3435
OBSERVATION_ID = "obs_id"
3536
TIME_STEP = "t"
3637
AVERAGE_UPPER_BOUND_RETURN = "average_upper_bound_return"

0 commit comments

Comments
 (0)