Source code for yawning_titan.envs.generic.core.reward_functions

"""
A collection of reward functions used be the generic network environment.

You can select the reward function that you wish to use in the config file under settings.
The reward functions take in a parameter called args. args is a dictionary that contains the
following information:
    -network_interface: Interface with the network
    -blue_action: The action that the blue agent has taken this turn
    -blue_node: The node that the blue agent has targeted for their action
    -start_state: The state of the nodes before the blue agent has taken their action
    -end_state: The state of the nodes after the blue agent has taken their action
    -start_vulnerabilities: The vulnerabilities before blue agents turn
    -end_vulnerabilities: The vulnerabilities after the blue agents turn
    -start_isolation: The isolation status of all the nodes at the start of a turn
    -end_isolation: The isolation status of all the nodes at the end of a turn
    -start_blue: The env as the blue agent can see it before the blue agents turn
    -end_blue: The env as the blue agent can see it after the blue agents turn

The reward function returns a single number (integer or float) that is the blue agents reward for that turn.
"""

# Functions:
from __future__ import annotations

import math

from yawning_titan.envs.generic.core.network_interface import NetworkInterface

REMOVE_RED_POINTS = []
for i in range(0, 101):
    REMOVE_RED_POINTS.append(round(math.exp(-0.004 * i), 4))

REDUCE_VULNERABILITY_POINTS = []
for i in range(1, 20):
    REDUCE_VULNERABILITY_POINTS.append(2 / (10 + math.exp(4 - 10 * (i / 20))) + 0.5)

SCANNING_USAGE_POINTS = []
for i in range(0, 100):
    SCANNING_USAGE_POINTS.append(-math.exp(-i) + 1)


[docs]def standard_rewards(args: dict) -> float: """ Calculate the reward for the current state of the environment. Actions cost a certain amount and blue gets rewarded for removing red nodes and reducing the vulnerability of nodes Args: args: A dictionary containing the following items: network_interface: Interface with the network blue_action: The action that the blue agent has taken this turn blue_node: The node that the blue agent has targeted for their action start_state: The state of the nodes before the blue agent has taken their action end_state: The state of the nodes after the blue agent has taken their action start_vulnerabilities: The vulnerabilities before blue agents turn end_vulnerabilities: The vulnerabilities after the blue agents turn start_isolation: The isolation status of all the nodes at the start of a turn end_isolation: The isolation status of all the nodes at the end of a turn start_blue: The env as the blue agent can see it before the blue agents turn end_blue: The env as the blue agent can see it after the blue agents turn Returns: The reward earned for this specific turn for the blue agent """ # Get information about the current state of the environment network_interface: NetworkInterface = args["network_interface"] blue_action = args["blue_action"] start_state = args["start_state"] end_state = args["end_state"] start_vulnerabilities = args["start_vulnerabilities"] end_vulnerabilities = args["end_vulnerabilities"] start_isolation = args["start_isolation"] end_isolation = args["end_isolation"] start_blue = args["start_blue"] end_blue = args["end_blue"] # cost for actions action_cost = { "reduce_vulnerability": 0.5, "restore_node": 1, "make_node_safe": 0.5, "scan": 0, "isolate": 1, "connect": 0, "do_nothing": -0.5, "add_deceptive_node": 8, } # prevent isolate reward from being duplicated reward = -action_cost[blue_action] if blue_action != "isolate" else 0 # punish agent for every node it has isolated reward += -action_cost["isolate"] * sum(end_isolation.values()) # calculating number of red nodes before and after the blue agents turn initial_cumulative_states = sum(start_state.values()) final_cumulative_states = sum(end_state.values()) # punish agent for doing nothing if there are large numbers or red controlled nodes in the environment if blue_action == "do_nothing": reward = reward - (0.2 * final_cumulative_states) if blue_action == "connect": if sum(end_isolation.values()) < sum(start_isolation.values()): reward += 5 else: reward -= 5 # rewards for removing red nodes if initial_cumulative_states > final_cumulative_states: reward += REMOVE_RED_POINTS[ round( 100 * final_cumulative_states / network_interface.current_graph.number_of_nodes() ) ] # punish agent for doing nothing if there are large numbers or red controlled nodes in the environment if blue_action != "make_node_safe" and blue_action != "restore_node": amount = ( final_cumulative_states / network_interface.current_graph.number_of_nodes() ) if amount > 0.3: reward = reward - amount + 0.3 # punish the blue agent for patching nodes that are already safe if blue_action == "make_node_safe" or blue_action == "restore_node": if initial_cumulative_states == final_cumulative_states: reward = reward - 3 # punish the blue agent for reducing the vulnerability of a node that is already at minimum vulnerability if blue_action == "reduce_vulnerability" or blue_action == "restore_node": if sum(start_vulnerabilities.values()) == sum(end_vulnerabilities.values()): reward = reward - 0.5 # reward for revealing red if blue_action == "scan": number = 0 for node, value in end_blue.items(): if value == 1 and start_blue[node] == 0: if start_state[node] == 1: number += 1 if number >= 5: reward += 2.5 else: reward += number * 0.5 # rewards for reducing node vulnerabilities if ( network_interface.game_mode.red.agent_attack.ignores_defences.value is False and blue_action == "reduce_vulnerability" ): initial_cumulative_vuln = sum(start_vulnerabilities.values()) final_cumulative_vuln = sum(end_vulnerabilities.values()) reward = reward + (initial_cumulative_vuln - final_cumulative_vuln) * 4 if blue_action == "add_deceptive_node": if network_interface.reached_max_deceptive_nodes: reward -= 5 return reward
[docs]def experimental_rewards(args: dict) -> float: """ Calculate the reward for the current state of the environment. Actions cost a certain amount and blue gets rewarded for removing red nodes and reducing the vulnerability of nodes Args: args: A dictionary containing the following items: network_interface: Interface with the network blue_action: The action that the blue agent has taken this turn blue_node: The node that the blue agent has targeted for their action start_state: The state of the nodes before the blue agent has taken their action end_state: The state of the nodes after the blue agent has taken their action start_vulnerabilities: The vulnerabilities before blue agents turn end_vulnerabilities: The vulnerabilities after the blue agents turn start_isolation: The isolation status of all the nodes at the start of a turn end_isolation: The isolation status of all the nodes at the end of a turn start_blue: The env as the blue agent can see it before the blue agents turn end_blue: The env as the blue agent can see it after the blue agents turn Returns: The reward earned for this specific turn for the blue agent """ # Get information about the current state of the environment network_interface: NetworkInterface = args["network_interface"] blue_action = args["blue_action"] start_state = args["start_state"] end_state = args["end_state"] start_blue = args["start_blue"] end_blue = args["end_blue"] # cost for actions action_cost = { "reduce_vulnerability": 0, "restore_node": 1, "make_node_safe": 0, "scan": 0, "isolate": 10, "connect": 0, "do_nothing": -0.5, "add_deceptive_node": 2, } reward = -action_cost[blue_action] # calculating number of red nodes before and after the blue agents turn initial_cumulative_states = sum(start_state.values()) final_cumulative_states = sum(end_state.values()) # calculating the vulnerability of all the nodes before and after the blue agents turn # blue's view initial_blue_states = sum(start_blue.values()) final_blue_states = sum(end_blue.values()) # rewards for removing red nodes # gives between 0.5 and 1 if blue_action == "make_node_safe" or blue_action == "restore_node": if initial_blue_states > final_blue_states: reward += REMOVE_RED_POINTS[ round( 100 * final_cumulative_states / network_interface.current_graph.number_of_nodes() ) ] elif initial_cumulative_states > final_cumulative_states: pass else: reward -= 1 # punish agent for doing nothing if there are large numbers or red controlled nodes in the environment if blue_action == "do_nothing": reward = reward - (0.5 * final_cumulative_states) vision_score = 0.01 * (final_cumulative_states - final_blue_states) ** 2 reward -= vision_score return reward
# A very simple example reward function
[docs]def one_per_timestep(args: dict) -> float: """ Give a reward for 0.1 for every timestep that the blue agent is alive. Args: args: A dictionary containing the following items: network_interface: Interface with the network blue_action: The action that the blue agent has taken this turn blue_node: The node that the blue agent has targeted for their action start_state: The state of the nodes before the blue agent has taken their action end_state: The state of the nodes after the blue agent has taken their action start_vulnerabilities: The vulnerabilities before blue agents turn end_vulnerabilities: The vulnerabilities after the blue agents turn start_isolation: The isolation status of all the nodes at the start of a turn end_isolation: The isolation status of all the nodes at the end of a turn start_blue: The env as the blue agent can see it before the blue agents turn end_blue: The env as the blue agent can see it after the blue agents turn Returns: 0.1 """ return 0.1
[docs]def zero_reward(args: dict) -> float: """ Return zero reward per timestep. Args: args: A dictionary containing the following items: network_interface: Interface with the network blue_action: The action that the blue agent has taken this turn blue_node: The node that the blue agent has targeted for their action start_state: The state of the nodes before the blue agent has taken their action end_state: The state of the nodes after the blue agent has taken their action start_vulnerabilities: The vulnerabilities before blue agents turn end_vulnerabilities: The vulnerabilities after the blue agents turn start_isolation: The isolation status of all the nodes at the start of a turn end_isolation: The isolation status of all the nodes at the end of a turn start_blue: The env as the blue agent can see it before the blue agents turn end_blue: The env as the blue agent can see it after the blue agents turn Returns: 0 """ return 0
[docs]def safe_nodes_give_rewards(args: dict) -> float: """ Give 1 reward for every safe node at that timestep. Args: args: A dictionary containing the following items: network_interface: Interface with the network blue_action: The action that the blue agent has taken this turn blue_node: The node that the blue agent has targeted for their action start_state: The state of the nodes before the blue agent has taken their action end_state: The state of the nodes after the blue agent has taken their action start_vulnerabilities: The vulnerabilities before blue agents turn end_vulnerabilities: The vulnerabilities after the blue agents turn start_isolation: The isolation status of all the nodes at the start of a turn end_isolation: The isolation status of all the nodes at the end of a turn start_blue: The env as the blue agent can see it before the blue agents turn end_blue: The env as the blue agent can see it after the blue agents turn Returns: The reward earned for this specific turn for the blue agent """ # Get information about the current state of the environment end_state = args["end_state"] final_cumulative_states = sum(end_state.values()) # reward is equal to the number of safe nodes reward = len(end_state) - final_cumulative_states return reward
[docs]def punish_bad_actions(args: dict) -> float: """ Just punishes bad actions bad moves. Args: args: A dictionary containing the following items: network_interface: Interface with the network blue_action: The action that the blue agent has taken this turn blue_node: The node that the blue agent has targeted for their action start_state: The state of the nodes before the blue agent has taken their action end_state: The state of the nodes after the blue agent has taken their action start_vulnerabilities: The vulnerabilities before blue agents turn end_vulnerabilities: The vulnerabilities after the blue agents turn start_isolation: The isolation status of all the nodes at the start of a turn end_isolation: The isolation status of all the nodes at the end of a turn start_blue: The env as the blue agent can see it before the blue agents turn end_blue: The env as the blue agent can see it after the blue agents turn Returns: The reward earned for this specific turn for the blue agent """ # Get information about the current state of the game network_interface: NetworkInterface = args["network_interface"] blue_action = args["blue_action"] start_state = args["start_state"] end_state = args["end_state"] start_vulnerabilities = args["start_vulnerabilities"] end_vulnerabilities = args["end_vulnerabilities"] # Get number of safe states before and after the blue agents turn initial_cumulative_states = sum(start_state.values()) final_cumulative_states = sum(end_state.values()) reward = 0 # punish agent for doing nothing if there are large numbers or red controlled nodes in the environment if blue_action == "do_nothing": reward = reward - (0.5 * final_cumulative_states) # punish the blue agent for patching nodes that are already safe if blue_action == "make_node_safe" or blue_action == "restore_node": if initial_cumulative_states == final_cumulative_states: reward = reward - (0.2 * initial_cumulative_states) # punish the blue agent for reducing the vulnerability of a node that is already at minimum vulnerability if blue_action == "reduce_vulnerability" and ( sum(start_vulnerabilities.values()) == sum(end_vulnerabilities.values()) ): reward = reward - 1 # punish for relocating deceptive nodes (after it has already been placed) if blue_action == "add_deceptive_node": if network_interface.reached_max_deceptive_nodes: reward = reward - 5 return reward
[docs]def num_nodes_safe(args: dict) -> float: """ Provide reward based on the proportion of nodes safe within the environment. Args: args: A dictionary containing information from the environment for the given timestep Returns: The calculated reward """ total_n_nodes = len(args["end_state"].values()) n_compromised = sum(args["end_state"].values()) n_safe = total_n_nodes - n_compromised return n_safe / total_n_nodes
[docs]def dcbo_cost_func(args: dict) -> float: """ Calculate the cost function for DCBO using a set of fixed action cost values. Args: args: A dictionary containing the following items: network_interface: Interface with the network blue_action: The action that the blue agent has taken this turn blue_node: The node that the blue agent has targeted for their action start_state: The state of the nodes before the blue agent has taken their action end_state: The state of the nodes after the blue agent has taken their action start_vulnerabilities: The vulnerabilities before blue agents turn end_vulnerabilities: The vulnerabilities after the blue agents turn start_isolation: The isolation status of all the nodes at the start of a turn end_isolation: The isolation status of all the nodes at the end of a turn start_blue: The env as the blue agent can see it before the blue agents turn end_blue: The env as the blue agent can see it after the blue agents turn Returns: The cost for DCBO """ # Get information about the current state of the environment blue_action = args["blue_action"] end_state = args["end_state"] # cost for actions action_cost = { "reduce_vulnerability": 0, "restore_node": 1, "make_node_safe": 1, "scan": 0, "isolate": 1, "connect": 0, "do_nothing": 0, "add_deceptive_node": 0, } reward = action_cost[blue_action] # calculating number of red nodes before and after the blue agents turn final_cumulative_states = sum(end_state.values()) cost = final_cumulative_states * 10 + reward return 0 - cost