from collections import defaultdict
import random
from typing import Optional
from gymnasium.spaces.space import Space
import numpy as np
from gridmind.policies.soft.base_soft_policy import BaseSoftPolicy
[docs]class StochasticStartEpsilonGreedyPolicy(BaseSoftPolicy):
"""
Epsilon-Greedy Policy is a specific implementation of an epsilon-soft policy.
The epsilon-greedy policy is a specific type of action selection strategy where, with a probability
ϵ, the agent selects a random action (exploration), and with a probability 1-ϵ, it selects the action
with the highest estimated value (greedy action).
"""
def __init__(
self,
num_actions: int,
action_space: Optional[Space] = None,
epsilon: float = 0.1,
) -> None:
super().__init__()
[docs] self.action_space = action_space
[docs] self.num_actions = num_actions
assert epsilon >= 0 and epsilon <= 1, "epsilon must be in rage 0 to 1."
assert (
num_actions == self.action_space.n
if self.action_space is not None
else True
), "Provided num_actions does not match with number of actions in the provided action_space."
[docs] self.policy_dict = defaultdict(lambda: random.randint(0, self.num_actions - 1))
[docs] def _get_random_action(self):
if self.action_space:
random_action = self.action_space.sample()
return random_action
random_action = random.randint(0, self.num_actions - 1)
return random_action
[docs] def get_action(self, state):
if random.random() <= self.epsilon:
action = self._get_random_action()
else:
state = self.convert_to_scalar(state)
action = self._get_greedy_action(state)
return action
[docs] def get_actions(self, states):
actions = []
for state in states:
state = self.convert_to_scalar(state)
action = self.get_action(state)
actions.append(action)
return actions
[docs] def _get_greedy_action(self, state):
state = self.convert_to_scalar(state)
action = self.policy_dict[state]
assert (
action in self.action_space if self.action_space is not None else True
), "Action not in action space!!"
return action
[docs] def convert_to_scalar(self, state):
if isinstance(state, np.ndarray):
# Assert that state has only one dimension and one element
assert (
state.ndim == 1 and state.shape[0] == 1
), "State must be a 1D array with one element."
# Convert numpy array to scalar
state = state.item()
return state
[docs] def get_action_prob(self, state, action):
greedy_action = self._get_greedy_action(state)
each_random_action_prob = self.epsilon / self.num_actions
greedy_action_prob = 1.0 - self.epsilon + each_random_action_prob
action_probs = (
greedy_action_prob if action == greedy_action else each_random_action_prob
)
return action_probs
[docs] def get_all_action_probabilities(self, states):
action_probs_list = []
for state in states:
action_probs = []
greedy_action = self._get_greedy_action(state)
each_random_action_prob = self.epsilon / self.num_actions
greedy_action_prob = 1.0 - self.epsilon + each_random_action_prob
for action in range(self.num_actions):
prob = (
greedy_action_prob
if action == greedy_action
else each_random_action_prob
)
action_probs.append(prob)
action_probs_list.append(action_probs)
action_probs_arr = np.array(action_probs_list).squeeze()
return action_probs_arr
[docs] def update(self, state, action):
assert (
action in self.action_space if self.action_space is not None else True
), "Action not in action space!!"
state = self.convert_to_scalar(state)
self.policy_dict[state] = action
[docs] def get_action_deterministic(self, state):
action = self._get_greedy_action(state=state)
return action
[docs] def set_policy_dict(self, policy_dict):
self.policy_dict = policy_dict
[docs] def get_policy_dict(self):
return self.policy_dict