Source code for stochastic_start_epsilon_greedy_policy

from collections import defaultdict
import random
from typing import Optional
from gymnasium.spaces.space import Space
import numpy as np
from gridmind.policies.soft.base_soft_policy import BaseSoftPolicy


[docs]class StochasticStartEpsilonGreedyPolicy(BaseSoftPolicy): """ Epsilon-Greedy Policy is a specific implementation of an epsilon-soft policy. The epsilon-greedy policy is a specific type of action selection strategy where, with a probability ϵ, the agent selects a random action (exploration), and with a probability 1-ϵ, it selects the action with the highest estimated value (greedy action). """ def __init__( self, num_actions: int, action_space: Optional[Space] = None, epsilon: float = 0.1, ) -> None: super().__init__()
[docs] self.action_space = action_space
[docs] self.num_actions = num_actions
[docs] self.epsilon = epsilon
assert epsilon >= 0 and epsilon <= 1, "epsilon must be in rage 0 to 1." assert ( num_actions == self.action_space.n if self.action_space is not None else True ), "Provided num_actions does not match with number of actions in the provided action_space."
[docs] self.policy_dict = defaultdict(lambda: random.randint(0, self.num_actions - 1))
[docs] def _get_random_action(self): if self.action_space: random_action = self.action_space.sample() return random_action random_action = random.randint(0, self.num_actions - 1) return random_action
[docs] def get_action(self, state): if random.random() <= self.epsilon: action = self._get_random_action() else: state = self.convert_to_scalar(state) action = self._get_greedy_action(state) return action
[docs] def get_actions(self, states): actions = [] for state in states: state = self.convert_to_scalar(state) action = self.get_action(state) actions.append(action) return actions
[docs] def _get_greedy_action(self, state): state = self.convert_to_scalar(state) action = self.policy_dict[state] assert ( action in self.action_space if self.action_space is not None else True ), "Action not in action space!!" return action
[docs] def convert_to_scalar(self, state): if isinstance(state, np.ndarray): # Assert that state has only one dimension and one element assert ( state.ndim == 1 and state.shape[0] == 1 ), "State must be a 1D array with one element." # Convert numpy array to scalar state = state.item() return state
[docs] def get_action_prob(self, state, action): greedy_action = self._get_greedy_action(state) each_random_action_prob = self.epsilon / self.num_actions greedy_action_prob = 1.0 - self.epsilon + each_random_action_prob action_probs = ( greedy_action_prob if action == greedy_action else each_random_action_prob ) return action_probs
[docs] def get_all_action_probabilities(self, states): action_probs_list = [] for state in states: action_probs = [] greedy_action = self._get_greedy_action(state) each_random_action_prob = self.epsilon / self.num_actions greedy_action_prob = 1.0 - self.epsilon + each_random_action_prob for action in range(self.num_actions): prob = ( greedy_action_prob if action == greedy_action else each_random_action_prob ) action_probs.append(prob) action_probs_list.append(action_probs) action_probs_arr = np.array(action_probs_list).squeeze() return action_probs_arr
[docs] def update(self, state, action): assert ( action in self.action_space if self.action_space is not None else True ), "Action not in action space!!" state = self.convert_to_scalar(state) self.policy_dict[state] = action
[docs] def get_action_deterministic(self, state): action = self._get_greedy_action(state=state) return action
[docs] def set_policy_dict(self, policy_dict): self.policy_dict = policy_dict
[docs] def get_policy_dict(self): return self.policy_dict