123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206 |
- import copy
- import gymnasium as gym
- from gymnasium.spaces import Box, Discrete
- import numpy as np
- import random
- class SimpleContextualBandit(gym.Env):
- """Simple env w/ 2 states and 3 actions (arms): 0, 1, and 2.
- Episodes last only for one timestep, possible observations are:
- [-1.0, 1.0] and [1.0, -1.0], where the first element is the "current context".
- The highest reward (+10.0) is received for selecting arm 0 for context=1.0
- and arm 2 for context=-1.0. Action 1 always yields 0.0 reward.
- """
- def __init__(self, config=None):
- self.action_space = Discrete(3)
- self.observation_space = Box(low=-1.0, high=1.0, shape=(2,))
- self.cur_context = None
- def reset(self, *, seed=None, options=None):
- self.cur_context = random.choice([-1.0, 1.0])
- return np.array([self.cur_context, -self.cur_context]), {}
- def step(self, action):
- rewards_for_context = {
- -1.0: [-10, 0, 10],
- 1.0: [10, 0, -10],
- }
- reward = rewards_for_context[self.cur_context][action]
- return (
- np.array([-self.cur_context, self.cur_context]),
- reward,
- True,
- False,
- {"regret": 10 - reward},
- )
- class LinearDiscreteEnv(gym.Env):
- """Samples data from linearly parameterized arms.
- The reward for context X and arm i is given by X^T * theta_i, for some
- latent set of parameters {theta_i : i = 1, ..., k}.
- The thetas are sampled uniformly at random, the contexts are Gaussian,
- and Gaussian noise is added to the rewards.
- """
- DEFAULT_CONFIG_LINEAR = {
- "feature_dim": 8,
- "num_actions": 4,
- "reward_noise_std": 0.01,
- }
- def __init__(self, config=None):
- self.config = copy.copy(self.DEFAULT_CONFIG_LINEAR)
- if config is not None and type(config) == dict:
- self.config.update(config)
- self.feature_dim = self.config["feature_dim"]
- self.num_actions = self.config["num_actions"]
- self.sigma = self.config["reward_noise_std"]
- self.action_space = Discrete(self.num_actions)
- self.observation_space = Box(low=-10, high=10, shape=(self.feature_dim,))
- self.thetas = np.random.uniform(-1, 1, (self.num_actions, self.feature_dim))
- self.thetas /= np.linalg.norm(self.thetas, axis=1, keepdims=True)
- self._elapsed_steps = 0
- self._current_context = None
- def _sample_context(self):
- return np.random.normal(scale=1 / 3, size=(self.feature_dim,))
- def reset(self, *, seed=None, options=None):
- self._current_context = self._sample_context()
- return self._current_context, {}
- def step(self, action):
- assert (
- self._elapsed_steps is not None
- ), "Cannot call env.step() beforecalling reset()"
- assert action < self.num_actions, "Invalid action."
- action = int(action)
- context = self._current_context
- rewards = self.thetas.dot(context)
- opt_action = rewards.argmax()
- regret = rewards.max() - rewards[action]
- # Add Gaussian noise
- rewards += np.random.normal(scale=self.sigma, size=rewards.shape)
- reward = rewards[action]
- self._current_context = self._sample_context()
- return (
- self._current_context,
- reward,
- True,
- False,
- {"regret": regret, "opt_action": opt_action},
- )
- def render(self, mode="human"):
- raise NotImplementedError
- class WheelBanditEnv(gym.Env):
- """Wheel bandit environment for 2D contexts
- (see https://arxiv.org/abs/1802.09127).
- """
- DEFAULT_CONFIG_WHEEL = {
- "delta": 0.5,
- "mu_1": 1.2,
- "mu_2": 1,
- "mu_3": 50,
- "std": 0.01,
- }
- feature_dim = 2
- num_actions = 5
- def __init__(self, config=None):
- self.config = copy.copy(self.DEFAULT_CONFIG_WHEEL)
- if config is not None and type(config) == dict:
- self.config.update(config)
- self.delta = self.config["delta"]
- self.mu_1 = self.config["mu_1"]
- self.mu_2 = self.config["mu_2"]
- self.mu_3 = self.config["mu_3"]
- self.std = self.config["std"]
- self.action_space = Discrete(self.num_actions)
- self.observation_space = Box(low=-1, high=1, shape=(self.feature_dim,))
- self.means = [self.mu_1] + 4 * [self.mu_2]
- self._elapsed_steps = 0
- self._current_context = None
- def _sample_context(self):
- while True:
- state = np.random.uniform(-1, 1, self.feature_dim)
- if np.linalg.norm(state) <= 1:
- return state
- def reset(self, *, seed=None, options=None):
- self._current_context = self._sample_context()
- return self._current_context, {}
- def step(self, action):
- assert (
- self._elapsed_steps is not None
- ), "Cannot call env.step() before calling reset()"
- action = int(action)
- self._elapsed_steps += 1
- rewards = [
- np.random.normal(self.means[j], self.std) for j in range(self.num_actions)
- ]
- context = self._current_context
- r_big = np.random.normal(self.mu_3, self.std)
- if np.linalg.norm(context) >= self.delta:
- if context[0] > 0:
- if context[1] > 0:
- # First quadrant
- rewards[1] = r_big
- opt_action = 1
- else:
- # Fourth quadrant
- rewards[4] = r_big
- opt_action = 4
- else:
- if context[1] > 0:
- # Second quadrant
- rewards[2] = r_big
- opt_action = 2
- else:
- # Third quadrant
- rewards[3] = r_big
- opt_action = 3
- else:
- # Smaller region where action 0 is optimal
- opt_action = 0
- reward = rewards[action]
- regret = rewards[opt_action] - reward
- self._current_context = self._sample_context()
- return (
- self._current_context,
- reward,
- True,
- False,
- {"regret": regret, "opt_action": opt_action},
- )
- def render(self, mode="human"):
- raise NotImplementedError
|