matrix_sequential_social_dilemma.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. ##########
  2. # Contribution by the Center on Long-Term Risk:
  3. # https://github.com/longtermrisk/marltoolbox
  4. # Some parts are originally from:
  5. # https://github.com/alshedivat/lola/tree/master/lola
  6. ##########
  7. import logging
  8. from abc import ABC
  9. from collections import Iterable
  10. from typing import Dict, Optional
  11. import numpy as np
  12. from gymnasium.spaces import Discrete
  13. from gymnasium.utils import seeding
  14. from ray.rllib.env.multi_agent_env import MultiAgentEnv
  15. from ray.rllib.examples.env.utils.interfaces import InfoAccumulationInterface
  16. from ray.rllib.examples.env.utils.mixins import (
  17. TwoPlayersTwoActionsInfoMixin,
  18. NPlayersNDiscreteActionsInfoMixin,
  19. )
  20. logger = logging.getLogger(__name__)
  21. class MatrixSequentialSocialDilemma(InfoAccumulationInterface, MultiAgentEnv, ABC):
  22. """
  23. A multi-agent abstract class for two player matrix games.
  24. PAYOUT_MATRIX: Numpy array. Along the dimension N, the action of the
  25. Nth player change. The last dimension is used to select the player
  26. whose reward you want to know.
  27. max_steps: number of step in one episode
  28. players_ids: list of the RLlib agent id of each player
  29. output_additional_info: ask the environment to aggregate information
  30. about the last episode and output them as info at the end of the
  31. episode.
  32. """
  33. def __init__(self, config: Optional[Dict] = None):
  34. if config is None:
  35. config = {}
  36. assert "reward_randomness" not in config.keys()
  37. assert self.PAYOUT_MATRIX is not None
  38. if "players_ids" in config:
  39. assert (
  40. isinstance(config["players_ids"], Iterable)
  41. and len(config["players_ids"]) == self.NUM_AGENTS
  42. )
  43. self.players_ids = config.get("players_ids", ["player_row", "player_col"])
  44. self.player_row_id, self.player_col_id = self.players_ids
  45. self.max_steps = config.get("max_steps", 20)
  46. self.output_additional_info = config.get("output_additional_info", True)
  47. self.step_count_in_current_episode = None
  48. # To store info about the fraction of each states
  49. if self.output_additional_info:
  50. self._init_info()
  51. def reset(self, *, seed=None, options=None):
  52. self.np_random, seed = seeding.np_random(seed)
  53. self.step_count_in_current_episode = 0
  54. if self.output_additional_info:
  55. self._reset_info()
  56. return {
  57. self.player_row_id: self.NUM_STATES - 1,
  58. self.player_col_id: self.NUM_STATES - 1,
  59. }, {}
  60. def step(self, actions: dict):
  61. """
  62. :param actions: Dict containing both actions for player_1 and player_2
  63. :return: observations, rewards, done, info
  64. """
  65. self.step_count_in_current_episode += 1
  66. action_player_row = actions[self.player_row_id]
  67. action_player_col = actions[self.player_col_id]
  68. if self.output_additional_info:
  69. self._accumulate_info(action_player_row, action_player_col)
  70. observations = self._produce_observations_invariant_to_the_player_trained(
  71. action_player_row, action_player_col
  72. )
  73. rewards = self._get_players_rewards(action_player_row, action_player_col)
  74. epi_is_done = self.step_count_in_current_episode >= self.max_steps
  75. if self.step_count_in_current_episode > self.max_steps:
  76. logger.warning("self.step_count_in_current_episode >= self.max_steps")
  77. info = self._get_info_for_current_epi(epi_is_done)
  78. return self._to_RLlib_API(observations, rewards, epi_is_done, info)
  79. def _produce_observations_invariant_to_the_player_trained(
  80. self, action_player_0: int, action_player_1: int
  81. ):
  82. """
  83. We want to be able to use a policy trained as player 1
  84. for evaluation as player 2 and vice versa.
  85. """
  86. return [
  87. action_player_0 * self.NUM_ACTIONS + action_player_1,
  88. action_player_1 * self.NUM_ACTIONS + action_player_0,
  89. ]
  90. def _get_players_rewards(self, action_player_0: int, action_player_1: int):
  91. return [
  92. self.PAYOUT_MATRIX[action_player_0][action_player_1][0],
  93. self.PAYOUT_MATRIX[action_player_0][action_player_1][1],
  94. ]
  95. def _to_RLlib_API(
  96. self, observations: list, rewards: list, epi_is_done: bool, info: dict
  97. ):
  98. observations = {
  99. self.player_row_id: observations[0],
  100. self.player_col_id: observations[1],
  101. }
  102. rewards = {self.player_row_id: rewards[0], self.player_col_id: rewards[1]}
  103. if info is None:
  104. info = {}
  105. else:
  106. info = {self.player_row_id: info, self.player_col_id: info}
  107. done = {
  108. self.player_row_id: epi_is_done,
  109. self.player_col_id: epi_is_done,
  110. "__all__": epi_is_done,
  111. }
  112. return observations, rewards, done, done, info
  113. def _get_info_for_current_epi(self, epi_is_done):
  114. if epi_is_done and self.output_additional_info:
  115. info_for_current_epi = self._get_episode_info()
  116. else:
  117. info_for_current_epi = None
  118. return info_for_current_epi
  119. def __str__(self):
  120. return self.NAME
  121. class IteratedMatchingPennies(
  122. TwoPlayersTwoActionsInfoMixin, MatrixSequentialSocialDilemma
  123. ):
  124. """
  125. A two-agent environment for the Matching Pennies game.
  126. """
  127. NUM_AGENTS = 2
  128. NUM_ACTIONS = 2
  129. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  130. ACTION_SPACE = Discrete(NUM_ACTIONS)
  131. OBSERVATION_SPACE = Discrete(NUM_STATES)
  132. PAYOUT_MATRIX = np.array([[[+1, -1], [-1, +1]], [[-1, +1], [+1, -1]]])
  133. NAME = "IMP"
  134. class IteratedPrisonersDilemma(
  135. TwoPlayersTwoActionsInfoMixin, MatrixSequentialSocialDilemma
  136. ):
  137. """
  138. A two-agent environment for the Prisoner's Dilemma game.
  139. """
  140. NUM_AGENTS = 2
  141. NUM_ACTIONS = 2
  142. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  143. ACTION_SPACE = Discrete(NUM_ACTIONS)
  144. OBSERVATION_SPACE = Discrete(NUM_STATES)
  145. PAYOUT_MATRIX = np.array([[[-1, -1], [-3, +0]], [[+0, -3], [-2, -2]]])
  146. NAME = "IPD"
  147. class IteratedAsymPrisonersDilemma(
  148. TwoPlayersTwoActionsInfoMixin, MatrixSequentialSocialDilemma
  149. ):
  150. """
  151. A two-agent environment for the Asymmetric Prisoner's Dilemma game.
  152. """
  153. NUM_AGENTS = 2
  154. NUM_ACTIONS = 2
  155. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  156. ACTION_SPACE = Discrete(NUM_ACTIONS)
  157. OBSERVATION_SPACE = Discrete(NUM_STATES)
  158. PAYOUT_MATRIX = np.array([[[+0, -1], [-3, +0]], [[+0, -3], [-2, -2]]])
  159. NAME = "IPD"
  160. class IteratedStagHunt(TwoPlayersTwoActionsInfoMixin, MatrixSequentialSocialDilemma):
  161. """
  162. A two-agent environment for the Stag Hunt game.
  163. """
  164. NUM_AGENTS = 2
  165. NUM_ACTIONS = 2
  166. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  167. ACTION_SPACE = Discrete(NUM_ACTIONS)
  168. OBSERVATION_SPACE = Discrete(NUM_STATES)
  169. PAYOUT_MATRIX = np.array([[[3, 3], [0, 2]], [[2, 0], [1, 1]]])
  170. NAME = "IteratedStagHunt"
  171. class IteratedChicken(TwoPlayersTwoActionsInfoMixin, MatrixSequentialSocialDilemma):
  172. """
  173. A two-agent environment for the Chicken game.
  174. """
  175. NUM_AGENTS = 2
  176. NUM_ACTIONS = 2
  177. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  178. ACTION_SPACE = Discrete(NUM_ACTIONS)
  179. OBSERVATION_SPACE = Discrete(NUM_STATES)
  180. PAYOUT_MATRIX = np.array([[[+0, +0], [-1.0, +1.0]], [[+1, -1], [-10, -10]]])
  181. NAME = "IteratedChicken"
  182. class IteratedAsymChicken(TwoPlayersTwoActionsInfoMixin, MatrixSequentialSocialDilemma):
  183. """
  184. A two-agent environment for the Asymmetric Chicken game.
  185. """
  186. NUM_AGENTS = 2
  187. NUM_ACTIONS = 2
  188. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  189. ACTION_SPACE = Discrete(NUM_ACTIONS)
  190. OBSERVATION_SPACE = Discrete(NUM_STATES)
  191. PAYOUT_MATRIX = np.array([[[+2.0, +0], [-1.0, +1.0]], [[+2.5, -1], [-10, -10]]])
  192. NAME = "AsymmetricIteratedChicken"
  193. class IteratedBoS(TwoPlayersTwoActionsInfoMixin, MatrixSequentialSocialDilemma):
  194. """
  195. A two-agent environment for the BoS game.
  196. """
  197. NUM_AGENTS = 2
  198. NUM_ACTIONS = 2
  199. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  200. ACTION_SPACE = Discrete(NUM_ACTIONS)
  201. OBSERVATION_SPACE = Discrete(NUM_STATES)
  202. PAYOUT_MATRIX = np.array(
  203. [[[+3.0, +2.0], [+0.0, +0.0]], [[+0.0, +0.0], [+2.0, +3.0]]]
  204. )
  205. NAME = "IteratedBoS"
  206. class IteratedAsymBoS(TwoPlayersTwoActionsInfoMixin, MatrixSequentialSocialDilemma):
  207. """
  208. A two-agent environment for the BoS game.
  209. """
  210. NUM_AGENTS = 2
  211. NUM_ACTIONS = 2
  212. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  213. ACTION_SPACE = Discrete(NUM_ACTIONS)
  214. OBSERVATION_SPACE = Discrete(NUM_STATES)
  215. PAYOUT_MATRIX = np.array(
  216. [[[+4.0, +1.0], [+0.0, +0.0]], [[+0.0, +0.0], [+2.0, +2.0]]]
  217. )
  218. NAME = "AsymmetricIteratedBoS"
  219. def define_greed_fear_matrix_game(greed, fear):
  220. class GreedFearGame(TwoPlayersTwoActionsInfoMixin, MatrixSequentialSocialDilemma):
  221. NUM_AGENTS = 2
  222. NUM_ACTIONS = 2
  223. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  224. ACTION_SPACE = Discrete(NUM_ACTIONS)
  225. OBSERVATION_SPACE = Discrete(NUM_STATES)
  226. R = 3
  227. P = 1
  228. T = R + greed
  229. S = P - fear
  230. PAYOUT_MATRIX = np.array([[[R, R], [S, T]], [[T, S], [P, P]]])
  231. NAME = "IteratedGreedFear"
  232. def __str__(self):
  233. return f"{self.NAME} with greed={greed} and fear={fear}"
  234. return GreedFearGame
  235. class IteratedBoSAndPD(
  236. NPlayersNDiscreteActionsInfoMixin, MatrixSequentialSocialDilemma
  237. ):
  238. """
  239. A two-agent environment for the BOTS + PD game.
  240. """
  241. NUM_AGENTS = 2
  242. NUM_ACTIONS = 3
  243. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  244. ACTION_SPACE = Discrete(NUM_ACTIONS)
  245. OBSERVATION_SPACE = Discrete(NUM_STATES)
  246. PAYOUT_MATRIX = np.array(
  247. [
  248. [[3.5, +1], [+0, +0], [-3, +2]],
  249. [[+0.0, +0], [+1, +3], [-3, +2]],
  250. [[+2.0, -3], [+2, -3], [-1, -1]],
  251. ]
  252. )
  253. NAME = "IteratedBoSAndPD"