matrix_sequential_social_dilemma.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. ##########
  2. # Contribution by the Center on Long-Term Risk:
  3. # https://github.com/longtermrisk/marltoolbox
  4. # Some parts are originally from:
  5. # https://github.com/alshedivat/lola/tree/master/lola
  6. ##########
  7. import logging
  8. from abc import ABC
  9. from collections import Iterable
  10. from typing import Dict, Optional
  11. import numpy as np
  12. from gym.spaces import Discrete
  13. from gym.utils import seeding
  14. from ray.rllib.env.multi_agent_env import MultiAgentEnv
  15. from ray.rllib.examples.env.utils.interfaces import InfoAccumulationInterface
  16. from ray.rllib.examples.env.utils.mixins import \
  17. TwoPlayersTwoActionsInfoMixin, NPlayersNDiscreteActionsInfoMixin
  18. logger = logging.getLogger(__name__)
  19. class MatrixSequentialSocialDilemma(InfoAccumulationInterface, MultiAgentEnv,
  20. ABC):
  21. """
  22. A multi-agent abstract class for two player matrix games.
  23. PAYOUT_MATRIX: Numpy array. Along the dimension N, the action of the
  24. Nth player change. The last dimension is used to select the player
  25. whose reward you want to know.
  26. max_steps: number of step in one episode
  27. players_ids: list of the RLLib agent id of each player
  28. output_additional_info: ask the environment to aggregate information
  29. about the last episode and output them as info at the end of the
  30. episode.
  31. """
  32. def __init__(self, config: Optional[Dict] = None):
  33. if config is None:
  34. config = {}
  35. assert "reward_randomness" not in config.keys()
  36. assert self.PAYOUT_MATRIX is not None
  37. if "players_ids" in config:
  38. assert isinstance(config["players_ids"], Iterable) and len(
  39. config["players_ids"]) == self.NUM_AGENTS
  40. self.players_ids = config.get("players_ids",
  41. ["player_row", "player_col"])
  42. self.player_row_id, self.player_col_id = self.players_ids
  43. self.max_steps = config.get("max_steps", 20)
  44. self.output_additional_info = config.get("output_additional_info",
  45. True)
  46. self.step_count_in_current_episode = None
  47. # To store info about the fraction of each states
  48. if self.output_additional_info:
  49. self._init_info()
  50. def seed(self, seed=None):
  51. """Seed the PRNG of this space. """
  52. self.np_random, seed = seeding.np_random(seed)
  53. return [seed]
  54. def reset(self):
  55. self.step_count_in_current_episode = 0
  56. if self.output_additional_info:
  57. self._reset_info()
  58. return {
  59. self.player_row_id: self.NUM_STATES - 1,
  60. self.player_col_id: self.NUM_STATES - 1
  61. }
  62. def step(self, actions: dict):
  63. """
  64. :param actions: Dict containing both actions for player_1 and player_2
  65. :return: observations, rewards, done, info
  66. """
  67. self.step_count_in_current_episode += 1
  68. action_player_row = actions[self.player_row_id]
  69. action_player_col = actions[self.player_col_id]
  70. if self.output_additional_info:
  71. self._accumulate_info(action_player_row, action_player_col)
  72. observations = \
  73. self._produce_observations_invariant_to_the_player_trained(
  74. action_player_row, action_player_col)
  75. rewards = self._get_players_rewards(action_player_row,
  76. action_player_col)
  77. epi_is_done = self.step_count_in_current_episode >= self.max_steps
  78. if self.step_count_in_current_episode > self.max_steps:
  79. logger.warning(
  80. "self.step_count_in_current_episode >= self.max_steps")
  81. info = self._get_info_for_current_epi(epi_is_done)
  82. return self._to_RLLib_API(observations, rewards, epi_is_done, info)
  83. def _produce_observations_invariant_to_the_player_trained(
  84. self, action_player_0: int, action_player_1: int):
  85. """
  86. We want to be able to use a policy trained as player 1
  87. for evaluation as player 2 and vice versa.
  88. """
  89. return [
  90. action_player_0 * self.NUM_ACTIONS + action_player_1,
  91. action_player_1 * self.NUM_ACTIONS + action_player_0
  92. ]
  93. def _get_players_rewards(self, action_player_0: int, action_player_1: int):
  94. return [
  95. self.PAYOUT_MATRIX[action_player_0][action_player_1][0],
  96. self.PAYOUT_MATRIX[action_player_0][action_player_1][1]
  97. ]
  98. def _to_RLLib_API(self, observations: list, rewards: list,
  99. epi_is_done: bool, info: dict):
  100. observations = {
  101. self.player_row_id: observations[0],
  102. self.player_col_id: observations[1]
  103. }
  104. rewards = {
  105. self.player_row_id: rewards[0],
  106. self.player_col_id: rewards[1]
  107. }
  108. if info is None:
  109. info = {}
  110. else:
  111. info = {self.player_row_id: info, self.player_col_id: info}
  112. done = {
  113. self.player_row_id: epi_is_done,
  114. self.player_col_id: epi_is_done,
  115. "__all__": epi_is_done,
  116. }
  117. return observations, rewards, done, info
  118. def _get_info_for_current_epi(self, epi_is_done):
  119. if epi_is_done and self.output_additional_info:
  120. info_for_current_epi = self._get_episode_info()
  121. else:
  122. info_for_current_epi = None
  123. return info_for_current_epi
  124. def __str__(self):
  125. return self.NAME
  126. class IteratedMatchingPennies(TwoPlayersTwoActionsInfoMixin,
  127. MatrixSequentialSocialDilemma):
  128. """
  129. A two-agent environment for the Matching Pennies game.
  130. """
  131. NUM_AGENTS = 2
  132. NUM_ACTIONS = 2
  133. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  134. ACTION_SPACE = Discrete(NUM_ACTIONS)
  135. OBSERVATION_SPACE = Discrete(NUM_STATES)
  136. PAYOUT_MATRIX = np.array([[[+1, -1], [-1, +1]], [[-1, +1], [+1, -1]]])
  137. NAME = "IMP"
  138. class IteratedPrisonersDilemma(TwoPlayersTwoActionsInfoMixin,
  139. MatrixSequentialSocialDilemma):
  140. """
  141. A two-agent environment for the Prisoner's Dilemma game.
  142. """
  143. NUM_AGENTS = 2
  144. NUM_ACTIONS = 2
  145. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  146. ACTION_SPACE = Discrete(NUM_ACTIONS)
  147. OBSERVATION_SPACE = Discrete(NUM_STATES)
  148. PAYOUT_MATRIX = np.array([[[-1, -1], [-3, +0]], [[+0, -3], [-2, -2]]])
  149. NAME = "IPD"
  150. class IteratedAsymPrisonersDilemma(TwoPlayersTwoActionsInfoMixin,
  151. MatrixSequentialSocialDilemma):
  152. """
  153. A two-agent environment for the Asymmetric Prisoner's Dilemma game.
  154. """
  155. NUM_AGENTS = 2
  156. NUM_ACTIONS = 2
  157. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  158. ACTION_SPACE = Discrete(NUM_ACTIONS)
  159. OBSERVATION_SPACE = Discrete(NUM_STATES)
  160. PAYOUT_MATRIX = np.array([[[+0, -1], [-3, +0]], [[+0, -3], [-2, -2]]])
  161. NAME = "IPD"
  162. class IteratedStagHunt(TwoPlayersTwoActionsInfoMixin,
  163. MatrixSequentialSocialDilemma):
  164. """
  165. A two-agent environment for the Stag Hunt game.
  166. """
  167. NUM_AGENTS = 2
  168. NUM_ACTIONS = 2
  169. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  170. ACTION_SPACE = Discrete(NUM_ACTIONS)
  171. OBSERVATION_SPACE = Discrete(NUM_STATES)
  172. PAYOUT_MATRIX = np.array([[[3, 3], [0, 2]], [[2, 0], [1, 1]]])
  173. NAME = "IteratedStagHunt"
  174. class IteratedChicken(TwoPlayersTwoActionsInfoMixin,
  175. MatrixSequentialSocialDilemma):
  176. """
  177. A two-agent environment for the Chicken game.
  178. """
  179. NUM_AGENTS = 2
  180. NUM_ACTIONS = 2
  181. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  182. ACTION_SPACE = Discrete(NUM_ACTIONS)
  183. OBSERVATION_SPACE = Discrete(NUM_STATES)
  184. PAYOUT_MATRIX = np.array([[[+0, +0], [-1., +1.]], [[+1, -1], [-10, -10]]])
  185. NAME = "IteratedChicken"
  186. class IteratedAsymChicken(TwoPlayersTwoActionsInfoMixin,
  187. MatrixSequentialSocialDilemma):
  188. """
  189. A two-agent environment for the Asymmetric Chicken game.
  190. """
  191. NUM_AGENTS = 2
  192. NUM_ACTIONS = 2
  193. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  194. ACTION_SPACE = Discrete(NUM_ACTIONS)
  195. OBSERVATION_SPACE = Discrete(NUM_STATES)
  196. PAYOUT_MATRIX = np.array([[[+2.0, +0], [-1., +1.]], [[+2.5, -1],
  197. [-10, -10]]])
  198. NAME = "AsymmetricIteratedChicken"
  199. class IteratedBoS(TwoPlayersTwoActionsInfoMixin,
  200. MatrixSequentialSocialDilemma):
  201. """
  202. A two-agent environment for the BoS game.
  203. """
  204. NUM_AGENTS = 2
  205. NUM_ACTIONS = 2
  206. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  207. ACTION_SPACE = Discrete(NUM_ACTIONS)
  208. OBSERVATION_SPACE = Discrete(NUM_STATES)
  209. PAYOUT_MATRIX = np.array([[[+3.0, +2.0], [+0.0, +0.0]], [[+0.0, +0.0],
  210. [+2.0, +3.0]]])
  211. NAME = "IteratedBoS"
  212. class IteratedAsymBoS(TwoPlayersTwoActionsInfoMixin,
  213. MatrixSequentialSocialDilemma):
  214. """
  215. A two-agent environment for the BoS game.
  216. """
  217. NUM_AGENTS = 2
  218. NUM_ACTIONS = 2
  219. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  220. ACTION_SPACE = Discrete(NUM_ACTIONS)
  221. OBSERVATION_SPACE = Discrete(NUM_STATES)
  222. PAYOUT_MATRIX = np.array([[[+4.0, +1.0], [+0.0, +0.0]], [[+0.0, +0.0],
  223. [+2.0, +2.0]]])
  224. NAME = "AsymmetricIteratedBoS"
  225. def define_greed_fear_matrix_game(greed, fear):
  226. class GreedFearGame(TwoPlayersTwoActionsInfoMixin,
  227. MatrixSequentialSocialDilemma):
  228. NUM_AGENTS = 2
  229. NUM_ACTIONS = 2
  230. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  231. ACTION_SPACE = Discrete(NUM_ACTIONS)
  232. OBSERVATION_SPACE = Discrete(NUM_STATES)
  233. R = 3
  234. P = 1
  235. T = R + greed
  236. S = P - fear
  237. PAYOUT_MATRIX = np.array([[[R, R], [S, T]], [[T, S], [P, P]]])
  238. NAME = "IteratedGreedFear"
  239. def __str__(self):
  240. return f"{self.NAME} with greed={greed} and fear={fear}"
  241. return GreedFearGame
  242. class IteratedBoSAndPD(NPlayersNDiscreteActionsInfoMixin,
  243. MatrixSequentialSocialDilemma):
  244. """
  245. A two-agent environment for the BOTS + PD game.
  246. """
  247. NUM_AGENTS = 2
  248. NUM_ACTIONS = 3
  249. NUM_STATES = NUM_ACTIONS**NUM_AGENTS + 1
  250. ACTION_SPACE = Discrete(NUM_ACTIONS)
  251. OBSERVATION_SPACE = Discrete(NUM_STATES)
  252. PAYOUT_MATRIX = np.array([[[3.5, +1], [+0, +0],
  253. [-3, +2]], [[+0., +0], [+1, +3], [-3, +2]],
  254. [[+2., -3], [+2, -3], [-1, -1]]])
  255. NAME = "IteratedBoSAndPD"