test_parameter_noise.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. import numpy as np
  2. import unittest
  3. import ray
  4. import ray.rllib.algorithms.ddpg as ddpg
  5. import ray.rllib.algorithms.dqn as dqn
  6. from ray.rllib.utils.test_utils import check, framework_iterator
  7. class TestParameterNoise(unittest.TestCase):
  8. @classmethod
  9. def setUpClass(cls) -> None:
  10. ray.init()
  11. @classmethod
  12. def tearDownClass(cls) -> None:
  13. ray.shutdown()
  14. def test_ddpg_parameter_noise(self):
  15. self.do_test_parameter_noise_exploration(
  16. ddpg.DDPG,
  17. # Switch on complete_episodes mode b/c we are using ParameterNoise.
  18. core_config=ddpg.DDPGConfig().rollouts(batch_mode="complete_episodes"),
  19. env="Pendulum-v1",
  20. env_config={},
  21. obs=np.array([1.0, 0.0, -1.0]),
  22. )
  23. def test_dqn_parameter_noise(self):
  24. self.do_test_parameter_noise_exploration(
  25. dqn.DQN,
  26. # Switch on complete_episodes mode b/c we are using ParameterNoise.
  27. core_config=dqn.DQNConfig().rollouts(batch_mode="complete_episodes"),
  28. env="FrozenLake-v1",
  29. env_config={"is_slippery": False, "map_name": "4x4"},
  30. obs=np.array(0),
  31. )
  32. def do_test_parameter_noise_exploration(
  33. self, algo_cls, *, core_config, env, env_config, obs
  34. ):
  35. """Tests, whether an Agent works with ParameterNoise."""
  36. core_config.rollouts(num_rollout_workers=0) # Run locally.
  37. core_config.environment(env, env_config=env_config)
  38. for fw in framework_iterator(core_config):
  39. config = core_config.copy()
  40. # Algo with ParameterNoise exploration (config["explore"]=True).
  41. # ----
  42. config.exploration(
  43. exploration_config={"type": "ParameterNoise"}, explore=True
  44. )
  45. algo = config.build()
  46. policy = algo.get_policy()
  47. pol_sess = policy.get_session()
  48. # Remove noise that has been added during policy initialization
  49. # (exploration.postprocess_trajectory does add noise to measure
  50. # the delta).
  51. policy.exploration._remove_noise(tf_sess=pol_sess)
  52. self.assertFalse(policy.exploration.weights_are_currently_noisy)
  53. noise_before = self._get_current_noise(policy, fw)
  54. check(noise_before, 0.0)
  55. initial_weights = self._get_current_weight(policy, fw)
  56. # Pseudo-start an episode and compare the weights before and after.
  57. policy.exploration.on_episode_start(policy, tf_sess=pol_sess)
  58. self.assertFalse(policy.exploration.weights_are_currently_noisy)
  59. noise_after_ep_start = self._get_current_noise(policy, fw)
  60. weights_after_ep_start = self._get_current_weight(policy, fw)
  61. # Should be the same, as we don't do anything at the beginning of
  62. # the episode, only one step later.
  63. check(noise_after_ep_start, noise_before)
  64. check(initial_weights, weights_after_ep_start)
  65. # Setting explore=False should always return the same action.
  66. a_ = algo.compute_single_action(obs, explore=False)
  67. self.assertFalse(policy.exploration.weights_are_currently_noisy)
  68. noise = self._get_current_noise(policy, fw)
  69. # We sampled the first noise (not zero anymore).
  70. check(noise, 0.0, false=True)
  71. # But still not applied b/c explore=False.
  72. check(self._get_current_weight(policy, fw), initial_weights)
  73. for _ in range(10):
  74. a = algo.compute_single_action(obs, explore=False)
  75. check(a, a_)
  76. # Noise never gets applied.
  77. check(self._get_current_weight(policy, fw), initial_weights)
  78. self.assertFalse(policy.exploration.weights_are_currently_noisy)
  79. # Explore=None (default: True) should return different actions.
  80. # However, this is only due to the underlying epsilon-greedy
  81. # exploration.
  82. actions = []
  83. current_weight = None
  84. for _ in range(10):
  85. actions.append(algo.compute_single_action(obs))
  86. self.assertTrue(policy.exploration.weights_are_currently_noisy)
  87. # Now, noise actually got applied (explore=True).
  88. current_weight = self._get_current_weight(policy, fw)
  89. check(current_weight, initial_weights, false=True)
  90. check(current_weight, initial_weights + noise)
  91. check(np.std(actions), 0.0, false=True)
  92. # Pseudo-end the episode and compare weights again.
  93. # Make sure they are the original ones.
  94. policy.exploration.on_episode_end(policy, tf_sess=pol_sess)
  95. weights_after_ep_end = self._get_current_weight(policy, fw)
  96. check(current_weight - noise, weights_after_ep_end, decimals=5)
  97. algo.stop()
  98. # DQN with ParameterNoise exploration (config["explore"]=False).
  99. # ----
  100. config = core_config.copy()
  101. config.exploration(
  102. exploration_config={"type": "ParameterNoise"}, explore=False
  103. )
  104. algo = config.build()
  105. policy = algo.get_policy()
  106. pol_sess = policy.get_session()
  107. # Remove noise that has been added during policy initialization
  108. # (exploration.postprocess_trajectory does add noise to measure
  109. # the delta).
  110. policy.exploration._remove_noise(tf_sess=pol_sess)
  111. self.assertFalse(policy.exploration.weights_are_currently_noisy)
  112. initial_weights = self._get_current_weight(policy, fw)
  113. # Noise before anything (should be 0.0, no episode started yet).
  114. noise = self._get_current_noise(policy, fw)
  115. check(noise, 0.0)
  116. # Pseudo-start an episode and compare the weights before and after
  117. # (they should be the same).
  118. policy.exploration.on_episode_start(policy, tf_sess=pol_sess)
  119. self.assertFalse(policy.exploration.weights_are_currently_noisy)
  120. # Should be the same, as we don't do anything at the beginning of
  121. # the episode, only one step later.
  122. noise = self._get_current_noise(policy, fw)
  123. check(noise, 0.0)
  124. noisy_weights = self._get_current_weight(policy, fw)
  125. check(initial_weights, noisy_weights)
  126. # Setting explore=False or None should always return the same
  127. # action.
  128. a_ = algo.compute_single_action(obs, explore=False)
  129. # Now we have re-sampled.
  130. noise = self._get_current_noise(policy, fw)
  131. check(noise, 0.0, false=True)
  132. for _ in range(5):
  133. a = algo.compute_single_action(obs, explore=None)
  134. check(a, a_)
  135. a = algo.compute_single_action(obs, explore=False)
  136. check(a, a_)
  137. # Pseudo-end the episode and compare weights again.
  138. # Make sure they are the original ones (no noise permanently
  139. # applied throughout the episode).
  140. policy.exploration.on_episode_end(policy, tf_sess=pol_sess)
  141. weights_after_episode_end = self._get_current_weight(policy, fw)
  142. check(initial_weights, weights_after_episode_end)
  143. # Noise should still be the same (re-sampling only happens at
  144. # beginning of episode).
  145. noise_after = self._get_current_noise(policy, fw)
  146. check(noise, noise_after)
  147. algo.stop()
  148. # Switch off underlying exploration entirely.
  149. # ----
  150. config = core_config.copy()
  151. if algo_cls is dqn.DQN:
  152. sub_config = {
  153. "type": "EpsilonGreedy",
  154. "initial_epsilon": 0.0, # <- no randomness whatsoever
  155. "final_epsilon": 0.0,
  156. }
  157. else:
  158. sub_config = {
  159. "type": "OrnsteinUhlenbeckNoise",
  160. "initial_scale": 0.0, # <- no randomness whatsoever
  161. "final_scale": 0.0,
  162. "random_timesteps": 0,
  163. }
  164. config.exploration(
  165. exploration_config={
  166. "type": "ParameterNoise",
  167. "sub_exploration": sub_config,
  168. },
  169. explore=True,
  170. )
  171. algo = config.build()
  172. # Now, when we act - even with explore=True - we would expect
  173. # the same action for the same input (parameter noise is
  174. # deterministic).
  175. policy = algo.get_policy()
  176. policy.exploration.on_episode_start(policy, tf_sess=pol_sess)
  177. a_ = algo.compute_single_action(obs)
  178. for _ in range(10):
  179. a = algo.compute_single_action(obs, explore=True)
  180. check(a, a_)
  181. algo.stop()
  182. def _get_current_noise(self, policy, fw):
  183. # If noise not even created yet, return 0.0.
  184. if policy.exploration.noise is None:
  185. return 0.0
  186. noise = policy.exploration.noise[0][0][0]
  187. if fw == "tf":
  188. noise = policy.get_session().run(noise)
  189. elif fw == "torch":
  190. noise = noise.detach().cpu().numpy()
  191. else:
  192. noise = noise.numpy()
  193. return noise
  194. def _get_current_weight(self, policy, fw):
  195. weights = policy.get_weights()
  196. if fw == "torch":
  197. # DQN model.
  198. if "_hidden_layers.0._model.0.weight" in weights:
  199. return weights["_hidden_layers.0._model.0.weight"][0][0]
  200. # DDPG model.
  201. else:
  202. return weights["policy_model.action_0._model.0.weight"][0][0]
  203. key = 0 if fw == "tf2" else list(weights.keys())[0]
  204. return weights[key][0][0]
  205. if __name__ == "__main__":
  206. import pytest
  207. import sys
  208. sys.exit(pytest.main(["-v", __file__]))