123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- import gym
- import numpy as np
- import unittest
- import ray
- from ray.rllib.agents.dqn import DQNTrainer
- from ray.rllib.utils.test_utils import framework_iterator
- from ray.tune.registry import register_env
- class TestReproducibility(unittest.TestCase):
- def test_reproducing_trajectory(self):
- class PickLargest(gym.Env):
- def __init__(self):
- self.observation_space = gym.spaces.Box(
- low=float("-inf"), high=float("inf"), shape=(4, ))
- self.action_space = gym.spaces.Discrete(4)
- def reset(self, **kwargs):
- self.obs = np.random.randn(4)
- return self.obs
- def step(self, action):
- reward = self.obs[action]
- return self.obs, reward, True, {}
- def env_creator(env_config):
- return PickLargest()
- for fw in framework_iterator(frameworks=("tf", "torch")):
- trajs = list()
- for trial in range(3):
- ray.init()
- register_env("PickLargest", env_creator)
- config = {
- "seed": 666 if trial in [0, 1] else 999,
- "min_time_s_per_reporting": 0,
- "timesteps_per_iteration": 100,
- "framework": fw,
- }
- agent = DQNTrainer(config=config, env="PickLargest")
- trajectory = list()
- for _ in range(8):
- r = agent.train()
- trajectory.append(r["episode_reward_max"])
- trajectory.append(r["episode_reward_min"])
- trajs.append(trajectory)
- ray.shutdown()
- # trial0 and trial1 use same seed and thus
- # expect identical trajectories.
- all_same = True
- for v0, v1 in zip(trajs[0], trajs[1]):
- if v0 != v1:
- all_same = False
- self.assertTrue(all_same)
- # trial1 and trial2 use different seeds and thus
- # most rewards tend to be different.
- diff_cnt = 0
- for v1, v2 in zip(trajs[1], trajs[2]):
- if v1 != v2:
- diff_cnt += 1
- self.assertTrue(diff_cnt > 8)
- if __name__ == "__main__":
- import pytest
- import sys
- sys.exit(pytest.main(["-v", __file__]))
|