openoker
/
ray


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
							import gymnasium as gym
import numpy as np
import unittest

import ray
from ray.rllib.algorithms.dqn import DQNConfig
from ray.rllib.utils.test_utils import framework_iterator
from ray.tune.registry import register_env


class TestReproducibility(unittest.TestCase):
    def test_reproducing_trajectory(self):
        class PickLargest(gym.Env):
            def __init__(self):
                self.observation_space = gym.spaces.Box(
                    low=float("-inf"), high=float("inf"), shape=(4,)
                )
                self.action_space = gym.spaces.Discrete(4)

            def reset(self, *, seed=None, options=None):
                self.obs = np.random.randn(4)
                return self.obs, {}

            def step(self, action):
                reward = self.obs[action]
                return self.obs, reward, True, False, {}

        def env_creator(env_config):
            return PickLargest()

        for fw in framework_iterator(frameworks=("tf", "torch")):
            trajs = list()
            for trial in range(3):
                ray.init()
                register_env("PickLargest", env_creator)
                config = (
                    DQNConfig()
                    .environment("PickLargest")
                    .debugging(seed=666 if trial in [0, 1] else 999)
                    .reporting(
                        min_time_s_per_iteration=0,
                        min_sample_timesteps_per_iteration=100,
                    )
                    .framework(fw)
                )
                algo = config.build()

                trajectory = list()
                for _ in range(8):
                    r = algo.train()
                    trajectory.append(r["episode_reward_max"])
                    trajectory.append(r["episode_reward_min"])
                trajs.append(trajectory)

                algo.stop()
                ray.shutdown()

            # trial0 and trial1 use same seed and thus
            # expect identical trajectories.
            all_same = True
            for v0, v1 in zip(trajs[0], trajs[1]):
                if v0 != v1:
                    all_same = False
            self.assertTrue(all_same)

            # trial1 and trial2 use different seeds and thus
            # most rewards tend to be different.
            diff_cnt = 0
            for v1, v2 in zip(trajs[1], trajs[2]):
                if v1 != v2:
                    diff_cnt += 1
            self.assertTrue(diff_cnt > 8)


if __name__ == "__main__":
    import pytest
    import sys

    sys.exit(pytest.main(["-v", __file__]))