123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107 |
- import unittest
- import ray
- from ray.rllib.agents.pg import PGTrainer, DEFAULT_CONFIG
- from ray.rllib.utils.framework import try_import_torch
- from ray.rllib.utils.test_utils import framework_iterator
- from ray import tune
- torch, _ = try_import_torch()
- class TestGPUs(unittest.TestCase):
- def test_gpus_in_non_local_mode(self):
- # Non-local mode.
- ray.init(num_cpus=8)
- actual_gpus = torch.cuda.device_count()
- print(f"Actual GPUs found (by torch): {actual_gpus}")
- config = DEFAULT_CONFIG.copy()
- config["num_workers"] = 2
- config["env"] = "CartPole-v0"
- # Expect errors when we run a config w/ num_gpus>0 w/o a GPU
- # and _fake_gpus=False.
- for num_gpus in [0, 0.1, 1, actual_gpus + 4]:
- # Only allow possible num_gpus_per_worker (so test would not
- # block infinitely due to a down worker).
- per_worker = [0] if actual_gpus == 0 or actual_gpus < num_gpus \
- else [0, 0.5, 1]
- for num_gpus_per_worker in per_worker:
- for fake_gpus in [False] + ([] if num_gpus == 0 else [True]):
- config["num_gpus"] = num_gpus
- config["num_gpus_per_worker"] = num_gpus_per_worker
- config["_fake_gpus"] = fake_gpus
- print(f"\n------------\nnum_gpus={num_gpus} "
- f"num_gpus_per_worker={num_gpus_per_worker} "
- f"_fake_gpus={fake_gpus}")
- frameworks = ("tf", "torch") if num_gpus > 1 else \
- ("tf2", "tf", "torch")
- for _ in framework_iterator(config, frameworks=frameworks):
- # Expect that trainer creation causes a num_gpu error.
- if actual_gpus < num_gpus + 2 * num_gpus_per_worker \
- and not fake_gpus:
- # "Direct" RLlib (create Trainer on the driver).
- # Cannot run through ray.tune.run() as it would
- # simply wait infinitely for the resources to
- # become available.
- print("direct RLlib")
- self.assertRaisesRegex(
- RuntimeError,
- "Found 0 GPUs on your machine",
- lambda: PGTrainer(config, env="CartPole-v0"),
- )
- # If actual_gpus >= num_gpus or faked,
- # expect no error.
- else:
- print("direct RLlib")
- trainer = PGTrainer(config, env="CartPole-v0")
- trainer.stop()
- # Cannot run through ray.tune.run() w/ fake GPUs
- # as it would simply wait infinitely for the
- # resources to become available (even though, we
- # wouldn't really need them).
- if num_gpus == 0:
- print("via ray.tune.run()")
- tune.run(
- "PG",
- config=config,
- stop={"training_iteration": 0})
- ray.shutdown()
- def test_gpus_in_local_mode(self):
- # Local mode.
- ray.init(num_gpus=8, local_mode=True)
- actual_gpus_available = torch.cuda.device_count()
- config = DEFAULT_CONFIG.copy()
- config["num_workers"] = 2
- config["env"] = "CartPole-v0"
- # Expect no errors in local mode.
- for num_gpus in [0, 0.1, 1, actual_gpus_available + 4]:
- print(f"num_gpus={num_gpus}")
- for fake_gpus in [False, True]:
- print(f"_fake_gpus={fake_gpus}")
- config["num_gpus"] = num_gpus
- config["_fake_gpus"] = fake_gpus
- frameworks = ("tf", "torch") if num_gpus > 1 else \
- ("tf2", "tf", "torch")
- for _ in framework_iterator(config, frameworks=frameworks):
- print("direct RLlib")
- trainer = PGTrainer(config, env="CartPole-v0")
- trainer.stop()
- print("via ray.tune.run()")
- tune.run(
- "PG", config=config, stop={"training_iteration": 0})
- ray.shutdown()
- if __name__ == "__main__":
- import pytest
- import sys
- sys.exit(pytest.main(["-v", __file__]))
|