jobs_check_cuda_available.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. """Job Submission CUDA available test
  2. Checks that GPU resources are available in the job submission
  3. driver script.
  4. This file is a driver script to be submitted to a Ray cluster via
  5. the Ray Jobs API. This is done by specifying `type: job` in
  6. `release_tests.yaml` (as opposed to, say, `type: sdk_command`).
  7. Release test for https://github.com/ray-project/ray/issues/24455
  8. Test owner: architkulkarni
  9. """
  10. import ray
  11. import torch
  12. from ray._private.test_utils import wait_for_condition
  13. ray.init()
  14. # Assert that GPU resources are available in the driver script
  15. assert torch.cuda.is_available(), "CUDA is not available in the driver script"
  16. # For good measure, let's also check that we can use the GPU
  17. # in a remote function.
  18. @ray.remote(num_gpus=0.1)
  19. def f():
  20. return ray.get_gpu_ids()
  21. assert ray.get(f.remote()) == [0]
  22. # Also check that non-GPU tasks can be scheduled across all nodes.
  23. NUM_NODES = 2
  24. @ray.remote(num_cpus=1, scheduling_strategy="SPREAD")
  25. def get_node_id():
  26. return ray.get_runtime_context().get_node_id()
  27. node_ids = set(ray.get([get_node_id.remote() for _ in range(100)]))
  28. def check_num_nodes_and_spawn_tasks():
  29. node_ids.update(ray.get([get_node_id.remote() for _ in range(10)]))
  30. return len(node_ids) >= NUM_NODES
  31. wait_for_condition(check_num_nodes_and_spawn_tasks)