jobs_remote_multi_node.py 1.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. """Job submission remote multinode test
  2. This tests that Ray Job submission works when submitting jobs
  3. to a remote cluster with multiple nodes.
  4. This file is a driver script to be submitted to a Ray cluster via
  5. the Ray Jobs API. This is done by specifying `type: job` in
  6. `release_tests.yaml` (as opposed to, say, `type: sdk_command`).
  7. Test owner: architkulkarni
  8. """
  9. import ray
  10. from ray._private.test_utils import wait_for_condition
  11. ray.init()
  12. NUM_NODES = 5
  13. # Spawn tasks that use get_runtime_context() to get their node_id
  14. # and wait until all of the nodes are seen.
  15. @ray.remote(num_cpus=1)
  16. def get_node_id():
  17. return ray.get_runtime_context().get_node_id()
  18. # Allow one fewer node in case a node fails to come up.
  19. num_expected_nodes = NUM_NODES - 1
  20. node_ids = set(ray.get([get_node_id.remote() for _ in range(100)]))
  21. def check_num_nodes_and_spawn_tasks():
  22. node_ids.update(ray.get([get_node_id.remote() for _ in range(10)]))
  23. return len(node_ids) >= num_expected_nodes
  24. wait_for_condition(check_num_nodes_and_spawn_tasks)