train_gpu_connect.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. """Small cluster training
  2. This training run will start 4 workers on 4 nodes (including head node).
  3. Test owner: krfricke
  4. Acceptance criteria: Should run through and report final results.
  5. """
  6. import json
  7. import os
  8. import time
  9. import ray
  10. if __name__ == "__main__":
  11. os.environ["RXGB_PLACEMENT_GROUP_TIMEOUT_S"] = "1200"
  12. # Passing in runtime_env to ray.init() will also set it for all the
  13. # workers.
  14. runtime_env = {
  15. "env_vars": {
  16. "RXGB_PLACEMENT_GROUP_TIMEOUT_S": "1200",
  17. },
  18. "working_dir": os.path.dirname(__file__),
  19. }
  20. ray.init(address="auto", runtime_env=runtime_env)
  21. from xgboost_ray import RayParams
  22. from release_test_util import train_ray, get_parquet_files
  23. ray_params = RayParams(
  24. elastic_training=False,
  25. max_actor_restarts=2,
  26. num_actors=4,
  27. cpus_per_actor=4,
  28. gpus_per_actor=1,
  29. )
  30. @ray.remote
  31. def ray_get_parquet_files():
  32. return get_parquet_files(
  33. path="/data/classification.parquet",
  34. num_files=25,
  35. )
  36. start = time.time()
  37. train_ray(
  38. path=ray.get(ray_get_parquet_files.remote()),
  39. num_workers=4,
  40. num_boost_rounds=100,
  41. regression=False,
  42. use_gpu=True,
  43. ray_params=ray_params,
  44. xgboost_params=None,
  45. )
  46. taken = time.time() - start
  47. result = {
  48. "time_taken": taken,
  49. }
  50. test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/train_gpu_connect.json")
  51. with open(test_output_json, "wt") as f:
  52. json.dump(result, f)
  53. print("PASSED.")