test_xgboost_sweep.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. """Large-scale XGBoost parameter sweep
  2. In this run, we will start 32 trials of 32 actors each running distributed
  3. XGBoost training. This test is more about making sure that the run succeeds
  4. than about total runtime. However, it is expected that this is faster than
  5. 1 hour.
  6. We fix the max_depth to 4 and the number of boosting rounds to 100. The
  7. fastest observed training time for 32 actors (1 CPU each) was about 2000
  8. seconds. We allow up to 10 minutes of slack, so aim for 2600 seconds total
  9. tuning time.
  10. Cluster: cluster_16x64_data.yaml
  11. Test owner: krfricke
  12. Acceptance criteria: Should run faster than 2600 seconds. Should run without
  13. errors.
  14. """
  15. from collections import Counter
  16. import json
  17. import os
  18. import time
  19. import ray
  20. from ray import tune
  21. from xgboost_ray import train, RayParams, RayDMatrix
  22. def xgboost_train(config, ray_params, num_boost_round=200):
  23. train_set = RayDMatrix(os.path.expanduser("/data/train.parquet"), "labels")
  24. test_set = RayDMatrix(os.path.expanduser("/data/test.parquet"), "labels")
  25. evals_result = {}
  26. bst = train(
  27. params=config,
  28. dtrain=train_set,
  29. evals=[(test_set, "eval")],
  30. evals_result=evals_result,
  31. ray_params=ray_params,
  32. verbose_eval=False,
  33. num_boost_round=num_boost_round,
  34. )
  35. model_path = "tuned.xgb"
  36. bst.save_model(model_path)
  37. print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1]))
  38. def main():
  39. name = "large xgboost sweep"
  40. ray.init(address="auto")
  41. num_samples = 31 # So that we fit on 1024 CPUs with 1 head bundle
  42. num_actors_per_sample = 32
  43. max_runtime = 3500
  44. config = {
  45. "tree_method": "approx",
  46. "objective": "binary:logistic",
  47. "eval_metric": ["logloss", "error"],
  48. "eta": tune.loguniform(1e-4, 1e-1),
  49. "subsample": tune.uniform(0.5, 1.0),
  50. "max_depth": 4,
  51. }
  52. ray_params = RayParams(
  53. max_actor_restarts=1,
  54. gpus_per_actor=0,
  55. cpus_per_actor=1,
  56. num_actors=num_actors_per_sample,
  57. )
  58. start_time = time.monotonic()
  59. analysis = tune.run(
  60. tune.with_parameters(xgboost_train, ray_params=ray_params, num_boost_round=100),
  61. config=config,
  62. num_samples=num_samples,
  63. resources_per_trial=ray_params.get_tune_resources(),
  64. )
  65. time_taken = time.monotonic() - start_time
  66. result = {
  67. "time_taken": time_taken,
  68. "trial_states": dict(Counter([trial.status for trial in analysis.trials])),
  69. "last_update": time.time(),
  70. }
  71. test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/tune_test.json")
  72. with open(test_output_json, "wt") as f:
  73. json.dump(result, f)
  74. if time_taken > max_runtime:
  75. print(
  76. f"The {name} test took {time_taken:.2f} seconds, but should not "
  77. f"have exceeded {max_runtime:.2f} seconds. Test failed. \n\n"
  78. f"--- FAILED: {name.upper()} ::: "
  79. f"{time_taken:.2f} > {max_runtime:.2f} ---"
  80. )
  81. else:
  82. print(
  83. f"The {name} test took {time_taken:.2f} seconds, which "
  84. f"is below the budget of {max_runtime:.2f} seconds. "
  85. f"Test successful. \n\n"
  86. f"--- PASSED: {name.upper()} ::: "
  87. f"{time_taken:.2f} <= {max_runtime:.2f} ---"
  88. )
  89. if __name__ == "__main__":
  90. main()