tune_tests.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. from typing import Optional
  2. from ray_release.config import Test
  3. from ray_release.result import Result
  4. def handle_result(
  5. test: Test,
  6. result: Result,
  7. ) -> Optional[str]:
  8. test_name = test["name"]
  9. msg = ""
  10. success = result.status == "finished"
  11. time_taken = result.results.get("time_taken", float("inf"))
  12. num_terminated = result.results.get("trial_states", {}).get("TERMINATED", 0)
  13. was_smoke_test = result.results.get("smoke_test", False)
  14. if not success:
  15. if result.status == "timeout":
  16. msg += "Test timed out."
  17. else:
  18. msg += "Test script failed. "
  19. if test_name == "tune_scalability_long_running_large_checkpoints":
  20. last_update_diff = result.results.get("last_update_diff", float("inf"))
  21. target_update_diff = 360
  22. if last_update_diff > target_update_diff:
  23. return (
  24. f"Last update to results json was too long ago "
  25. f"({last_update_diff:.2f} > {target_update_diff})"
  26. )
  27. return None
  28. elif test_name == "tune_scalability_bookkeeping_overhead":
  29. target_terminated = 10000
  30. target_time = 800
  31. elif test_name == "tune_scalability_durable_trainable":
  32. target_terminated = 16
  33. target_time = 650
  34. elif test_name == "tune_scalability_network_overhead":
  35. target_terminated = 100 if not was_smoke_test else 20
  36. target_time = 900 if not was_smoke_test else 400
  37. elif test_name == "tune_scalability_result_throughput_cluster":
  38. target_terminated = 1000
  39. target_time = 130
  40. elif test_name == "tune_scalability_result_throughput_single_node":
  41. target_terminated = 96
  42. target_time = 120
  43. elif test_name == "tune_scalability_xgboost_sweep":
  44. target_terminated = 31
  45. target_time = 3600
  46. else:
  47. return None
  48. if num_terminated < target_terminated:
  49. msg += (
  50. f"Some trials failed "
  51. f"(num_terminated={num_terminated} < {target_terminated}). "
  52. )
  53. if time_taken > target_time:
  54. msg += (
  55. f"Took too long to complete "
  56. f"(time_taken={time_taken:.2f} > {target_time}). "
  57. )
  58. return msg or None