tune_tests.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. from typing import Optional
  2. from ray_release.test import Test
  3. from ray_release.result import (
  4. Result,
  5. ResultStatus,
  6. )
  7. def handle_result(
  8. test: Test,
  9. result: Result,
  10. ) -> Optional[str]:
  11. test_name = test["name"]
  12. msg = ""
  13. success = result.status == ResultStatus.SUCCESS.value
  14. time_taken = result.results.get("time_taken", float("inf"))
  15. num_terminated = result.results.get("trial_states", {}).get("TERMINATED", 0)
  16. was_smoke_test = result.results.get("smoke_test", False)
  17. if not success:
  18. if result.status == "timeout":
  19. msg += "Test timed out."
  20. else:
  21. msg += "Test script failed. "
  22. if test_name == "tune_scalability_long_running_large_checkpoints":
  23. last_update_diff = result.results.get("last_update_diff", float("inf"))
  24. target_update_diff = 360
  25. if last_update_diff > target_update_diff:
  26. return (
  27. f"Last update to results json was too long ago "
  28. f"({last_update_diff:.2f} > {target_update_diff})"
  29. )
  30. return None
  31. elif test_name == "tune_scalability_bookkeeping_overhead":
  32. target_terminated = 10000
  33. target_time = 800
  34. elif test_name == "tune_scalability_durable_trainable":
  35. target_terminated = 16
  36. target_time = 650
  37. elif test_name == "tune_scalability_network_overhead":
  38. target_terminated = 100 if not was_smoke_test else 20
  39. target_time = 900 if not was_smoke_test else 400
  40. elif test_name == "tune_scalability_result_throughput_cluster":
  41. target_terminated = 1000
  42. target_time = 130
  43. elif test_name == "tune_scalability_result_throughput_single_node":
  44. target_terminated = 96
  45. target_time = 120
  46. elif test_name == "tune_scalability_xgboost_sweep":
  47. target_terminated = 31
  48. target_time = 3600
  49. else:
  50. return None
  51. if num_terminated < target_terminated:
  52. msg += (
  53. f"Some trials failed "
  54. f"(num_terminated={num_terminated} < {target_terminated}). "
  55. )
  56. if time_taken > target_time:
  57. msg += (
  58. f"Took too long to complete "
  59. f"(time_taken={time_taken:.2f} > {target_time}). "
  60. )
  61. return msg or None