ft_small_non_elastic.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. """Fault tolerance test (small cluster, non-elastic training)
  2. In this run, two training actors will die after some time. It is expected that
  3. in both cases lightgbm_ray stops training, restarts the dead actors, and
  4. continues training with all four actors.
  5. Test owner: Yard1 (primary), krfricke
  6. Acceptance criteria: Should run through and report final results. Intermediate
  7. output should show that training halts wenn an actor dies and continues only
  8. when all four actors are available again. The test will fail if fault
  9. tolerance did not work correctly.
  10. Notes: This test seems to be somewhat flaky. This might be due to
  11. race conditions in handling dead actors. This is likely a problem of
  12. the lightgbm_ray implementation and not of this test.
  13. """
  14. import os
  15. import ray
  16. from lightgbm_ray import RayParams
  17. from release_test_util import (
  18. train_ray,
  19. FailureState,
  20. FailureInjection,
  21. TrackingCallback,
  22. )
  23. if __name__ == "__main__":
  24. ray.init(address="auto", runtime_env={"working_dir": os.path.dirname(__file__)})
  25. failure_state = FailureState.remote()
  26. ray_params = RayParams(
  27. max_actor_restarts=2, num_actors=4, cpus_per_actor=4, gpus_per_actor=0
  28. )
  29. _, additional_results, _ = train_ray(
  30. path="/data/classification.parquet",
  31. num_workers=None,
  32. num_boost_rounds=100,
  33. num_files=200,
  34. regression=False,
  35. use_gpu=False,
  36. ray_params=ray_params,
  37. lightgbm_params=None,
  38. callbacks=[
  39. TrackingCallback(),
  40. FailureInjection(
  41. id="first_fail", state=failure_state, ranks=[1], iteration=14
  42. ),
  43. FailureInjection(
  44. id="second_fail", state=failure_state, ranks=[0], iteration=34
  45. ),
  46. ],
  47. )
  48. print("PASSED.")