test_cluster_manager.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824
  1. import os
  2. import sys
  3. import time
  4. import unittest
  5. from typing import Callable
  6. from unittest.mock import patch
  7. from freezegun import freeze_time
  8. from ray_release.exception import (
  9. ClusterCreationError,
  10. ClusterStartupError,
  11. ClusterStartupTimeout,
  12. ClusterStartupFailed,
  13. ClusterEnvBuildError,
  14. ClusterEnvBuildTimeout,
  15. ClusterComputeCreateError,
  16. ClusterEnvCreateError,
  17. )
  18. from ray_release.cluster_manager.full import FullClusterManager
  19. from ray_release.cluster_manager.minimal import MinimalClusterManager
  20. from ray_release.tests.utils import (
  21. UNIT_TEST_PROJECT_ID,
  22. UNIT_TEST_CLOUD_ID,
  23. APIDict,
  24. fail_always,
  25. fail_once,
  26. MockSDK,
  27. )
  28. from ray_release.util import get_anyscale_sdk
  29. from ray_release.test import Test
  30. TEST_CLUSTER_COMPUTE = {
  31. "cloud_id": UNIT_TEST_CLOUD_ID,
  32. "region": "us-west-2",
  33. "max_workers": 0,
  34. "head_node_type": {"name": "head_node", "instance_type": "m5.4xlarge"},
  35. "worker_node_types": [
  36. {
  37. "name": "worker_node",
  38. "instance_type": "m5.xlarge",
  39. "min_workers": 0,
  40. "max_workers": 0,
  41. "use_spot": False,
  42. }
  43. ],
  44. }
  45. def _fail(*args, **kwargs):
  46. raise RuntimeError()
  47. class MockTest(Test):
  48. def get_anyscale_byod_image(self) -> str:
  49. return "anyscale"
  50. class _DelayedResponse:
  51. def __init__(
  52. self,
  53. callback: Callable[[], None],
  54. finish_after: float,
  55. before: APIDict,
  56. after: APIDict,
  57. ):
  58. self.callback = callback
  59. self.finish_after = time.monotonic() + finish_after
  60. self.before = before
  61. self.after = after
  62. def __call__(self, *args, **kwargs):
  63. self.callback()
  64. if time.monotonic() > self.finish_after:
  65. return self.after
  66. else:
  67. return self.before
  68. class MinimalSessionManagerTest(unittest.TestCase):
  69. cls = MinimalClusterManager
  70. def setUp(self) -> None:
  71. self.sdk = MockSDK()
  72. self.sdk.returns["get_project"] = APIDict(
  73. result=APIDict(name="release_unit_tests")
  74. )
  75. self.cluster_compute = TEST_CLUSTER_COMPUTE
  76. self.cluster_manager = self.cls(
  77. project_id=UNIT_TEST_PROJECT_ID,
  78. sdk=self.sdk,
  79. test=MockTest(
  80. {
  81. "name": f"unit_test__{self.__class__.__name__}",
  82. "cluster": {"byod": {}},
  83. }
  84. ),
  85. )
  86. self.sdk.reset()
  87. self.sdk.returns["get_cloud"] = APIDict(result=APIDict(provider="AWS"))
  88. def testClusterName(self):
  89. sdk = MockSDK()
  90. sdk.returns["get_project"] = APIDict(result=APIDict(name="release_unit_tests"))
  91. sdk.returns["get_cloud"] = APIDict(result=APIDict(provider="AWS"))
  92. cluster_manager = self.cls(
  93. test=MockTest({"name": "test"}),
  94. project_id=UNIT_TEST_PROJECT_ID,
  95. smoke_test=False,
  96. sdk=sdk,
  97. )
  98. self.assertRegex(cluster_manager.cluster_name, r"^test_\d+$")
  99. cluster_manager = self.cls(
  100. test=MockTest({"name": "test"}),
  101. project_id=UNIT_TEST_PROJECT_ID,
  102. smoke_test=True,
  103. sdk=sdk,
  104. )
  105. self.assertRegex(cluster_manager.cluster_name, r"^test-smoke-test_\d+$")
  106. def testSetClusterEnv(self):
  107. sdk = MockSDK()
  108. sdk.returns["get_project"] = APIDict(result=APIDict(name="release_unit_tests"))
  109. sdk.returns["get_cloud"] = APIDict(result=APIDict(provider="AWS"))
  110. cluster_manager = self.cls(
  111. test=MockTest({"name": "test", "cluster": {"byod": {}}}),
  112. project_id=UNIT_TEST_PROJECT_ID,
  113. smoke_test=False,
  114. sdk=sdk,
  115. )
  116. cluster_manager.set_cluster_env()
  117. self.assertEqual(
  118. cluster_manager.cluster_env_name,
  119. "anyscale__env__"
  120. "a93b7dec6c1b606a9814ceb96ace13e116d04cc8ce3a2bdea1b0f279c34ff692",
  121. )
  122. @patch("time.sleep", lambda *a, **kw: None)
  123. def testFindCreateClusterComputeExisting(self):
  124. # Find existing compute and succeed
  125. self.cluster_manager.set_cluster_compute(self.cluster_compute)
  126. self.assertTrue(self.cluster_manager.cluster_compute_name)
  127. self.assertFalse(self.cluster_manager.cluster_compute_id)
  128. self.sdk.returns["search_cluster_computes"] = APIDict(
  129. metadata=APIDict(
  130. next_paging_token=None,
  131. ),
  132. results=[
  133. APIDict(
  134. name="no_match",
  135. id="wrong",
  136. ),
  137. APIDict(name=self.cluster_manager.cluster_compute_name, id="correct"),
  138. ],
  139. )
  140. self.cluster_manager.create_cluster_compute()
  141. self.assertEqual(self.cluster_manager.cluster_compute_id, "correct")
  142. self.assertEqual(self.sdk.call_counter["search_cluster_computes"], 1)
  143. self.assertEqual(len(self.sdk.call_counter), 2) # 1 extra for cloud provider
  144. @patch("time.sleep", lambda *a, **kw: None)
  145. def testFindCreateClusterComputeCreateFailFail(self):
  146. # No existing compute, create new, but fail both times
  147. self.cluster_manager.set_cluster_compute(self.cluster_compute)
  148. self.assertTrue(self.cluster_manager.cluster_compute_name)
  149. self.assertFalse(self.cluster_manager.cluster_compute_id)
  150. self.sdk.returns["search_cluster_computes"] = APIDict(
  151. metadata=APIDict(
  152. next_paging_token=None,
  153. ),
  154. results=[
  155. APIDict(
  156. name="no_match",
  157. id="wrong",
  158. ),
  159. ],
  160. )
  161. self.sdk.returns["create_cluster_compute"] = fail_always
  162. with self.assertRaises(ClusterComputeCreateError):
  163. self.cluster_manager.create_cluster_compute()
  164. # No cluster ID found or created
  165. self.assertFalse(self.cluster_manager.cluster_compute_id)
  166. # Both APIs were called twice (retry after fail)
  167. self.assertEqual(self.sdk.call_counter["search_cluster_computes"], 2)
  168. self.assertEqual(self.sdk.call_counter["create_cluster_compute"], 2)
  169. self.assertEqual(len(self.sdk.call_counter), 3) # 1 extra for cloud provider
  170. @patch("time.sleep", lambda *a, **kw: None)
  171. def testFindCreateClusterComputeCreateFailSucceed(self):
  172. # No existing compute, create new, fail once, succeed afterwards
  173. self.cluster_manager.set_cluster_compute(self.cluster_compute)
  174. self.assertTrue(self.cluster_manager.cluster_compute_name)
  175. self.assertFalse(self.cluster_manager.cluster_compute_id)
  176. self.sdk.returns["search_cluster_computes"] = APIDict(
  177. metadata=APIDict(
  178. next_paging_token=None,
  179. ),
  180. results=[
  181. APIDict(
  182. name="no_match",
  183. id="wrong",
  184. ),
  185. ],
  186. )
  187. self.sdk.returns["create_cluster_compute"] = fail_once(
  188. result=APIDict(
  189. result=APIDict(
  190. id="correct",
  191. )
  192. )
  193. )
  194. self.cluster_manager.create_cluster_compute()
  195. # Both APIs were called twice (retry after fail)
  196. self.assertEqual(self.cluster_manager.cluster_compute_id, "correct")
  197. self.assertEqual(self.sdk.call_counter["search_cluster_computes"], 2)
  198. self.assertEqual(self.sdk.call_counter["create_cluster_compute"], 2)
  199. self.assertEqual(len(self.sdk.call_counter), 3) # 1 extra for cloud provider
  200. @patch("time.sleep", lambda *a, **kw: None)
  201. def testFindCreateClusterComputeCreateSucceed(self):
  202. # No existing compute, create new, and succeed
  203. self.cluster_manager.set_cluster_compute(self.cluster_compute)
  204. self.assertTrue(self.cluster_manager.cluster_compute_name)
  205. self.assertFalse(self.cluster_manager.cluster_compute_id)
  206. self.sdk.returns["search_cluster_computes"] = APIDict(
  207. metadata=APIDict(
  208. next_paging_token=None,
  209. ),
  210. results=[
  211. APIDict(
  212. name="no_match",
  213. id="wrong",
  214. ),
  215. ],
  216. )
  217. self.sdk.returns["create_cluster_compute"] = APIDict(
  218. result=APIDict(
  219. id="correct",
  220. )
  221. )
  222. self.cluster_manager.create_cluster_compute()
  223. # Both APIs were called twice (retry after fail)
  224. self.assertEqual(self.cluster_manager.cluster_compute_id, "correct")
  225. self.assertEqual(self.sdk.call_counter["search_cluster_computes"], 1)
  226. self.assertEqual(self.sdk.call_counter["create_cluster_compute"], 1)
  227. self.assertEqual(len(self.sdk.call_counter), 3) # 1 extra for cloud provider
  228. # Test automatic fields
  229. self.assertEqual(
  230. self.cluster_manager.cluster_compute["idle_termination_minutes"],
  231. self.cluster_manager.autosuspend_minutes,
  232. )
  233. self.assertEqual(
  234. self.cluster_manager.cluster_compute["maximum_uptime_minutes"],
  235. self.cluster_manager.maximum_uptime_minutes,
  236. )
  237. def testClusterComputeExtraTags(self):
  238. self.cluster_manager.set_cluster_compute(self.cluster_compute)
  239. # No extra tags specified
  240. self.assertEqual(self.cluster_manager.cluster_compute, self.cluster_compute)
  241. # Extra tags specified
  242. self.cluster_manager.set_cluster_compute(
  243. self.cluster_compute, extra_tags={"foo": "bar"}
  244. )
  245. # All ResourceTypes as in
  246. # ray_release.aws.RELEASE_AWS_RESOURCE_TYPES_TO_TRACK_FOR_BILLING
  247. target_cluster_compute = TEST_CLUSTER_COMPUTE.copy()
  248. target_cluster_compute["aws"] = {
  249. "TagSpecifications": [
  250. {"ResourceType": "instance", "Tags": [{"Key": "foo", "Value": "bar"}]},
  251. {"ResourceType": "volume", "Tags": [{"Key": "foo", "Value": "bar"}]},
  252. ]
  253. }
  254. self.assertEqual(
  255. self.cluster_manager.cluster_compute["aws"], target_cluster_compute["aws"]
  256. )
  257. # Test merging with already existing tags
  258. cluster_compute_with_tags = TEST_CLUSTER_COMPUTE.copy()
  259. cluster_compute_with_tags["aws"] = {
  260. "TagSpecifications": [
  261. {"ResourceType": "fake", "Tags": []},
  262. {"ResourceType": "instance", "Tags": [{"Key": "key", "Value": "val"}]},
  263. ]
  264. }
  265. self.cluster_manager.set_cluster_compute(
  266. cluster_compute_with_tags, extra_tags={"foo": "bar"}
  267. )
  268. # All ResourceTypes as in RELEASE_AWS_RESOURCE_TYPES_TO_TRACK_FOR_BILLING
  269. target_cluster_compute = TEST_CLUSTER_COMPUTE.copy()
  270. target_cluster_compute["aws"] = {
  271. "TagSpecifications": [
  272. {"ResourceType": "fake", "Tags": []},
  273. {
  274. "ResourceType": "instance",
  275. "Tags": [
  276. {"Key": "key", "Value": "val"},
  277. {"Key": "foo", "Value": "bar"},
  278. ],
  279. },
  280. {"ResourceType": "volume", "Tags": [{"Key": "foo", "Value": "bar"}]},
  281. ]
  282. }
  283. self.assertEqual(
  284. self.cluster_manager.cluster_compute["aws"], target_cluster_compute["aws"]
  285. )
  286. @patch("time.sleep", lambda *a, **kw: None)
  287. def testFindCreateClusterEnvExisting(self):
  288. # Find existing env and succeed
  289. self.cluster_manager.set_cluster_env()
  290. self.assertTrue(self.cluster_manager.cluster_env_name)
  291. self.assertFalse(self.cluster_manager.cluster_env_id)
  292. self.sdk.returns["search_cluster_environments"] = APIDict(
  293. metadata=APIDict(
  294. next_paging_token=None,
  295. ),
  296. results=[
  297. APIDict(
  298. name="no_match",
  299. id="wrong",
  300. ),
  301. APIDict(name=self.cluster_manager.cluster_env_name, id="correct"),
  302. ],
  303. )
  304. self.cluster_manager.create_cluster_env()
  305. self.assertEqual(self.cluster_manager.cluster_env_id, "correct")
  306. self.assertEqual(self.sdk.call_counter["search_cluster_environments"], 1)
  307. self.assertEqual(len(self.sdk.call_counter), 1)
  308. @patch("time.sleep", lambda *a, **kw: None)
  309. def testFindCreateClusterEnvFailFail(self):
  310. # No existing compute, create new, but fail both times
  311. self.cluster_manager.set_cluster_env()
  312. self.assertTrue(self.cluster_manager.cluster_env_name)
  313. self.assertFalse(self.cluster_manager.cluster_env_id)
  314. self.sdk.returns["search_cluster_environments"] = APIDict(
  315. metadata=APIDict(
  316. next_paging_token=None,
  317. ),
  318. results=[
  319. APIDict(
  320. name="no_match",
  321. id="wrong",
  322. ),
  323. ],
  324. )
  325. self.sdk.returns["create_byod_cluster_environment"] = fail_always
  326. with self.assertRaises(ClusterEnvCreateError):
  327. self.cluster_manager.create_cluster_env()
  328. # No cluster ID found or created
  329. self.assertFalse(self.cluster_manager.cluster_env_id)
  330. # Both APIs were called twice (retry after fail)
  331. self.assertEqual(self.sdk.call_counter["search_cluster_environments"], 2)
  332. self.assertEqual(self.sdk.call_counter["create_byod_cluster_environment"], 2)
  333. self.assertEqual(len(self.sdk.call_counter), 2)
  334. @patch("time.sleep", lambda *a, **kw: None)
  335. def testFindCreateClusterEnvFailSucceed(self):
  336. # No existing compute, create new, fail once, succeed afterwards
  337. self.cluster_manager.set_cluster_env()
  338. self.assertTrue(self.cluster_manager.cluster_env_name)
  339. self.assertFalse(self.cluster_manager.cluster_env_id)
  340. self.cluster_manager.cluster_env_id = None
  341. self.sdk.reset()
  342. self.sdk.returns["search_cluster_environments"] = APIDict(
  343. metadata=APIDict(
  344. next_paging_token=None,
  345. ),
  346. results=[
  347. APIDict(
  348. name="no_match",
  349. id="wrong",
  350. ),
  351. ],
  352. )
  353. self.sdk.returns["create_byod_cluster_environment"] = fail_once(
  354. result=APIDict(
  355. result=APIDict(
  356. id="correct",
  357. )
  358. )
  359. )
  360. self.cluster_manager.create_cluster_env()
  361. # Both APIs were called twice (retry after fail)
  362. self.assertEqual(self.cluster_manager.cluster_env_id, "correct")
  363. self.assertEqual(self.sdk.call_counter["search_cluster_environments"], 2)
  364. self.assertEqual(self.sdk.call_counter["create_byod_cluster_environment"], 2)
  365. self.assertEqual(len(self.sdk.call_counter), 2)
  366. @patch("time.sleep", lambda *a, **kw: None)
  367. def testFindCreateClusterEnvSucceed(self):
  368. # No existing compute, create new, and succeed
  369. self.cluster_manager.set_cluster_env()
  370. self.assertTrue(self.cluster_manager.cluster_env_name)
  371. self.assertFalse(self.cluster_manager.cluster_env_id)
  372. self.sdk.returns["search_cluster_environments"] = APIDict(
  373. metadata=APIDict(
  374. next_paging_token=None,
  375. ),
  376. results=[
  377. APIDict(
  378. name="no_match",
  379. id="wrong",
  380. ),
  381. ],
  382. )
  383. self.sdk.returns["create_byod_cluster_environment"] = APIDict(
  384. result=APIDict(
  385. id="correct",
  386. )
  387. )
  388. self.cluster_manager.create_cluster_env()
  389. # Both APIs were called twice (retry after fail)
  390. self.assertEqual(self.cluster_manager.cluster_env_id, "correct")
  391. self.assertEqual(self.sdk.call_counter["search_cluster_environments"], 1)
  392. self.assertEqual(self.sdk.call_counter["create_byod_cluster_environment"], 1)
  393. self.assertEqual(len(self.sdk.call_counter), 2)
  394. @patch("time.sleep", lambda *a, **kw: None)
  395. def testBuildClusterEnvNotFound(self):
  396. self.cluster_manager.set_cluster_env()
  397. self.cluster_manager.cluster_env_id = "correct"
  398. # Environment build not found
  399. self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[])
  400. with self.assertRaisesRegex(ClusterEnvBuildError, "No build found"):
  401. self.cluster_manager.build_cluster_env(timeout=600)
  402. @patch("time.sleep", lambda *a, **kw: None)
  403. def testBuildClusterEnvPreBuildFailed(self):
  404. """Pre-build fails, but is kicked off again."""
  405. self.cluster_manager.set_cluster_env()
  406. self.cluster_manager.cluster_env_id = "correct"
  407. # Build failed on first lookup
  408. self.cluster_manager.cluster_env_build_id = None
  409. self.sdk.reset()
  410. self.sdk.returns["list_cluster_environment_builds"] = APIDict(
  411. results=[
  412. APIDict(
  413. id="build_failed",
  414. status="failed",
  415. created_at=0,
  416. error_message=None,
  417. config_json={},
  418. )
  419. ]
  420. )
  421. self.sdk.returns["create_cluster_environment_build"] = APIDict(
  422. result=APIDict(id="new_build_id")
  423. )
  424. self.sdk.returns["get_build"] = APIDict(
  425. result=APIDict(
  426. id="build_now_succeeded",
  427. status="failed",
  428. created_at=0,
  429. error_message=None,
  430. config_json={},
  431. )
  432. )
  433. with self.assertRaisesRegex(ClusterEnvBuildError, "Cluster env build failed"):
  434. self.cluster_manager.build_cluster_env(timeout=600)
  435. self.assertFalse(self.cluster_manager.cluster_env_build_id)
  436. self.assertEqual(self.sdk.call_counter["list_cluster_environment_builds"], 1)
  437. self.assertEqual(self.sdk.call_counter["create_cluster_environment_build"], 1)
  438. self.assertEqual(len(self.sdk.call_counter), 3)
  439. @patch("time.sleep", lambda *a, **kw: None)
  440. def testBuildClusterEnvPreBuildSucceeded(self):
  441. self.cluster_manager.set_cluster_env()
  442. self.cluster_manager.cluster_env_id = "correct"
  443. # (Second) build succeeded
  444. self.cluster_manager.cluster_env_build_id = None
  445. self.sdk.reset()
  446. self.sdk.returns["list_cluster_environment_builds"] = APIDict(
  447. results=[
  448. APIDict(
  449. id="build_failed",
  450. status="failed",
  451. created_at=0,
  452. error_message=None,
  453. config_json={},
  454. ),
  455. APIDict(
  456. id="build_succeeded",
  457. status="succeeded",
  458. created_at=1,
  459. error_message=None,
  460. config_json={},
  461. ),
  462. ]
  463. )
  464. self.cluster_manager.build_cluster_env(timeout=600)
  465. self.assertTrue(self.cluster_manager.cluster_env_build_id)
  466. self.assertEqual(self.cluster_manager.cluster_env_build_id, "build_succeeded")
  467. self.assertEqual(self.sdk.call_counter["list_cluster_environment_builds"], 1)
  468. self.assertEqual(len(self.sdk.call_counter), 1)
  469. @patch("time.sleep", lambda *a, **kw: None)
  470. def testBuildClusterEnvSelectLastBuild(self):
  471. self.cluster_manager.set_cluster_env()
  472. self.cluster_manager.cluster_env_id = "correct"
  473. # (Second) build succeeded
  474. self.cluster_manager.cluster_env_build_id = None
  475. self.sdk.reset()
  476. self.sdk.returns["list_cluster_environment_builds"] = APIDict(
  477. results=[
  478. APIDict(
  479. id="build_succeeded",
  480. status="succeeded",
  481. created_at=0,
  482. error_message=None,
  483. config_json={},
  484. ),
  485. APIDict(
  486. id="build_succeeded_2",
  487. status="succeeded",
  488. created_at=1,
  489. error_message=None,
  490. config_json={},
  491. ),
  492. ]
  493. )
  494. self.cluster_manager.build_cluster_env(timeout=600)
  495. self.assertTrue(self.cluster_manager.cluster_env_build_id)
  496. self.assertEqual(self.cluster_manager.cluster_env_build_id, "build_succeeded_2")
  497. self.assertEqual(self.sdk.call_counter["list_cluster_environment_builds"], 1)
  498. self.assertEqual(len(self.sdk.call_counter), 1)
  499. @patch("time.sleep", lambda *a, **kw: None)
  500. def testBuildClusterBuildFails(self):
  501. self.cluster_manager.set_cluster_env()
  502. self.cluster_manager.cluster_env_id = "correct"
  503. # Build, but fails after 300 seconds
  504. self.cluster_manager.cluster_env_build_id = None
  505. self.sdk.reset()
  506. self.sdk.returns["list_cluster_environment_builds"] = APIDict(
  507. results=[
  508. APIDict(
  509. id="build_failed",
  510. status="failed",
  511. created_at=0,
  512. error_message=None,
  513. config_json={},
  514. ),
  515. APIDict(
  516. id="build_succeeded",
  517. status="pending",
  518. created_at=1,
  519. error_message=None,
  520. config_json={},
  521. ),
  522. ]
  523. )
  524. with freeze_time() as frozen_time, self.assertRaisesRegex(
  525. ClusterEnvBuildError, "Cluster env build failed"
  526. ):
  527. self.sdk.returns["get_build"] = _DelayedResponse(
  528. lambda: frozen_time.tick(delta=10),
  529. finish_after=300,
  530. before=APIDict(
  531. result=APIDict(
  532. status="in_progress", error_message=None, config_json={}
  533. )
  534. ),
  535. after=APIDict(
  536. result=APIDict(status="failed", error_message=None, config_json={})
  537. ),
  538. )
  539. self.cluster_manager.build_cluster_env(timeout=600)
  540. self.assertFalse(self.cluster_manager.cluster_env_build_id)
  541. self.assertEqual(self.sdk.call_counter["list_cluster_environment_builds"], 1)
  542. self.assertGreaterEqual(self.sdk.call_counter["get_build"], 9)
  543. self.assertEqual(len(self.sdk.call_counter), 2)
  544. @patch("time.sleep", lambda *a, **kw: None)
  545. def testBuildClusterEnvBuildTimeout(self):
  546. self.cluster_manager.set_cluster_env()
  547. self.cluster_manager.cluster_env_id = "correct"
  548. # Build, but timeout after 100 seconds
  549. self.cluster_manager.cluster_env_build_id = None
  550. self.sdk.reset()
  551. self.sdk.returns["list_cluster_environment_builds"] = APIDict(
  552. results=[
  553. APIDict(
  554. id="build_failed",
  555. status="failed",
  556. created_at=0,
  557. error_message=None,
  558. config_json={},
  559. ),
  560. APIDict(
  561. id="build_succeeded",
  562. status="pending",
  563. created_at=1,
  564. error_message=None,
  565. config_json={},
  566. ),
  567. ]
  568. )
  569. with freeze_time() as frozen_time, self.assertRaisesRegex(
  570. ClusterEnvBuildTimeout, "Time out when building cluster env"
  571. ):
  572. self.sdk.returns["get_build"] = _DelayedResponse(
  573. lambda: frozen_time.tick(delta=10),
  574. finish_after=300,
  575. before=APIDict(
  576. result=APIDict(
  577. status="in_progress", error_message=None, config_json={}
  578. )
  579. ),
  580. after=APIDict(
  581. result=APIDict(
  582. status="succeeded", error_message=None, config_json={}
  583. )
  584. ),
  585. )
  586. self.cluster_manager.build_cluster_env(timeout=100)
  587. self.assertFalse(self.cluster_manager.cluster_env_build_id)
  588. self.assertEqual(self.sdk.call_counter["list_cluster_environment_builds"], 1)
  589. self.assertGreaterEqual(self.sdk.call_counter["get_build"], 9)
  590. self.assertEqual(len(self.sdk.call_counter), 2)
  591. @patch("time.sleep", lambda *a, **kw: None)
  592. def testBuildClusterBuildSucceed(self):
  593. self.cluster_manager.set_cluster_env()
  594. self.cluster_manager.cluster_env_id = "correct"
  595. # Build, succeed after 300 seconds
  596. self.cluster_manager.cluster_env_build_id = None
  597. self.sdk.reset()
  598. self.sdk.returns["list_cluster_environment_builds"] = APIDict(
  599. results=[
  600. APIDict(
  601. id="build_failed",
  602. status="failed",
  603. created_at=0,
  604. error_message=None,
  605. config_json={},
  606. ),
  607. APIDict(
  608. id="build_succeeded",
  609. status="pending",
  610. created_at=1,
  611. error_message=None,
  612. config_json={},
  613. ),
  614. ]
  615. )
  616. with freeze_time() as frozen_time:
  617. self.sdk.returns["get_build"] = _DelayedResponse(
  618. lambda: frozen_time.tick(delta=10),
  619. finish_after=300,
  620. before=APIDict(
  621. result=APIDict(
  622. status="in_progress", error_message=None, config_json={}
  623. )
  624. ),
  625. after=APIDict(
  626. result=APIDict(
  627. status="succeeded", error_message=None, config_json={}
  628. )
  629. ),
  630. )
  631. self.cluster_manager.build_cluster_env(timeout=600)
  632. self.assertTrue(self.cluster_manager.cluster_env_build_id)
  633. self.assertEqual(self.sdk.call_counter["list_cluster_environment_builds"], 1)
  634. self.assertGreaterEqual(self.sdk.call_counter["get_build"], 9)
  635. self.assertEqual(len(self.sdk.call_counter), 2)
  636. class FullSessionManagerTest(MinimalSessionManagerTest):
  637. cls = FullClusterManager
  638. def testSessionStartCreationError(self):
  639. self.cluster_manager.cluster_env_id = "correct"
  640. self.cluster_manager.cluster_compute_id = "correct"
  641. self.sdk.returns["create_cluster"] = _fail
  642. with self.assertRaises(ClusterCreationError):
  643. self.cluster_manager.start_cluster()
  644. def testSessionStartStartupError(self):
  645. self.cluster_manager.cluster_env_id = "correct"
  646. self.cluster_manager.cluster_compute_id = "correct"
  647. self.sdk.returns["create_cluster"] = APIDict(result=APIDict(id="success"))
  648. self.sdk.returns["start_cluster"] = _fail
  649. with self.assertRaises(ClusterStartupError):
  650. self.cluster_manager.start_cluster()
  651. @patch("time.sleep", lambda *a, **kw: None)
  652. def testSessionStartStartupTimeout(self):
  653. self.cluster_manager.cluster_env_id = "correct"
  654. self.cluster_manager.cluster_compute_id = "correct"
  655. self.sdk.returns["create_cluster"] = APIDict(result=APIDict(id="success"))
  656. self.sdk.returns["start_cluster"] = APIDict(
  657. result=APIDict(id="cop_id", completed=False)
  658. )
  659. with freeze_time() as frozen_time, self.assertRaises(ClusterStartupTimeout):
  660. self.sdk.returns["get_cluster_operation"] = _DelayedResponse(
  661. lambda: frozen_time.tick(delta=10),
  662. finish_after=300,
  663. before=APIDict(result=APIDict(completed=False)),
  664. after=APIDict(result=APIDict(completed=True)),
  665. )
  666. # Timeout before startup finishes
  667. self.cluster_manager.start_cluster(timeout=200)
  668. @patch("time.sleep", lambda *a, **kw: None)
  669. def testSessionStartStartupFailed(self):
  670. self.cluster_manager.cluster_env_id = "correct"
  671. self.cluster_manager.cluster_compute_id = "correct"
  672. self.sdk.returns["create_cluster"] = APIDict(result=APIDict(id="success"))
  673. self.sdk.returns["start_cluster"] = APIDict(
  674. result=APIDict(id="cop_id", completed=False)
  675. )
  676. with freeze_time() as frozen_time, self.assertRaises(ClusterStartupFailed):
  677. frozen_time.tick(delta=0.1)
  678. self.sdk.returns["get_cluster_operation"] = _DelayedResponse(
  679. lambda: frozen_time.tick(delta=10),
  680. finish_after=300,
  681. before=APIDict(result=APIDict(completed=False)),
  682. after=APIDict(result=APIDict(completed=True)),
  683. )
  684. self.sdk.returns["get_cluster"] = APIDict(
  685. result=APIDict(state="Terminated")
  686. )
  687. # Timeout is long enough
  688. self.cluster_manager.start_cluster(timeout=400)
  689. @patch("time.sleep", lambda *a, **kw: None)
  690. def testSessionStartStartupSuccess(self):
  691. self.cluster_manager.cluster_env_id = "correct"
  692. self.cluster_manager.cluster_compute_id = "correct"
  693. self.sdk.returns["create_cluster"] = APIDict(result=APIDict(id="success"))
  694. self.sdk.returns["start_cluster"] = APIDict(
  695. result=APIDict(id="cop_id", completed=False)
  696. )
  697. with freeze_time() as frozen_time:
  698. frozen_time.tick(delta=0.1)
  699. self.sdk.returns["get_cluster_operation"] = _DelayedResponse(
  700. lambda: frozen_time.tick(delta=10),
  701. finish_after=300,
  702. before=APIDict(result=APIDict(completed=False)),
  703. after=APIDict(result=APIDict(completed=True)),
  704. )
  705. self.sdk.returns["get_cluster"] = APIDict(result=APIDict(state="Running"))
  706. # Timeout is long enough
  707. self.cluster_manager.start_cluster(timeout=400)
  708. @unittest.skipUnless(
  709. os.environ.get("RELEASE_UNIT_TEST_NO_ANYSCALE", "0") == "1",
  710. reason="RELEASE_UNIT_TEST_NO_ANYSCALE is set to 1",
  711. )
  712. class LiveSessionManagerTest(unittest.TestCase):
  713. def setUp(self) -> None:
  714. self.sdk = get_anyscale_sdk()
  715. self.cluster_compute = TEST_CLUSTER_COMPUTE
  716. self.cluster_manager = FullClusterManager(
  717. project_id=UNIT_TEST_PROJECT_ID,
  718. sdk=self.sdk,
  719. test_name=f"unit_test__{self.__class__.__name__}__endToEnd",
  720. )
  721. def tearDown(self) -> None:
  722. self.cluster_manager.terminate_cluster()
  723. self.cluster_manager.delete_configs()
  724. def testSessionEndToEnd(self):
  725. self.cluster_manager.set_cluster_env(self.cluster_env)
  726. self.cluster_manager.set_cluster_compute(self.cluster_compute)
  727. self.cluster_manager.build_configs(timeout=1200)
  728. # Reset, so that we fetch them again and test that code path
  729. self.cluster_manager.cluster_compute_id = None
  730. self.cluster_manager.cluster_env_id = None
  731. self.cluster_manager.cluster_env_build_id = None
  732. self.cluster_manager.build_configs(timeout=1200)
  733. # Start cluster
  734. self.cluster_manager.start_cluster(timeout=1200)
  735. if __name__ == "__main__":
  736. import pytest
  737. sys.exit(pytest.main(["-v", __file__]))