test_zero_optimizer.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import deepspeed
  5. from types import SimpleNamespace
  6. from deepspeed.ops.op_builder import CPUAdamBuilder
  7. from deepspeed.checkpoint.utils import clone_tensors_for_torch_save, get_model_ckpt_name_for_rank
  8. from deepspeed.accelerator import get_accelerator
  9. from deepspeed.runtime.utils import required_torch_version
  10. from unit.common import DistributedTest, DistributedFixture
  11. from unit.simple_model import *
  12. from unit.checkpoint.common import *
  13. import pytest
  14. class TestZeROCheckpoint(DistributedTest):
  15. world_size = 2
  16. @pytest.mark.parametrize('zero_stage', [3])
  17. def test_pipeline_checkpoint_loading(self, tmpdir, zero_stage):
  18. config_dict = {
  19. "train_batch_size": 2,
  20. "optimizer": {
  21. "type": 'Adam'
  22. },
  23. "fp16": {
  24. "enabled": True,
  25. "initial_scale_power": 8
  26. },
  27. "zero_optimization": {
  28. "stage": zero_stage,
  29. "pipeline_loading_checkpoint": True,
  30. }
  31. }
  32. hidden_dim = 10
  33. with deepspeed.zero.Init():
  34. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  35. checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_module_only=True)
  36. @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', [(1, False, 'Adam'), (2, False, 'Adam'),
  37. (2, True, 'deepspeed_adam'),
  38. (3, False, 'Adam'),
  39. (3, True, 'deepspeed_adam')])
  40. def test_load_optimizer_state(self, tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
  41. if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
  42. pytest.skip("cpu-adam is not compatible")
  43. config_dict = {
  44. "train_batch_size": 2,
  45. "steps_per_print": 1,
  46. "optimizer": {
  47. "type": 'Adam',
  48. "params": {
  49. "lr": 0.00015,
  50. "betas": [0.8, 0.999],
  51. "eps": 1e-8,
  52. "weight_decay": 3e-7
  53. }
  54. },
  55. "fp16": {
  56. "enabled": True,
  57. "initial_scale_power": 8
  58. },
  59. "wall_clock_breakdown": True,
  60. "zero_optimization": {
  61. "stage": zero_stage,
  62. "cpu_offload": use_cpu_offload
  63. }
  64. }
  65. hidden_dim = 10
  66. if zero_stage == 3:
  67. with deepspeed.zero.Init():
  68. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  69. else:
  70. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  71. checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=True)
  72. @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', [(1, False, "Adam"), (2, False, "Adam"),
  73. (2, True, 'deepspeed_adam'),
  74. (3, False, 'Adam'),
  75. (3, True, 'deepspeed_adam')])
  76. def test_not_load_optimizer_state(self, tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
  77. if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
  78. pytest.skip("cpu-adam is not compatible")
  79. config_dict = {
  80. "train_batch_size": 2,
  81. "steps_per_print": 1,
  82. "optimizer": {
  83. "type": 'Adam',
  84. "params": {
  85. "lr": 0.00015,
  86. "betas": [0.8, 0.999],
  87. "eps": 1e-8,
  88. "weight_decay": 3e-7
  89. }
  90. },
  91. "fp16": {
  92. "enabled": True
  93. },
  94. "zero_optimization": {
  95. "stage": zero_stage,
  96. "cpu_offload": use_cpu_offload
  97. }
  98. }
  99. hidden_dim = 10
  100. if zero_stage == 3:
  101. global DeepSpeedZeroOptimizer_Stage3
  102. from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
  103. with deepspeed.zero.Init():
  104. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  105. else:
  106. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  107. checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=False)
  108. @pytest.mark.parametrize('zero_stage', [1, 2])
  109. def test_hybrid_optimizer_state(self, tmpdir, zero_stage):
  110. config_dict = {
  111. "train_micro_batch_size_per_gpu": 2,
  112. "gradient_accumulation_steps": 2,
  113. "steps_per_print": 1,
  114. "zero_optimization": {
  115. "stage": zero_stage
  116. },
  117. "zero_allow_untested_optimizer": True,
  118. "fp16": {
  119. "enabled": True,
  120. "initial_scale_power": 8
  121. }
  122. }
  123. hidden_dim = 10
  124. models = [SimpleModel(hidden_dim=hidden_dim) for _ in range(2)]
  125. optimizers = [HybridStateOptimizer(model.parameters()) for model in models]
  126. checkpoint_correctness_verification(config_dict,
  127. models=models,
  128. base_optimizers=optimizers,
  129. hidden_dim=hidden_dim,
  130. tmpdir=tmpdir,
  131. load_optimizer_states=True)
  132. @pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
  133. def test_load_module_only(self, tmpdir, zero_stage):
  134. config_dict = {
  135. "train_batch_size": 2,
  136. "optimizer": {
  137. "type": 'Adam'
  138. },
  139. "fp16": {
  140. "enabled": True,
  141. "initial_scale_power": 8
  142. },
  143. "zero_optimization": {
  144. "stage": zero_stage,
  145. }
  146. }
  147. hidden_dim = 10
  148. if zero_stage == 3:
  149. with deepspeed.zero.Init():
  150. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  151. else:
  152. models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
  153. checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_module_only=True)
  154. class ws4_model_checkpoint(DistributedFixture):
  155. world_size = 4
  156. def run(self, class_tmpdir, elastic_save, load_optim):
  157. ds_config = {
  158. "train_batch_size": 4,
  159. "optimizer": {
  160. "type": 'Adam'
  161. },
  162. "fp16": {
  163. "enabled": True,
  164. "initial_scale_power": 8
  165. },
  166. "zero_optimization": {
  167. "stage": 2,
  168. "elastic_checkpoint": elastic_save
  169. }
  170. }
  171. hidden_dim = 10
  172. model = SimpleModel(hidden_dim)
  173. model, _, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model.parameters())
  174. data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device)
  175. for n, batch in enumerate(data_loader):
  176. loss = model(batch[0], batch[1])
  177. model.backward(loss)
  178. model.step()
  179. if load_optim:
  180. torch.save(model.optimizer.optimizer.state_dict(), os.path.join(class_tmpdir, 'opt-state-dict'))
  181. model.save_checkpoint(class_tmpdir)
  182. @pytest.mark.parametrize("elastic_save", [True, False])
  183. @pytest.mark.parametrize("elastic_load", [True, False])
  184. @pytest.mark.parametrize("load_optim", [True, False])
  185. class TestZeROElasticCheckpoint(DistributedTest):
  186. world_size = 2
  187. def test_elastic_checkpoint_fixed_dp(self, tmpdir, elastic_save, elastic_load, load_optim):
  188. ds_config = {
  189. "train_batch_size": 2,
  190. "optimizer": {
  191. "type": 'Adam'
  192. },
  193. "fp16": {
  194. "enabled": True,
  195. "initial_scale_power": 8
  196. },
  197. "zero_optimization": {
  198. "stage": 2,
  199. "elastic_checkpoint": elastic_save
  200. }
  201. }
  202. hidden_dim = 10
  203. # torch 1.2.* stores raw tensor id numbers in checkpoint state which leads to
  204. # false positive mismatches in checkpoint state comparisons.
  205. # Newer torch versions store tensor ids as 0, 1, 2, ...
  206. expected_mismatch_keys = [] if required_torch_version(min_version=1.4) else ['params']
  207. models = [SimpleModel(hidden_dim) for _ in range(2)]
  208. model, _, _, _ = deepspeed.initialize(config=ds_config,
  209. model=models[0],
  210. model_parameters=models[0].parameters())
  211. data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device)
  212. for n, batch in enumerate(data_loader):
  213. loss = model(batch[0], batch[1])
  214. model.backward(loss)
  215. model.step()
  216. if load_optim:
  217. torch.save(model.optimizer.optimizer.state_dict(), os.path.join(tmpdir, 'opt-state-dict'))
  218. model.save_checkpoint(tmpdir)
  219. ds_config["zero_optimization"]["elastic_checkpoint"] = elastic_load
  220. model, _, _, _ = deepspeed.initialize(config=ds_config,
  221. model=models[1],
  222. model_parameters=models[1].parameters())
  223. model.load_checkpoint(tmpdir, load_optimizer_states=load_optim)
  224. if load_optim:
  225. saved_sd = torch.load(os.path.join(tmpdir, 'opt-state-dict'))
  226. curr_sd = model.optimizer.optimizer.state_dict()
  227. for curr_param_group, saved_param_group in zip(curr_sd['param_groups'], saved_sd['param_groups']):
  228. compare_state_dicts(curr_param_group, saved_param_group, expected_mismatch_keys)
  229. data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device)
  230. for n, batch in enumerate(data_loader):
  231. loss = model(batch[0], batch[1])
  232. model.backward(loss)
  233. model.step()
  234. def test_elastic_checkpoint_change_dp(self, ws4_model_checkpoint, class_tmpdir, elastic_save, elastic_load,
  235. load_optim):
  236. ds_config = {
  237. "train_batch_size": 4,
  238. "optimizer": {
  239. "type": 'Adam'
  240. },
  241. "fp16": {
  242. "enabled": True,
  243. "initial_scale_power": 8
  244. },
  245. "zero_optimization": {
  246. "stage": 2,
  247. "elastic_checkpoint": elastic_load
  248. }
  249. }
  250. hidden_dim = 10
  251. model = SimpleModel(hidden_dim)
  252. # Load checkpoint with dp world size = 2
  253. model, _, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model.parameters())
  254. if load_optim:
  255. with pytest.raises(deepspeed.runtime.zero.utils.ZeRORuntimeException):
  256. model.load_checkpoint(class_tmpdir, load_optimizer_states=load_optim)
  257. else:
  258. model.load_checkpoint(class_tmpdir, load_optimizer_states=load_optim)
  259. class TestZeROSaveLoadEdgeCase(DistributedTest):
  260. world_size = 2
  261. @pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
  262. def test_immediate_save_load(self, tmpdir, zero_stage):
  263. config_dict = {
  264. "train_batch_size": 4,
  265. "optimizer": {
  266. "type": 'Adam'
  267. },
  268. "fp16": {
  269. "enabled": True,
  270. "initial_scale_power": 8
  271. },
  272. "zero_optimization": {
  273. "stage": zero_stage,
  274. }
  275. }
  276. hidden_dim = 10
  277. model = SimpleModel(hidden_dim)
  278. ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
  279. ds_model.save_checkpoint(tmpdir)
  280. ds_model.load_checkpoint(tmpdir,
  281. load_optimizer_states=False,
  282. load_lr_scheduler_states=False,
  283. load_module_only=False)
  284. @pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
  285. def test_load_immediate_save(self, tmpdir, zero_stage):
  286. config_dict = {
  287. "train_batch_size": 4,
  288. "optimizer": {
  289. "type": 'Adam'
  290. },
  291. "fp16": {
  292. "enabled": True,
  293. "initial_scale_power": 8
  294. },
  295. "zero_optimization": {
  296. "stage": zero_stage,
  297. }
  298. }
  299. hidden_dim = 10
  300. model = SimpleModel(hidden_dim)
  301. # 1. pretrain a model and save it
  302. dtype = torch.half
  303. ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
  304. data_loader = random_dataloader(model=ds_model,
  305. total_samples=1,
  306. hidden_dim=hidden_dim,
  307. device=ds_model.device,
  308. dtype=dtype)
  309. for _, batch in enumerate(data_loader):
  310. loss = ds_model(batch[0], batch[1])
  311. ds_model.backward(loss)
  312. ds_model.step()
  313. ds_model.empty_partition_cache()
  314. ds_model.save_checkpoint(tmpdir)
  315. # 2. load and immediately save a model with a fresh ds engine
  316. ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
  317. ds_model.load_checkpoint(tmpdir,
  318. load_optimizer_states=False,
  319. load_lr_scheduler_states=False,
  320. load_module_only=False)
  321. ds_model.save_checkpoint(tmpdir)
  322. @pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
  323. def test_save_before_accum_grad_is_done(self, tmpdir, zero_stage):
  324. config_dict = {
  325. "optimizer": {
  326. "type": 'Adam'
  327. },
  328. "fp16": {
  329. "enabled": True,
  330. "initial_scale_power": 8
  331. },
  332. "zero_optimization": {
  333. "stage": zero_stage,
  334. "stage3_gather_fp16_weights_on_model_save": True,
  335. },
  336. "gradient_accumulation_steps": 2,
  337. "train_micro_batch_size_per_gpu": 1,
  338. "train_batch_size": 4,
  339. }
  340. hidden_dim = 10
  341. model = SimpleModel(hidden_dim)
  342. # This test reproduces a bug where one tries to retrieve a 16bit model before grad_accum
  343. # cycle was completed.
  344. # So we config grad_accum=2 and step only once and save_16bit_model
  345. ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
  346. data_loader = random_dataloader(model=ds_model,
  347. total_samples=2,
  348. hidden_dim=hidden_dim,
  349. device=ds_model.device,
  350. dtype=torch.half)
  351. batch = next(iter(data_loader))
  352. loss = ds_model(batch[0], batch[1])
  353. ds_model.backward(loss)
  354. ds_model.step()
  355. ds_model.empty_partition_cache()
  356. # we stepped only once, and now save 16bit model before gradient_accumulation_steps=2 is complete
  357. ds_model.save_16bit_model(tmpdir, "model.pt")
  358. # let's test just as well that we can save the checkpoint too
  359. ds_model.save_checkpoint(tmpdir)
  360. class TestZeROCheckpointFrozenWeights(DistributedTest):
  361. world_size = 2
  362. @pytest.mark.parametrize('zero_stage', [1, 2, 3])
  363. def test_load_optimizer_state(self, tmpdir, zero_stage):
  364. config_dict = {
  365. "train_batch_size": 2,
  366. "steps_per_print": 1,
  367. "optimizer": {
  368. "type": 'Adam',
  369. "params": {
  370. "lr": 0.00015,
  371. "betas": [0.8, 0.999],
  372. "eps": 1e-8,
  373. "weight_decay": 3e-7
  374. }
  375. },
  376. "fp16": {
  377. "enabled": True,
  378. "initial_scale_power": 8
  379. },
  380. "wall_clock_breakdown": True,
  381. "zero_optimization": {
  382. "stage": zero_stage
  383. }
  384. }
  385. hidden_dim = 10
  386. with deepspeed.zero.Init(enabled=zero_stage == 3):
  387. models = [SimpleFrozenModel(hidden_dim, empty_grad=False) for _ in range(2)]
  388. checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=True)
  389. @pytest.mark.parametrize('zero_stage', [1, 2, 3])
  390. def test_not_load_optimizer_state(self, tmpdir, zero_stage):
  391. config_dict = {
  392. "train_batch_size": 2,
  393. "steps_per_print": 1,
  394. "optimizer": {
  395. "type": 'Adam',
  396. "params": {
  397. "lr": 0.00015,
  398. "betas": [0.8, 0.999],
  399. "eps": 1e-8,
  400. "weight_decay": 3e-7
  401. }
  402. },
  403. "fp16": {
  404. "enabled": True
  405. },
  406. "zero_optimization": {
  407. "stage": zero_stage
  408. }
  409. }
  410. hidden_dim = 10
  411. with deepspeed.zero.Init(enabled=zero_stage == 3):
  412. models = [SimpleFrozenModel(hidden_dim, empty_grad=False) for _ in range(2)]
  413. checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=False)
  414. @pytest.mark.parametrize('zero_stage', [1, 2, 3])
  415. def test_load_module_only(self, tmpdir, zero_stage):
  416. config_dict = {
  417. "train_batch_size": 2,
  418. "optimizer": {
  419. "type": 'Adam'
  420. },
  421. "fp16": {
  422. "enabled": True,
  423. "initial_scale_power": 8
  424. },
  425. "zero_optimization": {
  426. "stage": zero_stage,
  427. }
  428. }
  429. hidden_dim = 10
  430. with deepspeed.zero.Init(enabled=zero_stage == 3):
  431. models = [SimpleFrozenModel(hidden_dim, empty_grad=False) for _ in range(2)]
  432. checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_module_only=True)
  433. @pytest.mark.parametrize('zero_stage', [1, 2])
  434. def test_save_exclude_frozen_weights(self, tmpdir, zero_stage):
  435. world_size = 1
  436. config_dict = {
  437. "train_micro_batch_size_per_gpu": 1,
  438. "optimizer": {
  439. "type": 'Adam'
  440. },
  441. "fp16": {
  442. "enabled": True,
  443. "initial_scale_power": 8
  444. },
  445. "zero_optimization": {
  446. "stage": zero_stage,
  447. }
  448. }
  449. hidden_dim = 10
  450. model = SimpleFrozenModel(hidden_dim, empty_grad=False)
  451. ds_engine, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
  452. # Validate backwards-compatibility of including frozen parameters in checkpoint
  453. all_ckpt_folder = os.path.join(tmpdir, 'all_params')
  454. ds_engine.save_checkpoint(all_ckpt_folder)
  455. all_params_ckpt_file = get_model_ckpt_name_for_rank(os.path.join(all_ckpt_folder, 'global_step0'), '00')
  456. loaded_all_param_model = torch.load(all_params_ckpt_file)['module']
  457. all_param_names = set([n for n, p in model.named_parameters()])
  458. assert set(loaded_all_param_model.keys()) == all_param_names
  459. # Validate exclusion of frozen parameters
  460. trainable_ckpt_folder = os.path.join(tmpdir, 'no_frozen_params')
  461. ds_engine.save_checkpoint(trainable_ckpt_folder, exclude_frozen_parameters=True)
  462. trainable_ckpt_file = get_model_ckpt_name_for_rank(os.path.join(trainable_ckpt_folder, 'global_step0'), '00')
  463. # Excluding frozen parameters should reduce checkpoint size
  464. assert os.path.getsize(all_params_ckpt_file) > os.path.getsize(trainable_ckpt_file)
  465. loaded_trainable_param_model = torch.load(trainable_ckpt_file)['module']
  466. frozen_param_names = set([n for n, p in model.named_parameters() if not p.requires_grad])
  467. loaded_trainable_param_names = set(loaded_trainable_param_model.keys())
  468. overlap_names = set.intersection(loaded_trainable_param_names, frozen_param_names)
  469. assert len(overlap_names) == 0
  470. trainable_param_names = set([n for n, p in model.named_parameters() if p.requires_grad])
  471. assert loaded_trainable_param_names == trainable_param_names
  472. class TestSaveTensorClone(DistributedTest):
  473. world_size = 1
  474. @pytest.mark.parametrize('zero_stage', [1, 2])
  475. @pytest.mark.parametrize('use_cpu_device', [True, False])
  476. def test_save_tensor_clone(self, tmpdir, zero_stage, use_cpu_device):
  477. ds_config = {
  478. "optimizer": {
  479. "type": "AdamW",
  480. },
  481. "zero_optimization": {
  482. "stage": zero_stage
  483. },
  484. "train_batch_size": 1,
  485. "train_micro_batch_size_per_gpu": 1
  486. }
  487. hidden_dim = 1024
  488. model = SimpleModel(hidden_dim, nlayers=4).half()
  489. ref_model_state_dict = model.state_dict()
  490. ds_engine, _, _, _ = deepspeed.initialize(model=model, config_params=ds_config)
  491. clone_device = torch.device('cpu') if use_cpu_device else get_accelerator().current_device()
  492. clone_state_dict = clone_tensors_for_torch_save(ds_engine.module.state_dict())
  493. compare_state_dicts(ref_model_state_dict, clone_state_dict)
  494. ref_ckpt_file = os.path.join(tmpdir, 'ref_ckpt.pt')
  495. torch.save(ref_model_state_dict, ref_ckpt_file)
  496. clone_ckpt_file = os.path.join(tmpdir, 'clone_ckpt.pt')
  497. torch.save(clone_state_dict, clone_ckpt_file)
  498. compare_state_dicts(torch.load(ref_ckpt_file), torch.load(clone_ckpt_file))
  499. class TestZeRONonDistributed(DistributedTest):
  500. world_size = 1
  501. init_distributed = False
  502. @pytest.mark.parametrize('zero_stage', [1, 2, 3])
  503. def test_chmod_exception_handling(self, monkeypatch, zero_stage):
  504. config_dict = {
  505. "optimizer": {
  506. "type": "AdamW"
  507. },
  508. "train_batch_size": 1,
  509. "zero_optimization": {
  510. "stage": zero_stage
  511. }
  512. }
  513. args = SimpleNamespace(local_rank=0)
  514. net = SimpleModel(hidden_dim=4)
  515. engine, _, _, _ = deepspeed.initialize(args=args,
  516. config=config_dict,
  517. model=net,
  518. model_parameters=net.parameters())
  519. log_called = False
  520. def mock_logger_info(message, *args, **kwargs):
  521. nonlocal log_called
  522. log_called = True
  523. monkeypatch.setattr("deepspeed.utils.logger.info", mock_logger_info)
  524. """
  525. This is presented for use-cases like Azure Storage File Share (where permissions are not allowed)
  526. We use a fake file for this test (file not existing would present a similar issue as not being able to chmod)
  527. """
  528. fake_recovery_script_dst = os.path.join("tmp", "zero_to_fp32.py")
  529. engine._change_recovery_script_permissions(fake_recovery_script_dst)
  530. assert log_called, "Expected deepspeed.utils.logger.info to be called."