cpu_adam.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. '''
  2. Copyright 2020 The Microsoft DeepSpeed Team
  3. '''
  4. import torch
  5. from cpuinfo import get_cpu_info
  6. from deepspeed.utils import logger
  7. from deepspeed.utils.logging import should_log_le
  8. from deepspeed.ops.op_builder import CPUAdamBuilder
  9. class DeepSpeedCPUAdam(torch.optim.Optimizer):
  10. optimizer_id = 0
  11. def __init__(self,
  12. model_params,
  13. lr=1e-3,
  14. bias_correction=True,
  15. betas=(0.9,
  16. 0.999),
  17. eps=1e-8,
  18. weight_decay=0,
  19. amsgrad=False,
  20. adamw_mode=True,
  21. fp32_optimizer_states=True):
  22. """Fast vectorized implementation of two variations of Adam optimizer on CPU:
  23. * Adam: A Method for Stochastic Optimization: (https://arxiv.org/abs/1412.6980);
  24. * AdamW: Fixing Weight Decay Regularization in Adam (https://arxiv.org/abs/1711.05101)
  25. DeepSpeed CPU Adam(W) provides between 5x to 7x speedup over torch.optim.adam(W).
  26. In order to apply this optimizer, the model requires to have its master parameter (in FP32)
  27. reside on the CPU memory.
  28. To train on a heterogeneous system, such as coordinating CPU and GPU, DeepSpeed offers
  29. the ZeRO-Offload technology which efficiently offloads the optimizer states into CPU memory,
  30. with minimal impact on training throughput. DeepSpeedCPUAdam plays an important role to minimize
  31. the overhead of the optimizer's latency on CPU. Please refer to ZeRO-Offload tutorial
  32. (https://www.deepspeed.ai/tutorials/zero-offload/) for more information on how to enable this technology.
  33. For calling step function, there are two options available: (1) update optimizer's states and (2) update
  34. optimizer's states and copy the parameters back to GPU at the same time. We have seen that the second
  35. option can bring 30% higher throughput than the doing the copy separately using option one.
  36. .. note::
  37. We recommend using our `config
  38. <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`_
  39. to allow :meth:`deepspeed.initialize` to build this optimizer
  40. for you.
  41. Arguments:
  42. model_params (iterable): iterable of parameters to optimize or dicts defining
  43. parameter groups.
  44. lr (float, optional): learning rate. (default: 1e-3)
  45. betas (Tuple[float, float], optional): coefficients used for computing
  46. running averages of gradient and its square. (default: (0.9, 0.999))
  47. eps (float, optional): term added to the denominator to improve
  48. numerical stability. (default: 1e-8)
  49. weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
  50. amsgrad (boolean, optional): whether to use the AMSGrad variant of this
  51. algorithm from the paper `On the Convergence of Adam and Beyond`_
  52. (default: False) NOT SUPPORTED in DeepSpeed CPUAdam!
  53. adamw_mode: select between Adam and AdamW implementations (default: AdamW)
  54. full_precision_optimizer_states: creates momementum and variance in full precision regardless of
  55. the precision of the parameters (default: True)
  56. """
  57. default_args = dict(lr=lr,
  58. betas=betas,
  59. eps=eps,
  60. weight_decay=weight_decay,
  61. bias_correction=bias_correction,
  62. amsgrad=amsgrad)
  63. super(DeepSpeedCPUAdam, self).__init__(model_params, default_args)
  64. cpu_info = get_cpu_info()
  65. self.cpu_vendor = cpu_info["vendor_id_raw"].lower(
  66. ) if "vendor_id_raw" in cpu_info else "unknown"
  67. if "amd" in self.cpu_vendor:
  68. for group_id, group in enumerate(self.param_groups):
  69. for param_id, p in enumerate(group['params']):
  70. if p.dtype == torch.half:
  71. logger.warning(
  72. "FP16 params for CPUAdam may not work on AMD CPUs")
  73. break
  74. else:
  75. continue
  76. break
  77. self.opt_id = DeepSpeedCPUAdam.optimizer_id
  78. DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1
  79. self.adam_w_mode = adamw_mode
  80. self.fp32_optimizer_states = fp32_optimizer_states
  81. self.ds_opt_adam = CPUAdamBuilder().load()
  82. self.ds_opt_adam.create_adam(self.opt_id,
  83. lr,
  84. betas[0],
  85. betas[1],
  86. eps,
  87. weight_decay,
  88. adamw_mode,
  89. should_log_le("info"))
  90. def __del__(self):
  91. # need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize
  92. # is used multiple times in the same process (notebook or pytest worker)
  93. self.ds_opt_adam.destroy_adam(self.opt_id)
  94. def __setstate__(self, state):
  95. super(DeepSpeedCPUAdam, self).__setstate__(state)
  96. for group in self.param_groups:
  97. group.setdefault('amsgrad', False)
  98. @torch.no_grad()
  99. def step(self, closure=None, fp16_param_groups=None):
  100. """Update the model parameters.
  101. .. note::
  102. This method will be called internally by ZeRO-Offload. DeepSpeed
  103. users should still use ``engine.step()`` as shown in the
  104. `Getting Started
  105. <https://www.deepspeed.ai/getting-started/#training>`_ guide.
  106. Args:
  107. closure (callable, optional): closure to compute the loss.
  108. Defaults to ``None``.
  109. fp16_param_groups: FP16 GPU parameters to update. Performing the
  110. copy here reduces communication time. Defaults to ``None``.
  111. Returns:
  112. loss: if ``closure`` is provided. Otherwise ``None``.
  113. """
  114. loss = None
  115. if closure is not None:
  116. with torch.enable_grad():
  117. loss = closure()
  118. # intended device for step
  119. device = torch.device('cpu')
  120. # converting the fp16 params to a group of parameter
  121. if type(fp16_param_groups) is list:
  122. if type(fp16_param_groups[0]) is not list:
  123. fp16_param_groups = [fp16_param_groups]
  124. elif fp16_param_groups is not None:
  125. fp16_param_groups = [[fp16_param_groups]]
  126. for group_id, group in enumerate(self.param_groups):
  127. for param_id, p in enumerate(group['params']):
  128. if p.grad is None:
  129. continue
  130. assert p.device == device, f"CPUAdam param is on {p.device} and must be 'cpu', make " \
  131. "sure you enabled 'offload_optimizer': 'cpu' in your ZeRO config."
  132. state = self.state[p]
  133. # State initialization
  134. if len(state) == 0:
  135. #print(f'group {group_id} param {param_id} = {p.numel()}')
  136. state['step'] = 0
  137. #use full precision by default unless self.fp32_optimizer_states is off
  138. state_dtype = torch.float if self.fp32_optimizer_states else p.dtype
  139. # gradient momentums
  140. state['exp_avg'] = torch.zeros_like(p.data,
  141. dtype=state_dtype,
  142. device=device)
  143. #memory_format=torch.preserve_format)
  144. # gradient variances
  145. state['exp_avg_sq'] = torch.zeros_like(p.data,
  146. dtype=state_dtype,
  147. device=device)
  148. #memory_format=torch.preserve_format)
  149. state['step'] += 1
  150. beta1, beta2 = group['betas']
  151. if fp16_param_groups is not None:
  152. self.ds_opt_adam.adam_update_copy(
  153. self.opt_id,
  154. state['step'],
  155. group['lr'],
  156. beta1,
  157. beta2,
  158. group['eps'],
  159. group['weight_decay'],
  160. group['bias_correction'],
  161. p.data,
  162. p.grad.data,
  163. state['exp_avg'],
  164. state['exp_avg_sq'],
  165. fp16_param_groups[group_id][param_id].data)
  166. else:
  167. self.ds_opt_adam.adam_update(self.opt_id,
  168. state['step'],
  169. group['lr'],
  170. beta1,
  171. beta2,
  172. group['eps'],
  173. group['weight_decay'],
  174. group['bias_correction'],
  175. p.data,
  176. p.grad.data,
  177. state['exp_avg'],
  178. state['exp_avg_sq'])
  179. return loss