progressive_layer_drop.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import numpy as np
  5. from deepspeed.utils import log_dist
  6. class ProgressiveLayerDrop(object):
  7. r""" Progressive Layer Dropping (PLD) for model training.
  8. This implements the PLD technique for compressed model training
  9. from this paper: https://arxiv.org/pdf/2010.13369.pdf
  10. Args:
  11. theta (float): a hyper-parameter that controls the trade-off between training time and robustness.
  12. The lower the theta value, the faster the training speed. Default value: 0.5.
  13. gamma (float): a hyper-parameter that controls how fast the drop ratio increases. Default value: 0.001.
  14. """
  15. def __init__(self, theta=0.5, gamma=0.001):
  16. super().__init__()
  17. self.theta = theta
  18. self.gamma = gamma
  19. self.current_theta = 1.0
  20. log_dist(f'Enabled progressive layer dropping (theta = {self.theta})', ranks=[0])
  21. def get_state(self):
  22. kwargs = {'progressive_layer_drop': True, 'pld_theta': self.get_theta()}
  23. return kwargs
  24. def get_theta(self):
  25. return self.current_theta
  26. def update_state(self, global_step):
  27. def _prob(x, gamma, p):
  28. return (1. - p) * np.exp(-gamma * x) + p
  29. self.current_theta = _prob(global_step, self.gamma, self.theta)