weight_quantizer.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. import torch
  2. from ..module_inject.replace_policy import HFBertLayerPolicy, replace_policies
  3. class WeightQuantization(object):
  4. def __init__(self, mlp_extra_grouping=True, mp_size=1):
  5. self.dense_scales = []
  6. self.qkv_scales = []
  7. self.mlp4hh_scales = []
  8. self.mlph4h_scales = []
  9. self.mlp_extra_grouping = mlp_extra_grouping
  10. self.mp_size = mp_size
  11. def quantize_data(self, data, quantize_bits, groups, key=None):
  12. data_groups = torch.split(data.float().view(-1), data.numel() // groups)
  13. max_d = [max(g.max(), g.min().abs()) for g in data_groups]
  14. data_scale = [float(1 << quantize_bits) / (2 * mx + 1e-5) for mx in max_d]
  15. data_int = [(g * s) for g, s in zip(data_groups, data_scale)]
  16. data_int = [
  17. di.round().clamp(-(1 << (quantize_bits - 1)),
  18. (((1 << (quantize_bits - 1)) - 1))) for di in data_int
  19. ]
  20. data_int = torch.cat(data_int).reshape(data.shape)
  21. data_int = data_int.to(torch.int8)
  22. data_scale = torch.cat([s.unsqueeze(0).unsqueeze(0) for s in data_scale])
  23. return data_int, data_scale
  24. def is_mlp(self, data, merge_count=1):
  25. return ((self.mp_size *data.shape[0] * merge_count) / data.shape[1] == 4 or \
  26. (self.mp_size *data.shape[1] * merge_count) / data.shape[0] == 4)
  27. def is_qkv(self, data):
  28. return ((self.mp_size * data.shape[0]) / data.shape[1] == 3 or \
  29. (self.mp_size * data.shape[1]) / data.shape[0] == 3)
  30. def Quantize(self, value_list, quantize_bits, groups, key, merge_dim=0):
  31. if self.mlp_extra_grouping and self.is_mlp(value_list[0],
  32. merge_count=len(value_list)):
  33. groups *= 2
  34. q_scale = []
  35. index = 0
  36. for data in value_list:
  37. data_int, data_scale = self.quantize_data(data, quantize_bits, groups, key)
  38. q_scale.append(data_scale)
  39. value_list[index] = data_int
  40. index += 1
  41. q_scale = (1 / torch.cat(q_scale,
  42. dim=merge_dim).to(
  43. torch.cuda.current_device()).view(-1).unsqueeze(0))
  44. if "mlp.dense_4h_to_h.weight" in key:
  45. self.mlp4hh_scales.append(q_scale)
  46. elif "mlp.dense_h_to_4h.weight" in key:
  47. self.mlph4h_scales.append(q_scale)
  48. elif "attention.query_key_value.weight" in key:
  49. self.qkv_scales.append(q_scale)
  50. else:
  51. self.dense_scales.append(q_scale)
  52. return value_list
  53. def merge_layer_scales(self, layer_scales):
  54. max_dim = max([s.shape[-1] for s in layer_scales])
  55. layer_scales = [
  56. torch.cat((s,
  57. torch.zeros((1,
  58. max_dim - s.shape[-1]),
  59. device=torch.cuda.current_device())),
  60. dim=-1) if s.shape[-1] < max_dim else s for s in layer_scales
  61. ]
  62. return torch.cat(layer_scales).unsqueeze(0)
  63. def merge_scales(self):
  64. all_scales = []
  65. for dense_scale, qkv_scale, m4hh_scale, mh4h_scale in \
  66. zip(self.dense_scales, self.qkv_scales, self.mlp4hh_scales, self.mlph4h_scales):
  67. all_scales.append(
  68. self.merge_layer_scales([qkv_scale,
  69. dense_scale,
  70. mh4h_scale,
  71. m4hh_scale]))
  72. return torch.cat(all_scales)
  73. def merge_scales_split(self, split_count):
  74. all_scales = [[] for _ in range(split_count)]
  75. for dense_scale, qkv_scale, m4hh_scale, mh4h_scale in \
  76. zip(self.dense_scales, self.qkv_scales, self.mlp4hh_scales, self.mlph4h_scales):
  77. dense_scale = torch.split(dense_scale, dense_scale.numel() // split_count)
  78. qkv_scale = torch.split(qkv_scale, qkv_scale.numel() // split_count)
  79. m4hh_scale = torch.split(m4hh_scale, m4hh_scale.numel() // split_count)
  80. mh4h_scale = torch.split(mh4h_scale, mh4h_scale.numel() // split_count)
  81. for s in range(split_count):
  82. all_scales[s].append(
  83. torch.cat([
  84. torch.cat((qkv_scale[s],
  85. torch.zeros_like(qkv_scale[s])),
  86. dim=1),
  87. torch.cat((dense_scale[s],
  88. torch.zeros_like(dense_scale[s])),
  89. dim=1),
  90. mh4h_scale[s],
  91. m4hh_scale[s]
  92. ]).unsqueeze(0))
  93. for scales_a in all_scales:
  94. torch.cat(scales_a)
  95. return all_scales
  96. def sd_quantize_megatron(self, sd, quantize_bits, groups):
  97. keys = sd.keys()
  98. for key in keys:
  99. value_list = [sd[key]]
  100. if "attention.dense.weight" in key or "mlp.dense_4h_to_h.weight" in key or \
  101. "mlp.dense_h_to_4h.weight" in key or "attention.query_key_value.weight" in key:
  102. value_list = self.Quantize(value_list, quantize_bits, groups, key=key)
  103. sd[key] = value_list[0]
  104. all_scales = self.merge_scales()
  105. return sd, all_scales
  106. def model_quantize(self, model, quantize_policy, quantize_bits, groups):
  107. all_scales = []
  108. def quantize_fn(layer, policy_cls):
  109. policy = policy_cls(layer)
  110. _, qkvw, _, dense_w, _, _ = policy.attention()
  111. _, _h4h_w, _, _4hh_w, _ = policy.mlp()
  112. keys = [qkvw, dense_w, _h4h_w, _4hh_w]
  113. layer_scales = []
  114. for key in range(len(keys)):
  115. if self.mlp_extra_grouping and is_mlp(keys[key]):
  116. data_quantized, data_scale = self.quantize_data(keys[key], quantize_bits, groups * 2)
  117. elif policy_cls is HFBertLayerPolicy and self.is_qkv(keys[key]):
  118. data_quantized, data_scale = self.quantize_data(keys[key], quantize_bits, groups * 3)
  119. else:
  120. data_quantized, data_scale = self.quantize_data(keys[key], quantize_bits, groups)
  121. keys[key].copy_(data_quantized)
  122. layer_scales.append(
  123. (1 /
  124. data_scale.to(torch.cuda.current_device()).view(-1).unsqueeze(0)))
  125. all_scales.append(self.merge_layer_scales(layer_scales))
  126. return layer
  127. def _quantize_module(model, policies):
  128. for name, child in model.named_children():
  129. if child.__class__ in policies:
  130. quantize_fn, replace_policy = policies[child.__class__]
  131. setattr(model, name, quantize_fn(child, replace_policy))
  132. else:
  133. _quantize_module(child, policies)
  134. return model
  135. policy = {}
  136. if quantize_policy is not None:
  137. for layer_name, replace_policy in quantize_policy.items():
  138. policy.update({layer_name: (quantize_fn, replace_policy)})
  139. else:
  140. for plcy in replace_policies:
  141. policy.update({plcy._orig_layer_class: (quantize_fn, plcy)})
  142. quantized_module = _quantize_module(model, policy)
  143. return quantized_module, torch.cat(all_scales)