openoker
/
DeepSpeed


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925
							'''Copyright The Microsoft DeepSpeed Team'''

import torch
import math
from torch import nn
from torch.nn import init
import deepspeed.comm as dist
from .utils import TopKBinarizer, SymQuantizer, AsymQuantizer, TernaryQuantizer, BinaryQuantizer
from deepspeed.utils import logger

g_mpu = None


class QuantAct(nn.Module):
    """
    Class to quantize given activations. Note that when using this function, the input acttivation quantization range will be fixed for all
    tokens/images for inference. This generally will affect some accuracy but achieve better latency performance.
    Parameters:
    ----------
    act_range_momentum : float, default 0.95
        Momentum for updating the activation quantization range.
    quant_mode : str, default 'symmetric'
    """
    def __init__(self, act_range_momentum=0.95, quant_mode='symmetric'):
        super(QuantAct, self).__init__()

        self.act_range_momentum = act_range_momentum
        self.quant_mode = quant_mode
        if quant_mode == 'symmetric':
            self.act_function = SymQuantizer.apply
        else:
            self.act_function = AsymQuantizer.apply

        self.register_buffer('x_min_max', torch.zeros(2))

    def forward(self, x, num_bits, *args):
        """
        x: the activation that we need to quantize
        num_bits: the number of bits we need to quantize the activation to
        *args: some extra arguments that are useless but needed for align with the interface of other quantization functions
        """

        if self.training:
            x_min = x.data.min()
            x_max = x.data.max()

            # Initialization
            if self.x_min_max[0] == self.x_min_max[1]:
                self.x_min_max[0] = x_min
                self.x_min_max[1] = x_max

            # if do not need momentum, please set self.act_range_momentum = 0
            self.x_min_max[0] = self.x_min_max[0] * self.act_range_momentum + x_min * (
                1 - self.act_range_momentum)
            self.x_min_max[1] = self.x_min_max[1] * self.act_range_momentum + x_max * (
                1 - self.act_range_momentum)

        x_q = self.act_function(x, num_bits, self.x_min_max[0], self.x_min_max[1])

        return x_q


class Embedding_Compress(nn.Embedding):
    def __init__(self, *kargs):
        super(Embedding_Compress, self).__init__(*kargs)
        self.weight.start_bits = None
        self.weight.target_bits = None
        self.weight.q_period = None
        self.weight_quantization_enabled_in_forward = False
        self.weight_quantization_enabled = False

    def extra_repr(self):
        return 'num_embeddings={}, embedding_dim={}, weight_quantization={}'.format(
            self.num_embeddings,
            self.embedding_dim,
            self.weight.target_bits)

    def enable_weight_quantization(self,
                                   start_bits,
                                   target_bits,
                                   quantization_period,
                                   weight_quantization_enabled_in_forward,
                                   quantization_type,
                                   num_groups):
        self.weight.start_bits = start_bits
        self.weight.target_bits = target_bits
        self.weight.q_period = quantization_period
        self.weight_quantization_enabled_in_forward = weight_quantization_enabled_in_forward
        if self.weight_quantization_enabled_in_forward:
            logger.warning(
                "************ A lot of MoQ features are not supported in quantize_weight_in_forward mode, please consider to use DS-FP16 optimizer************"
            )
            if self.weight.target_bits >= 3:
                if quantization_type == 'symmetric':
                    self.weight_quantizer = SymQuantizer.apply
                else:
                    self.weight_quantizer = AsymQuantizer.apply
            elif self.weight.target_bits == 2:
                assert quantization_type == 'symmetric', 'Only symmetric quantization is supported for ternary weight quantization'
                self.weight_quantizer = TernaryQuantizer.apply
            elif self.weight.target_bits == 1:
                assert quantization_type == 'symmetric', 'Only symmetric quantization is supported for binary weight quantization'
                self.weight_quantizer = BinaryQuantizer.apply
            # for embedding, we always use token-wise quantization
            self.weight_quantize_num_groups = self.weight.size(0)

    def fix_weight_quantization(self):
        self.weight.data = self.weight_quantizer(self.weight,
                                                 self.weight.target_bits,
                                                 None,
                                                 None,
                                                 self.weight_quantize_num_groups).data
        self.weight_quantization_enabled_in_forward = False
        return None

    def forward(self, input):
        if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled:
            weight = self.weight_quantizer(self.weight,
                                           self.weight.target_bits,
                                           None,
                                           None,
                                           self.weight_quantize_num_groups)
        else:
            weight = self.weight

        out = nn.functional.embedding(input,
                                      weight,
                                      self.padding_idx,
                                      self.max_norm,
                                      self.norm_type,
                                      self.scale_grad_by_freq,
                                      self.sparse)
        return out


class LinearLayer_Compress(nn.Linear):
    """
    Linear layer with compression.
    """
    def __init__(self, *kargs, bias=True):
        super(LinearLayer_Compress, self).__init__(*kargs, bias=bias)
        self.sparse_pruning_method = None
        self.row_pruning_method = None
        self.head_pruning_method = None
        self.activation_quantization_method = None
        self.weight.start_bits = None
        self.weight.target_bits = None
        self.weight.q_period = None
        self.weight_quantization_enabled_in_forward = False
        self.weight_quantization_enabled = False
        self.sparse_pruning_enabled = False
        self.row_pruning_enabled = False
        self.head_pruning_enabled = False
        self.activation_quantization_enabled = False

    def extra_repr(self):
        return 'in_features={}, out_features={}, bias={}, sparse pruning={}, row pruning={}, head pruning={}, activation quantization={}, weight_quantization={}'.format(
            self.in_features, self.out_features, self.bias is not None, self.sparse_pruning_method is not None, \
            self.row_pruning_method is not None, self.head_pruning_method is not None, self.activation_quantization_method is not None, self.weight.target_bits)

    def enable_sparse_pruning(self, ratio, method):
        # Here, we support two cases: L1 norm based pruning and topk based pruning
        self.sparse_pruning_ratio = ratio
        self.sparse_pruning_method = method
        if method == 'l1':
            weight_norm = torch.abs(self.weight.data)
            mask = TopKBinarizer.apply(weight_norm, self.sparse_pruning_ratio, False)
            mask = mask.view(self.weight.size())
            mask = mask.to(self.weight.device)
        elif method == 'topk':
            self.sparse_mask_scores = nn.Parameter(torch.Tensor(self.weight.size()))
            self.sparse_mask_scores.data = self.sparse_mask_scores.data.to(
                self.weight.device)
            init.kaiming_uniform_(self.sparse_mask_scores, a=math.sqrt(5))
            mask = None
        else:
            raise NotImplementedError

        self.register_buffer('sparse_pruning_mask', mask)

    def enable_row_pruning(self, ratio, method):
        # Here, we support two cases: L1 norm based pruning and topk based pruning
        self.row_pruning_ratio = ratio
        self.row_pruning_method = method

        if method == 'l1':
            # compute the l1 norm of each column
            weight_norm = torch.norm(self.weight.data, p=1, dim=1)
            mask = TopKBinarizer.apply(weight_norm, self.row_pruning_ratio, False)
            mask = mask.view(-1, 1)
            mask = mask.to(self.weight.device)
        elif method == 'topk':
            self.row_mask_scores = nn.Parameter(torch.Tensor(self.weight.size(0), 1))
            self.row_mask_scores.data = self.row_mask_scores.data.to(self.weight.device)
            init.kaiming_uniform_(self.row_mask_scores, a=math.sqrt(5))
            mask = None
        else:
            raise NotImplementedError

        self.register_buffer('row_pruning_mask', mask)

    def enable_head_pruning(self, ratio, method, num_heads):
        # Here, we support only topk based pruning
        self.num_heads = num_heads
        self.head_pruning_ratio = ratio
        self.head_pruning_method = method

        if method not in ['topk']:
            raise NotImplementedError
        else:
            self.head_pruning_ratio = ratio
            self.head_pruning_scores = nn.Parameter(torch.Tensor(
                1,
                self.num_heads))  # we apply the pruning to O matrix
            self.head_pruning_scores.data = self.head_pruning_scores.data.to(
                self.weight.device)
            init.kaiming_uniform_(self.head_pruning_scores, a=math.sqrt(5))

    def fix_sparse_pruning_helper(self):
        mask = self.get_mask(pruning_type='sparse')
        self.weight.data = self.weight.data * mask
        del self.sparse_pruning_mask
        if self.sparse_pruning_method == 'topk':
            del self.sparse_mask_scores
        self.sparse_pruning_method = None
        self.sparse_pruning_enabled = False
        return None

    def fix_row_col_pruning_helper(self, mask=None, dim_reduction=False):
        # This function is used for row/col pruning
        # particularly, if we have two back-to-back layers, F1 and F2; when
        # we remove rows from F1, we also need to remove columns from F2
        # However, if we only have one layer, F1, then we only need to mask pruned
        # rows as 0 in F1
        if mask is None:
            mask = self.get_mask(pruning_type='row').bool()
            if dim_reduction:
                start_bits = self.weight.start_bits
                target_bits = self.weight.target_bits
                q_period = self.weight.q_period
                self.weight = nn.Parameter(self.weight.data[mask.view(-1), :])
                self.weight.start_bits = start_bits
                self.weight.target_bits = target_bits
                self.weight.q_period = q_period
                if self.bias is not None:
                    self.bias = nn.Parameter(self.bias.data[mask.view(-1)])
                self.out_features = self.weight.size(0)
            else:
                self.weight.data = self.weight.data * mask.view(-1, 1)
                if self.bias is not None:
                    self.bias.data = self.bias.data * mask.view(-1)

            del self.row_pruning_mask
            if self.row_pruning_method == 'topk':
                del self.row_mask_scores
            self.row_pruning_method = None
        else:
            # this is generally for column pruning
            start_bits = self.weight.start_bits
            target_bits = self.weight.target_bits
            q_period = self.weight.q_period
            self.weight = nn.Parameter(self.weight.data[:, mask.view(-1)])
            self.weight.start_bits = start_bits
            self.weight.target_bits = target_bits
            self.weight.q_period = q_period
            self.in_features = self.weight.size(1)
            mask = None
        self.row_pruning_enabled = False
        return mask

    def fix_head_pruning_helper(self, mask=None, num_heads=None, dim_reduction=False):
        # similar as row/col pruning, head pruning also needs to prune QKV which is associated with O matrix
        num_heads = num_heads if num_heads else self.num_heads
        if mask is None:
            if self.head_pruning_method == 'topk':
                mask = self.get_mask(pruning_type='head').bool()
                if dim_reduction:
                    shape = self.weight.size(0)
                    start_bits = self.weight.start_bits
                    target_bits = self.weight.target_bits
                    q_period = self.weight.q_period
                    self.weight = nn.Parameter(self.weight.data.t().reshape(num_heads, -1)[mask.view(-1), :].reshape(-1, shape).t())
                    self.weight.start_bits = start_bits
                    self.weight.target_bits = target_bits
                    self.weight.q_period = q_period
                else:

                    shape = self.weight.size()
                    self.weight.data = (self.weight.data.t().reshape(self.num_heads,
                                                                     -1) *
                                        mask.view(-1,
                                                  1)).reshape(shape[1],
                                                              shape[0]).t()

                if self.head_pruning_method == 'topk':
                    del self.head_pruning_scores
                self.head_pruning_method = None
            else:
                raise NotImplementedError
        else:
            start_bits = self.weight.start_bits
            target_bits = self.weight.target_bits
            q_period = self.weight.q_period
            shape = self.weight.size(1)
            self.weight = nn.Parameter(self.weight.data.reshape(num_heads, -1)[mask.view(-1), :].reshape(-1, shape))
            self.weight.start_bits = start_bits
            self.weight.target_bits = target_bits
            self.weight.q_period = q_period
            if self.bias is not None:
                self.bias = nn.Parameter(self.bias.data.reshape(num_heads, -1)[mask.view(-1), :].reshape(-1))
        self.head_pruning_enabled = False
        return mask

    def get_mask(self, pruning_type='row'):
        if pruning_type == 'sparse':
            if self.sparse_pruning_method == 'l1':
                return self.sparse_pruning_mask.to(self.weight.device)
            elif self.sparse_pruning_method == 'topk':
                return TopKBinarizer.apply(self.sparse_mask_scores,
                                           self.sparse_pruning_ratio,
                                           False)
            else:
                raise NotImplementedError
        if pruning_type == 'row':
            if self.row_pruning_method == 'l1':
                return self.row_pruning_mask.to(self.weight.device)
            elif self.row_pruning_method == 'topk':
                return TopKBinarizer.apply(self.row_mask_scores,
                                           self.row_pruning_ratio,
                                           False)
            else:
                raise NotImplementedError
        elif pruning_type == 'head':
            if self.head_pruning_method == 'topk':
                return TopKBinarizer.apply(self.head_pruning_scores,
                                           self.head_pruning_ratio,
                                           False)
            else:
                raise NotImplementedError
        else:
            raise NotImplementedError

    def enable_weight_quantization(self,
                                   start_bits,
                                   target_bits,
                                   quantization_period,
                                   weight_quantization_enabled_in_forward,
                                   quantization_type,
                                   num_groups):
        self.weight.start_bits = start_bits
        self.weight.target_bits = target_bits
        self.weight.q_period = quantization_period
        self.weight_quantization_enabled_in_forward = weight_quantization_enabled_in_forward
        if self.weight_quantization_enabled_in_forward:
            logger.warning(
                "************ A lot of MoQ features are not supported in quantize_weight_in_forward mode, please consider to use DS-FP16 optimizer************"
            )
            if self.weight.target_bits >= 3:
                if quantization_type == 'symmetric':
                    self.weight_quantizer = SymQuantizer.apply
                else:
                    self.weight_quantizer = AsymQuantizer.apply
            elif self.weight.target_bits == 2:
                assert quantization_type == 'symmetric', 'Only symmetric quantization is supported for ternary weight quantization'
                self.weight_quantizer = TernaryQuantizer.apply
            elif self.weight.target_bits == 1:
                assert quantization_type == 'symmetric', 'Only symmetric quantization is supported for binary weight quantization'
                self.weight_quantizer = BinaryQuantizer.apply
            self.weight_quantize_num_groups = num_groups

    def fix_weight_quantization(self):
        self.weight.data = self.weight_quantizer(self.weight,
                                                 self.weight.target_bits,
                                                 None,
                                                 None,
                                                 self.weight_quantize_num_groups).data
        self.weight_quantization_enabled_in_forward = False
        return None

    def enable_activation_quantization(self, bits, quantization_type, range_calibration):
        assert bits in [4, 8], 'Only 4/8 bits activation quantization are supported for now'
        self.activation_quantization_bits = bits
        self.activation_quantization_method = f"{quantization_type}_{range_calibration}"
        if range_calibration == 'static':
            self.activation_quantizer = QuantAct(quant_mode=quantization_type)
        else:
            if quantization_type == 'symmetric':
                self.activation_quantizer = SymQuantizer.apply
            else:
                self.activation_quantizer = AsymQuantizer.apply

    def head_pruning_reshape(self, w, mask):
        shape = w.shape
        return (w.t().reshape(self.num_heads,
                              -1) * mask.view(-1,
                                              1)).reshape(shape[1],
                                                          shape[0]).t()

    def forward(self, input, skip_bias_add=False):

        if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled:
            weight = self.weight_quantizer(self.weight,
                                           self.weight.target_bits,
                                           None,
                                           None,
                                           self.weight_quantize_num_groups)
            bias = self.bias
        else:
            weight = self.weight
            bias = self.bias

        if self.sparse_pruning_enabled and self.sparse_pruning_method:
            mask = self.get_mask(pruning_type='sparse')
            weight = weight * mask.view(self.weight.size())

        if self.row_pruning_enabled and self.row_pruning_method:
            mask = self.get_mask(pruning_type='row')
            weight = weight * mask.view(-1, 1)
            if bias is not None:
                bias = bias * mask.view(-1)

        if self.head_pruning_enabled and self.head_pruning_method:
            mask = self.get_mask(pruning_type='head')
            weight = self.head_pruning_reshape(weight, mask)

        if self.activation_quantization_enabled:
            if 'dynamic' in self.activation_quantization_method:
                num_groups = input.numel() // input.size(-1)
            else:
                num_groups = 1
            input = self.activation_quantizer(input,
                                              self.activation_quantization_bits,
                                              None,
                                              None,
                                              num_groups)

        if skip_bias_add:
            # used for mpu linear layers
            output = nn.functional.linear(input, weight, None)
            return output, bias
        else:
            output = nn.functional.linear(input, weight, bias)
            return output


class Conv2dLayer_Compress(nn.Conv2d):
    """
    Conv2D layer with compression.
    """
    def __init__(self, *kargs):
        super(Conv2dLayer_Compress, self).__init__(*kargs)
        self.sparse_pruning_method = None
        self.channel_pruning_method = None
        self.activation_quantization_method = None
        self.weight.start_bits = None
        self.weight.target_bits = None
        self.weight.q_period = None
        self.weight_quantization_enabled_in_forward = False
        self.sparse_pruning_enabled = False
        self.channel_pruning_enabled = False
        self.activation_quantization_enabled = False

    def __repr__(self):
        s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
             ', stride={stride}')
        if self.padding != (0, ) * len(self.padding):
            s += ', padding={padding}'
        if self.dilation != (1, ) * len(self.dilation):
            s += ', dilation={dilation}'
        if self.output_padding != (0, ) * len(self.output_padding):
            s += ', output_padding={output_padding}'
        if self.groups != 1:
            s += ', groups={groups}'
        if self.bias is None:
            s += ', bias=False'
        if self.padding_mode != 'zeros':
            s += ', padding_mode={padding_mode}'
        output = s.format(**self.__dict__)

        return output + ' sparse pruning={}, channel pruning={}, activation quantization={}, weight_quantization={}'.format(
            self.sparse_pruning_method is not None,
            self.channel_pruning_method is not None,
            self.activation_quantization_method is not None,
            self.weight.target_bits)

    def enable_sparse_pruning(self, ratio, method):
        self.sparse_pruning_ratio = ratio
        self.sparse_pruning_method = method
        if method == 'l1':
            weight_norm = torch.abs(self.weight.data)
            mask = TopKBinarizer.apply(weight_norm, self.sparse_pruning_ratio, False)
            mask = mask.view(self.weight.size())
            mask = mask.to(self.weight.device)
        elif method == 'topk':
            self.sparse_mask_scores = nn.Parameter(torch.Tensor(self.weight.size()))
            self.sparse_mask_scores.data = self.sparse_mask_scores.data.to(
                self.weight.device)
            init.kaiming_uniform_(self.sparse_mask_scores, a=math.sqrt(5))
            mask = None
        else:
            raise NotImplementedError

        self.register_buffer('sparse_pruning_mask', mask)

    def enable_channel_pruning(self, ratio, method):
        # Here, we support two cases: L1 norm based pruning and topk based pruning
        self.channel_pruning_ratio = ratio
        self.channel_pruning_method = method

        if method == 'l1':
            # compute the l1 norm of each conv2d kernel (the last three dimension)
            weight_norm = torch.norm(self.weight.data, p=1, dim=[1, 2, 3])
            mask = TopKBinarizer.apply(weight_norm, self.channel_pruning_ratio, False)
            mask = mask.view(-1, 1, 1, 1)
            mask = mask.to(self.weight.device)
        elif method == 'topk':
            self.channel_mask_scores = nn.Parameter(
                torch.Tensor(self.weight.size(0),
                             1,
                             1,
                             1))
            self.channel_mask_scores.data = self.channel_mask_scores.data.to(
                self.weight.device)
            init.kaiming_uniform_(self.channel_mask_scores, a=math.sqrt(5))
            mask = None
        else:
            raise NotImplementedError

        self.register_buffer('channel_pruning_mask', mask)

    def fix_sparse_pruning_helper(self):
        mask = self.get_mask(pruning_type='sparse')
        self.weight.data = self.weight.data * mask
        del self.sparse_pruning_mask
        if self.sparse_pruning_method == 'topk':
            del self.sparse_mask_scores
        self.sparse_pruning_method = None
        self.sparse_pruning_enabled = False
        return None

    def fix_channel_pruning_helper(self, mask=None, dim_reduction=False):
        if mask is None:
            if self.channel_pruning_method in ['l1', 'topk']:
                mask = self.get_mask(pruning_type='channel').bool()
                if dim_reduction:
                    start_bits = self.weight.start_bits
                    target_bits = self.weight.target_bits
                    q_period = self.weight.q_period
                    self.weight = nn.Parameter(self.weight.data[mask.view(-1), ...])
                    self.weight.start_bits = start_bits
                    self.weight.target_bits = target_bits
                    self.weight.q_period = q_period
                    if self.bias is not None:
                        self.bias = nn.Parameter(self.bias.data[mask.view(-1)])
                else:
                    self.weight.data = self.weight.data * mask.view(-1, 1, 1, 1)
                    if self.bias is not None:
                        self.bias.data = self.bias.data * mask.view(-1)
                del self.channel_pruning_mask
                if self.channel_pruning_method == 'topk':
                    del self.channel_mask_scores
                self.channel_pruning_method = None
            else:
                raise NotImplementedError
        else:
            start_bits = self.weight.start_bits
            target_bits = self.weight.target_bits
            q_period = self.weight.q_period
            self.weight = nn.Parameter(self.weight.data[:, mask.view(-1), ...])
            self.weight.start_bits = start_bits
            self.weight.target_bits = target_bits
            self.weight.q_period = q_period
            mask = None
        self.channel_pruning_enabled = False
        return mask

    def get_mask(self, pruning_type='sparse'):
        if pruning_type == 'sparse':
            if self.sparse_pruning_method == 'l1':
                return self.sparse_pruning_mask.to(self.weight.device)
            elif self.sparse_pruning_method == 'topk':
                return TopKBinarizer.apply(self.sparse_mask_scores,
                                           self.sparse_pruning_ratio,
                                           False)
            else:
                raise NotImplementedError
        elif pruning_type == 'channel':
            if self.channel_pruning_method == 'l1':
                return self.channel_pruning_mask.to(self.weight.device)
            elif self.channel_pruning_method == 'topk':
                return TopKBinarizer.apply(self.channel_mask_scores,
                                           self.channel_pruning_ratio,
                                           False)
            else:
                raise NotImplementedError
        else:
            raise NotImplementedError

    def fix_weight_quantization(self):
        self.weight.data = self.weight_quantizer(self.weight,
                                                 self.weight.target_bits,
                                                 None,
                                                 None,
                                                 self.weight_quantize_num_groups).data
        self.weight_quantization_enabled_in_forward = False
        return None

    def enable_weight_quantization(self,
                                   start_bits,
                                   target_bits,
                                   quantization_period,
                                   weight_quantization_enabled_in_forward,
                                   quantization_type,
                                   num_groups):
        self.weight.start_bits = start_bits
        self.weight.target_bits = target_bits
        self.weight.q_period = quantization_period
        self.weight_quantization_enabled_in_forward = weight_quantization_enabled_in_forward
        if self.weight_quantization_enabled_in_forward:
            assert self.weight.target_bits >= 4, 'Only >=4 bits weight quantization are supported during forward pass for now'
            logger.warning(
                "************ A lot of MoQ features are not supported in quantize_weight_in_forward mode, please consider to use DS-FP16 optimizer************"
            )
            if quantization_type == 'symmetric':
                self.weight_quantizer = SymQuantizer.apply
            else:
                self.weight_quantizer = AsymQuantizer.apply
            self.weight_quantize_num_groups = num_groups

    def enable_activation_quantization(self, bits, quantization_type, range_calibration):
        assert bits in [4, 8], 'Only 4/8 bits activation quantization are supported for now'
        self.activation_quantization_bits = bits
        self.activation_quantization_method = f"{quantization_type}_{range_calibration}"
        if range_calibration == 'static':
            self.activation_quantizer = QuantAct(quant_mode=quantization_type)
        else:
            if quantization_type == 'symmetric':
                self.activation_quantizer = SymQuantizer.apply
            else:
                self.activation_quantizer = AsymQuantizer.apply

    def forward(self, input):

        if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled:
            weight = self.weight_quantizer(self.weight,
                                           self.weight.target_bits,
                                           None,
                                           None,
                                           self.weight_quantize_num_groups)
            bias = self.bias
        else:
            weight = self.weight
            bias = self.bias

        if self.sparse_pruning_enabled and self.sparse_pruning_method:
            mask = self.get_mask(pruning_type='sparse')
            weight = weight * mask.view(self.weight.size())

        if self.channel_pruning_enabled:
            mask = self.get_mask(pruning_type='channel')
            weight = weight * mask.view(-1, 1, 1, 1)
            if bias is not None:
                bias = bias * mask.view(-1)

        if self.activation_quantization_enabled:
            if 'dynamic' in self.activation_quantization_method:
                num_groups = input.numel() // input[0].numel()
            else:
                num_groups = 1
            input = self.activation_quantizer(input,
                                              self.activation_quantization_bits,
                                              None,
                                              None,
                                              num_groups)

        return nn.functional.conv2d(input,
                                    weight,
                                    bias,
                                    self.stride,
                                    self.padding,
                                    self.dilation,
                                    self.groups)


class BNLayer_Compress(nn.BatchNorm2d):
    def fix_channel_pruning_helper(self, mask, dim_reduction=True):
        self.weight = nn.Parameter(self.weight.data[mask.view(-1)])
        self.bias = nn.Parameter(self.bias.data[mask.view(-1)])
        self.running_mean = self.running_mean[mask.view(-1)]
        self.running_var = self.running_var[mask.view(-1)]


def _reduce(input_):
    """All-reduce the the input tensor across model parallel group."""
    group = g_mpu.get_model_parallel_group()

    # Bypass the function if we are using only 1 GPU.
    if dist.get_world_size(group=group) == 1:
        return input_

    # All-reduce.
    dist.all_reduce(input_, group=group)

    return input_


def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False):
    """Split a tensor along its last dimension.
    Arguments:
        tensor: input tensor.
        num_partitions: number of partitions to split the tensor
        contiguous_split_chunks: If True, make each chunk contiguous
                                 in memory.
    """
    # Get the size and dimension.
    last_dim = tensor.dim() - 1
    assert tensor.size()[last_dim] % num_partitions == 0
    last_dim_size = tensor.size()[last_dim] // num_partitions
    # Split.
    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
    # Note: torch.split does not create contiguous tensors by default.
    if contiguous_split_chunks:
        return tuple(chunk.contiguous() for chunk in tensor_list)

    return tensor_list


def _split(input_):
    """Split the tensor along its last dimension and keep the
    corresponding slice."""
    group = g_mpu.get_model_parallel_group()

    # Bypass the function if we are using only 1 GPU.
    if dist.get_world_size(group=group) == 1:
        return input_

    # Split along last dimension.
    world_size = dist.get_world_size(group=group)
    input_list = split_tensor_along_last_dim(input_, world_size)

    # Note: torch.split does not create contiguous tensors by default.
    rank = dist.get_rank(group=group)
    output = input_list[rank].contiguous()

    return output


def _gather(input_):
    """Gather tensors and concatinate along the last dimension."""
    group = g_mpu.get_model_parallel_group()

    # Bypass the function if we are using only 1 GPU.
    if dist.get_world_size(group=group) == 1:
        return input_

    # Size and dimension.
    last_dim = input_.dim() - 1
    rank = dist.get_rank(group=group)
    world_size = dist.get_world_size(group=group)

    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
    tensor_list[rank] = input_
    dist.all_gather(tensor_list, input_, group=group)

    # Note: torch.cat already creates a contiguous tensor.
    output = torch.cat(tensor_list, dim=last_dim).contiguous()

    return output


class _CopyToModelParallelRegion(torch.autograd.Function):
    """Pass the input to the model parallel region."""
    @staticmethod
    def forward(ctx, input_):
        return input_

    @staticmethod
    def backward(ctx, grad_output):
        return _reduce(grad_output)


class _ReduceFromModelParallelRegion(torch.autograd.Function):
    """All-redcue the input from the model parallel region."""
    @staticmethod
    def forward(ctx, input_):
        return _reduce(input_)

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output


class _ScatterToModelParallelRegion(torch.autograd.Function):
    """Split the input and keep only the corresponding chuck to the rank."""
    @staticmethod
    def forward(ctx, input_):
        return _split(input_)

    @staticmethod
    def backward(ctx, grad_output):
        return _gather(grad_output)


class _GatherFromModelParallelRegion(torch.autograd.Function):
    """Gather the input from model parallel region and concatinate."""
    @staticmethod
    def forward(ctx, input_):
        return _gather(input_)

    @staticmethod
    def backward(ctx, grad_output):
        return _split(grad_output)


# -----------------
# Helper functions.
# -----------------


def copy_to_model_parallel_region(input_):
    return _CopyToModelParallelRegion.apply(input_)


def reduce_from_model_parallel_region(input_):
    return _ReduceFromModelParallelRegion.apply(input_)


def scatter_to_model_parallel_region(input_):
    return _ScatterToModelParallelRegion.apply(input_)


def gather_from_model_parallel_region(input_):
    return _GatherFromModelParallelRegion.apply(input_)


class ColumnParallelLinear_Compress(LinearLayer_Compress):
    def __init__(self,
                 mpu,
                 input_size,
                 output_size,
                 bias=True,
                 gather_output=True,
                 skip_bias_add=False):
        # Keep input parameters
        global g_mpu
        g_mpu = mpu
        self.input_size = input_size
        self.output_size = output_size
        self.gather_output = gather_output
        self.skip_bias_add = skip_bias_add

        # Divide the weight matrix along the last dimension.
        world_size = mpu.get_model_parallel_world_size()
        assert output_size % world_size == 0
        self.output_size_per_partition = output_size // world_size

        super(ColumnParallelLinear_Compress,
              self).__init__(self.input_size,
                             self.output_size_per_partition,
                             bias=bias)

    def forward(self, input_):
        # Set up backprop all-reduce.
        input_parallel = copy_to_model_parallel_region(input_)
        # Matrix multiply.
        if self.skip_bias_add:
            output_parallel, bias = super().forward(input_parallel, True)
        else:
            output_parallel = super().forward(input_parallel)
            bias = None
        if self.gather_output:
            # All-gather across the partitions.
            output = gather_from_model_parallel_region(output_parallel)
        else:
            output = output_parallel
        return output, bias


class RowParallelLinear_Compress(LinearLayer_Compress):
    def __init__(self,
                 mpu,
                 input_size,
                 output_size,
                 bias=True,
                 input_is_parallel=False,
                 skip_bias_add=False):
        # Keep input parameters
        global g_mpu
        g_mpu = mpu
        self.input_size = input_size
        self.output_size = output_size
        self.input_is_parallel = input_is_parallel
        self.skip_bias_add = skip_bias_add

        # Divide the weight matrix along the last dimension.
        world_size = mpu.get_model_parallel_world_size()
        assert input_size % world_size == 0
        self.input_size_per_partition = input_size // world_size

        super(RowParallelLinear_Compress,
              self).__init__(self.input_size_per_partition,
                             self.output_size,
                             bias=bias)

    def forward(self, input_):
        # Set up backprop all-reduce.
        if self.input_is_parallel:
            input_parallel = input_
        else:
            input_parallel = scatter_to_model_parallel_region(input_)
        # Matrix multiply.
        output_parallel, bias = super().forward(input_parallel, True)

        # All-reduce across all the partitions.
        output_ = reduce_from_model_parallel_region(output_parallel)
        if not self.skip_bias_add:
            if bias is not None:
                output = output_ + bias
            else:
                output = output_
            output_bias = None
        else:
            output = output_
            output_bias = bias
        return output, output_bias