openoker
/
DeepSpeed


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
							# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

import json
import torch
from deepspeed.utils.types import ActivationFuncType, NormType


class TransformerConfig():

    def __init__(self, hidden_size, intermediate_size, heads, num_hidden_layers):
        self.layer_id = -1
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.heads = heads
        self.num_hidden_layers = num_hidden_layers


class DeepSpeedInferenceConfig(TransformerConfig):
    """Initialize the DeepSpeed Transformer Config.
        Arguments:
            hidden_size: The hidden size of the transformer layer
            intermediate_size: The intermediate size of the feed-forward part of transformer layer
            heads: The number of heads in the self-attention of the transformer layer
            num_hidden_layers: The number of transformer layers
            layer_norm_eps: The epsilon value for the layer norm
            local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
                to use if the model already set the current device, otherwise need to set it
                so that the transformer kernel can work on the right device
            mp_size (optional): This argument is mainly used to create the parameters on the kernel side
                using model-parallel architecture. If the client model already takes care of this, there is no
                need to pass this argument.
            pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
            stochastic_mode:  Enable for high performance, please note that this flag has some level of
                non-determinism and can produce different results on different runs.  However, we have seen
                that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
                a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
                to turn it off in order to be able to reproduce the same result through the regular kernel execution.

            scale_attention: If true, both q and k are scaled by 1/sqrt(attention_heads) before attention computation.
            return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor
            bigscience_bloom: This flag is added temporarily for supporting the BLOOM-176B model architecture.
            use_triton: This flag is to enable triton kernels in inference or not.
    """

    def __init__(self,
                 hidden_size=-1,
                 intermediate_size=-1,
                 heads=-1,
                 num_hidden_layers=-1,
                 layer_norm_eps=1e-12,
                 local_rank=-1,
                 mp_size=1,
                 dtype=torch.float16,
                 pre_layer_norm=True,
                 norm_type=NormType.LayerNorm,
                 stochastic_mode=False,
                 scale_attention=True,
                 triangular_masking=True,
                 local_attention=False,
                 window_size=256,
                 rotary_dim=-1,
                 rotate_half=False,
                 rotate_every_two=True,
                 return_tuple=True,
                 mlp_after_attn=True,
                 mlp_act_func_type=ActivationFuncType.GELU,
                 training_mp_size=1,
                 bigscience_bloom=False,
                 max_out_tokens=1024,
                 min_out_tokens=1,
                 enable_qkv_quantization=False,
                 use_mup=False,
                 scale_attn_by_inverse_layer_idx=False,
                 return_single_tuple=False,
                 set_empty_params=False,
                 transposed_mode=False,
                 use_triton=False,
                 triton_autotune=False):
        super(DeepSpeedInferenceConfig,
              self).__init__(hidden_size, (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads,
                             num_hidden_layers)
        self.dtype = dtype
        self.pre_layer_norm = pre_layer_norm
        self.norm_type = norm_type
        self.local_rank = local_rank
        self.stochastic_mode = stochastic_mode
        self.epsilon = layer_norm_eps
        self.mp_size = mp_size
        self.scale_attention = scale_attention
        self.triangular_masking = triangular_masking
        self.local_attention = local_attention
        self.window_size = window_size
        self.rotary_dim = rotary_dim
        self.rotate_half = rotate_half
        self.rotate_every_two = rotate_every_two
        self.return_tuple = return_tuple
        self.mlp_after_attn = mlp_after_attn
        self.mlp_act_func_type = mlp_act_func_type
        self.specialized_mode = False
        self.training_mp_size = training_mp_size
        self.bigscience_bloom = bigscience_bloom
        self.max_out_tokens = max_out_tokens
        self.min_out_tokens = min_out_tokens
        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
        self.enable_qkv_quantization = enable_qkv_quantization
        self.use_mup = use_mup
        self.return_single_tuple = return_single_tuple
        self.set_empty_params = set_empty_params
        self.transposed_mode = transposed_mode
        self.use_triton = use_triton
        self.triton_autotune = triton_autotune

    @classmethod
    def from_dict(cls, json_object):
        config = DeepSpeedInferenceConfig()
        for key, value in json_object.items():
            config.__dict__[key] = value
        return config

    @classmethod
    def from_json_file(cls, json_file):
        with open(json_file, "r", encoding='utf-8') as reader:
            text = reader.read()
        return cls.from_dict(json.loads(text))