123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354 |
- '''Copyright The Microsoft DeepSpeed Team'''
- from .base import *
- from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
- from ..policy import TransformerPolicy
- class DS_GPT2Container(BaseTransformerContainer):
- def __init__(self, **kwargs):
- super().__init__(**kwargs)
- # All model specific things should be defined here instead of the base class.
- def create_module(self, config=None):
- _config = config if config is not None else self.ds_model_config
- self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
- self.module.config.scale_attention = self.scale_attention
- return self.module
- class HFGPT2LayerPolicy(TransformerPolicy):
- _orig_layer_class = None
- def __init__(self, client_module, inference=True):
- # HuggingFace GPT2 uses convolutional layer instead of linear layer
- super().__init__(inference, linear_layer=False)
- self.client_module = client_module
- try:
- import transformers
- HFGPT2LayerPolicy._orig_layer_class = transformers.models.gpt2.modeling_gpt2.GPT2Block
- except:
- HFGPT2LayerPolicy._orig_layer_class = None
- def get_hidden_heads(self):
- return self.client_module.attn.embed_dim, \
- self.client_module.attn.num_heads
- def attention(self):
- return self.client_module.attn.c_attn.weight, \
- self.client_module.attn.c_attn.bias, \
- self.client_module.attn.c_proj.weight, \
- self.client_module.attn.c_proj.bias
- def mlp(self):
- return self.client_module.mlp.c_fc.weight, \
- self.client_module.mlp.c_fc.bias, \
- self.client_module.mlp.c_proj.weight, \
- self.client_module.mlp.c_proj.bias
- def layernorm(self):
- return self.client_module.ln_2.weight, \
- self.client_module.ln_2.bias, \
- self.client_module.ln_1.weight, \
- self.client_module.ln_1.bias
|