'''Copyright The Microsoft DeepSpeed Team''' from .base import * from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference from ..policy import TransformerPolicy class DS_GPT2Container(BaseTransformerContainer): def __init__(self, **kwargs): super().__init__(**kwargs) # All model specific things should be defined here instead of the base class. def create_module(self, config=None): _config = config if config is not None else self.ds_model_config self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group) self.module.config.scale_attention = self.scale_attention return self.module class HFGPT2LayerPolicy(TransformerPolicy): _orig_layer_class = None def __init__(self, client_module, inference=True): # HuggingFace GPT2 uses convolutional layer instead of linear layer super().__init__(inference, linear_layer=False) self.client_module = client_module try: import transformers HFGPT2LayerPolicy._orig_layer_class = transformers.models.gpt2.modeling_gpt2.GPT2Block except: HFGPT2LayerPolicy._orig_layer_class = None def get_hidden_heads(self): return self.client_module.attn.embed_dim, \ self.client_module.attn.num_heads def attention(self): return self.client_module.attn.c_attn.weight, \ self.client_module.attn.c_attn.bias, \ self.client_module.attn.c_proj.weight, \ self.client_module.attn.c_proj.bias def mlp(self): return self.client_module.mlp.c_fc.weight, \ self.client_module.mlp.c_fc.bias, \ self.client_module.mlp.c_proj.weight, \ self.client_module.mlp.c_proj.bias def layernorm(self): return self.client_module.ln_2.weight, \ self.client_module.ln_2.bias, \ self.client_module.ln_1.weight, \ self.client_module.ln_1.bias