123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301 |
- # Copyright (c) Microsoft Corporation.
- # SPDX-License-Identifier: Apache-2.0
- # DeepSpeed Team
- import os
- import json
- import argparse
- import torch
- from deepspeed.pipe import PipelineModule, LayerSpec
- from deepspeed.moe.layer import MoE
- from deepspeed.accelerator import get_accelerator
- import deepspeed.comm as dist
- class SimpleModel(torch.nn.Module):
- def __init__(self, hidden_dim, empty_grad=False, nlayers=1):
- super(SimpleModel, self).__init__()
- self.linears = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim) for i in range(nlayers)])
- if empty_grad:
- self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
- self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
- self.empty_grad = empty_grad
- def forward(self, x, y):
- if len(self.linears) == 1:
- x = self.linears[0](x)
- else:
- for i, l in enumerate(self.linears):
- x = self.linears[i // 2](x) + l(x)
- return self.cross_entropy_loss(x, y)
- class SimpleFrozenModel(torch.nn.Module):
- def __init__(self, hidden_dim, empty_grad=False):
- super(SimpleFrozenModel, self).__init__()
- self.linears = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim) for i in range(2)])
- if empty_grad:
- self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
- self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
- self.empty_grad = empty_grad
- # Freeze first layer
- self.linears[0].weight.requires_grad = False
- self.linears[0].bias.requires_grad = False
- def forward(self, x, y):
- if len(self.linears) == 1:
- x = self.linears[0](x)
- else:
- for i, l in enumerate(self.linears):
- x = self.linears[i // 2](x) + l(x)
- return self.cross_entropy_loss(x, y)
- class Curriculum_SimpleModel(SimpleModel):
- def __init__(self, hidden_dim, empty_grad=False):
- super(Curriculum_SimpleModel, self).__init__(hidden_dim, empty_grad)
- def forward(self, x, y, **kwargs):
- seqlen = kwargs.get('curriculum_seqlen', None)
- loss = super(Curriculum_SimpleModel, self).forward(x, y)
- return loss, seqlen
- class SimpleMoEModel(torch.nn.Module):
- def __init__(self, hidden_dim, num_experts=4, ep_size=1, use_residual=False):
- super(SimpleMoEModel, self).__init__()
- self.linear1 = torch.nn.Linear(hidden_dim, hidden_dim)
- expert = torch.nn.Sequential(torch.nn.Linear(hidden_dim, hidden_dim), torch.nn.Linear(hidden_dim, hidden_dim))
- # using two MoE layers to check implications of sharing a single storage
- self.moe_1 = MoE(hidden_size=hidden_dim,
- expert=expert,
- ep_size=ep_size,
- use_residual=use_residual,
- num_experts=num_experts,
- k=1)
- # interleaving MoE modules with dense to create an opportunity
- # for gradients to be merged in ZeRO stage 2 average_tensor reduce bucket
- self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
- self.moe_2 = MoE(hidden_size=hidden_dim,
- expert=expert,
- ep_size=ep_size,
- use_residual=use_residual,
- num_experts=num_experts,
- k=1)
- self.linear3 = torch.nn.Linear(hidden_dim, hidden_dim)
- self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
- def forward(self, x, y):
- hidden_dim = self.linear1(x)
- output, _, _ = self.moe_1(hidden_dim)
- output = self.linear2(output)
- output, _, _ = self.moe_2(output)
- output = self.linear3(output)
- hidden_dim = hidden_dim + output
- sentence_embed = hidden_dim.mean(1)
- return self.cross_entropy_loss(sentence_embed, y)
- class SimplePRMoEModel(torch.nn.Module):
- def __init__(self, hidden_dim, num_experts=2, ep_size=1, use_residual=False):
- super(SimplePRMoEModel, self).__init__()
- self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
- linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
- self.linear2 = MoE(hidden_size=hidden_dim,
- expert=linear2,
- ep_size=ep_size,
- use_residual=use_residual,
- num_experts=num_experts,
- k=1)
- linear3 = torch.nn.Linear(hidden_dim, hidden_dim)
- self.linear3 = MoE(hidden_size=hidden_dim,
- expert=linear3,
- ep_size=ep_size,
- use_residual=use_residual,
- num_experts=int(2 * num_experts),
- k=1)
- self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
- def forward(self, x, y):
- hidden_dim = x
- hidden_dim = self.linear(hidden_dim)
- output, _, _ = self.linear2(hidden_dim)
- output, _, _ = self.linear3(output)
- hidden_dim = hidden_dim + output
- sentence_embed = hidden_dim.mean(1)
- return self.cross_entropy_loss(sentence_embed, y)
- class UnusedParametersModel(SimpleModel):
- def __init__(self, hidden_dim, empty_grad=False):
- super().__init__(hidden_dim, empty_grad)
- self.unused_linear = torch.nn.Linear(hidden_dim, hidden_dim)
- class LinearStack(torch.nn.Module):
- def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, num_layers=4):
- super().__init__()
- self.input_dim = input_dim
- self.output_dim = output_dim
- self.hidden_dim = hidden_dim
- self.input_layer = torch.nn.Linear(in_features=self.input_dim, out_features=self.hidden_dim)
- self.layers = torch.nn.ModuleList([
- torch.nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim, bias=False)
- for x in range(num_layers)
- ])
- self.output_layer = torch.nn.Linear(in_features=self.hidden_dim, out_features=self.output_dim)
- self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
- def forward(self, x, y):
- x = self.input_layer(x)
- for layer in self.layers:
- x = layer(x)
- x = self.output_layer(x)
- return x
- class LinearStackPipe(PipelineModule):
- def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, num_layers=4, **kwargs):
- self.input_dim = input_dim
- self.output_dim = output_dim
- self.hidden_dim = hidden_dim
- self.num_layers = num_layers
- layers = []
- layers.append(LayerSpec(torch.nn.Linear, self.input_dim, self.hidden_dim))
- for x in range(self.num_layers):
- layers.append(LayerSpec(torch.nn.Linear, self.hidden_dim, self.hidden_dim, bias=False))
- layers.append(lambda x: x)
- layers.append(LayerSpec(torch.nn.Linear, self.hidden_dim, self.output_dim))
- super().__init__(layers=layers, loss_fn=torch.nn.CrossEntropyLoss(), **kwargs)
- class SimpleOptimizer(torch.optim.Optimizer):
- def __init__(self, params, lr=0.11072018):
- defaults = dict(lr=lr)
- super(SimpleOptimizer, self).__init__(params, defaults)
- def __setstate__(self, state):
- super(SimpleOptimizer, self).__setstate__(state)
- def step(self, closure=None):
- loss = None
- if closure is not None:
- loss = closure()
- for group in self.param_groups:
- for p in group['params']:
- if p.grad is None:
- continue
- d_p = p.grad.data
- p.data.add_(-group['lr'], d_p)
- return loss
- class HybridStateOptimizer(torch.optim.Optimizer):
- def __init__(self, params, lr=0.11072018):
- defaults = dict(lr=lr)
- super(HybridStateOptimizer, self).__init__(params, defaults)
- def __setstate__(self, state):
- super(HybridStateOptimizer, self).__setstate__(state)
- def step(self, closure=None):
- loss = None
- if closure is not None:
- loss = closure()
- for group in self.param_groups:
- for p in group['params']:
- if p.grad is None:
- continue
- state = self.state[p]
- if len(state) == 0:
- state['integer_step'] = 0
- state['tensor_step'] = torch.zeros(1, device=p.device)
- d_p = p.grad.data
- p.data.add_(-group['lr'], d_p)
- state['integer_step'] += 1
- state['tensor_step'] += 1
- return loss
- class PLD_SimpleModel(SimpleModel):
- def __init__(self, hidden_dim, empty_grad=False):
- super(PLD_SimpleModel, self).__init__(hidden_dim, empty_grad)
- def forward(self, x, y, **kwargs):
- pld = kwargs.get('progressive_layer_drop', False)
- theta = kwargs.get('pld_theta', 1.0)
- hidden_dim = super(PLD_SimpleModel, self).forward(x, y)
- return hidden_dim
- def random_dataset(total_samples, hidden_dim, device, dtype=torch.half):
- train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=dtype)
- train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
- train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
- return train_dataset
- def random_dataloader(model, total_samples, hidden_dim, device, dtype=torch.half):
- batch_size = model.train_micro_batch_size_per_gpu()
- train_dataset = random_dataset(total_samples, hidden_dim, device, dtype=dtype)
- train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
- return train_loader
- def sequence_dataloader(model, total_samples, hidden_dim, device, seq_len: int = 32, dtype=torch.half):
- batch_size = model.train_micro_batch_size_per_gpu()
- train_data = torch.randn(total_samples, seq_len, hidden_dim, device=device, dtype=dtype)
- train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
- train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
- train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
- return train_loader
- def create_config_from_dict(tmpdir, config_dict):
- config_path = os.path.join(tmpdir, 'temp_config.json')
- with open(config_path, 'w') as fd:
- json.dump(config_dict, fd)
- return config_path
- def create_deepspeed_args():
- parser = argparse.ArgumentParser()
- args = parser.parse_args(args='')
- args.deepspeed = True
- if dist.is_initialized():
- # We assume up to one full node executing unit tests
- assert dist.get_world_size() <= get_accelerator().device_count()
- args.local_rank = dist.get_rank()
- return args
- def args_from_dict(tmpdir, config_dict):
- args = create_deepspeed_args()
- config_path = create_config_from_dict(tmpdir, config_dict)
- args.deepspeed_config = config_path
- return args
|