123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044 |
- import torch
- import deepspeed.comm as dist
- import deepspeed
- import pytest
- from deepspeed.ops.adam import FusedAdam
- from .common import distributed_test
- from deepspeed.ops.op_builder import CPUAdamBuilder
- from .simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args, SimpleMoEModel, sequence_dataloader
- from .util import required_torch_version
- try:
- from apex import amp # noqa: F401
- _amp_available = True
- except ImportError:
- _amp_available = False
- amp_available = pytest.mark.skipif(not _amp_available,
- reason="apex/amp is not installed")
- def test_lamb_fp32_grad_clip(tmpdir):
- config_dict = {
- "train_batch_size": 2,
- "steps_per_print": 1,
- "optimizer": {
- "type": "Lamb",
- "params": {
- "lr": 0.00015
- }
- },
- "gradient_clipping": 1.0
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- model = SimpleModel(hidden_dim)
- @distributed_test(world_size=[1, 2])
- def _test_lamb_fp32_grad_clip(args, model, hidden_dim):
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters())
- data_loader = random_dataloader(model=model,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=model.device,
- dtype=torch.float)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_lamb_fp32_grad_clip(args=args, model=model, hidden_dim=hidden_dim)
- def test_lamb_fp16_basic(tmpdir):
- config_dict = {
- "train_batch_size": 2,
- "steps_per_print": 1,
- "optimizer": {
- "type": "Lamb",
- "params": {
- "lr": 0.00015
- }
- },
- "gradient_clipping": 1.0,
- "fp16": {
- "enabled": True
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- model = SimpleModel(hidden_dim)
- @distributed_test(world_size=[1, 2])
- def _test_lamb_fp16_basic(args, model, hidden_dim):
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters())
- data_loader = random_dataloader(model=model,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_lamb_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
- def test_lamb_fp16_empty_grad(tmpdir):
- config_dict = {
- "train_batch_size": 2,
- "steps_per_print": 1,
- "optimizer": {
- "type": "Lamb",
- "params": {
- "lr": 0.00015
- }
- },
- "gradient_clipping": 1.0,
- "fp16": {
- "enabled": True
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- model = SimpleModel(hidden_dim, empty_grad=True)
- @distributed_test(world_size=[2])
- def _test_lamb_fp16_empty_grad(args, model, hidden_dim):
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters())
- data_loader = random_dataloader(model=model,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_lamb_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
- def test_adam_fp32_empty_grad(tmpdir):
- config_dict = {
- "train_batch_size": 2,
- "steps_per_print": 1,
- "optimizer": {
- "type": "Adam",
- "params": {
- "lr": 0.00015
- }
- },
- "gradient_clipping": 1.0,
- "fp16": {
- "enabled": False
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- model = SimpleModel(hidden_dim, empty_grad=True)
- @distributed_test(world_size=[2])
- def _test_adam_fp32_empty_grad(args, model, hidden_dim):
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters())
- data_loader = random_dataloader(model=model,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=model.device,
- dtype=torch.float)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_adam_fp32_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
- def test_adamw_fp16_basic(tmpdir):
- config_dict = {
- "train_batch_size": 1,
- "steps_per_print": 1,
- "fp16": {
- "enabled": True
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- model = SimpleModel(hidden_dim)
- @distributed_test(world_size=[1])
- def _test_adamw_fp16_basic(args, model, hidden_dim):
- optimizer = torch.optim.AdamW(params=model.parameters())
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- optimizer=optimizer)
- data_loader = random_dataloader(model=model,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_adamw_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
- def test_unfused_fp16_optimizer_gradnorm_for_moe(tmpdir, monkeypatch):
- if not required_torch_version():
- pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
- config_dict = {
- "train_batch_size": 2,
- "steps_per_print": 1,
- "fp16": {
- "enabled": True
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
- torch_norm_tensor = torch.cuda.FloatTensor([total_norm])
- all_gather_results = [
- torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
- ]
- dist.all_gather(all_gather_results, torch_norm_tensor)
- assert len(set([x.item() for x in all_gather_results])) == 1
- return 1.0
- @distributed_test(world_size=[2])
- def _test_unfused_fp16_optimizer(args, hidden_dim):
- # initialize MoE
- model = SimpleMoEModel(hidden_dim, ep_size=2)
- optimizer = torch.optim.AdamW(params=model.parameters())
- engine, optimizer, _, _ = deepspeed.initialize(args=args,
- model=model,
- optimizer=optimizer,
- dist_init_required=False)
- monkeypatch.setattr(optimizer,
- 'unscale_and_clip_grads',
- mock_unscale_and_clip_grads)
- data_loader = sequence_dataloader(model=engine,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=engine.device)
- for n, batch in enumerate(data_loader):
- loss = engine(batch[0], batch[1])
- engine.backward(loss)
- engine.step()
- _test_unfused_fp16_optimizer(args=args, hidden_dim=hidden_dim)
- def test_fused_fp16_optimizer_gradnorm_for_moe(tmpdir, monkeypatch):
- if not required_torch_version():
- pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
- config_dict = {
- "train_batch_size": 2,
- "steps_per_print": 1,
- "fp16": {
- "enabled": True
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True):
- torch_norm_tensor = torch.cuda.FloatTensor([total_norm])
- all_gather_results = [
- torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
- ]
- dist.all_gather(all_gather_results, torch_norm_tensor)
- assert len(set([x.item() for x in all_gather_results])) == 1
- return 1.0
- @distributed_test(world_size=[2])
- def _test_fused_fp16_optimizer(args, hidden_dim):
- # initialize MoE
- model = SimpleMoEModel(hidden_dim, ep_size=2)
- # optimizer = torch.optim.AdamW(params=model.parameters())
- optimizer = FusedAdam(params=model.parameters())
- engine, optimizer, _, _ = deepspeed.initialize(args=args,
- model=model,
- optimizer=optimizer,
- dist_init_required=False)
- monkeypatch.setattr(optimizer,
- 'unscale_and_clip_grads',
- mock_unscale_and_clip_grads)
- data_loader = sequence_dataloader(model=engine,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=engine.device)
- for n, batch in enumerate(data_loader):
- loss = engine(batch[0], batch[1])
- engine.backward(loss)
- engine.step()
- _test_fused_fp16_optimizer(args=args, hidden_dim=hidden_dim)
- @pytest.mark.parametrize("fused_lamb_legacy", [(False), (True)])
- def test_lamb_optimizer_gradnorm_for_moe(tmpdir, monkeypatch, fused_lamb_legacy: bool):
- if not required_torch_version():
- pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
- config_dict = {
- "train_batch_size": 2,
- "steps_per_print": 1,
- "fp16": {
- "enabled": True
- },
- "optimizer": {
- "type": "Lamb",
- "params": {
- "lr": 0.00015
- }
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
- torch_norm_tensor = torch.cuda.FloatTensor([total_norm])
- all_gather_results = [
- torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
- ]
- dist.all_gather(all_gather_results, torch_norm_tensor)
- assert len(set([x.item() for x in all_gather_results])) == 1
- return 1.0
- @distributed_test(world_size=[2])
- def _test_lamb_legacy_optimizer_step(args, hidden_dim, fused_lamb_legacy):
- # initialize MoE
- model = SimpleMoEModel(hidden_dim, ep_size=2)
- engine, optimizer, _, _ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters(),
- dist_init_required=False)
- monkeypatch.setattr(optimizer,
- 'unscale_and_clip_grads',
- mock_unscale_and_clip_grads)
- optimizer.fused_lamb_legacy = fused_lamb_legacy
- data_loader = sequence_dataloader(model=engine,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=engine.device)
- for n, batch in enumerate(data_loader):
- loss = engine(batch[0], batch[1])
- engine.backward(loss)
- engine.step()
- _test_lamb_legacy_optimizer_step(args=args,
- hidden_dim=hidden_dim,
- fused_lamb_legacy=fused_lamb_legacy)
- def test_dict_config_adamw_fp16_basic():
- config = {"train_batch_size": 1, "steps_per_print": 1, "fp16": {"enabled": True}}
- args = create_deepspeed_args()
- hidden_dim = 10
- model = SimpleModel(hidden_dim)
- @distributed_test(world_size=[1])
- def _test_adamw_fp16_basic(args, model, hidden_dim, config):
- optimizer = torch.optim.AdamW(params=model.parameters())
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- optimizer=optimizer,
- config=config)
- data_loader = random_dataloader(model=model,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_adamw_fp16_basic(args=args, model=model, hidden_dim=hidden_dim, config=config)
- def test_adamw_fp16_empty_grad(tmpdir):
- config_dict = {
- "train_batch_size": 1,
- "steps_per_print": 1,
- "fp16": {
- "enabled": True
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- model = SimpleModel(hidden_dim)
- @distributed_test(world_size=[1])
- def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
- optimizer = torch.optim.AdamW(params=model.parameters())
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- optimizer=optimizer)
- data_loader = random_dataloader(model=model,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_adamw_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
- @pytest.mark.parametrize('zero_stage, use_cpu_offload',
- [(1,
- False),
- (2,
- False),
- (2,
- True),
- (3,
- False),
- (3,
- True)])
- def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
- if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
- pytest.skip("cpu-adam is not compatible")
- config_dict = {
- "train_batch_size": 1,
- "steps_per_print": 1,
- "optimizer": {
- "type": "Adam",
- "params": {
- "lr": 0.00015
- }
- },
- "scheduler": {
- "type": "OneCycle",
- "params": {
- "cycle_first_step_size": 16000,
- "cycle_first_stair_count": 8000,
- "decay_step_size": 16000,
- "cycle_min_lr": 1e-06,
- "cycle_max_lr": 3e-05,
- "decay_lr_rate": 1e-07,
- "cycle_min_mom": 0.85,
- "cycle_max_mom": 0.99,
- "decay_mom_rate": 0.0
- }
- },
- "fp16": {
- "enabled": True
- },
- "zero_optimization": {
- "stage": zero_stage,
- "cpu_offload": use_cpu_offload
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- @distributed_test(world_size=[1])
- def _test_adam_fp16_zero_onecycle_compatibility(args, zero_stage, hidden_dim):
- model = SimpleModel(hidden_dim)
- model, _, _,_ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters())
- data_loader = random_dataloader(model=model,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_adam_fp16_zero_onecycle_compatibility(args=args,
- zero_stage=zero_stage,
- hidden_dim=hidden_dim)
- @pytest.mark.parametrize('zero_stage, use_cpu_offload',
- [(1,
- False),
- (2,
- False),
- (2,
- True),
- (3,
- False),
- (3,
- True)])
- def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
- if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
- pytest.skip("cpu-adam is not compatible")
- config_dict = {
- "train_batch_size": 4,
- "steps_per_print": 1,
- "optimizer": {
- "type": "Adam",
- "params": {
- "lr": 0.00015
- }
- },
- "fp16": {
- "enabled": True,
- "loss_scale": 138.
- },
- "zero_optimization": {
- "stage": zero_stage,
- "cpu_offload": use_cpu_offload
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- @distributed_test(world_size=2)
- def _test_zero_static_scale(args, zero_stage, hidden_dim):
- #making hidden size not divisible by DP for covering this scenario
- hidden_dim = hidden_dim
- model = SimpleModel(hidden_dim)
- model, optim, _, _ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters())
- # Ensure the static scaler is configured.
- assert optim.dynamic_loss_scale == False
- assert optim.loss_scaler.loss_scale == 138.
- # Now make sure things work..
- data_loader = random_dataloader(model=model,
- total_samples=10,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- #test when hidden_dim is not aligned with world size
- _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=9)
- #test when hidden_dim is aligned with world size
- _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=10)
- def test_zero_static_scale_deprecated_format(tmpdir):
- config_dict = {
- "train_batch_size": 4,
- "steps_per_print": 1,
- "optimizer": {
- "type": "Adam",
- "params": {
- "lr": 0.00015
- }
- },
- "fp16": {
- "enabled": True,
- "loss_scale": 138.
- },
- "zero_optimization": {
- "stage": 1
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- @distributed_test(world_size=2)
- def _test_zero_static_scale(args):
- hidden_dim = 10
- model = SimpleModel(hidden_dim)
- model, optim, _, _ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters())
- # Ensure the static scaler is configured.
- assert optim.dynamic_loss_scale == False
- assert optim.loss_scaler.loss_scale == 138.
- # Now make sure things work..
- data_loader = random_dataloader(model=model,
- total_samples=10,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_zero_static_scale(args)
- @pytest.mark.parametrize('zero_stage, use_cpu_offload',
- [(1,
- False),
- (2,
- False),
- (2,
- True),
- (3,
- False),
- (3,
- True)])
- def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
- if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
- pytest.skip("cpu-adam is not compatible")
- config_dict = {
- "train_batch_size": 4,
- "steps_per_print": 1,
- "fp16": {
- "enabled": True,
- },
- "zero_optimization": {
- "stage": zero_stage,
- "cpu_offload": use_cpu_offload
- },
- "zero_allow_untested_optimizer": False
- }
- args = args_from_dict(tmpdir, config_dict)
- @distributed_test(world_size=[1])
- def _test_zero_allow_untested_optimizer(args, zero_stage):
- hidden_dim = 10
- model = SimpleModel(hidden_dim)
- optimizer = SimpleOptimizer(model.parameters())
- with pytest.raises(AssertionError):
- model, optim, _, _ = deepspeed.initialize(args=args,
- model=model,
- optimizer=optimizer,
- model_parameters=model.parameters())
- _test_zero_allow_untested_optimizer(args, zero_stage)
- @pytest.mark.parametrize('zero_stage, use_cpu_offload',
- [(1,
- False),
- (2,
- False),
- (2,
- True),
- (3,
- False),
- (3,
- True)])
- def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
- if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
- pytest.skip("cpu-adam is not compatible")
- if zero_stage == 3:
- pytest.skip("skip for now")
- config_dict = {
- "train_micro_batch_size_per_gpu": 1,
- "gradient_accumulation_steps": 1,
- "fp16": {
- "enabled": True,
- "initial_scale_power": 8
- },
- "optimizer": {
- "type": "Adam",
- "params": {
- "lr": 0.00015
- }
- },
- "zero_optimization": {
- "stage": zero_stage,
- "cpu_offload": use_cpu_offload,
- "reduce_bucket_size": 100,
- "allgather_bucket_size": 100
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- @distributed_test(world_size=[3])
- def _test_zero_empty_partition(args, zero_stage):
- hidden_dim = 1
- model = SimpleModel(hidden_dim)
- # Ensure model has 2 parameters, to cause empty partition with DP=3
- assert len(list(model.parameters())) == 2
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters())
- # Now make sure things work..
- data_loader = random_dataloader(model=model,
- total_samples=1,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_zero_empty_partition(args=args, zero_stage=zero_stage)
- @amp_available
- def test_adam_amp_basic(tmpdir):
- config_dict = {"train_batch_size": 1, "steps_per_print": 1, "amp": {"enabled": True}}
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- model = SimpleModel(hidden_dim)
- @distributed_test(world_size=[1])
- def _test_adam_amp_basic(args, model, hidden_dim):
- optimizer = torch.optim.Adam(params=model.parameters())
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- optimizer=optimizer)
- data_loader = random_dataloader(model=model,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_adam_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
- @amp_available
- def test_lamb_amp_basic(tmpdir):
- config_dict = {
- "train_batch_size": 2,
- "steps_per_print": 1,
- "optimizer": {
- "type": "Lamb",
- "params": {
- "lr": 0.00015
- }
- },
- "gradient_clipping": 1.0,
- "amp": {
- "enabled": True,
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- model = SimpleModel(hidden_dim)
- @distributed_test(world_size=[1, 2])
- def _test_lamb_amp_basic(args, model, hidden_dim):
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters())
- data_loader = random_dataloader(model=model,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_lamb_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
- @amp_available
- def test_adam_amp_o2(tmpdir):
- config_dict = {
- "train_batch_size": 2,
- "steps_per_print": 1,
- "optimizer": {
- "type": "Adam",
- "params": {
- "lr": 0.00015
- }
- },
- "gradient_clipping": 1.0,
- "amp": {
- "enabled": True,
- "opt_level": "O2"
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- model = SimpleModel(hidden_dim)
- @distributed_test(world_size=[1, 2])
- def _test_adam_amp_o2(args, model, hidden_dim):
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters())
- data_loader = random_dataloader(model=model,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_adam_amp_o2(args=args, model=model, hidden_dim=hidden_dim)
- @amp_available
- def test_adam_amp_o2_empty_grad(tmpdir):
- config_dict = {
- "train_batch_size": 2,
- "steps_per_print": 1,
- "optimizer": {
- "type": "Adam",
- "params": {
- "lr": 0.00015
- }
- },
- "gradient_clipping": 1.0,
- "amp": {
- "enabled": True,
- "opt_level": "O2"
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- model = SimpleModel(hidden_dim)
- @distributed_test(world_size=[2])
- def _test_adam_amp_o2_empty_grad(args, model, hidden_dim):
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters())
- data_loader = random_dataloader(model=model,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_adam_amp_o2_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
- @pytest.mark.parametrize('zero_stage, optimizer_constructor',
- [(1,
- FusedAdam),
- (2,
- torch.optim.Adam),
- (2,
- FusedAdam),
- (3,
- torch.optim.Adam),
- (3,
- FusedAdam)])
- def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor):
- config_dict = {
- "train_batch_size": 2,
- "steps_per_print": 1,
- "fp16": {
- "enabled": True
- },
- "zero_optimization": {
- "stage": zero_stage
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- @distributed_test(world_size=[1])
- def _test_zero_supported_client_optimizer(args, zero_stage, optimizer_constructor):
- model = SimpleModel(hidden_dim)
- client_optimizer = optimizer_constructor(params=model.parameters())
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- optimizer=client_optimizer)
- _test_zero_supported_client_optimizer(args=args,
- zero_stage=zero_stage,
- optimizer_constructor=optimizer_constructor)
- def test_zero2_reduce_scatter_off(tmpdir):
- config_dict = {
- "train_batch_size": 2,
- "steps_per_print": 1,
- "optimizer": {
- "type": "Adam",
- "params": {
- "lr": 0.00015
- }
- },
- "gradient_clipping": 1.0,
- "zero_optimization": {
- "stage": 2,
- "contiguous_gradients": True,
- "allgather_bucket_size": 2000000000,
- "reduce_bucket_size": 200000000,
- "overlap_comm": False,
- "reduce_scatter": False
- },
- "fp16": {
- "enabled": True
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- model = SimpleModel(hidden_dim)
- @distributed_test(world_size=[2])
- def _helper(args, model, hidden_dim):
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters())
- data_loader = random_dataloader(model=model,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _helper(args=args, model=model, hidden_dim=hidden_dim)
- @pytest.mark.parametrize('adam_type, torch_impl',
- [('Adam',
- True),
- ('Adam',
- False),
- ('AdamW',
- True),
- ('AdamW',
- False)])
- def test_fp16_adam_types(tmpdir, adam_type, torch_impl):
- config_dict = {
- "train_batch_size": 1,
- "steps_per_print": 1,
- "fp16": {
- "enabled": True,
- "initial_scale_power": 10
- },
- "optimizer": {
- "type": adam_type,
- "torch_adam": torch_impl,
- "params": {
- "lr": 0.00015
- }
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- model = SimpleModel(hidden_dim)
- @distributed_test(world_size=[1])
- def _test_fp16_adam_types(args, model, hidden_dim):
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters())
- data_loader = random_dataloader(model=model,
- total_samples=10,
- hidden_dim=hidden_dim,
- device=model.device)
- for _, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _test_fp16_adam_types(args=args, model=model, hidden_dim=hidden_dim)
- def test_zero3_lazyscatter(tmpdir):
- config_dict = {
- "train_batch_size": 1,
- "steps_per_print": 1,
- "fp16": {
- "enabled": True,
- "initial_scale_power": 10
- },
- "optimizer": {
- "type": "AdamW",
- "params": {
- "lr": 0.00015
- }
- },
- "zero_optimization": {
- "stage": 3
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- @distributed_test(world_size=[1])
- def _go(args):
- model = SimpleModel(hidden_dim)
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- model_parameters=model.parameters())
- data_loader = random_dataloader(model=model,
- total_samples=10,
- hidden_dim=hidden_dim,
- device=model.device)
- for _, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _go(args=args)
- @pytest.mark.parametrize('stage', [1, 2, 3])
- def test_zero_empty_grad(tmpdir, stage):
- config_dict = {
- "train_batch_size": 1,
- "steps_per_print": 1,
- "fp16": {
- "enabled": True
- },
- "zero_optimization": {
- "stage": stage
- }
- }
- args = args_from_dict(tmpdir, config_dict)
- hidden_dim = 10
- model = SimpleModel(hidden_dim)
- @distributed_test(world_size=[1])
- def _go(args, model, hidden_dim):
- optimizer = torch.optim.Adam(model.parameters())
- model, _, _, _ = deepspeed.initialize(args=args,
- model=model,
- optimizer=optimizer)
- data_loader = random_dataloader(model=model,
- total_samples=50,
- hidden_dim=hidden_dim,
- device=model.device)
- for n, batch in enumerate(data_loader):
- loss = model(batch[0], batch[1])
- model.backward(loss)
- model.step()
- _go(args=args, model=model, hidden_dim=hidden_dim)
|