test_he_llama.py 3.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import os
  5. import torch
  6. import pytest
  7. import deepspeed
  8. from deepspeed.ops.op_builder import OpBuilder
  9. from unit.common import DistributedTest
  10. from deepspeed.accelerator import get_accelerator
  11. from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
  12. rocm_version = OpBuilder.installed_rocm_version()
  13. if rocm_version != (0, 0):
  14. pytest.skip("skip inference tests on rocm for now", allow_module_level=True)
  15. @pytest.mark.seq_inference
  16. @pytest.mark.parametrize("batch_size", [1, 2], ids=["bsz=1", "bsz=2"])
  17. @pytest.mark.parametrize("model_name", ["huggyllama/llama-7b"])
  18. class TestHybridEngineLlama(DistributedTest):
  19. world_size = 1
  20. def _generate(self, model, tokenizer, prompt):
  21. local_rank = int(os.getenv("LOCAL_RANK", "0"))
  22. tokens = tokenizer.batch_encode_plus(prompt, return_tensors="pt", padding=True)
  23. for t in tokens:
  24. if torch.is_tensor(tokens[t]):
  25. tokens[t] = tokens[t].to(f'{get_accelerator().device_name()}:{local_rank}')
  26. #output = model.generate(**tokens, do_sample=False, max_length=100)
  27. output = model.generate(tokens.input_ids, do_sample=False, max_length=100)
  28. outputs = tokenizer.batch_decode(output, skip_special_tokens=True)
  29. return outputs
  30. def get_model(self, model_name):
  31. local_rank = int(os.getenv("LOCAL_RANK", "0"))
  32. model_config = AutoConfig.from_pretrained(model_name)
  33. model_config.dropout = 0.0
  34. model = AutoModelForCausalLM.from_pretrained(model_name, config=model_config)
  35. # Make the model smaller so we can run it on a single GPU in CI
  36. _ = [model.model.layers.pop(-1) for _ in range(8)]
  37. model = model.half()
  38. model = model.to(f'{get_accelerator().device_name()}:{local_rank}')
  39. return model
  40. def get_tokenizer(self, model_name):
  41. tokenizer = AutoTokenizer.from_pretrained(model_name)
  42. tokenizer.pad_token = tokenizer.eos_token
  43. return tokenizer
  44. def get_prompt(self, batch_size):
  45. if batch_size == 1:
  46. prompt = ["Microsoft is in Washington"]
  47. elif batch_size == 2:
  48. prompt = ["DeepSpeed is", "Microsoft is in Washington"]
  49. else:
  50. raise NotImplementedError(f"batch_size {batch_size} not implemented")
  51. return prompt
  52. def test_correctness(self, batch_size, model_name):
  53. pytest.skip("skip test for now, will fix in follow-up PR")
  54. model = self.get_model(model_name)
  55. tokenizer = self.get_tokenizer(model_name)
  56. prompt = self.get_prompt(batch_size)
  57. base_out = self._generate(model, tokenizer, prompt)
  58. ds_config = {"train_batch_size": 1, "fp16": {"enabled": True}, "hybrid_engine": {"enabled": True}}
  59. model, *_ = deepspeed.initialize(model=model, config=ds_config)
  60. model.eval()
  61. ds1_out = self._generate(model, tokenizer, prompt)
  62. assert base_out == ds1_out, f"base_out: {base_out}, ds1_out: {ds1_out}"
  63. model.train()
  64. model.eval()
  65. ds2_out = self._generate(model, tokenizer, prompt)
  66. assert base_out == ds2_out
  67. def test_functionality(self, batch_size, model_name):
  68. model = self.get_model(model_name)
  69. tokenizer = self.get_tokenizer(model_name)
  70. prompt = self.get_prompt(batch_size)
  71. ds_config = {"train_batch_size": 1, "fp16": {"enabled": True}, "hybrid_engine": {"enabled": True}}
  72. model, *_ = deepspeed.initialize(model=model, config=ds_config)
  73. model.eval()
  74. ds1_out = self._generate(model, tokenizer, prompt)
  75. model.train()
  76. model.eval()
  77. ds2_out = self._generate(model, tokenizer, prompt)
  78. assert ds1_out == ds2_out, f"ds1_out: {ds1_out}, ds2_out: {ds2_out}"