test_he_llama.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import os
  5. import torch
  6. import pytest
  7. import deepspeed
  8. from deepspeed.ops.op_builder import OpBuilder
  9. from unit.common import DistributedTest
  10. from deepspeed.accelerator import get_accelerator
  11. from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
  12. from deepspeed.ops.op_builder import InferenceBuilder
  13. if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
  14. pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
  15. rocm_version = OpBuilder.installed_rocm_version()
  16. if rocm_version != (0, 0):
  17. pytest.skip("skip inference tests on rocm for now", allow_module_level=True)
  18. @pytest.mark.seq_inference
  19. @pytest.mark.parametrize("batch_size", [1, 2], ids=["bsz=1", "bsz=2"])
  20. @pytest.mark.parametrize("model_name", ["huggyllama/llama-7b"])
  21. class TestHybridEngineLlama(DistributedTest):
  22. world_size = 1
  23. def _generate(self, model, tokenizer, prompt):
  24. local_rank = int(os.getenv("LOCAL_RANK", "0"))
  25. tokens = tokenizer.batch_encode_plus(prompt, return_tensors="pt", padding=True)
  26. for t in tokens:
  27. if torch.is_tensor(tokens[t]):
  28. tokens[t] = tokens[t].to(f'{get_accelerator().device_name()}:{local_rank}')
  29. #output = model.generate(**tokens, do_sample=False, max_length=100)
  30. output = model.generate(tokens.input_ids, do_sample=False, max_length=100)
  31. outputs = tokenizer.batch_decode(output, skip_special_tokens=True)
  32. return outputs
  33. def get_model(self, model_name):
  34. local_rank = int(os.getenv("LOCAL_RANK", "0"))
  35. model_config = AutoConfig.from_pretrained(model_name)
  36. model_config.dropout = 0.0
  37. model = AutoModelForCausalLM.from_pretrained(model_name, config=model_config)
  38. # Make the model smaller so we can run it on a single GPU in CI
  39. _ = [model.model.layers.pop(-1) for _ in range(8)]
  40. model = model.half()
  41. model = model.to(f'{get_accelerator().device_name()}:{local_rank}')
  42. return model
  43. def get_tokenizer(self, model_name):
  44. tokenizer = AutoTokenizer.from_pretrained(model_name)
  45. tokenizer.pad_token = tokenizer.eos_token
  46. return tokenizer
  47. def get_prompt(self, batch_size):
  48. if batch_size == 1:
  49. prompt = ["Microsoft is in Washington"]
  50. elif batch_size == 2:
  51. prompt = ["DeepSpeed is", "Microsoft is in Washington"]
  52. else:
  53. raise NotImplementedError(f"batch_size {batch_size} not implemented")
  54. return prompt
  55. def test_correctness(self, batch_size, model_name):
  56. pytest.skip("skip test for now, will fix in follow-up PR")
  57. model = self.get_model(model_name)
  58. tokenizer = self.get_tokenizer(model_name)
  59. prompt = self.get_prompt(batch_size)
  60. base_out = self._generate(model, tokenizer, prompt)
  61. ds_config = {"train_batch_size": 1, "fp16": {"enabled": True}, "hybrid_engine": {"enabled": True}}
  62. model, *_ = deepspeed.initialize(model=model, config=ds_config)
  63. model.eval()
  64. ds1_out = self._generate(model, tokenizer, prompt)
  65. assert base_out == ds1_out, f"base_out: {base_out}, ds1_out: {ds1_out}"
  66. model.train()
  67. model.eval()
  68. ds2_out = self._generate(model, tokenizer, prompt)
  69. assert base_out == ds2_out
  70. def test_functionality(self, batch_size, model_name):
  71. model = self.get_model(model_name)
  72. tokenizer = self.get_tokenizer(model_name)
  73. prompt = self.get_prompt(batch_size)
  74. ds_config = {"train_batch_size": 1, "fp16": {"enabled": True}, "hybrid_engine": {"enabled": True}}
  75. model, *_ = deepspeed.initialize(model=model, config=ds_config)
  76. model.eval()
  77. ds1_out = self._generate(model, tokenizer, prompt)
  78. model.train()
  79. model.eval()
  80. ds2_out = self._generate(model, tokenizer, prompt)
  81. assert ds1_out == ds2_out, f"ds1_out: {ds1_out}, ds2_out: {ds2_out}"