test_model_profiling.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import os
  5. import time
  6. import pytest
  7. import torch
  8. import deepspeed
  9. from transformers import pipeline
  10. from unit.common import DistributedTest
  11. from deepspeed.accelerator import get_accelerator
  12. from deepspeed.ops.op_builder import InferenceBuilder
  13. if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
  14. pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
  15. @pytest.mark.inference
  16. @pytest.mark.parametrize("use_cuda_events", [True, False])
  17. @pytest.mark.parametrize("enable_cuda_graph", [True, False])
  18. class TestModelProfiling(DistributedTest):
  19. world_size = 1
  20. def test(self, enable_cuda_graph, use_cuda_events):
  21. task = "fill-mask"
  22. model = "bert-base-cased"
  23. dtype = torch.float16
  24. query = "I am a [MASK] model"
  25. local_rank = int(os.getenv("LOCAL_RANK", "0"))
  26. world_size = int(os.getenv("WORLD_SIZE", "1"))
  27. pipe = pipeline(task, model, framework="pt", device=get_accelerator().device_name(local_rank))
  28. pipe.model = deepspeed.init_inference(pipe.model,
  29. dtype=dtype,
  30. mp_size=world_size,
  31. replace_with_kernel_inject=True,
  32. enable_cuda_graph=enable_cuda_graph)
  33. pipe.model.profile_model_time(use_cuda_events=use_cuda_events)
  34. e2e_times = []
  35. model_times = []
  36. for _ in range(10):
  37. get_accelerator().synchronize()
  38. start = time.perf_counter_ns()
  39. r = pipe(query)
  40. get_accelerator().synchronize()
  41. end = time.perf_counter_ns()
  42. e2e_times.append((end - start) / 1e6) # convert ns to ms
  43. model_times.extend(pipe.model.model_times())
  44. for e2e_t, model_t in zip(e2e_times, model_times):
  45. assert e2e_t >= model_t