test_model_profiling.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import os
  5. import time
  6. import pytest
  7. import torch
  8. import deepspeed
  9. from transformers import pipeline
  10. from unit.common import DistributedTest
  11. from deepspeed.accelerator import get_accelerator
  12. from deepspeed.ops.op_builder import InferenceBuilder
  13. if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
  14. pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
  15. if torch.half not in get_accelerator().supported_dtypes():
  16. pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
  17. @pytest.mark.inference
  18. @pytest.mark.parametrize("use_cuda_events", [True, False])
  19. @pytest.mark.parametrize("enable_cuda_graph", [True, False])
  20. class TestModelProfiling(DistributedTest):
  21. world_size = 1
  22. def test(self, enable_cuda_graph, use_cuda_events):
  23. task = "fill-mask"
  24. model = "bert-base-cased"
  25. dtype = torch.float16
  26. query = "I am a [MASK] model"
  27. local_rank = int(os.getenv("LOCAL_RANK", "0"))
  28. world_size = int(os.getenv("WORLD_SIZE", "1"))
  29. pipe = pipeline(task, model, framework="pt", device=get_accelerator().device_name(local_rank))
  30. pipe.model = deepspeed.init_inference(pipe.model,
  31. dtype=dtype,
  32. mp_size=world_size,
  33. replace_with_kernel_inject=True,
  34. enable_cuda_graph=enable_cuda_graph)
  35. pipe.model.profile_model_time(use_cuda_events=use_cuda_events)
  36. e2e_times = []
  37. model_times = []
  38. for _ in range(10):
  39. get_accelerator().synchronize()
  40. start = time.perf_counter_ns()
  41. r = pipe(query)
  42. get_accelerator().synchronize()
  43. end = time.perf_counter_ns()
  44. e2e_times.append((end - start) / 1e6) # convert ns to ms
  45. model_times.extend(pipe.model.model_times())
  46. for e2e_t, model_t in zip(e2e_times, model_times):
  47. assert e2e_t >= model_t