12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061 |
- # Copyright (c) Microsoft Corporation.
- # SPDX-License-Identifier: Apache-2.0
- # DeepSpeed Team
- import os
- import time
- import pytest
- import torch
- import deepspeed
- from transformers import pipeline
- from unit.common import DistributedTest
- from deepspeed.accelerator import get_accelerator
- from deepspeed.ops.op_builder import InferenceBuilder
- if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
- pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
- if torch.half not in get_accelerator().supported_dtypes():
- pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
- @pytest.mark.inference
- @pytest.mark.parametrize("use_cuda_events", [True, False])
- @pytest.mark.parametrize("enable_cuda_graph", [True, False])
- class TestModelProfiling(DistributedTest):
- world_size = 1
- def test(self, enable_cuda_graph, use_cuda_events):
- task = "fill-mask"
- model = "bert-base-cased"
- dtype = torch.float16
- query = "I am a [MASK] model"
- local_rank = int(os.getenv("LOCAL_RANK", "0"))
- world_size = int(os.getenv("WORLD_SIZE", "1"))
- pipe = pipeline(task, model, framework="pt", device=get_accelerator().device_name(local_rank))
- pipe.model = deepspeed.init_inference(pipe.model,
- dtype=dtype,
- mp_size=world_size,
- replace_with_kernel_inject=True,
- enable_cuda_graph=enable_cuda_graph)
- pipe.model.profile_model_time(use_cuda_events=use_cuda_events)
- e2e_times = []
- model_times = []
- for _ in range(10):
- get_accelerator().synchronize()
- start = time.perf_counter_ns()
- r = pipe(query)
- get_accelerator().synchronize()
- end = time.perf_counter_ns()
- e2e_times.append((end - start) / 1e6) # convert ns to ms
- model_times.extend(pipe.model.model_times())
- for e2e_t, model_t in zip(e2e_times, model_times):
- assert e2e_t >= model_t
|