# Copyright (c) Microsoft Corporation. # SPDX-License-Identifier: Apache-2.0 # DeepSpeed Team """ This script is to test the performance of the DS4Sci_EvoformerAttention op. To run the script, 1. Clone the CUTLASS repo. E.g. git clone https://github.com/NVIDIA/cutlass.git 2. Specify the CUTLASS_PATH environment variable. E.g. export CUTLASS_PATH=$(pwd)/cutlass 3. Run the script. E.g. python DS4Sci_EvoformerAttention_bench.py """ import contextlib import torch from typing import List from torch.nn import functional as F from deepspeed.ops.deepspeed4science import DS4Sci_EvoformerAttention from deepspeed.accelerator import get_accelerator def attention_reference( q_input: torch.Tensor, # [*, Dim_Q, H, C_hid] k_input: torch.Tensor, # [*, Dim_Q, H, C_hid] v_input: torch.Tensor, # [*, Dim_Q, H, C_hid] biases: List[torch.Tensor], sm_scale: float) -> torch.Tensor: # Original shape: [*, Dim_Q, H, C_hid] -> Transpose to: [*, H, Dim_Q, C_hid] q = q_input.transpose(-2, -3) k = k_input.transpose(-2, -3) v = v_input.transpose(-2, -3) # Now, q, k, v are in shape: [*, H, Dim_Q, C_hid] # Transpose k to shape [*, H, C_hid, Dim_Q] k_t = k.transpose(-1, -2) # Now, q and k_t are in shapes: [*, H, Dim_Q, C_hid] and [*, H, C_hid, Dim_Q] respectively # [*, H, Dim_Q, Dim_Q] a = torch.matmul(q, k_t) * sm_scale for b in biases: a += b a = F.softmax(a, dim=-1) # Now, a is in shape [*, H, Dim_Q, Dim_Q], v is in shape [*, H, Dim_Q, C_hid] # Matmul operation results in [*, H, Dim_Q, C_hid] a_v = torch.matmul(a, v) # [*, Dim_Q, H, C_hid] o = a_v.transpose(-2, -3) return o dtype = torch.float16 N = 256 heads = 4 dim = 32 seq_len = 256 @contextlib.contextmanager def cuda_timer(res_list): start = get_accelerator().Event(enable_timing=True) end = get_accelerator().Event(enable_timing=True) start.record() yield end.record() get_accelerator().synchronize() res_list.append(start.elapsed_time(end)) def benchmark(): ours_fw = [] ours_bw = [] baseline_fw = [] baseline_bw = [] for batch in range(1, 17): Q = torch.randn(batch, N, seq_len, heads, dim, dtype=dtype, device="cuda", requires_grad=True) K = torch.randn(batch, N, seq_len, heads, dim, dtype=dtype, device="cuda", requires_grad=True) V = torch.randn(batch, N, seq_len, heads, dim, dtype=dtype, device="cuda", requires_grad=True) bias1 = torch.randn(batch, N, 1, 1, seq_len, dtype=dtype, device="cuda", requires_grad=False) bias2 = torch.randn(batch, 1, heads, seq_len, seq_len, dtype=dtype, device="cuda", requires_grad=True) # warm up DS4Sci_EvoformerAttention(Q, K, V, [bias1, bias2]) with cuda_timer(ours_fw): out = DS4Sci_EvoformerAttention(Q, K, V, [bias1, bias2]) d_out = torch.rand_like(out) with cuda_timer(ours_bw): out.backward(d_out) # warm up attention_reference(Q, K, V, [bias1, bias2], 1 / (dim**0.5)) with cuda_timer(baseline_fw): ref_out = attention_reference(Q, K, V, [bias1, bias2], 1 / (dim**0.5)) with cuda_timer(baseline_bw): ref_out.backward(d_out) print(f"batch size\tours (FW)\tbaseline (FW)\tours (BW)\tbaseline (BW)") for i in range(len(ours_fw)): print(f"{i+1}\t{ours_fw[i]}\t{baseline_fw[i]}\t{ours_bw[i]}\t{baseline_bw[i]}") benchmark()