[update] initialize tensors on cuda directly for benchmarking

jason-huang03 · jason-huang03 · commit 2aecfa89c777 · 2025-02-15T06:55:55.000Z
diff --git a/bench/bench_baseline.py b/bench/bench_baseline.py
@@ -29,9 +29,9 @@
 print(f"is_causal: {is_causal}")
 for seq_len in {1024, 2048, 4096, 8192, 16384, 32768}:
     flops = 4 * head * batch * headdim * seq_len * seq_len // (2 if is_causal else 1)
-    q = torch.randn(batch, head, seq_len, headdim).half().cuda()
-    k = torch.randn(batch, head, seq_len, headdim).half().cuda()
-    v = torch.randn(batch, head, seq_len, headdim).half().cuda()
+    q = torch.randn(batch, head, seq_len, headdim, dtype=torch.float16, device="cuda")
+    k = torch.randn(batch, head, seq_len, headdim, dtype=torch.float16, device="cuda")
+    v = torch.randn(batch, head, seq_len, headdim, dtype=torch.float16, device="cuda")
     for i in range(5): sdpa(q, k, v, is_causal=is_causal)
     torch.cuda.synchronize()
     _, time = benchmark_forward(sdpa, q, k, v, is_causal=is_causal, repeats=100, verbose=False, desc='Triton')
@@ -41,9 +41,9 @@
 print(f"is_causal: {is_causal}")
 for seq_len in {1024, 2048, 4096, 8192, 16384, 32768}:
     flops = 4 * head * batch * headdim * seq_len * seq_len // (2 if is_causal else 1)
-    q = torch.randn(batch, head, seq_len, headdim).half().cuda()
-    k = torch.randn(batch, head, seq_len, headdim).half().cuda()
-    v = torch.randn(batch, head, seq_len, headdim).half().cuda()
+    q = torch.randn(batch, head, seq_len, headdim, dtype=torch.float16, device="cuda")
+    k = torch.randn(batch, head, seq_len, headdim, dtype=torch.float16, device="cuda")
+    v = torch.randn(batch, head, seq_len, headdim, dtype=torch.float16, device="cuda")
     for i in range(5): sdpa(q, k, v, is_causal=is_causal)
     torch.cuda.synchronize()
     _, time = benchmark_forward(sdpa, q, k, v, is_causal=is_causal, repeats=100, verbose=False, desc='Triton')