Revert "finish building"

l1cacheDell · gzy19990617 · commit 754830b28c87 · 2025-02-20T23:55:26.000+08:00
This reverts commit b1ce0a4.
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
@@ -40,8 +40,8 @@
   } else if (head_dim == 128) {                                 \
     constexpr int HEAD_DIM = 128;                               \
     __VA_ARGS__                                                 \
-  } else if (head_dim == 256) {                                 \
-    constexpr int HEAD_DIM = 256;                               \
+  } else if (head_dim == 192) {                                 \
+    constexpr int HEAD_DIM = 192;                               \
     __VA_ARGS__                                                 \
   } else {                                                      \
     std::ostringstream err_msg;                                 \
diff --git a/patch_test/test_dsk.py b/patch_test/test_dsk.py
@@ -60,7 +60,7 @@ def precision_cmp_torch(t1: torch.Tensor, t2: torch.Tensor):
 torch.cuda.synchronize()
 
 sim, l1, max_diff = precision_cmp_torch(o_torch_fa2.transpose(2, 1), o_sa)
-print(f"Sim and Diff of Sage Attn & torch SDPA: {sim}, {max_diff}")
+print(f"Sim and Diff of Sage Attn: {sim}, {max_diff}")
 
 sim, l1, max_diff = precision_cmp_torch(o_torch_fa2, o_torch_sdpa)
-print(f"Sim and Diff of Flash Attn & torch SDPA: {sim}, {max_diff}")
+print(f"Sim and Diff of Flash Attn: {sim}, {max_diff}")
diff --git a/sageattention/core.py b/sageattention/core.py
@@ -925,7 +925,7 @@ def sageattn_qk_int8_pv_fp8_cuda_dsk_sm90(
     torch.cuda.set_device(v.device)
 
     _tensor_layout = 0 if tensor_layout == "NHD" else 1
-    _is_causal = 1 if is_causal else 0
+    _is_caual = 1 if is_causal else 0
     _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
     _return_lse = 1 if return_lse else 0
 
@@ -939,11 +939,11 @@ def sageattn_qk_int8_pv_fp8_cuda_dsk_sm90(
         q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
         k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
         v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
-    elif head_dim_og > 128 and head_dim_og < 256:
-        q = torch.nn.functional.pad(q, (0, 256 - head_dim_og))
-        k = torch.nn.functional.pad(k, (0, 256 - head_dim_og))
+    elif head_dim_og > 128 and head_dim_og < 192:
+        q = torch.nn.functional.pad(q, (0, 192 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 192 - head_dim_og))
         # v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
-    elif head_dim_og > 256:
+    elif head_dim_og > 192:
         raise ValueError(f"Unsupported head_dim: {head_dim_og}")
     
     assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, "Last dim of qkv must be contiguous."
@@ -977,10 +977,10 @@ def sageattn_qk_int8_pv_fp8_cuda_dsk_sm90(
             v = torch.cat([v, torch.zeros(v.size(0), v_pad_len, v.size(2), v.size(3), dtype=v.dtype, device=v.device)], dim=1)
 
     v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False)
-    q_int8_nope, q_int8_pe, _ = torch.split(q_int8, [128, 64, 64], dim=-1)
-    k_int8_nope, k_int8_pe, _ = torch.split(k_int8, [128, 64, 64], dim=-1)
+    q_int8_nope, q_int8_pe = torch.split(q_int8, [128, 64], dim=-1)
+    k_int8_nope, k_int8_pe = torch.split(k_int8, [128, 64], dim=-1)
 
-    lse = _qattn_sm90.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_dsk_sm90(q_int8_nope, k_int8_nope, q_int8_pe, k_int8_pe, v_fp8, o, q_scale, k_scale, v_scale, _tensor_layout, _is_causal, _qk_quant_gran, sm_scale, _return_lse)
+    lse = _qattn_sm90.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_dsk_sm90(q_int8_nope, k_int8_nope, q_int8_pe, k_int8_pe, v_fp8, o, q_scale, k_scale, v_scale, _tensor_layout, _is_caual, _qk_quant_gran, sm_scale, _return_lse)
 
     head_dim_og = v.shape[-1]
     o = o[..., :head_dim_og]