Bump FA2 to 2.7.4.post1 (#1728)

KuuCi · Vincent Chen · web-flow · commit f3c6ec20dd88 · 2025-03-12T12:54:45.000-07:00
Co-authored-by: Vincent Chen &lt;v-chen_data@example.com&gt;
diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py
@@ -231,7 +231,7 @@ def apply_ffn(
         if not self.use_pad_tok_in_ffn and attention_mask is not None:
             assert unpad_input is not None
             attention_mask = self.slice_attention_mask(attention_mask, seq_len)
-            m, indices, _, _ = unpad_input(m, attention_mask)
+            m, indices, *_ = unpad_input(m, attention_mask)
         n = self.ffn(m)
         if not self.use_pad_tok_in_ffn and attention_mask is not None:
             assert pad_input is not None
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
@@ -289,15 +289,15 @@ def gen_flash_attn_padding_info(
         query_padding_mask = attention_mask_in_length
         unpadding_function = bert_padding.unpad_input_for_concatenated_sequences
 
-    _, indices_q, cu_seqlens_q, max_seqlen_q = unpadding_function(
+    _, indices_q, cu_seqlens_q, max_seqlen_q, *_ = unpadding_function(
         torch.empty(bsz, S, 1, device=device),
         query_padding_mask,
     )
-    _, indices_k, cu_seqlens_k, max_seqlen_k = unpadding_function(
+    _, indices_k, cu_seqlens_k, max_seqlen_k, *_ = unpadding_function(
         torch.empty(bsz, past_key_len + S, 1, device=device),
         key_padding_mask,
     )
-    _, indices_v, _, _ = unpadding_function(
+    _, indices_v, *_ = unpadding_function(
         torch.empty(bsz, past_key_len + S, 1, device=device),
         key_padding_mask,
     )
diff --git a/setup.py b/setup.py
@@ -104,7 +104,7 @@
 
 # Flash 2 group kept for backwards compatibility
 extra_deps['gpu-flash2'] = [
-    'flash-attn==2.6.3',
+    'flash-attn==2.7.4.post1',
 ]
 
 extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2'])

Original file line number	Diff line number	Diff line change
`@@ -289,15 +289,15 @@ def gen_flash_attn_padding_info(`
`289`	`289`	`query_padding_mask = attention_mask_in_length`
`290`	`290`	`unpadding_function = bert_padding.unpad_input_for_concatenated_sequences`
`291`	`291`
`292`		`- _, indices_q, cu_seqlens_q, max_seqlen_q = unpadding_function(`
	`292`	`+ _, indices_q, cu_seqlens_q, max_seqlen_q, *_ = unpadding_function(`
`293`	`293`	`torch.empty(bsz, S, 1, device=device),`
`294`	`294`	`query_padding_mask,`
`295`	`295`	`)`
`296`		`- _, indices_k, cu_seqlens_k, max_seqlen_k = unpadding_function(`
	`296`	`+ _, indices_k, cu_seqlens_k, max_seqlen_k, *_ = unpadding_function(`
`297`	`297`	`torch.empty(bsz, past_key_len + S, 1, device=device),`
`298`	`298`	`key_padding_mask,`
`299`	`299`	`)`
`300`		`- _, indices_v, _, _ = unpadding_function(`
	`300`	`+ _, indices_v, *_ = unpadding_function(`
`301`	`301`	`torch.empty(bsz, past_key_len + S, 1, device=device),`
`302`	`302`	`key_padding_mask,`
`303`	`303`	`)`
Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@`
`104`	`104`
`105`	`105`	`# Flash 2 group kept for backwards compatibility`
`106`	`106`	`extra_deps['gpu-flash2'] = [`
`107`		`- 'flash-attn==2.6.3',`
	`107`	`+ 'flash-attn==2.7.4.post1',`
`108`	`108`	`]`
`109`	`109`
`110`	`110`	`extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2'])`