Update fusion layer counting logic for Llama 3.2 weight conversion (#1722)

ebsmothers · web-flow · commit 10b02e0e3589 · 2024-09-30T15:19:04.000-07:00
diff --git a/torchtune/models/llama3_2_vision/_convert_weights.py b/torchtune/models/llama3_2_vision/_convert_weights.py
@@ -148,8 +148,10 @@ def llama3_vision_tune_to_meta(
 
     # Calculate fusion_interval: layer interval where cross attention layers are fused
     num_layers = max(_layer_num(k) for k in state_dict if "layers" in k) + 1
-    num_fusion_layers = (
-        max(_layer_num(k) for k in state_dict if "cross_attention_layers" in k) + 1
+    # Get the number of unique fusion layers.
+    # Keys have the form decoder.fusion_layer.i. ... where i is the layer number
+    num_fusion_layers = len(
+        set([k.split(".")[2] for k in state_dict if "fusion_layer" in k])
     )
     assert (
         num_layers % num_fusion_layers == 0