linkedin · manojks1999 · Jan 19, 2025
diff --git a/benchmark/scripts/benchmark_rope.py b/benchmark/scripts/benchmark_rope.py
@@ -30,7 +30,13 @@ def bench_speed_rope(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput
     seq_len = extra_benchmark_config["seq_len"] if "seq_len" in extra_benchmark_config else input.x
 
     head_dim = hidden_size // num_q_heads
-    rotary_emb = LlamaRotaryEmbedding(head_dim, device=device)
+
+    config = LlamaConfig(
+        max_position_embeddings=512,
+        head_dim=head_dim,
+    )
+
+    rotary_emb = LlamaRotaryEmbedding(config=config, device=device)
     q = torch.randn(
         (1, seq_len, num_q_heads, head_dim),
         device=device,
@@ -105,7 +111,13 @@ def bench_memory_rope(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutpu
     seq_len = extra_benchmark_config["seq_len"] if "seq_len" in extra_benchmark_config else input.x
 
     head_dim = hidden_size // num_q_heads
-    rotary_emb = LlamaRotaryEmbedding(head_dim, device=device)
+
+    config = LlamaConfig(
+        max_position_embeddings=512,
+        head_dim=head_dim,
+    )
+
+    rotary_emb = LlamaRotaryEmbedding(config=config, device=device)
     q = torch.randn(
         (1, seq_len, num_q_heads, head_dim),
         device=device,

diff --git a/test/transformers/test_rope.py b/test/transformers/test_rope.py
@@ -4,6 +4,7 @@
 from test.utils import supports_bfloat16
 from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding
 from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+from transformers.models.llama.configuration_llama import LlamaConfig
 
 from liger_kernel.ops.rope import LigerRopeFunction
 from liger_kernel.transformers.functional import liger_rope
@@ -57,7 +58,13 @@ def test_correctness(
     atol,
     rtol,
 ):
-    rotary_emb = LlamaRotaryEmbedding(head_dim, device=device)
+
+    config = LlamaConfig(
+        max_position_embeddings=512,
+        head_dim=head_dim,
+    )
+
+    rotary_emb = LlamaRotaryEmbedding(config=config, device=device)
 
     _tensor_q = torch.randn((bsz, seq_len, num_q_heads, head_dim), device=device).transpose(1, 2).to(dtype)
 
@@ -133,7 +140,12 @@ def test_functional_correctness(
     k1 = _k.clone().requires_grad_(True)
     k2 = _k.clone().requires_grad_(True)
 
-    rotary_emb = LlamaRotaryEmbedding(head_dim, device=device)
+    config = LlamaConfig(
+        max_position_embeddings=512,
+        head_dim=head_dim,
+    )
+
+    rotary_emb = LlamaRotaryEmbedding(config=config, device=device)
 
     pos_ids = torch.arange(seq_len, device=device, dtype=torch.long).unsqueeze(0)
     if expand_position_ids: