databricks · rithwik-db · Mar 12, 2025 · Mar 7, 2025 · Mar 10, 2025 · Mar 11, 2025
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -21,14 +21,14 @@ jobs:
       fail-fast: false
       matrix:
         include:
-        - name: "python3.11-pytorch2.5.1-gpus1"
+        - name: "python3.11-pytorch2.6.0-gpus1"
           gpu_num: 1
           python_version: 3.11
-          container: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04
-        - name: "python3.11-pytorch2.5.1-gpus2"
+          container: mosaicml/pytorch:2.6.0_cu124-python3.11-ubuntu22.04
+        - name: "python3.11-pytorch2.6.0-gpus2"
           gpu_num: 2
           python_version: 3.11
-          container: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.6.0_cu124-python3.11-ubuntu22.04
     steps:
     - name: Run PR GPU tests
       uses: mosaicml/ci-testing/.github/actions/[email protected]

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -32,7 +32,7 @@ repos:
     additional_dependencies:
     - toml
 - repo: https://github.com/hadialqattan/pycln
-  rev: v2.1.2
+  rev: v2.5.0
   hooks:
   - id: pycln
     args: [. --all]

diff --git a/megablocks/layers/arguments.py b/megablocks/layers/arguments.py
@@ -8,6 +8,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
+import triton
 
 import megablocks.grouped_gemm_util as grouped_gemm
 
@@ -73,6 +74,11 @@ class Arguments:
     moe_zloss_in_fp32: bool = False
 
     def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse' and triton.__version__ >= '3.2.0':
+            raise ValueError('Sparse MLP is not supported with triton >=3.2.0')
+
         if self.__getattribute__('mlp_impl') == 'grouped':
             grouped_gemm.assert_grouped_gemm_is_available()
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@
 
 # build requirements
 [build-system]
-requires = ["setuptools < 70.0.0", "torch >= 2.5.1, < 2.5.2"]
+requires = ["setuptools < 70.0.0", "torch >= 2.6.0, < 2.6.1"]
 build-backend = "setuptools.build_meta"
 
 # Pytest

diff --git a/setup.py b/setup.py
@@ -62,8 +62,8 @@
 install_requires = [
     'numpy>=1.21.5,<2.1.0',
     'packaging>=21.3.0,<24.2',
-    'torch>=2.5.1,<2.5.2',
-    'triton>=2.1.0',
+    'torch>=2.6.0,<2.6.1',
+    'triton>=3.2.0,<3.3.0',
     'stanford-stk==0.7.1',
 ]
 

diff --git a/tests/layers/dmoe_test.py b/tests/layers/dmoe_test.py
@@ -5,6 +5,7 @@
 
 import pytest
 import torch
+import triton
 
 from megablocks import grouped_gemm_util as gg
 from megablocks.layers.arguments import Arguments
@@ -53,6 +54,11 @@ def construct_moes(
     mlp_impl: str = 'sparse',
     moe_zloss_weight: float = 0,
 ):
+    # All tests are skipped if triton >=3.2.0 is installed since sparse is not supported
+    # TODO: Remove this once sparse is supported with triton >=3.2.0
+    if mlp_impl == 'sparse' and triton.__version__ >= '3.2.0':
+        pytest.skip('Sparse MLP is not supported with triton >=3.2.0')
+
     init_method = partial(torch.nn.init.normal_, mean=0.0, std=0.1)
     args = Arguments(
         hidden_size=hidden_size,

diff --git a/tests/layers/glu_test.py b/tests/layers/glu_test.py
@@ -6,6 +6,7 @@
 import pytest
 import stk
 import torch
+import triton
 
 from megablocks.layers import dmlp_registry
 from megablocks.layers.arguments import Arguments
@@ -23,6 +24,11 @@ def construct_dmoe_glu(
     mlp_impl: str = 'sparse',
     memory_optimized_mlp: bool = False,
 ):
+    # All tests are skipped if triton >=3.2.0 is installed since sparse is not supported
+    # TODO: Remove this once sparse is supported with triton >=3.2.0
+    if mlp_impl == 'sparse' and triton.__version__ >= '3.2.0':
+        pytest.skip('Sparse MLP is not supported with triton >=3.2.0')
+
     init_method = partial(torch.nn.init.normal_, mean=0.0, std=0.1)
     args = Arguments(
         hidden_size=hidden_size,

diff --git a/tests/layers/moe_test.py b/tests/layers/moe_test.py
@@ -5,6 +5,7 @@
 
 import pytest
 import torch
+import triton
 
 from megablocks.layers.arguments import Arguments
 from megablocks.layers.moe import MoE, batched_load_balancing_loss, clear_load_balancing_loss
@@ -41,6 +42,11 @@ def construct_moe(
     moe_top_k: int = 1,
     moe_zloss_weight: float = 0,
 ):
+    # All tests are skipped if triton >=3.2.0 is installed since sparse is not supported
+    # TODO: Remove this once sparse is supported with triton >=3.2.0
+    if triton.__version__ >= '3.2.0':
+        pytest.skip('Sparse MLP is not supported with triton >=3.2.0')
+
     init_method = partial(torch.nn.init.normal_, mean=0.0, std=0.1)
     args = Arguments(
         hidden_size=hidden_size,