pi314ever
diff --git a/‎.github/workflows/pr-test-amd.yml
Lines changed: 52 additions & 4 deletions b/‎.github/workflows/pr-test-amd.yml
Lines changed: 52 additions & 4 deletions
diff --git a/‎.github/workflows/pr-test-sgl-kernel.yml
Lines changed: 14 additions & 7 deletions b/‎.github/workflows/pr-test-sgl-kernel.yml
Lines changed: 14 additions & 7 deletions
diff --git a/‎.github/workflows/pr-test.yml
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/pr-test.yml
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/release-docker-blackwell.yml
Lines changed: 36 additions & 0 deletions b/‎.github/workflows/release-docker-blackwell.yml
Lines changed: 36 additions & 0 deletions
diff --git a/‎.github/workflows/release-pypi-kernel.yml
Lines changed: 0 additions & 44 deletions b/‎.github/workflows/release-pypi-kernel.yml
Lines changed: 0 additions & 44 deletions
diff --git a/‎.github/workflows/release-whl-kernel-cu128.yml renamed to ‎.github/workflows/release-whl-kernel-cu118.yml
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/release-whl-kernel-cu128.yml renamed to ‎.github/workflows/release-whl-kernel-cu118.yml
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/release-whl-kernel.yml
Lines changed: 45 additions & 11 deletions b/‎.github/workflows/release-whl-kernel.yml
Lines changed: 45 additions & 11 deletions
diff --git a/‎.github/workflows/vllm-dependency-test.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/vllm-dependency-test.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitmodules
Lines changed: 0 additions & 4 deletions b/‎.gitmodules
Lines changed: 0 additions & 4 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
@@ -7,12 +7,14 @@ on:
       - "python/sglang/**"
       - "test/**"
       - "sgl-kernel/**"
+      - ".github/workflows/pr-test-amd.yml"
   pull_request:
     branches: [ main ]
     paths:
       - "python/sglang/**"
       - "test/**"
       - "sgl-kernel/**"
+      - ".github/workflows/pr-test-amd.yml"
   workflow_dispatch:
 
 concurrency:
@@ -36,12 +38,12 @@ jobs:
           else
             DEVICE_FLAG="--device /dev/dri"
           fi
-          docker pull lmsysorg/sglang:v0.4.3.post4-rocm630
+          docker pull lmsysorg/sglang:v0.4.5-rocm630
           docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
             -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
             --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
             -w /sglang-checkout --name ci_sglang \
-            lmsysorg/sglang:v0.4.3.post4-rocm630
+            lmsysorg/sglang:v0.4.5-rocm630
 
       - name: Install dependencies
         run: |
@@ -53,6 +55,10 @@ jobs:
           docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
           docker exec -w /human-eval ci_sglang pip install -e .
 
+          docker exec -w / ci_sglang mkdir -p /dummy-grok
+          mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
+          docker cp ./dummy-grok ci_sglang:/
+
       - name: Evaluate Accuracy
         timeout-minutes: 20
         run: |
@@ -76,12 +82,12 @@ jobs:
           else
             DEVICE_FLAG="--device /dev/dri"
           fi
-          docker pull lmsysorg/sglang:v0.4.3.post4-rocm630
+          docker pull lmsysorg/sglang:v0.4.5-rocm630
           docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
             -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
             --cap-add=SYS_PTRACE -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} --security-opt seccomp=unconfined \
             -w /sglang-checkout --name ci_sglang \
-            lmsysorg/sglang:v0.4.3.post4-rocm630
+            lmsysorg/sglang:v0.4.5-rocm630
 
       - name: Install dependencies
         run: |
@@ -98,6 +104,48 @@ jobs:
         run: |
           docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_mla.py
 
+  bench-test-2-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    runs-on: linux-mi300-gpu-2
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup docker
+        run: |
+          # Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG.
+          if [ -f "/etc/podinfo/gha-render-devices" ]; then
+            DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
+          else
+            DEVICE_FLAG="--device /dev/dri"
+          fi
+          docker pull lmsysorg/sglang:v0.4.5-rocm630
+          docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
+            -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
+            --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
+            -w /sglang-checkout --name ci_sglang \
+            lmsysorg/sglang:v0.4.5-rocm630
+
+      - name: Install dependencies
+        run: |
+          docker exec ci_sglang pip install --upgrade pip
+          docker exec ci_sglang pip uninstall sgl-kernel -y || true
+          docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
+          docker exec ci_sglang pip install -e "python[dev_hip]"
+
+          docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
+          docker exec -w /human-eval ci_sglang pip install -e .
+
+          docker exec -w / ci_sglang mkdir -p /dummy-grok
+          mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
+          docker cp ./dummy-grok ci_sglang:/
+
+      - name: Evaluate Benchmark
+        timeout-minutes: 20
+        run: |
+          docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 models/test_dummy_grok_models.py
+
   finish:
     if: always()
     needs: [
 
@@ -35,9 +35,14 @@ jobs:
     runs-on: sgl-kernel-build-node
     strategy:
       matrix:
-        python-version: ['3.9']
-        cuda-version: ['12.4']
-
+        include:
+          - python-version: '3.9'
+            cuda-version: '11.8'
+          - python-version: '3.9'
+            cuda-version: '12.4'
+          - python-version: '3.9'
+            cuda-version: '12.8'
+    name: Build Wheel (CUDA ${{ matrix.cuda-version }})
     steps:
       - name: Cleanup
         run: |
@@ -52,13 +57,15 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
 
-      - name: Build wheels for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
+      - name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
+        if: github.event_name != 'push' || (matrix.cuda-version != '11.8' && matrix.cuda-version != '12.8')
         run: |
           cd sgl-kernel
           chmod +x ./build.sh
           ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
 
-      - name: Upload artifacts
+      - name: Upload artifacts (only for CUDA 12.4)
+        if: ${{ matrix.cuda-version == '12.4' }}
         uses: actions/upload-artifact@v4
         with:
           name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
@@ -81,7 +88,7 @@ jobs:
       - name: Install
         run: |
           bash scripts/ci_install_dependency.sh
-          pip3 install torch==2.5.1 && pip3 install pytest && pip3 install vllm==0.7.2
+          pip3 install torch==2.5.1 && pip3 install pytest
           pip3 uninstall sgl-kernel -y || true
           pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
           pip3 list | grep sgl-kernel
@@ -128,7 +135,7 @@ jobs:
           pip3 uninstall sgl-kernel -y
 
   finish:
-    needs: [unit-test, mla-test, lint]
+    needs: [unit-test, mla-test, lint, build-wheels]
     runs-on: ubuntu-latest
     steps:
       - name: Check all dependent job statuses
 
@@ -187,8 +187,6 @@ jobs:
         timeout-minutes: 10
         run: |
           cd test/srt
-          USE_VLLM_CUSTOM_ALLREDUCE=1 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
-
           python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
 
       - name: Benchmark single latency + torch.compile (TP=2)
 
@@ -0,0 +1,36 @@
+name: Build Blackwell Docker Image
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  build-dev:
+    if: ${{ github.repository == 'sgl-project/sglang' }}
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: false
+          docker-images: false
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push Blackwell Image
+        run: |
+          docker build . -f docker/Dockerfile.blackwell -t lmsysorg/sglang:blackwell --no-cache
+          docker push lmsysorg/sglang:blackwell
@@ -1,4 +1,4 @@
-name: Release SGLang Kernel Wheel (cu128)
+name: Release SGLang Kernel Wheel (cu118)
 
 on:
   workflow_dispatch:
@@ -14,11 +14,11 @@ on:
 jobs:
   build-wheels:
     if: github.repository == 'sgl-project/sglang'
-    runs-on: ubuntu-latest
+    runs-on: sgl-kernel-release-node
     strategy:
       matrix:
         python-version: ['3.9']
-        cuda-version: ['12.8']
+        cuda-version: ['11.8']
 
     steps:
       - uses: actions/checkout@v4
@@ -80,7 +80,7 @@ jobs:
           WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
 
       - name: Update wheel index
-        run: python3 scripts/update_kernel_whl_index.py --cuda 128
+        run: python3 scripts/update_kernel_whl_index.py
 
       - name: Push wheel index
         run: |
 
@@ -1,25 +1,59 @@
-name: Release SGLang Kernel Wheel (cu118)
+name: Release SGLang Kernels
 
 on:
-  workflow_dispatch:
-    inputs:
-      tag_name:
-        type: string
   push:
     branches:
       - main
     paths:
       - sgl-kernel/python/sgl_kernel/version.py
+  workflow_dispatch:
+    inputs:
+      tag_name:
+        type: string
+        required: false
+
+concurrency:
+  group: release-sglang-kernels-${{ github.ref }}
+  cancel-in-progress: true
 
 jobs:
-  build-wheels:
+  build-cu124:
     if: github.repository == 'sgl-project/sglang'
-    runs-on: ubuntu-latest
+    runs-on: sgl-kernel-release-node
     strategy:
       matrix:
         python-version: ['3.9']
-        cuda-version: ['11.8']
+        cuda-version: ['12.4']
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: 'recursive'
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
 
+      - name: Build wheels
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload to PyPI
+        working-directory: sgl-kernel
+        run: |
+          pip install twine
+          python3 -m twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
+
+  build-cu128:
+    if: github.repository == 'sgl-project/sglang'
+    needs: build-cu124
+    runs-on: sgl-kernel-release-node
+    strategy:
+      matrix:
+        python-version: ['3.9']
+        cuda-version: ['12.8']
     steps:
       - uses: actions/checkout@v4
         with:
@@ -30,7 +64,7 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
 
-      - name: Build wheels for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
+      - name: Build wheels
         run: |
           cd sgl-kernel
           chmod +x ./build.sh
@@ -43,7 +77,7 @@ jobs:
           path: sgl-kernel/dist/*
 
   release:
-    needs: build-wheels
+    needs: build-cu128
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -80,7 +114,7 @@ jobs:
           WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
 
       - name: Update wheel index
-        run: python3 scripts/update_kernel_whl_index.py
+        run: python3 scripts/update_kernel_whl_index.py --cuda 128
 
       - name: Push wheel index
         run: |
 
@@ -33,6 +33,7 @@ jobs:
         run: |
           bash scripts/ci_install_dependency.sh
           pip install "vllm>=0.6.4.post1,<=0.7.2"
+          pip install "bitsandbytes>=0.44.0"
 
       - name: Run VLLM dependency tests
         timeout-minutes: 60
 
@@ -1,4 +0,0 @@
-[submodule "sgl-kernel/3rdparty/flashinfer"]
-	path = sgl-kernel/3rdparty/flashinfer
-	url = https://github.com/sgl-project/flashinfer.git
-	branch = sgl-kernel
@@ -63,7 +63,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 
 ## Adoption and Sponsorship
 The project has been deployed to large-scale production, generating trillions of tokens every day.
-It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
+It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
 
 <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>