walker-ai
diff --git a/‎.github/workflows/pr-test-amd.yml
Lines changed: 30 additions & 4 deletions b/‎.github/workflows/pr-test-amd.yml
Lines changed: 30 additions & 4 deletions
diff --git a/‎.github/workflows/pr-test-sgl-kernel.yml
Lines changed: 2 additions & 3 deletions b/‎.github/workflows/pr-test-sgl-kernel.yml
Lines changed: 2 additions & 3 deletions
diff --git a/‎.github/workflows/pr-test-xeon.yml
Lines changed: 81 additions & 0 deletions b/‎.github/workflows/pr-test-xeon.yml
Lines changed: 81 additions & 0 deletions
diff --git a/‎.github/workflows/release-docker-deepep.yml
Lines changed: 14 additions & 3 deletions b/‎.github/workflows/release-docker-deepep.yml
Lines changed: 14 additions & 3 deletions
diff --git a/‎.github/workflows/release-docker-dev-deepep.yml
Lines changed: 0 additions & 36 deletions b/‎.github/workflows/release-docker-dev-deepep.yml
Lines changed: 0 additions & 36 deletions
diff --git a/‎.github/workflows/release-docker-router.yml
Lines changed: 30 additions & 0 deletions b/‎.github/workflows/release-docker-router.yml
Lines changed: 30 additions & 0 deletions
diff --git a/‎.github/workflows/release-docker-xeon.yml
Lines changed: 35 additions & 0 deletions b/‎.github/workflows/release-docker-xeon.yml
Lines changed: 35 additions & 0 deletions
diff --git a/‎.github/workflows/vllm-dependency-test.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/vllm-dependency-test.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore
Lines changed: 2 additions & 0 deletions b/‎.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎Makefile
Lines changed: 1 addition & 1 deletion b/‎Makefile
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 3 additions & 3 deletions b/‎README.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎benchmark/deepseek_v3/README.md
Lines changed: 1 addition & 1 deletion b/‎benchmark/deepseek_v3/README.md
Lines changed: 1 addition & 1 deletion
@@ -72,7 +72,7 @@ jobs:
       - name: Evaluate accuracy (TP=2)
         timeout-minutes: 30
         run: |
-          bash scripts/amd_ci_exec.sh python3 test_moe_eval_accuracy_large.py
+          bash scripts/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_moe_eval_accuracy_large.py
 
   mla-test-1-gpu-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -220,8 +220,10 @@ jobs:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
       github.event.pull_request.draft == false
     strategy:
+      fail-fast: false
       matrix:
         runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+        part: [0, 1, 2, 3, 4, 5]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout code
@@ -238,7 +240,7 @@ jobs:
       - name: Run test
         timeout-minutes: 40
         run: |
-          bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd
+          bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 6
 
   unit-test-backend-2-gpu-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -264,6 +266,30 @@ jobs:
         run: |
           bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd
 
+  unit-test-backend-4-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      matrix:
+        runner: [linux-mi300-gpu-4]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 40
+        run: |
+          bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-4-gpu-amd
+
   unit-test-backend-8-gpu-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
       github.event.pull_request.draft == false
@@ -284,9 +310,9 @@ jobs:
         run: bash scripts/amd_ci_install_dependency.sh
 
       - name: Run test
-        timeout-minutes: 40
+        timeout-minutes: 60
         run: |
-          bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd
+          bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600
 
   finish:
     if: always()
 
@@ -36,8 +36,6 @@ jobs:
     strategy:
       matrix:
         include:
-          - python-version: '3.9'
-            cuda-version: '11.8'
           - python-version: '3.9'
             cuda-version: '12.4'
           - python-version: '3.9'
@@ -88,7 +86,7 @@ jobs:
       - name: Install
         run: |
           bash scripts/ci_install_dependency.sh
-          pip3 install torch==2.6.0 torchvision && pip3 install pytest
+          pip3 install torch==2.7.1 torchvision && pip3 install pytest
           pip3 uninstall sgl-kernel -y || true
           pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
           pip3 list | grep sgl-kernel
@@ -120,6 +118,7 @@ jobs:
       - name: Install
         run: |
           bash scripts/ci_install_dependency.sh
+          pip3 install torch==2.7.1 torchvision
           pip3 uninstall sgl-kernel -y || true
           pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
           pip3 list | grep sgl-kernel
 
@@ -0,0 +1,81 @@
+name: PR Test (Xeon)
+on:
+  pull_request:
+    branches:
+      - main
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-xeon
+  cancel-in-progress: false
+
+jobs:
+  build-test:
+    if: github.event_name == 'pull_request'
+    runs-on: sgl-kernel-release-node
+    strategy:
+      matrix:
+        build_type: ['all']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-xeon
+
+          docker build . -f docker/Dockerfile.xeon  -t sglang_xeon --no-cache
+
+      - name: Run container
+        run: |
+          docker run -dt \
+            -v ${{ github.workspace }}:/sglang-checkout/ --ipc=host \
+            --name ci_sglang_xeon \
+            sglang_xeon
+
+      - name: Install Dependency
+        timeout-minutes: 20
+        run: |
+          docker exec ci_sglang_xeon bash -c "python3 -m pip install --upgrade pip"
+          docker exec ci_sglang_xeon pip uninstall sgl-kernel -y || true
+          docker exec -w /sglang-checkout/sgl-kernel ci_sglang_xeon bash -c "cp pyproject_cpu.toml pyproject.toml && pip install -v ."
+          docker exec -w /sglang-checkout/ ci_sglang_xeon bash -c "pip install -e "python[all_cpu]""
+          docker exec ci_sglang_xeon bash -c "python3 -m pip install pytest expecttest"
+
+      - name: Check AMX Support
+        id: check_amx
+        timeout-minutes: 5
+        run: |
+          docker exec -w /sglang-checkout/ ci_sglang_xeon \
+            bash -c "python3 -c 'import torch; import sgl_kernel; assert torch._C._cpu._is_amx_tile_supported(); assert hasattr(torch.ops.sgl_kernel, \"convert_weight_packed\"); '"
+        continue-on-error: true
+
+      - name: Run UT Cases
+        if: steps.check_amx.outcome == 'success'
+        timeout-minutes: 20
+        run: |
+          docker exec -w /sglang-checkout/ ci_sglang_xeon \
+            bash -c "cd ./test/srt && python3 run_suite.py --suite per-commit-cpu"
+
+      - name: Cleanup container
+        if: always()
+        run: |
+          docker rm -f ci_sglang_xeon || true
+
+  finish:
+    if: always()
+    needs: [build-test]
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
@@ -9,6 +9,17 @@ jobs:
   build-dev:
     if: ${{ github.repository == 'sgl-project/sglang' }}
     runs-on: ubuntu-22.04
+
+    strategy:
+      matrix:
+        variant:
+          - base: lmsysorg/sglang:latest
+            tag: deepep
+          - base: lmsysorg/sglang:dev
+            tag: dev-deepep
+          - base: lmsysorg/sglang:blackwell
+            tag: blackwell-deepep
+
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
@@ -30,7 +41,7 @@ jobs:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
 
-      - name: Build and Push DeepEP Image
+      - name: Build and Push Docker Image
         run: |
-          docker build . -f docker/Dockerfile.deepep -t lmsysorg/sglang:deepep --no-cache
-          docker push lmsysorg/sglang:deepep
+          docker build . -f docker/Dockerfile.deepep --build-arg BASE_IMAGE=${{ matrix.variant.base }} -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache
+          docker push lmsysorg/sglang:${{ matrix.variant.tag }}
@@ -0,0 +1,30 @@
+name: Release SGLang Router Docker Image
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "sgl-router/py_src/sglang_router/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat sgl-router/py_src/sglang_router/version.py | cut -d'"' -f2)
+          tag=v${version}
+
+          docker build . -f docker/Dockerfile.router -t lmsysorg/sglang-router:${tag} --no-cache
+          docker push lmsysorg/sglang-router:${tag}
@@ -0,0 +1,35 @@
+name: Release Docker Images
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-24.04
+    environment: 'prod'
+    strategy:
+      matrix:
+        build_type: ['all']
+    steps:
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-xeon
+
+          docker build . -f docker/Dockerfile.xeon  -t lmsysorg/sglang:${tag} --no-cache
+          docker push lmsysorg/sglang:${tag}
@@ -30,7 +30,7 @@ jobs:
       - name: Install dependencies
         run: |
           bash scripts/ci_install_dependency.sh
-          pip install "vllm==0.8.4"
+          pip install "vllm==0.9.0.1"
           pip install "bitsandbytes>=0.44.0"
 
       - name: Run VLLM dependency tests
 
@@ -233,3 +233,5 @@ compile_commands.json
 
 # Rust lib
 Cargo.lock
+
+lmms-eval
@@ -19,7 +19,7 @@ format: check-deps ## Format modified Python files using isort and black
 FILES_TO_UPDATE = docker/Dockerfile.rocm \
                  python/pyproject.toml \
                  python/sglang/version.py \
-                 docs/developer/setup_github_runner.md \
+                 docs/references/setup_github_runner.md \
                  docs/start/install.md \
 				 benchmark/deepseek_v3/README.md
 
 
@@ -12,7 +12,7 @@
 
 --------------------------------------------------------------------------------
 
-| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
+| [**Blog**](https://lmsys.org/blog/2025-05-05-large-scale-ep/)
 | [**Documentation**](https://docs.sglang.ai/)
 | [**Join Slack**](https://slack.sglang.ai/)
 | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
@@ -44,7 +44,7 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
 
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
@@ -63,7 +63,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 [Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
 
 ## Adoption and Sponsorship
-SGLang has been deployed at large scale, serving trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
+SGLang has been deployed at large scale, generating trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
 
 <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
 
 
@@ -33,7 +33,7 @@ Add [performance optimization options](#performance-optimization-options) as nee
 
 ```bash
 # Installation
-pip install "sglang[all]>=0.4.6.post5"
+pip install "sglang[all]>=0.4.7"
 
 # Launch
 python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code