diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs
index ca910dccb0d06..604a88393fd95 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs
@@ -191,6 +191,14 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
                     })
                     .try_into()
                     .unwrap(),
+                _ if idx_ty.is_simd()
+                    && matches!(
+                        idx_ty.simd_size_and_type(fx.tcx).1.kind(),
+                        ty::Uint(ty::UintTy::U32)
+                    ) =>
+                {
+                    idx_ty.simd_size_and_type(fx.tcx).0.try_into().unwrap()
+                }
                 _ => {
                     fx.tcx.dcx().span_err(
                         span,
@@ -213,6 +221,8 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
 
             let total_len = lane_count * 2;
 
+            // FIXME: this is a terrible abstraction-breaking hack.
+            // Find a way to reuse `immediate_const_vector` from `codegen_ssa` instead.
             let indexes = {
                 use rustc_middle::mir::interpret::*;
                 let idx_const = match &idx.node {
diff --git a/compiler/rustc_codegen_gcc/src/builder.rs b/compiler/rustc_codegen_gcc/src/builder.rs
index 47b378cc1cd82..6ba678e2e7c65 100644
--- a/compiler/rustc_codegen_gcc/src/builder.rs
+++ b/compiler/rustc_codegen_gcc/src/builder.rs
@@ -1923,15 +1923,11 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
         v2: RValue<'gcc>,
         mask: RValue<'gcc>,
     ) -> RValue<'gcc> {
-        let struct_type = mask.get_type().is_struct().expect("mask should be of struct type");
-
         // TODO(antoyo): use a recursive unqualified() here.
         let vector_type = v1.get_type().unqualified().dyncast_vector().expect("vector type");
         let element_type = vector_type.get_element_type();
         let vec_num_units = vector_type.get_num_units();
 
-        let mask_num_units = struct_type.get_field_count();
-        let mut vector_elements = vec![];
         let mask_element_type = if element_type.is_integral() {
             element_type
         } else {
@@ -1942,19 +1938,39 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
             #[cfg(not(feature = "master"))]
             self.int_type
         };
-        for i in 0..mask_num_units {
-            let field = struct_type.get_field(i as i32);
-            vector_elements.push(self.context.new_cast(
-                self.location,
-                mask.access_field(self.location, field).to_rvalue(),
-                mask_element_type,
-            ));
-        }
+
+        let mut mask_elements = if let Some(vector_type) = mask.get_type().dyncast_vector() {
+            let mask_num_units = vector_type.get_num_units();
+            let mut mask_elements = vec![];
+            for i in 0..mask_num_units {
+                let index = self.context.new_rvalue_from_long(self.cx.type_u32(), i as _);
+                mask_elements.push(self.context.new_cast(
+                    self.location,
+                    self.extract_element(mask, index).to_rvalue(),
+                    mask_element_type,
+                ));
+            }
+            mask_elements
+        } else {
+            let struct_type = mask.get_type().is_struct().expect("mask should be of struct type");
+            let mask_num_units = struct_type.get_field_count();
+            let mut mask_elements = vec![];
+            for i in 0..mask_num_units {
+                let field = struct_type.get_field(i as i32);
+                mask_elements.push(self.context.new_cast(
+                    self.location,
+                    mask.access_field(self.location, field).to_rvalue(),
+                    mask_element_type,
+                ));
+            }
+            mask_elements
+        };
+        let mask_num_units = mask_elements.len();
 
         // NOTE: the mask needs to be the same length as the input vectors, so add the missing
         // elements in the mask if needed.
         for _ in mask_num_units..vec_num_units {
-            vector_elements.push(self.context.new_rvalue_zero(mask_element_type));
+            mask_elements.push(self.context.new_rvalue_zero(mask_element_type));
         }
 
         let result_type = self.context.new_vector_type(element_type, mask_num_units as u64);
@@ -1998,7 +2014,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
 
         let new_mask_num_units = std::cmp::max(mask_num_units, vec_num_units);
         let mask_type = self.context.new_vector_type(mask_element_type, new_mask_num_units as u64);
-        let mask = self.context.new_rvalue_from_vector(self.location, mask_type, &vector_elements);
+        let mask = self.context.new_rvalue_from_vector(self.location, mask_type, &mask_elements);
         let result = self.context.new_rvalue_vector_perm(self.location, v1, v2, mask);
 
         if vec_num_units != mask_num_units {
diff --git a/compiler/rustc_codegen_gcc/src/intrinsic/simd.rs b/compiler/rustc_codegen_gcc/src/intrinsic/simd.rs
index 8da1df3be1534..96a833ccaf2b6 100644
--- a/compiler/rustc_codegen_gcc/src/intrinsic/simd.rs
+++ b/compiler/rustc_codegen_gcc/src/intrinsic/simd.rs
@@ -353,19 +353,24 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(
     }
 
     if name == sym::simd_shuffle {
-        // Make sure this is actually an array, since typeck only checks the length-suffixed
+        // Make sure this is actually an array or SIMD vector, since typeck only checks the length-suffixed
         // version of this intrinsic.
-        let n: u64 = match *args[2].layout.ty.kind() {
+        let idx_ty = args[2].layout.ty;
+        let n: u64 = match idx_ty.kind() {
             ty::Array(ty, len) if matches!(*ty.kind(), ty::Uint(ty::UintTy::U32)) => {
                 len.try_eval_target_usize(bx.cx.tcx, ty::ParamEnv::reveal_all()).unwrap_or_else(
                     || span_bug!(span, "could not evaluate shuffle index array length"),
                 )
             }
-            _ => return_error!(InvalidMonomorphization::SimdShuffle {
-                span,
-                name,
-                ty: args[2].layout.ty
-            }),
+            _ if idx_ty.is_simd()
+                && matches!(
+                    idx_ty.simd_size_and_type(bx.cx.tcx).1.kind(),
+                    ty::Uint(ty::UintTy::U32)
+                ) =>
+            {
+                idx_ty.simd_size_and_type(bx.cx.tcx).0
+            }
+            _ => return_error!(InvalidMonomorphization::SimdShuffle { span, name, ty: idx_ty }),
         };
         require_simd!(ret_ty, InvalidMonomorphization::SimdReturn { span, name, ty: ret_ty });
 
diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
index f5558723d11bf..5d32ef0d9d65f 100644
--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -1279,19 +1279,24 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
     }
 
     if name == sym::simd_shuffle {
-        // Make sure this is actually an array, since typeck only checks the length-suffixed
+        // Make sure this is actually an array or SIMD vector, since typeck only checks the length-suffixed
         // version of this intrinsic.
-        let n: u64 = match args[2].layout.ty.kind() {
+        let idx_ty = args[2].layout.ty;
+        let n: u64 = match idx_ty.kind() {
             ty::Array(ty, len) if matches!(ty.kind(), ty::Uint(ty::UintTy::U32)) => {
                 len.try_eval_target_usize(bx.cx.tcx, ty::ParamEnv::reveal_all()).unwrap_or_else(
                     || span_bug!(span, "could not evaluate shuffle index array length"),
                 )
             }
-            _ => return_error!(InvalidMonomorphization::SimdShuffle {
-                span,
-                name,
-                ty: args[2].layout.ty
-            }),
+            _ if idx_ty.is_simd()
+                && matches!(
+                    idx_ty.simd_size_and_type(bx.cx.tcx).1.kind(),
+                    ty::Uint(ty::UintTy::U32)
+                ) =>
+            {
+                idx_ty.simd_size_and_type(bx.cx.tcx).0
+            }
+            _ => return_error!(InvalidMonomorphization::SimdShuffle { span, name, ty: idx_ty }),
         };
 
         let (out_len, out_ty) = require_simd!(ret_ty, SimdReturn);
diff --git a/library/core/src/intrinsics/simd.rs b/library/core/src/intrinsics/simd.rs
index 221724d7b4ae9..5982819809937 100644
--- a/library/core/src/intrinsics/simd.rs
+++ b/library/core/src/intrinsics/simd.rs
@@ -232,7 +232,7 @@ extern "rust-intrinsic" {
     ///
     /// `T` must be a vector.
     ///
-    /// `U` must be a **const** array of `i32`s. This means it must either refer to a named
+    /// `U` must be a **const** array or vector of `u32`s. This means it must either refer to a named
     /// const or be given as an inline const expression (`const { ... }`).
     ///
     /// `V` must be a vector with the same element type as `T` and the same length as `U`.
diff --git a/tests/ui/simd/shuffle.rs b/tests/ui/simd/shuffle.rs
index 09926d95557cd..dc0d688284e3c 100644
--- a/tests/ui/simd/shuffle.rs
+++ b/tests/ui/simd/shuffle.rs
@@ -6,15 +6,20 @@
 #![allow(incomplete_features)]
 #![feature(adt_const_params)]
 
+use std::marker::ConstParamTy;
+
 extern "rust-intrinsic" {
     fn simd_shuffle<T, I, U>(a: T, b: T, i: I) -> U;
 }
 
-#[derive(Copy, Clone)]
+#[derive(Copy, Clone, ConstParamTy, PartialEq, Eq)]
 #[repr(simd)]
 struct Simd<T, const N: usize>([T; N]);
 
-pub unsafe fn __shuffle_vector16<const IDX: [u32; 16], T, U>(x: T, y: T) -> U {
+unsafe fn __shuffle_vector16<const IDX: [u32; 16], T, U>(x: T, y: T) -> U {
+    simd_shuffle(x, y, IDX)
+}
+unsafe fn __shuffle_vector16_v2<const IDX: Simd<u32, 16>, T, U>(x: T, y: T) -> U {
     simd_shuffle(x, y, IDX)
 }
 
@@ -30,6 +35,17 @@ fn main() {
         let y: Simd<u8, 2> = simd_shuffle(a, b, I2);
         assert_eq!(y.0, [1, 5]);
     }
+    // Test that we can also use a SIMD vector instead of a normal array for the shuffle.
+    const I1_SIMD: Simd<u32, 4> = Simd([0, 2, 4, 6]);
+    const I2_SIMD: Simd<u32, 2> = Simd([1, 5]);
+    unsafe {
+        let x: Simd<u8, 4> = simd_shuffle(a, b, I1_SIMD);
+        assert_eq!(x.0, [0, 2, 4, 6]);
+
+        let y: Simd<u8, 2> = simd_shuffle(a, b, I2_SIMD);
+        assert_eq!(y.0, [1, 5]);
+    }
+
     // Test that an indirection (via an unnamed constant)
     // through a const generic parameter also works.
     // See https://github.com/rust-lang/rust/issues/113500 for details.
@@ -42,4 +58,11 @@ fn main() {
             Simd<u8, 16>,
         >(a, b);
     }
+    unsafe {
+        __shuffle_vector16_v2::<
+            { Simd([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]) },
+            Simd<u8, 16>,
+            Simd<u8, 16>,
+        >(a, b);
+    }
 }