diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 38fa77e41bb53..abeb011ae7c76 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -117,6 +117,14 @@ else()
   set(CMAKE_CXX_EXTENSIONS NO)
 endif()
 
+# Emit a warning for people who haven't updated their build.
+if(NOT "openmp" IN_LIST RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES AND
+   NOT "openmp" IN_LIST RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES)
+  message(WARNING "Building the offloading runtime with no device library. See "
+                  "https://openmp.llvm.org/SupportAndFAQ.html#q-how-to-build-an-openmp-gpu-offload-capable-compiler.html "
+                  "for more information.")
+endif()
+
 # Set the path of all resulting libraries to a unified location so that it can
 # be used for testing.
 set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
@@ -371,7 +379,6 @@ add_subdirectory(tools/offload-tblgen)
 
 # Build offloading plugins and device RTLs if they are available.
 add_subdirectory(plugins-nextgen)
-add_subdirectory(DeviceRTL)
 add_subdirectory(tools)
 add_subdirectory(docs)
 
diff --git a/offload/cmake/caches/AMDGPUBot.cmake b/offload/cmake/caches/AMDGPUBot.cmake
index 0236f5f0b6987..7f9b5e1d26971 100644
--- a/offload/cmake/caches/AMDGPUBot.cmake
+++ b/offload/cmake/caches/AMDGPUBot.cmake
@@ -19,3 +19,6 @@ set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE
 
 set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "")
 set(CLANG_DEFAULT_RTLIB "compiler-rt" STRING "")
+
+set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa CACHE STRING "")
+set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "openmp" CACHE STRING "")
diff --git a/offload/cmake/caches/Offload.cmake b/offload/cmake/caches/Offload.cmake
index 5533a6508f5d5..3747a1d3eb299 100644
--- a/offload/cmake/caches/Offload.cmake
+++ b/offload/cmake/caches/Offload.cmake
@@ -5,5 +5,5 @@ set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
 set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa;nvptx64-nvidia-cuda CACHE STRING "") 
 set(RUNTIMES_nvptx64-nvidia-cuda_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/NVPTX.cmake" CACHE STRING "")
 set(RUNTIMES_amdgcn-amd-amdhsa_CACHE_FILES "${CMAKE_SOURCE_DIR}/../libcxx/cmake/caches/AMDGPU.cmake" CACHE STRING "")
-set(RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;libcxx;libcxxabi" CACHE STRING "")
-set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;libcxx;libcxxabi" CACHE STRING "")
+set(RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;openmp;libcxx;libcxxabi" CACHE STRING "")
+set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc;openmp;libcxx;libcxxabi" CACHE STRING "")
diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt
index ab34851d8961c..3b8dbbc78b375 100644
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@@ -88,6 +88,14 @@ else()
   set(CMAKE_CXX_EXTENSIONS NO)
 endif()
 
+# Targeting the GPU directly requires a few flags to make CMake happy.
+if("${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nogpulib")
+elseif("${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^nvptx")
+  set(CMAKE_REQUIRED_FLAGS
+      "${CMAKE_REQUIRED_FLAGS} -flto -c -Wno-unused-command-line-argument")
+endif()
+
 # Check and set up common compiler flags.
 include(config-ix)
 include(HandleOpenMPOptions)
@@ -122,35 +130,41 @@ else()
   get_clang_resource_dir(LIBOMP_HEADERS_INSTALL_PATH SUBDIR include)
 endif()
 
-# Build host runtime library, after LIBOMPTARGET variables are set since they are needed
-# to enable time profiling support in the OpenMP runtime.
-add_subdirectory(runtime)
-
-set(ENABLE_OMPT_TOOLS ON)
-# Currently tools are not tested well on Windows or MacOS X.
-if (APPLE OR WIN32)
-  set(ENABLE_OMPT_TOOLS OFF)
-endif()
+# Use the current compiler target to determine the appropriate runtime to build.
+if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx" OR
+   "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx")
+  add_subdirectory(device)
+else()
+  # Build host runtime library, after LIBOMPTARGET variables are set since they
+  # are needed to enable time profiling support in the OpenMP runtime.
+  add_subdirectory(runtime)
+
+  set(ENABLE_OMPT_TOOLS ON)
+  # Currently tools are not tested well on Windows or MacOS X.
+  if (APPLE OR WIN32)
+    set(ENABLE_OMPT_TOOLS OFF)
+  endif()
 
-option(OPENMP_ENABLE_OMPT_TOOLS "Enable building ompt based tools for OpenMP."
-       ${ENABLE_OMPT_TOOLS})
-if (OPENMP_ENABLE_OMPT_TOOLS)
-  add_subdirectory(tools)
-endif()
+  option(OPENMP_ENABLE_OMPT_TOOLS "Enable building ompt based tools for OpenMP."
+         ${ENABLE_OMPT_TOOLS})
+  if (OPENMP_ENABLE_OMPT_TOOLS)
+    add_subdirectory(tools)
+  endif()
 
-# Propagate OMPT support to offload
-if(NOT ${OPENMP_STANDALONE_BUILD})
-  set(LIBOMP_HAVE_OMPT_SUPPORT ${LIBOMP_HAVE_OMPT_SUPPORT} PARENT_SCOPE)
-  set(LIBOMP_OMP_TOOLS_INCLUDE_DIR ${LIBOMP_OMP_TOOLS_INCLUDE_DIR} PARENT_SCOPE)
-endif()
+  # Propagate OMPT support to offload
+  if(NOT ${OPENMP_STANDALONE_BUILD})
+    set(LIBOMP_HAVE_OMPT_SUPPORT ${LIBOMP_HAVE_OMPT_SUPPORT} PARENT_SCOPE)
+    set(LIBOMP_OMP_TOOLS_INCLUDE_DIR ${LIBOMP_OMP_TOOLS_INCLUDE_DIR} PARENT_SCOPE)
+  endif()
 
-option(OPENMP_MSVC_NAME_SCHEME "Build dll with MSVC naming scheme." OFF)
+  option(OPENMP_MSVC_NAME_SCHEME "Build dll with MSVC naming scheme." OFF)
 
-# Build libompd.so
-add_subdirectory(libompd)
+  # Build libompd.so
+  add_subdirectory(libompd)
 
-# Build documentation
-add_subdirectory(docs)
+  # Build documentation
+  add_subdirectory(docs)
 
-# Now that we have seen all testsuites, create the check-openmp target.
-construct_check_openmp_target()
+  # Now that we have seen all testsuites, create the check-openmp target.
+  construct_check_openmp_target()
+endif()
diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
new file mode 100644
index 0000000000000..619890863ca0c
--- /dev/null
+++ b/openmp/device/CMakeLists.txt
@@ -0,0 +1,99 @@
+# Ensure the compiler is a valid clang when building the GPU target.
+set(req_ver "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}")
+if(LLVM_VERSION_MAJOR AND NOT (CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang" AND
+   ${CMAKE_CXX_COMPILER_VERSION} VERSION_EQUAL "${req_ver}"))
+  message(FATAL_ERROR "Cannot build GPU device runtime. CMake compiler "
+                      "'${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}' "
+                      " is not 'Clang ${req_ver}'.")
+endif()
+
+set(src_files
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/Allocator.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/Configuration.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/Debug.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/Kernel.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/LibC.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/Mapping.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/Misc.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/Parallelism.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/Profiling.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/Reduction.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/State.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/Synchronization.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/Tasking.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/DeviceUtils.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/Workshare.cpp
+)
+
+list(APPEND compile_options -flto)
+list(APPEND compile_options -fvisibility=hidden)
+list(APPEND compile_options -nogpulib)
+list(APPEND compile_options -nostdlibinc)
+list(APPEND compile_options -fno-rtti)
+list(APPEND compile_options -fno-exceptions)
+list(APPEND compile_options -fconvergent-functions)
+list(APPEND compile_options -Wno-unknown-cuda-version)
+if(LLVM_DEFAULT_TARGET_TRIPLE)
+  list(APPEND compile_options --target=${LLVM_DEFAULT_TARGET_TRIPLE})
+endif()
+
+# We disable the slp vectorizer during the runtime optimization to avoid
+# vectorized accesses to the shared state. Generally, those are "good" but
+# the optimizer pipeline (esp. Attributor) does not fully support vectorized
+# instructions yet and we end up missing out on way more important constant
+# propagation. That said, we will run the vectorizer again after the runtime
+# has been linked into the user program.
+list(APPEND compile_flags "SHELL: -mllvm -vectorize-slp=false")
+if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn" OR
+   "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn")
+  set(target_name "amdgpu")
+  list(APPEND compile_flags "SHELL:-Xclang -mcode-object-version=none")
+elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx" OR
+       "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^nvptx")
+  set(target_name "nvptx")
+  list(APPEND compile_flags --cuda-feature=+ptx63)
+endif()
+
+# Trick to combine these into a bitcode file via the linker's LTO pass.
+add_executable(libompdevice ${src_files})
+set_target_properties(libompdevice PROPERTIES
+  RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  LINKER_LANGUAGE CXX
+  BUILD_RPATH ""
+  INSTALL_RPATH ""
+  RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
+
+# If the user built with the GPU C library enabled we will use that instead.
+if(LIBOMPTARGET_GPU_LIBC_SUPPORT)
+  target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC)
+endif()
+target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512)
+
+target_include_directories(libompdevice PRIVATE
+                           ${CMAKE_CURRENT_SOURCE_DIR}/include
+                           ${CMAKE_CURRENT_SOURCE_DIR}/../../libc
+                           ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
+target_compile_options(libompdevice PRIVATE ${compile_options})
+target_link_options(libompdevice PRIVATE
+                    "-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
+if(LLVM_DEFAULT_TARGET_TRIPLE)
+  target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
+endif()
+install(TARGETS libompdevice
+        PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+        DESTINATION ${OPENMP_INSTALL_LIBDIR})
+
+add_library(ompdevice.all_objs OBJECT IMPORTED)
+set_property(TARGET ompdevice.all_objs APPEND PROPERTY IMPORTED_OBJECTS
+             ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-${target_name}.bc)
+
+# Archive all the object files generated above into a static library
+add_library(ompdevice STATIC)
+add_dependencies(ompdevice libompdevice)
+set_target_properties(ompdevice PROPERTIES
+  ARCHIVE_OUTPUT_DIRECTORY "${OPENMP_INSTALL_LIBDIR}"
+  ARCHIVE_OUTPUT_NAME ompdevice
+  LINKER_LANGUAGE CXX
+)
+target_link_libraries(ompdevice PRIVATE ompdevice.all_objs)
+install(TARGETS ompdevice ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
diff --git a/offload/DeviceRTL/include/Allocator.h b/openmp/device/include/Allocator.h
similarity index 100%
rename from offload/DeviceRTL/include/Allocator.h
rename to openmp/device/include/Allocator.h
diff --git a/offload/DeviceRTL/include/Configuration.h b/openmp/device/include/Configuration.h
similarity index 100%
rename from offload/DeviceRTL/include/Configuration.h
rename to openmp/device/include/Configuration.h
diff --git a/offload/DeviceRTL/include/Debug.h b/openmp/device/include/Debug.h
similarity index 100%
rename from offload/DeviceRTL/include/Debug.h
rename to openmp/device/include/Debug.h
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h
similarity index 100%
rename from offload/DeviceRTL/include/DeviceTypes.h
rename to openmp/device/include/DeviceTypes.h
diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/openmp/device/include/DeviceUtils.h
similarity index 100%
rename from offload/DeviceRTL/include/DeviceUtils.h
rename to openmp/device/include/DeviceUtils.h
diff --git a/offload/DeviceRTL/include/Interface.h b/openmp/device/include/Interface.h
similarity index 100%
rename from offload/DeviceRTL/include/Interface.h
rename to openmp/device/include/Interface.h
diff --git a/offload/DeviceRTL/include/LibC.h b/openmp/device/include/LibC.h
similarity index 100%
rename from offload/DeviceRTL/include/LibC.h
rename to openmp/device/include/LibC.h
diff --git a/offload/DeviceRTL/include/Mapping.h b/openmp/device/include/Mapping.h
similarity index 100%
rename from offload/DeviceRTL/include/Mapping.h
rename to openmp/device/include/Mapping.h
diff --git a/offload/DeviceRTL/include/Profiling.h b/openmp/device/include/Profiling.h
similarity index 100%
rename from offload/DeviceRTL/include/Profiling.h
rename to openmp/device/include/Profiling.h
diff --git a/offload/DeviceRTL/include/State.h b/openmp/device/include/State.h
similarity index 100%
rename from offload/DeviceRTL/include/State.h
rename to openmp/device/include/State.h
diff --git a/offload/DeviceRTL/include/Synchronization.h b/openmp/device/include/Synchronization.h
similarity index 100%
rename from offload/DeviceRTL/include/Synchronization.h
rename to openmp/device/include/Synchronization.h
diff --git a/offload/DeviceRTL/include/Workshare.h b/openmp/device/include/Workshare.h
similarity index 100%
rename from offload/DeviceRTL/include/Workshare.h
rename to openmp/device/include/Workshare.h
diff --git a/offload/DeviceRTL/include/generated_microtask_cases.gen b/openmp/device/include/generated_microtask_cases.gen
similarity index 100%
rename from offload/DeviceRTL/include/generated_microtask_cases.gen
rename to openmp/device/include/generated_microtask_cases.gen
diff --git a/offload/DeviceRTL/src/Allocator.cpp b/openmp/device/src/Allocator.cpp
similarity index 100%
rename from offload/DeviceRTL/src/Allocator.cpp
rename to openmp/device/src/Allocator.cpp
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/openmp/device/src/Configuration.cpp
similarity index 100%
rename from offload/DeviceRTL/src/Configuration.cpp
rename to openmp/device/src/Configuration.cpp
diff --git a/offload/DeviceRTL/src/Debug.cpp b/openmp/device/src/Debug.cpp
similarity index 100%
rename from offload/DeviceRTL/src/Debug.cpp
rename to openmp/device/src/Debug.cpp
diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/openmp/device/src/DeviceUtils.cpp
similarity index 100%
rename from offload/DeviceRTL/src/DeviceUtils.cpp
rename to openmp/device/src/DeviceUtils.cpp
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/openmp/device/src/Kernel.cpp
similarity index 100%
rename from offload/DeviceRTL/src/Kernel.cpp
rename to openmp/device/src/Kernel.cpp
diff --git a/offload/DeviceRTL/src/LibC.cpp b/openmp/device/src/LibC.cpp
similarity index 100%
rename from offload/DeviceRTL/src/LibC.cpp
rename to openmp/device/src/LibC.cpp
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/openmp/device/src/Mapping.cpp
similarity index 100%
rename from offload/DeviceRTL/src/Mapping.cpp
rename to openmp/device/src/Mapping.cpp
diff --git a/offload/DeviceRTL/src/Misc.cpp b/openmp/device/src/Misc.cpp
similarity index 100%
rename from offload/DeviceRTL/src/Misc.cpp
rename to openmp/device/src/Misc.cpp
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/openmp/device/src/Parallelism.cpp
similarity index 100%
rename from offload/DeviceRTL/src/Parallelism.cpp
rename to openmp/device/src/Parallelism.cpp
diff --git a/offload/DeviceRTL/src/Profiling.cpp b/openmp/device/src/Profiling.cpp
similarity index 100%
rename from offload/DeviceRTL/src/Profiling.cpp
rename to openmp/device/src/Profiling.cpp
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/openmp/device/src/Reduction.cpp
similarity index 100%
rename from offload/DeviceRTL/src/Reduction.cpp
rename to openmp/device/src/Reduction.cpp
diff --git a/offload/DeviceRTL/src/State.cpp b/openmp/device/src/State.cpp
similarity index 100%
rename from offload/DeviceRTL/src/State.cpp
rename to openmp/device/src/State.cpp
diff --git a/offload/DeviceRTL/src/Stub.cpp b/openmp/device/src/Stub.cpp
similarity index 100%
rename from offload/DeviceRTL/src/Stub.cpp
rename to openmp/device/src/Stub.cpp
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
similarity index 100%
rename from offload/DeviceRTL/src/Synchronization.cpp
rename to openmp/device/src/Synchronization.cpp
diff --git a/offload/DeviceRTL/src/Tasking.cpp b/openmp/device/src/Tasking.cpp
similarity index 100%
rename from offload/DeviceRTL/src/Tasking.cpp
rename to openmp/device/src/Tasking.cpp
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/openmp/device/src/Workshare.cpp
similarity index 98%
rename from offload/DeviceRTL/src/Workshare.cpp
rename to openmp/device/src/Workshare.cpp
index a8759307b42bd..1c2c5b27f273f 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/openmp/device/src/Workshare.cpp
@@ -907,16 +907,15 @@ template <typename Ty> class StaticLoopChunker {
 
 #define OMP_LOOP_ENTRY(BW, TY)                                                 \
   [[gnu::flatten, clang::always_inline]] void                                  \
-      __kmpc_distribute_for_static_loop##BW(                                   \
-          IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,       \
-          TY num_threads, TY block_chunk, TY thread_chunk) {                   \
+  __kmpc_distribute_for_static_loop##BW(                                       \
+      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
+      TY num_threads, TY block_chunk, TY thread_chunk) {                       \
     ompx::StaticLoopChunker<TY>::DistributeFor(                                \
         loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk);      \
   }                                                                            \
   [[gnu::flatten, clang::always_inline]] void                                  \
-      __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *),  \
-                                        void *arg, TY num_iters,               \
-                                        TY block_chunk) {                      \
+  __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *),      \
+                                    void *arg, TY num_iters, TY block_chunk) { \
     ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters,           \
                                             block_chunk);                      \
   }                                                                            \
diff --git a/openmp/docs/SupportAndFAQ.rst b/openmp/docs/SupportAndFAQ.rst
index abeb37a86da41..2684d20c5b0b6 100644
--- a/openmp/docs/SupportAndFAQ.rst
+++ b/openmp/docs/SupportAndFAQ.rst
@@ -78,6 +78,13 @@ Clang will be built with all backends enabled. When building with
 ``LLVM_ENABLE_RUNTIMES="openmp"`` OpenMP should not be enabled in 
 ``LLVM_ENABLE_PROJECTS`` because it is enabled by default.
 
+Support for the device library comes from a separate build of the OpenMP library
+that targets the GPU architecture. Building it requires enabling the runtime
+targets, or setting the target manually when doing a standalone build. This is
+done with the ``LLVM_RUNTIME_TARGETS`` option and then enabling the OpenMP
+runtime for the GPU target. ``RUNTIMES_<triple>_LLVM_ENABLE_RUNTIMES``. Refer to
+the cache file for the specific invocation.
+
 For Nvidia offload, please see :ref:`build_nvidia_offload_capable_compiler`.
 For AMDGPU offload, please see :ref:`build_amdgpu_offload_capable_compiler`.