rapidsai · nirandaperera · Jun 10, 2025 · Jun 10, 2025 · Jun 11, 2025 · Jun 11, 2025
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/detail/nvtx/ranges.hpp>
+
+#include <nvtx3/nvToolsExt.h>
+#include <nvtx3/nvToolsExtMem.h>
+
+namespace rmm {
+
+
+struct librmm_memory_domain {
+  static constexpr char const* name{"librmm_memory"};  ///< Name of the librmm domain
+};
+
+/**
+ * @brief Get the nvtx domain object
+ * 
+ * @return nvtx3::domain const& 
+ */
+inline nvtx3::domain const& nvtx_domain() { return nvtx3::domain::get<librmm_memory_domain>(); }
+
+/**
+ * @brief Create a new nvtx heap for the allocated memory
+ * 
+ * @param ptr Pointer to the allocated memory
+ * @param size Size of the allocated memory
+ * @return nvtxMemHeapHandle_t Handle to the nvtx heap
+ */
+inline nvtxMemHeapHandle_t create_nvtx_heap(void* ptr, std::size_t size)
+{
+  nvtxMemVirtualRangeDesc_t nvtxRangeDesc = {};
+  nvtxRangeDesc.size                      = size;
+  nvtxRangeDesc.ptr                       = ptr;
+
+  nvtxMemHeapDesc_t nvtxHeapDesc    = {};
+  nvtxHeapDesc.extCompatID          = NVTX_EXT_COMPATID_MEM;
+  nvtxHeapDesc.structSize           = sizeof(nvtxMemHeapDesc_t);
+  nvtxHeapDesc.usage                = NVTX_MEM_HEAP_USAGE_TYPE_SUB_ALLOCATOR;
+  nvtxHeapDesc.type                 = NVTX_MEM_TYPE_VIRTUAL_ADDRESS;
+  nvtxHeapDesc.typeSpecificDescSize = sizeof(nvtxMemVirtualRangeDesc_t);
+  nvtxHeapDesc.typeSpecificDesc     = &nvtxRangeDesc;
+
+  return nvtxMemHeapRegister(nvtx_domain(), &nvtxHeapDesc);
+}
+
+/**
+ * @brief Destroy the nvtx heap
+ * 
+ * @param handle Handle to the nvtx heap
+ */
+inline void destroy_nvtx_heap(nvtxMemHeapHandle_t handle) { nvtxMemHeapUnregister(nvtx_domain(), handle); }
+
+/**
+ * @brief Register the memory region with the nvtx heap
+ * 
+ * @param handle Handle to the nvtx heap
+ * @param ptr Pointer to the memory region
+ * @param size Size of the memory region
+ */
+inline void register_mem_region(nvtxMemHeapHandle_t handle, void const* ptr, std::size_t size)
+{
+  nvtxMemVirtualRangeDesc_t nvtxRangeDesc = {};
+  nvtxRangeDesc.size                      = size;
+  nvtxRangeDesc.ptr                       = ptr;
+
+  nvtxMemRegionsRegisterBatch_t nvtxRegionsDesc = {};
+  nvtxRegionsDesc.extCompatID                   = NVTX_EXT_COMPATID_MEM;
+  nvtxRegionsDesc.structSize                    = sizeof(nvtxMemRegionsRegisterBatch_t);
+  nvtxRegionsDesc.regionType                    = NVTX_MEM_TYPE_VIRTUAL_ADDRESS;
+  nvtxRegionsDesc.heap                          = handle;
+  nvtxRegionsDesc.regionCount                   = 1;
+  nvtxRegionsDesc.regionDescElementSize         = sizeof(nvtxMemVirtualRangeDesc_t);
+  nvtxRegionsDesc.regionDescElements            = &nvtxRangeDesc;
+
+  nvtxMemRegionsRegister(nvtx_domain(), &nvtxRegionsDesc);
+}
+
+/**
+ * @brief Unregister the memory region from the nvtx heap
+ * 
+ * @param ptr Pointer to the memory region
+ */
+inline void unregister_mem_region(void const* ptr)
+{
+  nvtxMemRegionRef_t nvtxRegionRef;
+  nvtxRegionRef.pointer = ptr;
+
+  nvtxMemRegionsUnregisterBatch_t nvtxRegionsDesc = {};
+  nvtxRegionsDesc.extCompatID                     = NVTX_EXT_COMPATID_MEM;
+  nvtxRegionsDesc.structSize                      = sizeof(nvtxMemRegionsUnregisterBatch_t);
+  nvtxRegionsDesc.refType                         = NVTX_MEM_REGION_REF_TYPE_POINTER;
+  nvtxRegionsDesc.refCount                        = 1;
+  nvtxRegionsDesc.refElementSize                  = sizeof(nvtxMemRegionRef_t);
+  nvtxRegionsDesc.refElements                     = &nvtxRegionRef;
+
+  nvtxMemRegionsUnregister(nvtx_domain(), &nvtxRegionsDesc);
+}
+
+}  // namespace rmm
@@ -217,7 +217,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
                 std::string("Maximum allocation size exceeded (failed to allocate ") +
                   rmm::detail::format_bytes(size) + ")",
                 rmm::out_of_memory);
-    auto const block = this->underlying().get_block(size, stream_event);
+    auto const block = get_block(size, stream_event);
 
     RMM_LOG_TRACE("[A][stream %s][%zuB][%p]",
                   rmm::detail::format_stream(stream_event.stream),

@@ -21,6 +21,7 @@
 #include <rmm/detail/export.hpp>
 #include <rmm/detail/format.hpp>
 #include <rmm/detail/logging_assert.hpp>
+#include <rmm/detail/nvtx/memory.hpp>
 #include <rmm/detail/thrust_namespace.h>
 #include <rmm/logger.hpp>
 #include <rmm/mr/device/detail/coalescing_free_list.hpp>
@@ -363,6 +364,12 @@ class pool_memory_resource final
     if (size == 0) { return {}; }
 
     void* ptr = get_upstream_resource().allocate_async(size, stream);
+
+#ifdef RMM_NVTX
+    // Create a new nvtx heap for the allocated memory
+    nvtx_heaps_[ptr] = create_nvtx_heap(ptr, size);
+#endif
+
     return *upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first;
   }
 
@@ -383,6 +390,24 @@ class pool_memory_resource final
     allocated_blocks_.insert(alloc);
 #endif
 
+#ifdef RMM_NVTX
+    void* heap_key;
+    if (alloc.is_head()) {
+      // if alloc is head, then it's beginning of the heap (and its already in the upstream_blocks_)
+      heap_key = block.pointer();
+    } else {
+      // if alloc is not head, then it's in the middle of the heap (and its not in the
+      // upstream_blocks_). Find the upstream block that alloc belongs to
+      auto const it = upstream_blocks_.lower_bound(block.pointer());
+      if (it == upstream_blocks_.begin()) {
+        RMM_FAIL("Could not find a heap for block", rmm::logic_error);
+      }
+      heap_key = std::prev(it)->pointer();
+    }
+    // register alloc with the heap
+    register_mem_region(nvtx_heaps_.at(heap_key), alloc.pointer(), alloc.size());
+#endif
+
     auto rest = (block.size() > size)
                   // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
                   ? block_type{block.pointer() + size, block.size() - size, false}
@@ -411,7 +436,14 @@ class pool_memory_resource final
 
     return block;
 #else
+    // unregister ptr from the domain.
+
     auto const iter = upstream_blocks_.find(static_cast<char*>(ptr));
+
+#ifdef RMM_NVTX
+    unregister_mem_region(ptr);
+#endif
+
     return block_type{static_cast<char*>(ptr), size, (iter != upstream_blocks_.end())};
 #endif
   }
@@ -432,6 +464,13 @@ class pool_memory_resource final
     allocated_blocks_.clear();
 #endif
 
+#ifdef RMM_NVTX
+    for (auto const& [ptr, heap] : nvtx_heaps_) {
+      destroy_nvtx_heap(heap);
+    }
+    nvtx_heaps_.clear();
+#endif
+
     current_pool_size_ = 0;
   }
 
@@ -497,9 +536,13 @@ class pool_memory_resource final
   std::set<block_type, rmm::mr::detail::compare_blocks<block_type>> allocated_blocks_;
 #endif
 
+#ifdef RMM_NVTX
+  std::unordered_map<void*, nvtxMemHeapHandle_t> nvtx_heaps_;
+#endif
+
   // blocks allocated from upstream
   std::set<block_type, rmm::mr::detail::compare_blocks<block_type>> upstream_blocks_;
-};  // namespace mr
+};
 
 /** @} */  // end of group
 }  // namespace mr

@@ -25,8 +25,10 @@
 
 #include <gtest/gtest.h>
 
+#include <chrono>
 #include <cstddef>
 #include <memory>
+#include <thread>
 #include <vector>
 
 // explicit instantiation for test coverage purposes
@@ -124,6 +126,27 @@ TEST(PoolTest, InitialAndMaxPoolSizeEqual)
   }());
 }
 
+TEST(PoolTest, InitialAndMaxPoolSizeEqual2)
+{
+  auto make_alloc_mb = [](pool_mr& mr, std::size_t size_mb) {
+    auto ptr = mr.allocate(size_mb << 20);
+    mr.deallocate(ptr, size_mb << 20);
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  };
+
+  EXPECT_NO_THROW([&]() {
+    pool_mr mr(rmm::mr::get_current_device_resource_ref(), 1 << 20, 100 << 20);
+    make_alloc_mb(mr, 50);
+    make_alloc_mb(mr, 1);
+    make_alloc_mb(mr, 50);
+    make_alloc_mb(mr, 1);
+    make_alloc_mb(mr, 50);
+    make_alloc_mb(mr, 1);
+    make_alloc_mb(mr, 50);
+    make_alloc_mb(mr, 1);
+  }());
+}
+
 TEST(PoolTest, NonAlignedPoolSize)
 {
   EXPECT_THROW(