Skip to content

Adding nvtx memory regions to pool MR #1952

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions cpp/include/rmm/detail/nvtx/memory.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <rmm/detail/nvtx/ranges.hpp>

#include <nvtx3/nvToolsExt.h>
#include <nvtx3/nvToolsExtMem.h>

namespace rmm {


struct librmm_memory_domain {
static constexpr char const* name{"librmm_memory"}; ///< Name of the librmm domain
};

/**
* @brief Get the nvtx domain object
*
* @return nvtx3::domain const&
*/
inline nvtx3::domain const& nvtx_domain() { return nvtx3::domain::get<librmm_memory_domain>(); }

/**
* @brief Create a new nvtx heap for the allocated memory
*
* @param ptr Pointer to the allocated memory
* @param size Size of the allocated memory
* @return nvtxMemHeapHandle_t Handle to the nvtx heap
*/
inline nvtxMemHeapHandle_t create_nvtx_heap(void* ptr, std::size_t size)
{
nvtxMemVirtualRangeDesc_t nvtxRangeDesc = {};
nvtxRangeDesc.size = size;
nvtxRangeDesc.ptr = ptr;

nvtxMemHeapDesc_t nvtxHeapDesc = {};
nvtxHeapDesc.extCompatID = NVTX_EXT_COMPATID_MEM;
nvtxHeapDesc.structSize = sizeof(nvtxMemHeapDesc_t);
nvtxHeapDesc.usage = NVTX_MEM_HEAP_USAGE_TYPE_SUB_ALLOCATOR;
nvtxHeapDesc.type = NVTX_MEM_TYPE_VIRTUAL_ADDRESS;
nvtxHeapDesc.typeSpecificDescSize = sizeof(nvtxMemVirtualRangeDesc_t);
nvtxHeapDesc.typeSpecificDesc = &nvtxRangeDesc;

return nvtxMemHeapRegister(nvtx_domain(), &nvtxHeapDesc);
}

/**
* @brief Destroy the nvtx heap
*
* @param handle Handle to the nvtx heap
*/
inline void destroy_nvtx_heap(nvtxMemHeapHandle_t handle) { nvtxMemHeapUnregister(nvtx_domain(), handle); }

/**
* @brief Register the memory region with the nvtx heap
*
* @param handle Handle to the nvtx heap
* @param ptr Pointer to the memory region
* @param size Size of the memory region
*/
inline void register_mem_region(nvtxMemHeapHandle_t handle, void const* ptr, std::size_t size)
{
nvtxMemVirtualRangeDesc_t nvtxRangeDesc = {};
nvtxRangeDesc.size = size;
nvtxRangeDesc.ptr = ptr;

nvtxMemRegionsRegisterBatch_t nvtxRegionsDesc = {};
nvtxRegionsDesc.extCompatID = NVTX_EXT_COMPATID_MEM;
nvtxRegionsDesc.structSize = sizeof(nvtxMemRegionsRegisterBatch_t);
nvtxRegionsDesc.regionType = NVTX_MEM_TYPE_VIRTUAL_ADDRESS;
nvtxRegionsDesc.heap = handle;
nvtxRegionsDesc.regionCount = 1;
nvtxRegionsDesc.regionDescElementSize = sizeof(nvtxMemVirtualRangeDesc_t);
nvtxRegionsDesc.regionDescElements = &nvtxRangeDesc;

nvtxMemRegionsRegister(nvtx_domain(), &nvtxRegionsDesc);
}

/**
* @brief Unregister the memory region from the nvtx heap
*
* @param ptr Pointer to the memory region
*/
inline void unregister_mem_region(void const* ptr)
{
nvtxMemRegionRef_t nvtxRegionRef;
nvtxRegionRef.pointer = ptr;

nvtxMemRegionsUnregisterBatch_t nvtxRegionsDesc = {};
nvtxRegionsDesc.extCompatID = NVTX_EXT_COMPATID_MEM;
nvtxRegionsDesc.structSize = sizeof(nvtxMemRegionsUnregisterBatch_t);
nvtxRegionsDesc.refType = NVTX_MEM_REGION_REF_TYPE_POINTER;
nvtxRegionsDesc.refCount = 1;
nvtxRegionsDesc.refElementSize = sizeof(nvtxMemRegionRef_t);
nvtxRegionsDesc.refElements = &nvtxRegionRef;

nvtxMemRegionsUnregister(nvtx_domain(), &nvtxRegionsDesc);
}

} // namespace rmm
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
std::string("Maximum allocation size exceeded (failed to allocate ") +
rmm::detail::format_bytes(size) + ")",
rmm::out_of_memory);
auto const block = this->underlying().get_block(size, stream_event);
auto const block = get_block(size, stream_event);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❓ question: ‏ Why drop the CRTP indirection here? This doesn't seem related to this PR.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@harrism that's right. But when I was reading the code, what I gathered was, get_block is not implemented by the derived class. It's not mentioned here as well. https://github.com/nirandaperera/rmm/blob/adding_nvtx_pool/cpp/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp#L70-L76
So, IINM, we can simply call the method, without the indirection.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I see. Good catch.


RMM_LOG_TRACE("[A][stream %s][%zuB][%p]",
rmm::detail::format_stream(stream_event.stream),
Expand Down
45 changes: 44 additions & 1 deletion cpp/include/rmm/mr/device/pool_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <rmm/detail/export.hpp>
#include <rmm/detail/format.hpp>
#include <rmm/detail/logging_assert.hpp>
#include <rmm/detail/nvtx/memory.hpp>
#include <rmm/detail/thrust_namespace.h>
#include <rmm/logger.hpp>
#include <rmm/mr/device/detail/coalescing_free_list.hpp>
Expand Down Expand Up @@ -363,6 +364,12 @@ class pool_memory_resource final
if (size == 0) { return {}; }

void* ptr = get_upstream_resource().allocate_async(size, stream);

#ifdef RMM_NVTX
// Create a new nvtx heap for the allocated memory
nvtx_heaps_[ptr] = create_nvtx_heap(ptr, size);
#endif

return *upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first;
}

Expand All @@ -383,6 +390,24 @@ class pool_memory_resource final
allocated_blocks_.insert(alloc);
#endif

#ifdef RMM_NVTX
void* heap_key;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this adds some overhead on every suballocation. And the insertion into the nvtx_heaps map is a small overhead on upstream allocations.

Can you please benchmark this cost with the random allocations benchmark with NVTX on and off and report it in the PR? Is NVTX enabled by default? Depending on these costs, we may want it off by default.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@harrism Yes, there is an overhead here. In particular

  1. Inserting and querying from the nvtx_heaps_ unordered map.
  2. calling lower_bound on unstream_blocks_ set (which is logarithmic)

I think we can alleviate 2, if we add a void* upstream_ member to the block class, rather than the bool head. Then IINM, is_head() will be upstream_ == ptr_. But then, we are adding additional 3-bytes to the block class.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think its a worthwhile change?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to see benchmarks, if you don't mind. :)

if (alloc.is_head()) {
// if alloc is head, then it's beginning of the heap (and its already in the upstream_blocks_)
heap_key = block.pointer();
} else {
// if alloc is not head, then it's in the middle of the heap (and its not in the
// upstream_blocks_). Find the upstream block that alloc belongs to
auto const it = upstream_blocks_.lower_bound(block.pointer());
if (it == upstream_blocks_.begin()) {
RMM_FAIL("Could not find a heap for block", rmm::logic_error);
}
heap_key = std::prev(it)->pointer();
}
// register alloc with the heap
register_mem_region(nvtx_heaps_.at(heap_key), alloc.pointer(), alloc.size());
#endif

auto rest = (block.size() > size)
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
? block_type{block.pointer() + size, block.size() - size, false}
Expand Down Expand Up @@ -411,7 +436,14 @@ class pool_memory_resource final

return block;
#else
// unregister ptr from the domain.

auto const iter = upstream_blocks_.find(static_cast<char*>(ptr));

#ifdef RMM_NVTX
unregister_mem_region(ptr);
#endif

return block_type{static_cast<char*>(ptr), size, (iter != upstream_blocks_.end())};
#endif
}
Expand All @@ -432,6 +464,13 @@ class pool_memory_resource final
allocated_blocks_.clear();
#endif

#ifdef RMM_NVTX
for (auto const& [ptr, heap] : nvtx_heaps_) {
destroy_nvtx_heap(heap);
}
nvtx_heaps_.clear();
#endif

current_pool_size_ = 0;
}

Expand Down Expand Up @@ -497,9 +536,13 @@ class pool_memory_resource final
std::set<block_type, rmm::mr::detail::compare_blocks<block_type>> allocated_blocks_;
#endif

#ifdef RMM_NVTX
std::unordered_map<void*, nvtxMemHeapHandle_t> nvtx_heaps_;
#endif

// blocks allocated from upstream
std::set<block_type, rmm::mr::detail::compare_blocks<block_type>> upstream_blocks_;
}; // namespace mr
};

/** @} */ // end of group
} // namespace mr
Expand Down
23 changes: 23 additions & 0 deletions cpp/tests/mr/device/pool_mr_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@

#include <gtest/gtest.h>

#include <chrono>
#include <cstddef>
#include <memory>
#include <thread>
#include <vector>

// explicit instantiation for test coverage purposes
Expand Down Expand Up @@ -124,6 +126,27 @@ TEST(PoolTest, InitialAndMaxPoolSizeEqual)
}());
}

TEST(PoolTest, InitialAndMaxPoolSizeEqual2)
{
auto make_alloc_mb = [](pool_mr& mr, std::size_t size_mb) {
auto ptr = mr.allocate(size_mb << 20);
mr.deallocate(ptr, size_mb << 20);
std::this_thread::sleep_for(std::chrono::milliseconds(1));
};

EXPECT_NO_THROW([&]() {
pool_mr mr(rmm::mr::get_current_device_resource_ref(), 1 << 20, 100 << 20);
make_alloc_mb(mr, 50);
make_alloc_mb(mr, 1);
make_alloc_mb(mr, 50);
make_alloc_mb(mr, 1);
make_alloc_mb(mr, 50);
make_alloc_mb(mr, 1);
make_alloc_mb(mr, 50);
make_alloc_mb(mr, 1);
}());
}

TEST(PoolTest, NonAlignedPoolSize)
{
EXPECT_THROW(
Expand Down
Loading