-
Notifications
You must be signed in to change notification settings - Fork 223
Adding nvtx memory regions to pool MR #1952
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
4f63732
bb11510
fa8227f
6ffbc2d
a6317d4
f1f453c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <rmm/detail/nvtx/ranges.hpp> | ||
|
||
#include <nvtx3/nvToolsExt.h> | ||
#include <nvtx3/nvToolsExtMem.h> | ||
|
||
namespace rmm { | ||
|
||
|
||
struct librmm_memory_domain { | ||
static constexpr char const* name{"librmm_memory"}; ///< Name of the librmm domain | ||
}; | ||
|
||
/** | ||
* @brief Get the nvtx domain object | ||
* | ||
* @return nvtx3::domain const& | ||
*/ | ||
inline nvtx3::domain const& nvtx_domain() { return nvtx3::domain::get<librmm_memory_domain>(); } | ||
|
||
/** | ||
* @brief Create a new nvtx heap for the allocated memory | ||
* | ||
* @param ptr Pointer to the allocated memory | ||
* @param size Size of the allocated memory | ||
* @return nvtxMemHeapHandle_t Handle to the nvtx heap | ||
*/ | ||
inline nvtxMemHeapHandle_t create_nvtx_heap(void* ptr, std::size_t size) | ||
{ | ||
nvtxMemVirtualRangeDesc_t nvtxRangeDesc = {}; | ||
nvtxRangeDesc.size = size; | ||
nvtxRangeDesc.ptr = ptr; | ||
|
||
nvtxMemHeapDesc_t nvtxHeapDesc = {}; | ||
nvtxHeapDesc.extCompatID = NVTX_EXT_COMPATID_MEM; | ||
nvtxHeapDesc.structSize = sizeof(nvtxMemHeapDesc_t); | ||
nvtxHeapDesc.usage = NVTX_MEM_HEAP_USAGE_TYPE_SUB_ALLOCATOR; | ||
nvtxHeapDesc.type = NVTX_MEM_TYPE_VIRTUAL_ADDRESS; | ||
nvtxHeapDesc.typeSpecificDescSize = sizeof(nvtxMemVirtualRangeDesc_t); | ||
nvtxHeapDesc.typeSpecificDesc = &nvtxRangeDesc; | ||
|
||
return nvtxMemHeapRegister(nvtx_domain(), &nvtxHeapDesc); | ||
} | ||
|
||
/** | ||
* @brief Destroy the nvtx heap | ||
* | ||
* @param handle Handle to the nvtx heap | ||
*/ | ||
inline void destroy_nvtx_heap(nvtxMemHeapHandle_t handle) { nvtxMemHeapUnregister(nvtx_domain(), handle); } | ||
|
||
/** | ||
* @brief Register the memory region with the nvtx heap | ||
* | ||
* @param handle Handle to the nvtx heap | ||
* @param ptr Pointer to the memory region | ||
* @param size Size of the memory region | ||
*/ | ||
inline void register_mem_region(nvtxMemHeapHandle_t handle, void const* ptr, std::size_t size) | ||
{ | ||
nvtxMemVirtualRangeDesc_t nvtxRangeDesc = {}; | ||
nvtxRangeDesc.size = size; | ||
nvtxRangeDesc.ptr = ptr; | ||
|
||
nvtxMemRegionsRegisterBatch_t nvtxRegionsDesc = {}; | ||
nvtxRegionsDesc.extCompatID = NVTX_EXT_COMPATID_MEM; | ||
nvtxRegionsDesc.structSize = sizeof(nvtxMemRegionsRegisterBatch_t); | ||
nvtxRegionsDesc.regionType = NVTX_MEM_TYPE_VIRTUAL_ADDRESS; | ||
nvtxRegionsDesc.heap = handle; | ||
nvtxRegionsDesc.regionCount = 1; | ||
nvtxRegionsDesc.regionDescElementSize = sizeof(nvtxMemVirtualRangeDesc_t); | ||
nvtxRegionsDesc.regionDescElements = &nvtxRangeDesc; | ||
|
||
nvtxMemRegionsRegister(nvtx_domain(), &nvtxRegionsDesc); | ||
} | ||
|
||
/** | ||
* @brief Unregister the memory region from the nvtx heap | ||
* | ||
* @param ptr Pointer to the memory region | ||
*/ | ||
inline void unregister_mem_region(void const* ptr) | ||
{ | ||
nvtxMemRegionRef_t nvtxRegionRef; | ||
nvtxRegionRef.pointer = ptr; | ||
|
||
nvtxMemRegionsUnregisterBatch_t nvtxRegionsDesc = {}; | ||
nvtxRegionsDesc.extCompatID = NVTX_EXT_COMPATID_MEM; | ||
nvtxRegionsDesc.structSize = sizeof(nvtxMemRegionsUnregisterBatch_t); | ||
nvtxRegionsDesc.refType = NVTX_MEM_REGION_REF_TYPE_POINTER; | ||
nvtxRegionsDesc.refCount = 1; | ||
nvtxRegionsDesc.refElementSize = sizeof(nvtxMemRegionRef_t); | ||
nvtxRegionsDesc.refElements = &nvtxRegionRef; | ||
|
||
nvtxMemRegionsUnregister(nvtx_domain(), &nvtxRegionsDesc); | ||
} | ||
|
||
} // namespace rmm |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ | |
#include <rmm/detail/export.hpp> | ||
#include <rmm/detail/format.hpp> | ||
#include <rmm/detail/logging_assert.hpp> | ||
#include <rmm/detail/nvtx/memory.hpp> | ||
#include <rmm/detail/thrust_namespace.h> | ||
#include <rmm/logger.hpp> | ||
#include <rmm/mr/device/detail/coalescing_free_list.hpp> | ||
|
@@ -363,6 +364,12 @@ class pool_memory_resource final | |
if (size == 0) { return {}; } | ||
|
||
void* ptr = get_upstream_resource().allocate_async(size, stream); | ||
|
||
#ifdef RMM_NVTX | ||
// Create a new nvtx heap for the allocated memory | ||
nvtx_heaps_[ptr] = create_nvtx_heap(ptr, size); | ||
#endif | ||
|
||
return *upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first; | ||
} | ||
|
||
|
@@ -383,6 +390,24 @@ class pool_memory_resource final | |
allocated_blocks_.insert(alloc); | ||
#endif | ||
|
||
#ifdef RMM_NVTX | ||
void* heap_key; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this adds some overhead on every suballocation. And the insertion into the nvtx_heaps map is a small overhead on upstream allocations. Can you please benchmark this cost with the random allocations benchmark with NVTX on and off and report it in the PR? Is NVTX enabled by default? Depending on these costs, we may want it off by default. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @harrism Yes, there is an overhead here. In particular
I think we can alleviate 2, if we add a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think its a worthwhile change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would like to see benchmarks, if you don't mind. :) |
||
if (alloc.is_head()) { | ||
// if alloc is head, then it's beginning of the heap (and its already in the upstream_blocks_) | ||
heap_key = block.pointer(); | ||
} else { | ||
// if alloc is not head, then it's in the middle of the heap (and its not in the | ||
// upstream_blocks_). Find the upstream block that alloc belongs to | ||
auto const it = upstream_blocks_.lower_bound(block.pointer()); | ||
if (it == upstream_blocks_.begin()) { | ||
RMM_FAIL("Could not find a heap for block", rmm::logic_error); | ||
} | ||
heap_key = std::prev(it)->pointer(); | ||
} | ||
// register alloc with the heap | ||
register_mem_region(nvtx_heaps_.at(heap_key), alloc.pointer(), alloc.size()); | ||
#endif | ||
|
||
auto rest = (block.size() > size) | ||
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) | ||
? block_type{block.pointer() + size, block.size() - size, false} | ||
|
@@ -411,7 +436,14 @@ class pool_memory_resource final | |
|
||
return block; | ||
#else | ||
// unregister ptr from the domain. | ||
|
||
auto const iter = upstream_blocks_.find(static_cast<char*>(ptr)); | ||
|
||
#ifdef RMM_NVTX | ||
unregister_mem_region(ptr); | ||
#endif | ||
|
||
return block_type{static_cast<char*>(ptr), size, (iter != upstream_blocks_.end())}; | ||
#endif | ||
} | ||
|
@@ -432,6 +464,13 @@ class pool_memory_resource final | |
allocated_blocks_.clear(); | ||
#endif | ||
|
||
#ifdef RMM_NVTX | ||
for (auto const& [ptr, heap] : nvtx_heaps_) { | ||
destroy_nvtx_heap(heap); | ||
} | ||
nvtx_heaps_.clear(); | ||
#endif | ||
|
||
current_pool_size_ = 0; | ||
} | ||
|
||
|
@@ -497,9 +536,13 @@ class pool_memory_resource final | |
std::set<block_type, rmm::mr::detail::compare_blocks<block_type>> allocated_blocks_; | ||
#endif | ||
|
||
#ifdef RMM_NVTX | ||
std::unordered_map<void*, nvtxMemHeapHandle_t> nvtx_heaps_; | ||
#endif | ||
|
||
// blocks allocated from upstream | ||
std::set<block_type, rmm::mr::detail::compare_blocks<block_type>> upstream_blocks_; | ||
}; // namespace mr | ||
}; | ||
|
||
/** @} */ // end of group | ||
} // namespace mr | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
❓ question: Why drop the CRTP indirection here? This doesn't seem related to this PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@harrism that's right. But when I was reading the code, what I gathered was,
get_block
is not implemented by the derived class. It's not mentioned here as well. https://github.com/nirandaperera/rmm/blob/adding_nvtx_pool/cpp/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp#L70-L76So, IINM, we can simply call the method, without the indirection.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, I see. Good catch.