-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[Offload] Add an unloadBinary
interface to PluginInterface
#143873
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-offload @llvm/pr-subscribers-backend-amdgpu Author: Ross Brunton (RossBrunton) ChangesThis allows removal of a specific Image from a Device, rather than This is required for Full diff: https://github.com/llvm/llvm-project/pull/143873.diff 6 Files Affected:
diff --git a/offload/liboffload/API/Program.td b/offload/liboffload/API/Program.td
index 8c88fe6e21e6a..c5ad433988adc 100644
--- a/offload/liboffload/API/Program.td
+++ b/offload/liboffload/API/Program.td
@@ -13,7 +13,9 @@
def : Function {
let name = "olCreateProgram";
let desc = "Create a program for the device from the binary image pointed to by `ProgData`.";
- let details = [];
+ let details = [
+ "`ProgData` must remain valid for the entire lifetime of the provided program handle",
+ ];
let params = [
Param<"ol_device_handle_t", "Device", "handle of the device", PARAM_IN>,
Param<"const void*", "ProgData", "pointer to the program binary data", PARAM_IN>,
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index d2b331905ab77..e7804891c494d 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -465,6 +465,10 @@ Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData,
}
Error olDestroyProgram_impl(ol_program_handle_t Program) {
+ if (auto Err = Program->Image->getDevice().unloadBinary(Program->Image)) {
+ return Err;
+ }
+
return olDestroy(Program);
}
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e4c32713e2c15..4ec3d5113a1fb 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2023,6 +2023,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
+ Error unloadBinaryImpl(DeviceImageTy *Image) override {
+ AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(*Image);
+
+ // Unload the executable of the image.
+ if (auto Err = AMDImage.unloadExecutable())
+ return Err;
+
+ return Plugin::success();
+ }
+
/// Deinitialize the device and release its resources.
Error deinitImpl() override {
// Deinitialize the stream and event pools.
@@ -2035,19 +2045,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = AMDGPUSignalManager.deinit())
return Err;
- // Close modules if necessary.
- if (!LoadedImages.empty()) {
- // Each image has its own module.
- for (DeviceImageTy *Image : LoadedImages) {
- AMDGPUDeviceImageTy &AMDImage =
- static_cast<AMDGPUDeviceImageTy &>(*Image);
-
- // Unload the executable of the image.
- if (auto Err = AMDImage.unloadExecutable())
- return Err;
- }
- }
-
// Invalidate agent reference.
Agent = {0};
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index d2437908a0a6f..7af61074bb322 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -712,6 +712,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
virtual Expected<DeviceImageTy *>
loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
+ /// Unload a previously loaded Image from the device
+ Error unloadBinary(DeviceImageTy *Image);
+ virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;
+
/// Setup the device environment if needed. Notice this setup may not be run
/// on some plugins. By default, it will be executed, but plugins can change
/// this behavior by overriding the shouldSetupDeviceEnvironment function.
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index f9a6b3c1f4324..bb503c1ff7a54 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -821,26 +821,52 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
return Plugin::success();
}
-Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
- for (DeviceImageTy *Image : LoadedImages)
- if (auto Err = callGlobalDestructors(Plugin, *Image))
- return Err;
+Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
+ if (auto Err = callGlobalDestructors(Plugin, *Image))
+ return Err;
if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
- for (auto *Image : LoadedImages) {
- DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
- GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
- sizeof(DeviceMemoryPoolTrackingTy),
- &ImageDeviceMemoryPoolTracking);
- if (auto Err =
- GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
- consumeError(std::move(Err));
- continue;
- }
- DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
+ DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
+ GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
+ sizeof(DeviceMemoryPoolTrackingTy),
+ &ImageDeviceMemoryPoolTracking);
+ if (auto Err =
+ GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
+ consumeError(std::move(Err));
}
+ DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
+ }
+
+ GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
+ auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
+ if (!ProfOrErr)
+ return ProfOrErr.takeError();
+
+ if (!ProfOrErr->empty()) {
+ // Dump out profdata
+ if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
+ uint32_t(DeviceDebugKind::PGODump))
+ ProfOrErr->dump();
+
+ // Write data to profiling file
+ if (auto Err = ProfOrErr->write())
+ return Err;
+ }
+
+ LoadedImages.erase(
+ std::find(LoadedImages.begin(), LoadedImages.end(), Image));
+ return unloadBinaryImpl(Image);
+}
+
+Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
+ while (!LoadedImages.empty()) {
+ if (auto Err = unloadBinary(LoadedImages.back()))
+ return Err;
+ }
+
+ if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
// TODO: Write this by default into a file.
printf("\n\n|-----------------------\n"
"| Device memory tracker:\n"
@@ -856,25 +882,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
DeviceMemoryPoolTracking.AllocationMax);
}
- for (auto *Image : LoadedImages) {
- GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
- auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
- if (!ProfOrErr)
- return ProfOrErr.takeError();
-
- if (ProfOrErr->empty())
- continue;
-
- // Dump out profdata
- if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
- uint32_t(DeviceDebugKind::PGODump))
- ProfOrErr->dump();
-
- // Write data to profiling file
- if (auto Err = ProfOrErr->write())
- return Err;
- }
-
// Delete the memory manager before deinitializing the device. Otherwise,
// we may delete device allocations after the device is deinitialized.
if (MemoryManager)
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 44ccfc47a21c9..db3c990afc2d6 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -358,6 +358,21 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::success();
}
+ Error unloadBinaryImpl(DeviceImageTy *Image) override {
+ assert(Context && "Invalid CUDA context");
+
+ // Each image has its own module.
+ for (DeviceImageTy *Image : LoadedImages) {
+ CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
+
+ // Unload the module of the image.
+ if (auto Err = CUDAImage.unloadModule())
+ return Err;
+ }
+
+ return Plugin::success();
+ }
+
/// Deinitialize the device and release its resources.
Error deinitImpl() override {
if (Context) {
@@ -372,20 +387,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
if (auto Err = CUDAEventManager.deinit())
return Err;
- // Close modules if necessary.
- if (!LoadedImages.empty()) {
- assert(Context && "Invalid CUDA context");
-
- // Each image has its own module.
- for (DeviceImageTy *Image : LoadedImages) {
- CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
-
- // Unload the module of the image.
- if (auto Err = CUDAImage.unloadModule())
- return Err;
- }
- }
-
if (Context) {
CUresult Res = cuDevicePrimaryCtxRelease(Device);
if (auto Err =
|
b2f9c1e
to
4ea1fc8
Compare
This MR changes the order of operations when deiniting devices as follows: Before:
After:
I think this is sound, since my intuition is that destructors in one image can't call code in another image... But I'm not 100% on that. |
offload/liboffload/API/Program.td
Outdated
@@ -13,7 +13,9 @@ | |||
def : Function { | |||
let name = "olCreateProgram"; | |||
let desc = "Create a program for the device from the binary image pointed to by `ProgData`."; | |||
let details = []; | |||
let details = [ | |||
"`ProgData` must remain valid for the entire lifetime of the output `Program` handle", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is probably something we should change. We should keep an internal copy of whatever the user provides because it just gets messy if we allow the user to segfault things in our API. I've been planning on doing that since forever so I should probably just get around to it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wouldn't that result in an uneeded copy and duplicate versions of the image data in memory? Although we could just have olCreateProgram take ownership of the passed pointer to sidestep the problem.
It also may result in a memory leak if the user wants to have a long running process create and delete many programs over its lifetime.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the idea is that the returned program handle owns the associated memory. If the user doesn't want it anymore they unload it. That's how the other APIs do it because it's really bad to have your program crash because a user deallocated some random pointer ten days later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, okay. I read the original comment as the PluginInterface making a copy which lives for the lifetime of the device. Having the program handle owning a copy (which is discarded when the handle is) makes more sense.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It also looks like program handles already make a copy (which lives as long as the program handle itself), which I assumed they didn't.
LoadedImages.erase( | ||
std::find(LoadedImages.begin(), LoadedImages.end(), Image)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should not be modifying a data structure inside of a function meant to be called in a loop.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not? LoadedImage
is only used internally by GenericDeviceTy
(and its subclasses) and those should be aware that calling this method invalidates the iterator.
This allows removal of a specific Image from a Device, rather than requiring all image data to outlive the device they were created for. This is required for `ol_program_handle_t`s, which now specify the lifetime of the buffer used to create the program.
This allows removal of a specific Image from a Device, rather than
requiring all image data to outlive the device they were created for.
This is required for
ol_program_handle_t
s, which now specify thelifetime of the buffer used to create the program.