Skip to content

Cuda graph implementation on trt backend #1071

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ set(USE_AVX2 0 CACHE BOOL "Compile with AVX2")
set(USE_BIGGER_BOARDS_EXPENSIVE 0 CACHE BOOL "Allow boards up to size 50. Compiling with this will use more memory and slow down KataGo, even when playing on boards of size 19.")

set(USE_CACHE_TENSORRT_PLAN 0 CACHE BOOL "Use TENSORRT plan cache. May use a lot of disk space. Only applies when USE_BACKEND is TENSORRT.")
set(USE_TENSORRT_CUDA_GRAPH 0 CACHE BOOL "Use cudaGraph when using TENSORRT backend.")
mark_as_advanced(USE_CACHE_TENSORRT_PLAN)
mark_as_advanced(USE_TENSORRT_CUDA_GRAPH)

#--------------------------- NEURAL NET BACKEND ------------------------------------------------------------------------

Expand Down Expand Up @@ -89,10 +91,14 @@ elseif(USE_BACKEND STREQUAL "TENSORRT")
)
if(USE_CACHE_TENSORRT_PLAN AND (NOT BUILD_DISTRIBUTED))
message(STATUS "-DUSE_CACHE_TENSORRT_PLAN is set, using TENSORRT plan cache.")
add_compile_definitions(CACHE_TENSORRT_PLAN)
add_compile_definitions(CACHE_TENSORRT_PLAN)
elseif(USE_CACHE_TENSORRT_PLAN AND BUILD_DISTRIBUTED)
message(FATAL_ERROR "Combining USE_CACHE_TENSORRT_PLAN with BUILD_DISTRIBUTED is not supported - it would consume excessive disk space and might worsen performance every time models are updated. Use only one at a time in a given build of KataGo.")
endif()
if(USE_TENSORRT_CUDA_GRAPH)
message(STATUS "-DUSE_TENSORRT_CUDA_GRAPH is set, using cuda graph at inference")
add_compile_definitions(TENSORRT_CUDA_GRAPH)
endif()
elseif(USE_BACKEND STREQUAL "METAL")
message(STATUS "-DUSE_BACKEND=METAL, using Metal backend.")
if(NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja")
Expand Down
245 changes: 145 additions & 100 deletions cpp/neuralnet/trtbackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1089,7 +1089,11 @@ struct ComputeHandle {
unique_ptr<IRuntime> runtime;
unique_ptr<ICudaEngine> engine;
unique_ptr<IExecutionContext> exec;

#ifdef TENSORRT_CUDA_GRAPH
vector<cudaGraph_t> cudaGraphs;
vector<cudaGraphExec_t> cudaGraphExecs;
inline static unordered_map<int, mutex> mutexPerGpu;
#endif
ComputeHandle(
Logger* logger,
const cudaDeviceProp* prop,
Expand All @@ -1098,7 +1102,7 @@ struct ComputeHandle {
int maxBatchSz,
bool requireExactNNLen) {
ctx = context;

maxBatchSize = maxBatchSz;
modelVersion = loadedModel->modelDesc.modelVersion;

Expand All @@ -1108,7 +1112,7 @@ struct ComputeHandle {
if(getInferLibVersion() / 100 != NV_TENSORRT_VERSION / 100) {
throw StringError("TensorRT backend: detected incompatible version of TensorRT library");
}

trtLogger.setLogger(logger);

auto builder = unique_ptr<IBuilder>(createInferBuilder(trtLogger));
Expand Down Expand Up @@ -1318,7 +1322,15 @@ struct ComputeHandle {
tuneMutex.unlock();
} else {
tuneMutex.unlock();
planBuffer.reset(builder->buildSerializedNetwork(*model->network, *config));
{
int gpuId;
cudaGetDevice(&gpuId);
auto& mutex = mutexPerGpu[gpuId];
mutex.lock();
planBuffer.reset(builder->buildSerializedNetwork(*model->network, *config));
mutex.unlock();
}

Comment on lines +1325 to +1333
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this code be gated behind the TENSORRT_CUDA_GRAPH define? Is there any other code that should be gated that isn't as well?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, is auto& mutex = mutexPerGpu[gpuId]; safe?

This is a global map that starts unintialized, so the map itself is subject to race conditions since retrieving a value out of it involves a mutation, right?

Nitpick: for the mutex lock/unlock, normally I think we would use a std::lock_guard so that RAII guarantees unlock even if the build of the network raises. (although admittedly generally katago is written so that exceptions in this kind of code are fatal anyways). Is that easy to do?

if(!planBuffer) {
throw StringError("TensorRT backend: failed to create plan");
}
Expand Down Expand Up @@ -1546,15 +1558,15 @@ struct InputBuffers {
size_t scoreValueResultBufferBytes;
size_t ownershipResultBufferBytes;

unique_ptr<float[]> maskInputs; // Host pointer
unique_ptr<float[]> spatialInputs; // Host pointer
unique_ptr<float[]> globalInputs; // Host pointer
unique_ptr<float[]> metaInputs; // Host pointer
unique_ptr<float[]> policyPassResults; // Host pointer
unique_ptr<float[]> policyResults; // Host pointer
unique_ptr<float[]> valueResults; // Host pointer
unique_ptr<float[]> scoreValueResults; // Host pointer
unique_ptr<float[]> ownershipResults; // Host pointer
float* maskInputs; // Host pointer
float* spatialInputs; // Host pointer
float* globalInputs; // Host pointer
float* metaInputs; // Host pointer
float* policyPassResults; // Host pointer
float* policyResults; // Host pointer
float* valueResults; // Host pointer
float* scoreValueResults; // Host pointer
float* ownershipResults; // Host pointer

InputBuffers(const LoadedModel* loadedModel, int maxBatchSz, int nnXLen, int nnYLen) {
const ModelDesc& m = loadedModel->modelDesc;
Expand Down Expand Up @@ -1602,15 +1614,15 @@ struct InputBuffers {
scoreValueResultBufferBytes = maxBatchSize * singleScoreValueResultBytes;
ownershipResultBufferBytes = maxBatchSize * singleOwnershipResultBytes;

maskInputs = make_unique<float[]>(maxBatchSize * singleMaskElts);
spatialInputs = make_unique<float[]>(maxBatchSize * singleInputElts);
globalInputs = make_unique<float[]>(maxBatchSize * singleInputGlobalElts);
metaInputs = make_unique<float[]>(maxBatchSize * singleInputMetaElts);
policyPassResults = make_unique<float[]>(maxBatchSize * singlePolicyPassResultElts);
policyResults = make_unique<float[]>(maxBatchSize * singlePolicyResultElts);
valueResults = make_unique<float[]>(maxBatchSize * singleValueResultElts);
scoreValueResults = make_unique<float[]>(maxBatchSize * singleScoreValueResultElts);
ownershipResults = make_unique<float[]>(maxBatchSize * singleOwnershipResultElts);
cudaMallocHost((void**)&maskInputs, maxBatchSize * singleMaskElts * sizeof(float));
cudaMallocHost((void**)&spatialInputs, maxBatchSize * singleInputElts * sizeof(float));
cudaMallocHost((void**)&globalInputs, maxBatchSize * singleInputGlobalElts * sizeof(float));
cudaMallocHost((void**)&metaInputs, maxBatchSize * singleInputMetaElts * sizeof(float));
cudaMallocHost((void**)&policyPassResults, maxBatchSize * singlePolicyPassResultElts * sizeof(float));
cudaMallocHost((void**)&policyResults, maxBatchSize * singlePolicyResultElts * sizeof(float));
cudaMallocHost((void**)&valueResults, maxBatchSize * singleValueResultElts * sizeof(float));
cudaMallocHost((void**)&scoreValueResults, maxBatchSize * singleScoreValueResultElts * sizeof(float));
cudaMallocHost((void**)&ownershipResults, maxBatchSize * singleOwnershipResultElts * sizeof(float));
Comment on lines +1617 to +1625
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're changing all of these to raw pointers, does this require a corresponding free operation somewhere?

}

InputBuffers() = delete;
Expand Down Expand Up @@ -1656,7 +1668,6 @@ void NeuralNet::getOutput(
const float* rowSpatial = inputBufs[nIdx]->rowSpatialBuf.data();
const float* rowMeta = inputBufs[nIdx]->rowMetaBuf.data();
const bool hasRowMeta = inputBufs[nIdx]->hasRowMeta;
copy(rowGlobal, rowGlobal + numGlobalFeatures, rowGlobalInput);
std::copy(rowGlobal,rowGlobal+numGlobalFeatures,rowGlobalInput);
if(numMetaFeatures > 0) {
testAssert(rowMeta != NULL);
Expand Down Expand Up @@ -1696,89 +1707,123 @@ void NeuralNet::getOutput(
const int numPolicyChannels = inputBuffers->singlePolicyPassResultElts;
assert(inputBuffers->singlePolicyResultElts == numPolicyChannels * nnXLen * nnYLen);

// Transfers from host memory to device memory are asynchronous with respect to the host
CUDA_ERR(
"getOutput",
cudaMemcpyAsync(
gpuHandle->getBuffer("InputMask"),
inputBuffers->maskInputs.get(),
inputBuffers->singleMaskBytes * batchSize,
cudaMemcpyHostToDevice));
CUDA_ERR(
"getOutput",
cudaMemcpyAsync(
gpuHandle->getBuffer("InputSpatial"),
inputBuffers->spatialInputs.get(),
inputBuffers->singleInputBytes * batchSize,
cudaMemcpyHostToDevice));
CUDA_ERR(
"getOutput",
cudaMemcpyAsync(
gpuHandle->getBuffer("InputGlobal"),
inputBuffers->globalInputs.get(),
inputBuffers->singleInputGlobalBytes * batchSize,
cudaMemcpyHostToDevice));
if(numMetaFeatures > 0) {
CUDA_ERR(
"getOutput",
cudaMemcpyAsync(
gpuHandle->getBuffer("InputMeta"),
inputBuffers->metaInputs.get(),
inputBuffers->singleInputMetaBytes * batchSize,
cudaMemcpyHostToDevice));
#ifdef TENSORRT_CUDA_GRAPH
if(gpuHandle->cudaGraphExecs.empty()) {
gpuHandle->cudaGraphs.resize(inputBuffers->maxBatchSize + 1);
gpuHandle->cudaGraphExecs.resize(inputBuffers->maxBatchSize + 1);
}
auto& graph = gpuHandle->cudaGraphs[batchSize];
auto& instance = gpuHandle->cudaGraphExecs[batchSize];
if(instance == nullptr) { // First evaluation with current batchsize. Initialize cuda graph
int gpuId;
cudaGetDevice(&gpuId);
auto& mutex = gpuHandle->mutexPerGpu[gpuId];
mutex.lock();
Comment on lines +1718 to +1721
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're going to have repeat code that does cudaGetDevice and grabs a mutex, maybe that should be factored out as a helper, so that it can be written once safely (with unsafe access to mutexPerGpu itself fixed).

#endif

auto maskInputDims = gpuHandle->getBufferDynamicShape("InputMask", batchSize);
auto spatialInputDims = gpuHandle->getBufferDynamicShape("InputSpatial", batchSize);
auto globalInputDims = gpuHandle->getBufferDynamicShape("InputGlobal", batchSize);
auto maskInputDims = gpuHandle->getBufferDynamicShape("InputMask", batchSize);
auto spatialInputDims = gpuHandle->getBufferDynamicShape("InputSpatial", batchSize);
auto globalInputDims = gpuHandle->getBufferDynamicShape("InputGlobal", batchSize);

gpuHandle->exec->setInputShape("InputMask", maskInputDims);
gpuHandle->exec->setInputShape("InputSpatial", spatialInputDims);
gpuHandle->exec->setInputShape("InputGlobal", globalInputDims);
gpuHandle->exec->setInputShape("InputMask", maskInputDims);
gpuHandle->exec->setInputShape("InputSpatial", spatialInputDims);
gpuHandle->exec->setInputShape("InputGlobal", globalInputDims);

if(numMetaFeatures > 0) {
auto metaInputDims = gpuHandle->getBufferDynamicShape("InputMeta", batchSize);
gpuHandle->exec->setInputShape("InputMeta", metaInputDims);
}
if(numMetaFeatures > 0) {
auto metaInputDims = gpuHandle->getBufferDynamicShape("InputMeta", batchSize);
gpuHandle->exec->setInputShape("InputMeta", metaInputDims);
}
#ifdef TENSORRT_CUDA_GRAPH
gpuHandle->exec->enqueueV3(cudaStreamPerThread); // Warm up
cudaStreamBeginCapture(
cudaStreamPerThread, cudaStreamCaptureModeThreadLocal); // In case other server thread is also capturing.
#endif
// Transfers from host memory to device memory are asynchronous with respect to the host
CUDA_ERR(
"getOutput",
cudaMemcpyAsync(
gpuHandle->getBuffer("InputMask"),
inputBuffers->maskInputs,
inputBuffers->singleMaskBytes * batchSize,
cudaMemcpyHostToDevice,
cudaStreamPerThread));
CUDA_ERR(
"getOutput",
cudaMemcpyAsync(
gpuHandle->getBuffer("InputSpatial"),
inputBuffers->spatialInputs,
inputBuffers->singleInputBytes * batchSize,
cudaMemcpyHostToDevice,
cudaStreamPerThread));
CUDA_ERR(
"getOutput",
cudaMemcpyAsync(
gpuHandle->getBuffer("InputGlobal"),
inputBuffers->globalInputs,
inputBuffers->singleInputGlobalBytes * batchSize,
cudaMemcpyHostToDevice,
cudaStreamPerThread));
if(numMetaFeatures > 0) {
CUDA_ERR(
"getOutput",
cudaMemcpyAsync(
gpuHandle->getBuffer("InputMeta"),
inputBuffers->metaInputs,
inputBuffers->singleInputMetaBytes * batchSize,
cudaMemcpyHostToDevice,
cudaStreamPerThread));
}

gpuHandle->exec->enqueueV3(cudaStreamPerThread);

CUDA_ERR(
"getOutput",
cudaMemcpy(
inputBuffers->policyPassResults.get(),
gpuHandle->getBuffer("OutputPolicyPass"),
inputBuffers->singlePolicyPassResultBytes * batchSize,
cudaMemcpyDeviceToHost));
CUDA_ERR(
"getOutput",
cudaMemcpy(
inputBuffers->policyResults.get(),
gpuHandle->getBuffer("OutputPolicy"),
inputBuffers->singlePolicyResultBytes * batchSize,
cudaMemcpyDeviceToHost));
CUDA_ERR(
"getOutput",
cudaMemcpy(
inputBuffers->valueResults.get(),
gpuHandle->getBuffer("OutputValue"),
inputBuffers->singleValueResultBytes * batchSize,
cudaMemcpyDeviceToHost));
CUDA_ERR(
"getOutput",
cudaMemcpy(
inputBuffers->scoreValueResults.get(),
gpuHandle->getBuffer("OutputScoreValue"),
inputBuffers->singleScoreValueResultBytes * batchSize,
cudaMemcpyDeviceToHost));
CUDA_ERR(
"getOutput",
cudaMemcpy(
inputBuffers->ownershipResults.get(),
gpuHandle->getBuffer("OutputOwnership"),
inputBuffers->singleOwnershipResultBytes * batchSize,
cudaMemcpyDeviceToHost));
gpuHandle->exec->enqueueV3(cudaStreamPerThread);

CUDA_ERR(
"getOutput",
cudaMemcpyAsync(
inputBuffers->policyPassResults,
gpuHandle->getBuffer("OutputPolicyPass"),
inputBuffers->singlePolicyPassResultBytes * batchSize,
cudaMemcpyDeviceToHost,
cudaStreamPerThread));
CUDA_ERR(
"getOutput",
cudaMemcpyAsync(
inputBuffers->policyResults,
gpuHandle->getBuffer("OutputPolicy"),
inputBuffers->singlePolicyResultBytes * batchSize,
cudaMemcpyDeviceToHost,
cudaStreamPerThread));
CUDA_ERR(
"getOutput",
cudaMemcpyAsync(
inputBuffers->valueResults,
gpuHandle->getBuffer("OutputValue"),
inputBuffers->singleValueResultBytes * batchSize,
cudaMemcpyDeviceToHost,
cudaStreamPerThread));
CUDA_ERR(
"getOutput",
cudaMemcpyAsync(
inputBuffers->scoreValueResults,
gpuHandle->getBuffer("OutputScoreValue"),
inputBuffers->singleScoreValueResultBytes * batchSize,
cudaMemcpyDeviceToHost,
cudaStreamPerThread));
CUDA_ERR(
"getOutput",
cudaMemcpyAsync(
inputBuffers->ownershipResults,
gpuHandle->getBuffer("OutputOwnership"),
inputBuffers->singleOwnershipResultBytes * batchSize,
cudaMemcpyDeviceToHost,
cudaStreamPerThread));
#ifdef TENSORRT_CUDA_GRAPH
cudaStreamEndCapture(cudaStreamPerThread, &graph);
cudaGraphInstantiate(&instance, graph, 0);
mutex.unlock();
} // if (instance == nullptr)
cudaGraphLaunch(instance, cudaStreamPerThread);
#endif
cudaStreamSynchronize(cudaStreamPerThread);
gpuHandle->printDebugOutput(batchSize);
gpuHandle->trtErrorRecorder.clear();

Expand Down