-
Notifications
You must be signed in to change notification settings - Fork 615
Cuda graph implementation on trt backend #1071
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
d04849c
5336a81
8242d03
476d17f
a2d6411
1758eb2
84f5510
51c9b02
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1089,7 +1089,11 @@ struct ComputeHandle { | |
unique_ptr<IRuntime> runtime; | ||
unique_ptr<ICudaEngine> engine; | ||
unique_ptr<IExecutionContext> exec; | ||
|
||
#ifdef TENSORRT_CUDA_GRAPH | ||
vector<cudaGraph_t> cudaGraphs; | ||
vector<cudaGraphExec_t> cudaGraphExecs; | ||
inline static unordered_map<int, mutex> mutexPerGpu; | ||
#endif | ||
ComputeHandle( | ||
Logger* logger, | ||
const cudaDeviceProp* prop, | ||
|
@@ -1098,7 +1102,7 @@ struct ComputeHandle { | |
int maxBatchSz, | ||
bool requireExactNNLen) { | ||
ctx = context; | ||
|
||
maxBatchSize = maxBatchSz; | ||
modelVersion = loadedModel->modelDesc.modelVersion; | ||
|
||
|
@@ -1108,7 +1112,7 @@ struct ComputeHandle { | |
if(getInferLibVersion() / 100 != NV_TENSORRT_VERSION / 100) { | ||
throw StringError("TensorRT backend: detected incompatible version of TensorRT library"); | ||
} | ||
|
||
trtLogger.setLogger(logger); | ||
|
||
auto builder = unique_ptr<IBuilder>(createInferBuilder(trtLogger)); | ||
|
@@ -1318,7 +1322,15 @@ struct ComputeHandle { | |
tuneMutex.unlock(); | ||
} else { | ||
tuneMutex.unlock(); | ||
planBuffer.reset(builder->buildSerializedNetwork(*model->network, *config)); | ||
{ | ||
int gpuId; | ||
cudaGetDevice(&gpuId); | ||
auto& mutex = mutexPerGpu[gpuId]; | ||
mutex.lock(); | ||
planBuffer.reset(builder->buildSerializedNetwork(*model->network, *config)); | ||
mutex.unlock(); | ||
} | ||
|
||
if(!planBuffer) { | ||
throw StringError("TensorRT backend: failed to create plan"); | ||
} | ||
|
@@ -1546,15 +1558,15 @@ struct InputBuffers { | |
size_t scoreValueResultBufferBytes; | ||
size_t ownershipResultBufferBytes; | ||
|
||
unique_ptr<float[]> maskInputs; // Host pointer | ||
unique_ptr<float[]> spatialInputs; // Host pointer | ||
unique_ptr<float[]> globalInputs; // Host pointer | ||
unique_ptr<float[]> metaInputs; // Host pointer | ||
unique_ptr<float[]> policyPassResults; // Host pointer | ||
unique_ptr<float[]> policyResults; // Host pointer | ||
unique_ptr<float[]> valueResults; // Host pointer | ||
unique_ptr<float[]> scoreValueResults; // Host pointer | ||
unique_ptr<float[]> ownershipResults; // Host pointer | ||
float* maskInputs; // Host pointer | ||
float* spatialInputs; // Host pointer | ||
float* globalInputs; // Host pointer | ||
float* metaInputs; // Host pointer | ||
float* policyPassResults; // Host pointer | ||
float* policyResults; // Host pointer | ||
float* valueResults; // Host pointer | ||
float* scoreValueResults; // Host pointer | ||
float* ownershipResults; // Host pointer | ||
|
||
InputBuffers(const LoadedModel* loadedModel, int maxBatchSz, int nnXLen, int nnYLen) { | ||
const ModelDesc& m = loadedModel->modelDesc; | ||
|
@@ -1602,15 +1614,15 @@ struct InputBuffers { | |
scoreValueResultBufferBytes = maxBatchSize * singleScoreValueResultBytes; | ||
ownershipResultBufferBytes = maxBatchSize * singleOwnershipResultBytes; | ||
|
||
maskInputs = make_unique<float[]>(maxBatchSize * singleMaskElts); | ||
spatialInputs = make_unique<float[]>(maxBatchSize * singleInputElts); | ||
globalInputs = make_unique<float[]>(maxBatchSize * singleInputGlobalElts); | ||
metaInputs = make_unique<float[]>(maxBatchSize * singleInputMetaElts); | ||
policyPassResults = make_unique<float[]>(maxBatchSize * singlePolicyPassResultElts); | ||
policyResults = make_unique<float[]>(maxBatchSize * singlePolicyResultElts); | ||
valueResults = make_unique<float[]>(maxBatchSize * singleValueResultElts); | ||
scoreValueResults = make_unique<float[]>(maxBatchSize * singleScoreValueResultElts); | ||
ownershipResults = make_unique<float[]>(maxBatchSize * singleOwnershipResultElts); | ||
cudaMallocHost((void**)&maskInputs, maxBatchSize * singleMaskElts * sizeof(float)); | ||
cudaMallocHost((void**)&spatialInputs, maxBatchSize * singleInputElts * sizeof(float)); | ||
cudaMallocHost((void**)&globalInputs, maxBatchSize * singleInputGlobalElts * sizeof(float)); | ||
cudaMallocHost((void**)&metaInputs, maxBatchSize * singleInputMetaElts * sizeof(float)); | ||
cudaMallocHost((void**)&policyPassResults, maxBatchSize * singlePolicyPassResultElts * sizeof(float)); | ||
cudaMallocHost((void**)&policyResults, maxBatchSize * singlePolicyResultElts * sizeof(float)); | ||
cudaMallocHost((void**)&valueResults, maxBatchSize * singleValueResultElts * sizeof(float)); | ||
cudaMallocHost((void**)&scoreValueResults, maxBatchSize * singleScoreValueResultElts * sizeof(float)); | ||
cudaMallocHost((void**)&ownershipResults, maxBatchSize * singleOwnershipResultElts * sizeof(float)); | ||
Comment on lines
+1617
to
+1625
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we're changing all of these to raw pointers, does this require a corresponding free operation somewhere? |
||
} | ||
|
||
InputBuffers() = delete; | ||
|
@@ -1656,7 +1668,6 @@ void NeuralNet::getOutput( | |
const float* rowSpatial = inputBufs[nIdx]->rowSpatialBuf.data(); | ||
const float* rowMeta = inputBufs[nIdx]->rowMetaBuf.data(); | ||
const bool hasRowMeta = inputBufs[nIdx]->hasRowMeta; | ||
copy(rowGlobal, rowGlobal + numGlobalFeatures, rowGlobalInput); | ||
std::copy(rowGlobal,rowGlobal+numGlobalFeatures,rowGlobalInput); | ||
if(numMetaFeatures > 0) { | ||
testAssert(rowMeta != NULL); | ||
|
@@ -1696,89 +1707,123 @@ void NeuralNet::getOutput( | |
const int numPolicyChannels = inputBuffers->singlePolicyPassResultElts; | ||
assert(inputBuffers->singlePolicyResultElts == numPolicyChannels * nnXLen * nnYLen); | ||
|
||
// Transfers from host memory to device memory are asynchronous with respect to the host | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpyAsync( | ||
gpuHandle->getBuffer("InputMask"), | ||
inputBuffers->maskInputs.get(), | ||
inputBuffers->singleMaskBytes * batchSize, | ||
cudaMemcpyHostToDevice)); | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpyAsync( | ||
gpuHandle->getBuffer("InputSpatial"), | ||
inputBuffers->spatialInputs.get(), | ||
inputBuffers->singleInputBytes * batchSize, | ||
cudaMemcpyHostToDevice)); | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpyAsync( | ||
gpuHandle->getBuffer("InputGlobal"), | ||
inputBuffers->globalInputs.get(), | ||
inputBuffers->singleInputGlobalBytes * batchSize, | ||
cudaMemcpyHostToDevice)); | ||
if(numMetaFeatures > 0) { | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpyAsync( | ||
gpuHandle->getBuffer("InputMeta"), | ||
inputBuffers->metaInputs.get(), | ||
inputBuffers->singleInputMetaBytes * batchSize, | ||
cudaMemcpyHostToDevice)); | ||
#ifdef TENSORRT_CUDA_GRAPH | ||
if(gpuHandle->cudaGraphExecs.empty()) { | ||
gpuHandle->cudaGraphs.resize(inputBuffers->maxBatchSize + 1); | ||
gpuHandle->cudaGraphExecs.resize(inputBuffers->maxBatchSize + 1); | ||
} | ||
auto& graph = gpuHandle->cudaGraphs[batchSize]; | ||
auto& instance = gpuHandle->cudaGraphExecs[batchSize]; | ||
if(instance == nullptr) { // First evaluation with current batchsize. Initialize cuda graph | ||
int gpuId; | ||
cudaGetDevice(&gpuId); | ||
auto& mutex = gpuHandle->mutexPerGpu[gpuId]; | ||
mutex.lock(); | ||
Comment on lines
+1718
to
+1721
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we're going to have repeat code that does cudaGetDevice and grabs a mutex, maybe that should be factored out as a helper, so that it can be written once safely (with unsafe access to mutexPerGpu itself fixed). |
||
#endif | ||
|
||
auto maskInputDims = gpuHandle->getBufferDynamicShape("InputMask", batchSize); | ||
auto spatialInputDims = gpuHandle->getBufferDynamicShape("InputSpatial", batchSize); | ||
auto globalInputDims = gpuHandle->getBufferDynamicShape("InputGlobal", batchSize); | ||
auto maskInputDims = gpuHandle->getBufferDynamicShape("InputMask", batchSize); | ||
auto spatialInputDims = gpuHandle->getBufferDynamicShape("InputSpatial", batchSize); | ||
auto globalInputDims = gpuHandle->getBufferDynamicShape("InputGlobal", batchSize); | ||
|
||
gpuHandle->exec->setInputShape("InputMask", maskInputDims); | ||
gpuHandle->exec->setInputShape("InputSpatial", spatialInputDims); | ||
gpuHandle->exec->setInputShape("InputGlobal", globalInputDims); | ||
gpuHandle->exec->setInputShape("InputMask", maskInputDims); | ||
gpuHandle->exec->setInputShape("InputSpatial", spatialInputDims); | ||
gpuHandle->exec->setInputShape("InputGlobal", globalInputDims); | ||
|
||
if(numMetaFeatures > 0) { | ||
auto metaInputDims = gpuHandle->getBufferDynamicShape("InputMeta", batchSize); | ||
gpuHandle->exec->setInputShape("InputMeta", metaInputDims); | ||
} | ||
if(numMetaFeatures > 0) { | ||
auto metaInputDims = gpuHandle->getBufferDynamicShape("InputMeta", batchSize); | ||
gpuHandle->exec->setInputShape("InputMeta", metaInputDims); | ||
} | ||
#ifdef TENSORRT_CUDA_GRAPH | ||
gpuHandle->exec->enqueueV3(cudaStreamPerThread); // Warm up | ||
cudaStreamBeginCapture( | ||
cudaStreamPerThread, cudaStreamCaptureModeThreadLocal); // In case other server thread is also capturing. | ||
#endif | ||
// Transfers from host memory to device memory are asynchronous with respect to the host | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpyAsync( | ||
gpuHandle->getBuffer("InputMask"), | ||
inputBuffers->maskInputs, | ||
inputBuffers->singleMaskBytes * batchSize, | ||
cudaMemcpyHostToDevice, | ||
cudaStreamPerThread)); | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpyAsync( | ||
gpuHandle->getBuffer("InputSpatial"), | ||
inputBuffers->spatialInputs, | ||
inputBuffers->singleInputBytes * batchSize, | ||
cudaMemcpyHostToDevice, | ||
cudaStreamPerThread)); | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpyAsync( | ||
gpuHandle->getBuffer("InputGlobal"), | ||
inputBuffers->globalInputs, | ||
inputBuffers->singleInputGlobalBytes * batchSize, | ||
cudaMemcpyHostToDevice, | ||
cudaStreamPerThread)); | ||
if(numMetaFeatures > 0) { | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpyAsync( | ||
gpuHandle->getBuffer("InputMeta"), | ||
inputBuffers->metaInputs, | ||
inputBuffers->singleInputMetaBytes * batchSize, | ||
cudaMemcpyHostToDevice, | ||
cudaStreamPerThread)); | ||
} | ||
|
||
gpuHandle->exec->enqueueV3(cudaStreamPerThread); | ||
|
||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpy( | ||
inputBuffers->policyPassResults.get(), | ||
gpuHandle->getBuffer("OutputPolicyPass"), | ||
inputBuffers->singlePolicyPassResultBytes * batchSize, | ||
cudaMemcpyDeviceToHost)); | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpy( | ||
inputBuffers->policyResults.get(), | ||
gpuHandle->getBuffer("OutputPolicy"), | ||
inputBuffers->singlePolicyResultBytes * batchSize, | ||
cudaMemcpyDeviceToHost)); | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpy( | ||
inputBuffers->valueResults.get(), | ||
gpuHandle->getBuffer("OutputValue"), | ||
inputBuffers->singleValueResultBytes * batchSize, | ||
cudaMemcpyDeviceToHost)); | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpy( | ||
inputBuffers->scoreValueResults.get(), | ||
gpuHandle->getBuffer("OutputScoreValue"), | ||
inputBuffers->singleScoreValueResultBytes * batchSize, | ||
cudaMemcpyDeviceToHost)); | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpy( | ||
inputBuffers->ownershipResults.get(), | ||
gpuHandle->getBuffer("OutputOwnership"), | ||
inputBuffers->singleOwnershipResultBytes * batchSize, | ||
cudaMemcpyDeviceToHost)); | ||
gpuHandle->exec->enqueueV3(cudaStreamPerThread); | ||
|
||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpyAsync( | ||
inputBuffers->policyPassResults, | ||
gpuHandle->getBuffer("OutputPolicyPass"), | ||
inputBuffers->singlePolicyPassResultBytes * batchSize, | ||
cudaMemcpyDeviceToHost, | ||
cudaStreamPerThread)); | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpyAsync( | ||
inputBuffers->policyResults, | ||
gpuHandle->getBuffer("OutputPolicy"), | ||
inputBuffers->singlePolicyResultBytes * batchSize, | ||
cudaMemcpyDeviceToHost, | ||
cudaStreamPerThread)); | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpyAsync( | ||
inputBuffers->valueResults, | ||
gpuHandle->getBuffer("OutputValue"), | ||
inputBuffers->singleValueResultBytes * batchSize, | ||
cudaMemcpyDeviceToHost, | ||
cudaStreamPerThread)); | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpyAsync( | ||
inputBuffers->scoreValueResults, | ||
gpuHandle->getBuffer("OutputScoreValue"), | ||
inputBuffers->singleScoreValueResultBytes * batchSize, | ||
cudaMemcpyDeviceToHost, | ||
cudaStreamPerThread)); | ||
CUDA_ERR( | ||
"getOutput", | ||
cudaMemcpyAsync( | ||
inputBuffers->ownershipResults, | ||
gpuHandle->getBuffer("OutputOwnership"), | ||
inputBuffers->singleOwnershipResultBytes * batchSize, | ||
cudaMemcpyDeviceToHost, | ||
cudaStreamPerThread)); | ||
#ifdef TENSORRT_CUDA_GRAPH | ||
cudaStreamEndCapture(cudaStreamPerThread, &graph); | ||
cudaGraphInstantiate(&instance, graph, 0); | ||
mutex.unlock(); | ||
} // if (instance == nullptr) | ||
cudaGraphLaunch(instance, cudaStreamPerThread); | ||
#endif | ||
cudaStreamSynchronize(cudaStreamPerThread); | ||
gpuHandle->printDebugOutput(batchSize); | ||
gpuHandle->trtErrorRecorder.clear(); | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't this code be gated behind the TENSORRT_CUDA_GRAPH define? Is there any other code that should be gated that isn't as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, is auto& mutex = mutexPerGpu[gpuId]; safe?
This is a global map that starts unintialized, so the map itself is subject to race conditions since retrieving a value out of it involves a mutation, right?
Nitpick: for the mutex lock/unlock, normally I think we would use a std::lock_guard so that RAII guarantees unlock even if the build of the network raises. (although admittedly generally katago is written so that exceptions in this kind of code are fatal anyways). Is that easy to do?