lightvector · zjuwyz · Jun 6, 2025 · Jun 6, 2025 · Jun 6, 2025 · Jun 6, 2025
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -40,7 +40,9 @@ set(USE_AVX2 0 CACHE BOOL "Compile with AVX2")
 set(USE_BIGGER_BOARDS_EXPENSIVE 0 CACHE BOOL "Allow boards up to size 50. Compiling with this will use more memory and slow down KataGo, even when playing on boards of size 19.")
 
 set(USE_CACHE_TENSORRT_PLAN 0 CACHE BOOL "Use TENSORRT plan cache. May use a lot of disk space. Only applies when USE_BACKEND is TENSORRT.")
+set(USE_TENSORRT_CUDA_GRAPH 0 CACHE BOOL "Use cudaGraph when using TENSORRT backend.")
 mark_as_advanced(USE_CACHE_TENSORRT_PLAN)
+mark_as_advanced(USE_TENSORRT_CUDA_GRAPH)
 
 #--------------------------- NEURAL NET BACKEND ------------------------------------------------------------------------
 
@@ -89,10 +91,14 @@ elseif(USE_BACKEND STREQUAL "TENSORRT")
     )
   if(USE_CACHE_TENSORRT_PLAN AND (NOT BUILD_DISTRIBUTED))
     message(STATUS "-DUSE_CACHE_TENSORRT_PLAN is set, using TENSORRT plan cache.")
-    add_compile_definitions(CACHE_TENSORRT_PLAN)
+    add_compile_definitions(CACHE_TENSORRT_PLAN)  
   elseif(USE_CACHE_TENSORRT_PLAN AND BUILD_DISTRIBUTED)
     message(FATAL_ERROR "Combining USE_CACHE_TENSORRT_PLAN with BUILD_DISTRIBUTED is not supported - it would consume excessive disk space and might worsen performance every time models are updated. Use only one at a time in a given build of KataGo.")
   endif()
+  if(USE_TENSORRT_CUDA_GRAPH)
+    message(STATUS "-DUSE_TENSORRT_CUDA_GRAPH is set, using cuda graph at inference")
+    add_compile_definitions(TENSORRT_CUDA_GRAPH)
+  endif()
 elseif(USE_BACKEND STREQUAL "METAL")
   message(STATUS "-DUSE_BACKEND=METAL, using Metal backend.")
   if(NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja")

diff --git a/cpp/neuralnet/trtbackend.cpp b/cpp/neuralnet/trtbackend.cpp
@@ -1089,7 +1089,11 @@ struct ComputeHandle {
   unique_ptr<IRuntime> runtime;
   unique_ptr<ICudaEngine> engine;
   unique_ptr<IExecutionContext> exec;
-
+#ifdef TENSORRT_CUDA_GRAPH
+  vector<cudaGraph_t> cudaGraphs;
+  vector<cudaGraphExec_t> cudaGraphExecs;
+  inline static unordered_map<int, mutex> mutexPerGpu;
+#endif
   ComputeHandle(
     Logger* logger,
     const cudaDeviceProp* prop,
@@ -1098,7 +1102,7 @@ struct ComputeHandle {
     int maxBatchSz,
     bool requireExactNNLen) {
     ctx = context;
-
+    
     maxBatchSize = maxBatchSz;
     modelVersion = loadedModel->modelDesc.modelVersion;
 
@@ -1108,7 +1112,7 @@ struct ComputeHandle {
     if(getInferLibVersion() / 100 != NV_TENSORRT_VERSION / 100) {
       throw StringError("TensorRT backend: detected incompatible version of TensorRT library");
     }
-
+    
     trtLogger.setLogger(logger);
 
     auto builder = unique_ptr<IBuilder>(createInferBuilder(trtLogger));
@@ -1318,7 +1322,15 @@ struct ComputeHandle {
         tuneMutex.unlock();
       } else {
         tuneMutex.unlock();
-        planBuffer.reset(builder->buildSerializedNetwork(*model->network, *config));
+        {
+          int gpuId;
+          cudaGetDevice(&gpuId);
+          auto& mutex = mutexPerGpu[gpuId];
+          mutex.lock();
+          planBuffer.reset(builder->buildSerializedNetwork(*model->network, *config));
+          mutex.unlock();
+        }
+
         if(!planBuffer) {
           throw StringError("TensorRT backend: failed to create plan");
         }
@@ -1546,15 +1558,15 @@ struct InputBuffers {
   size_t scoreValueResultBufferBytes;
   size_t ownershipResultBufferBytes;
 
-  unique_ptr<float[]> maskInputs;           // Host pointer
-  unique_ptr<float[]> spatialInputs;        // Host pointer
-  unique_ptr<float[]> globalInputs;  // Host pointer
-  unique_ptr<float[]> metaInputs;  // Host pointer
-  unique_ptr<float[]> policyPassResults;    // Host pointer
-  unique_ptr<float[]> policyResults;        // Host pointer
-  unique_ptr<float[]> valueResults;         // Host pointer
-  unique_ptr<float[]> scoreValueResults;    // Host pointer
-  unique_ptr<float[]> ownershipResults;     // Host pointer
+  float* maskInputs;         // Host pointer
+  float* spatialInputs;      // Host pointer
+  float* globalInputs;       // Host pointer
+  float* metaInputs;         // Host pointer
+  float* policyPassResults;  // Host pointer
+  float* policyResults;      // Host pointer
+  float* valueResults;       // Host pointer
+  float* scoreValueResults;  // Host pointer
+  float* ownershipResults;   // Host pointer
 
   InputBuffers(const LoadedModel* loadedModel, int maxBatchSz, int nnXLen, int nnYLen) {
     const ModelDesc& m = loadedModel->modelDesc;
@@ -1602,15 +1614,15 @@ struct InputBuffers {
     scoreValueResultBufferBytes = maxBatchSize * singleScoreValueResultBytes;
     ownershipResultBufferBytes = maxBatchSize * singleOwnershipResultBytes;
 
-    maskInputs = make_unique<float[]>(maxBatchSize * singleMaskElts);
-    spatialInputs = make_unique<float[]>(maxBatchSize * singleInputElts);
-    globalInputs = make_unique<float[]>(maxBatchSize * singleInputGlobalElts);
-    metaInputs = make_unique<float[]>(maxBatchSize * singleInputMetaElts);
-    policyPassResults = make_unique<float[]>(maxBatchSize * singlePolicyPassResultElts);
-    policyResults = make_unique<float[]>(maxBatchSize * singlePolicyResultElts);
-    valueResults = make_unique<float[]>(maxBatchSize * singleValueResultElts);
-    scoreValueResults = make_unique<float[]>(maxBatchSize * singleScoreValueResultElts);
-    ownershipResults = make_unique<float[]>(maxBatchSize * singleOwnershipResultElts);
+    cudaMallocHost((void**)&maskInputs, maxBatchSize * singleMaskElts * sizeof(float));
+    cudaMallocHost((void**)&spatialInputs, maxBatchSize * singleInputElts * sizeof(float));
+    cudaMallocHost((void**)&globalInputs, maxBatchSize * singleInputGlobalElts * sizeof(float));
+    cudaMallocHost((void**)&metaInputs, maxBatchSize * singleInputMetaElts * sizeof(float));
+    cudaMallocHost((void**)&policyPassResults, maxBatchSize * singlePolicyPassResultElts * sizeof(float));
+    cudaMallocHost((void**)&policyResults, maxBatchSize * singlePolicyResultElts * sizeof(float));
+    cudaMallocHost((void**)&valueResults, maxBatchSize * singleValueResultElts * sizeof(float));
+    cudaMallocHost((void**)&scoreValueResults, maxBatchSize * singleScoreValueResultElts * sizeof(float));
+    cudaMallocHost((void**)&ownershipResults, maxBatchSize * singleOwnershipResultElts * sizeof(float));
   }
 
   InputBuffers() = delete;
@@ -1656,7 +1668,6 @@ void NeuralNet::getOutput(
     const float* rowSpatial = inputBufs[nIdx]->rowSpatialBuf.data();
     const float* rowMeta = inputBufs[nIdx]->rowMetaBuf.data();
     const bool hasRowMeta = inputBufs[nIdx]->hasRowMeta;
-    copy(rowGlobal, rowGlobal + numGlobalFeatures, rowGlobalInput);
     std::copy(rowGlobal,rowGlobal+numGlobalFeatures,rowGlobalInput);
     if(numMetaFeatures > 0) {
       testAssert(rowMeta != NULL);
@@ -1696,89 +1707,123 @@ void NeuralNet::getOutput(
   const int numPolicyChannels = inputBuffers->singlePolicyPassResultElts;
   assert(inputBuffers->singlePolicyResultElts == numPolicyChannels * nnXLen * nnYLen);
 
-  // Transfers from host memory to device memory are asynchronous with respect to the host
-  CUDA_ERR(
-    "getOutput",
-    cudaMemcpyAsync(
-      gpuHandle->getBuffer("InputMask"),
-      inputBuffers->maskInputs.get(),
-      inputBuffers->singleMaskBytes * batchSize,
-      cudaMemcpyHostToDevice));
-  CUDA_ERR(
-    "getOutput",
-    cudaMemcpyAsync(
-      gpuHandle->getBuffer("InputSpatial"),
-      inputBuffers->spatialInputs.get(),
-      inputBuffers->singleInputBytes * batchSize,
-      cudaMemcpyHostToDevice));
-  CUDA_ERR(
-    "getOutput",
-    cudaMemcpyAsync(
-      gpuHandle->getBuffer("InputGlobal"),
-      inputBuffers->globalInputs.get(),
-      inputBuffers->singleInputGlobalBytes * batchSize,
-      cudaMemcpyHostToDevice));
-  if(numMetaFeatures > 0) {
-    CUDA_ERR(
-      "getOutput",
-      cudaMemcpyAsync(
-        gpuHandle->getBuffer("InputMeta"),
-        inputBuffers->metaInputs.get(),
-        inputBuffers->singleInputMetaBytes * batchSize,
-        cudaMemcpyHostToDevice));
+#ifdef TENSORRT_CUDA_GRAPH
+  if(gpuHandle->cudaGraphExecs.empty()) {
+    gpuHandle->cudaGraphs.resize(inputBuffers->maxBatchSize + 1);
+    gpuHandle->cudaGraphExecs.resize(inputBuffers->maxBatchSize + 1);
   }
+  auto& graph = gpuHandle->cudaGraphs[batchSize];
+  auto& instance = gpuHandle->cudaGraphExecs[batchSize];
+  if(instance == nullptr) {  // First evaluation with current batchsize. Initialize cuda graph
+    int gpuId;
+    cudaGetDevice(&gpuId);
+    auto& mutex = gpuHandle->mutexPerGpu[gpuId];
+    mutex.lock();
+#endif
 
-  auto maskInputDims = gpuHandle->getBufferDynamicShape("InputMask", batchSize);
-  auto spatialInputDims = gpuHandle->getBufferDynamicShape("InputSpatial", batchSize);
-  auto globalInputDims = gpuHandle->getBufferDynamicShape("InputGlobal", batchSize);
+    auto maskInputDims = gpuHandle->getBufferDynamicShape("InputMask", batchSize);
+    auto spatialInputDims = gpuHandle->getBufferDynamicShape("InputSpatial", batchSize);
+    auto globalInputDims = gpuHandle->getBufferDynamicShape("InputGlobal", batchSize);
 
-  gpuHandle->exec->setInputShape("InputMask", maskInputDims);
-  gpuHandle->exec->setInputShape("InputSpatial", spatialInputDims);
-  gpuHandle->exec->setInputShape("InputGlobal", globalInputDims);
+    gpuHandle->exec->setInputShape("InputMask", maskInputDims);
+    gpuHandle->exec->setInputShape("InputSpatial", spatialInputDims);
+    gpuHandle->exec->setInputShape("InputGlobal", globalInputDims);
 
-  if(numMetaFeatures > 0) {
-    auto metaInputDims = gpuHandle->getBufferDynamicShape("InputMeta", batchSize);
-    gpuHandle->exec->setInputShape("InputMeta", metaInputDims);
-  }
+    if(numMetaFeatures > 0) {
+      auto metaInputDims = gpuHandle->getBufferDynamicShape("InputMeta", batchSize);
+      gpuHandle->exec->setInputShape("InputMeta", metaInputDims);
+    }
+#ifdef TENSORRT_CUDA_GRAPH
+    gpuHandle->exec->enqueueV3(cudaStreamPerThread);  // Warm up
+    cudaStreamBeginCapture(
+      cudaStreamPerThread, cudaStreamCaptureModeThreadLocal);  // In case other server thread is also capturing.
+#endif
+      // Transfers from host memory to device memory are asynchronous with respect to the host
+    CUDA_ERR(
+      "getOutput",
+      cudaMemcpyAsync(
+        gpuHandle->getBuffer("InputMask"),
+        inputBuffers->maskInputs,
+        inputBuffers->singleMaskBytes * batchSize,
+        cudaMemcpyHostToDevice,
+        cudaStreamPerThread));
+    CUDA_ERR(
+      "getOutput",
+      cudaMemcpyAsync(
+        gpuHandle->getBuffer("InputSpatial"),
+        inputBuffers->spatialInputs,
+        inputBuffers->singleInputBytes * batchSize,
+        cudaMemcpyHostToDevice,
+        cudaStreamPerThread));
+    CUDA_ERR(
+      "getOutput",
+      cudaMemcpyAsync(
+        gpuHandle->getBuffer("InputGlobal"),
+        inputBuffers->globalInputs,
+        inputBuffers->singleInputGlobalBytes * batchSize,
+        cudaMemcpyHostToDevice,
+        cudaStreamPerThread));
+    if(numMetaFeatures > 0) {
+      CUDA_ERR(
+        "getOutput",
+        cudaMemcpyAsync(
+          gpuHandle->getBuffer("InputMeta"),
+          inputBuffers->metaInputs,
+          inputBuffers->singleInputMetaBytes * batchSize,
+          cudaMemcpyHostToDevice,
+          cudaStreamPerThread));
+    }
 
-  gpuHandle->exec->enqueueV3(cudaStreamPerThread);
-
-  CUDA_ERR(
-    "getOutput",
-    cudaMemcpy(
-      inputBuffers->policyPassResults.get(),
-      gpuHandle->getBuffer("OutputPolicyPass"),
-      inputBuffers->singlePolicyPassResultBytes * batchSize,
-      cudaMemcpyDeviceToHost));
-  CUDA_ERR(
-    "getOutput",
-    cudaMemcpy(
-      inputBuffers->policyResults.get(),
-      gpuHandle->getBuffer("OutputPolicy"),
-      inputBuffers->singlePolicyResultBytes * batchSize,
-      cudaMemcpyDeviceToHost));
-  CUDA_ERR(
-    "getOutput",
-    cudaMemcpy(
-      inputBuffers->valueResults.get(),
-      gpuHandle->getBuffer("OutputValue"),
-      inputBuffers->singleValueResultBytes * batchSize,
-      cudaMemcpyDeviceToHost));
-  CUDA_ERR(
-    "getOutput",
-    cudaMemcpy(
-      inputBuffers->scoreValueResults.get(),
-      gpuHandle->getBuffer("OutputScoreValue"),
-      inputBuffers->singleScoreValueResultBytes * batchSize,
-      cudaMemcpyDeviceToHost));
-  CUDA_ERR(
-    "getOutput",
-    cudaMemcpy(
-      inputBuffers->ownershipResults.get(),
-      gpuHandle->getBuffer("OutputOwnership"),
-      inputBuffers->singleOwnershipResultBytes * batchSize,
-      cudaMemcpyDeviceToHost));
+    gpuHandle->exec->enqueueV3(cudaStreamPerThread);
 
+    CUDA_ERR(
+      "getOutput",
+      cudaMemcpyAsync(
+        inputBuffers->policyPassResults,
+        gpuHandle->getBuffer("OutputPolicyPass"),
+        inputBuffers->singlePolicyPassResultBytes * batchSize,
+        cudaMemcpyDeviceToHost,
+        cudaStreamPerThread));
+    CUDA_ERR(
+      "getOutput",
+      cudaMemcpyAsync(
+        inputBuffers->policyResults,
+        gpuHandle->getBuffer("OutputPolicy"),
+        inputBuffers->singlePolicyResultBytes * batchSize,
+        cudaMemcpyDeviceToHost,
+        cudaStreamPerThread));
+    CUDA_ERR(
+      "getOutput",
+      cudaMemcpyAsync(
+        inputBuffers->valueResults,
+        gpuHandle->getBuffer("OutputValue"),
+        inputBuffers->singleValueResultBytes * batchSize,
+        cudaMemcpyDeviceToHost,
+        cudaStreamPerThread));
+    CUDA_ERR(
+      "getOutput",
+      cudaMemcpyAsync(
+        inputBuffers->scoreValueResults,
+        gpuHandle->getBuffer("OutputScoreValue"),
+        inputBuffers->singleScoreValueResultBytes * batchSize,
+        cudaMemcpyDeviceToHost,
+        cudaStreamPerThread));
+    CUDA_ERR(
+      "getOutput",
+      cudaMemcpyAsync(
+        inputBuffers->ownershipResults,
+        gpuHandle->getBuffer("OutputOwnership"),
+        inputBuffers->singleOwnershipResultBytes * batchSize,
+        cudaMemcpyDeviceToHost,
+        cudaStreamPerThread));
+#ifdef TENSORRT_CUDA_GRAPH
+    cudaStreamEndCapture(cudaStreamPerThread, &graph);
+    cudaGraphInstantiate(&instance, graph, 0);
+    mutex.unlock();
+  }  // if (instance == nullptr)
+  cudaGraphLaunch(instance, cudaStreamPerThread);
+#endif
+  cudaStreamSynchronize(cudaStreamPerThread);
   gpuHandle->printDebugOutput(batchSize);
   gpuHandle->trtErrorRecorder.clear();