@@ -374,6 +374,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
374
374
#else
375
375
fprintf (stderr, " warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n " );
376
376
fprintf (stderr, " warning: see main README.md for information on enabling GPU BLAS support\n " );
377
+ #endif
378
+ } else if (arg == " --gpu-layers-draft" || arg == " -ngld" || arg == " --n-gpu-layers-draft" ) {
379
+ if (++i >= argc) {
380
+ invalid_param = true ;
381
+ break ;
382
+ }
383
+ #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
384
+ params.n_gpu_layers_draft = std::stoi (argv[i]);
385
+ #else
386
+ fprintf (stderr, " warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n " );
387
+ fprintf (stderr, " warning: see main README.md for information on enabling GPU BLAS support\n " );
377
388
#endif
378
389
} else if (arg == " --main-gpu" || arg == " -mg" ) {
379
390
if (++i >= argc) {
@@ -664,6 +675,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
664
675
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
665
676
printf (" -ngl N, --n-gpu-layers N\n " );
666
677
printf (" number of layers to store in VRAM\n " );
678
+ printf (" -ngld N, --n-gpu-layers-draft N\n " );
679
+ printf (" number of layers to store in VRAM for the draft model\n " );
667
680
printf (" -ts SPLIT --tensor-split SPLIT\n " );
668
681
printf (" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " );
669
682
printf (" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n " );
0 commit comments