Skip to content

Commit 84e7236

Browse files
authored
speculative: add --n-gpu-layers-draft option (#3063)
1 parent b52b29a commit 84e7236

File tree

3 files changed

+15
-0
lines changed

3 files changed

+15
-0
lines changed

common/common.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
374374
#else
375375
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
376376
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
377+
#endif
378+
} else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
379+
if (++i >= argc) {
380+
invalid_param = true;
381+
break;
382+
}
383+
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
384+
params.n_gpu_layers_draft = std::stoi(argv[i]);
385+
#else
386+
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
387+
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
377388
#endif
378389
} else if (arg == "--main-gpu" || arg == "-mg") {
379390
if (++i >= argc) {
@@ -664,6 +675,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
664675
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
665676
printf(" -ngl N, --n-gpu-layers N\n");
666677
printf(" number of layers to store in VRAM\n");
678+
printf(" -ngld N, --n-gpu-layers-draft N\n");
679+
printf(" number of layers to store in VRAM for the draft model\n");
667680
printf(" -ts SPLIT --tensor-split SPLIT\n");
668681
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
669682
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ struct gpt_params {
3838
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
3939
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
4040
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
41+
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
4142
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
4243
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
4344
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.

examples/speculative/speculative.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ int main(int argc, char ** argv) {
4242

4343
// load the draft model
4444
params.model = params.model_draft;
45+
params.n_gpu_layers = params.n_gpu_layers_draft;
4546
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
4647

4748
// tokenize the prompt

0 commit comments

Comments
 (0)