@@ -6076,8 +6076,22 @@ static bool llm_load_tensors(
6076
6076
#endif
6077
6077
6078
6078
// there is very little benefit to offloading the input layer, so always keep it on the CPU
6079
- model.buft_input = llama_default_buffer_type_cpu(true);
6080
- //model.buft_input = llama_default_buffer_type_offload(main_gpu);
6079
+ //model.buft_input = llama_default_buffer_type_cpu(true);
6080
+ //
6081
+ // Well, this is not really true when the model uses the same tensor for token embeddings and for output
6082
+ // (e.g., Bitnet, Gemma). If we use the above, then the matrix multiplication with the output tensor runs
6083
+ // on the CPU, which can have quite a significant impact on performance. For instance, for 3B-Bitnet, I get
6084
+ // TG-128 = ~240 t/s on an RTX-4080 with the above, and TG-128 = 320 t/s with the version below.
6085
+ // The issue with just generically putting token embeddings on the GPU is that CUDA supports the GET_ROWS
6086
+ // operation only for F16 and legacy quants, and this leads to a massive drop in performance when token embeddings
6087
+ // are quantized with a k- or i-quant (which is almost always true). The back-end related stuff and offloading
6088
+ // to the GPU has become quite opaque and hard to understand, so for now we fix this just for Bitnet
6089
+ // (where token_embeddings is quantized with Q8_0).
6090
+ if (model.arch == LLM_ARCH_BITNET) {
6091
+ model.buft_input = llama_default_buffer_type_offload(model, main_gpu);
6092
+ } else {
6093
+ model.buft_input = llama_default_buffer_type_cpu(true);
6094
+ }
6081
6095
6082
6096
model.buft_layer.resize(n_layer);
6083
6097
0 commit comments