Skip to content

llama : support qwen3 rerank and embeddings #14029

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3061,6 +3061,64 @@ def prepare_tensors(self):
class Qwen3Model(Qwen2Model):
model_arch = gguf.MODEL_ARCH.QWEN3

# extra logic for rerank models
token_false_id: int | None = None
token_true_id: int | None = None
sep_token_id: int = 0
is_tied_embeddings: bool = False

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# a bit hacky, but currently the only way to detect if this is a rerank model
readme_path = self.dir_model / "README.md"
readme_text = ""
if readme_path.exists():
with readme_path.open("r", encoding="utf-8") as f:
readme_text = f.read()
if "# Qwen3-Reranker" in readme_text:
self._find_rerank_config()

def _find_rerank_config(self):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
self.token_false_id = tokenizer.convert_tokens_to_ids("no")
self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
self.sep_token_id = tokenizer.convert_tokens_to_ids("\\n") # unused, but needed for rerank check
self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
logger.info(f"gguf: token_false_id = {self.token_false_id}, token_true_id = {self.token_true_id}")
logger.info(f"gguf: sep_token_id = {self.sep_token_id}")
logger.info(f"gguf: is_tied_embeddings = {self.is_tied_embeddings}")

def set_gguf_parameters(self):
super().set_gguf_parameters()
is_rerank = self.token_false_id is not None and self.token_true_id is not None
if is_rerank:
self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
self.gguf_writer.add_sep_token_id(self.sep_token_id)
self.gguf_writer.add_classifier_output_labels(["yes", "no"])

def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
# extract "yes" and "no" tokens from the output lm_head tensor
assert self.token_false_id is not None and self.token_true_id is not None
false_row = data_torch[self.token_false_id]
true_row = data_torch[self.token_true_id]
return torch.stack([true_row, false_row], dim=0)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
is_rerank = self.token_false_id is not None and self.token_true_id is not None

if is_rerank:
if self.is_tied_embeddings and "embed_tokens" in name:
return [
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight", self._get_cls_out_tensor(data_torch)),
(self.map_tensor_name(name), data_torch),
]
if not self.is_tied_embeddings and "lm_head" in name:
# this is the lm_head tensor, we need to extract the cls_out tensor
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight", self._get_cls_out_tensor(data_torch))]

return super().modify_tensors(data_torch, name, bid)


@ModelBase.register("Qwen3MoeForCausalLM")
class Qwen3MoeModel(Qwen2MoeModel):
Expand Down
1 change: 1 addition & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_CLS_OUT, "cls.output" }, // rerank
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
Expand Down
13 changes: 9 additions & 4 deletions src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1577,10 +1577,15 @@ void llm_graph_context::build_pooling(
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
}
} else if (cls_out) {
// Single layer classification head (direct projection)
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
GGML_ASSERT(cls_out_b != nullptr);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
if (arch == LLM_ARCH_QWEN3) {
cur = ggml_mul_mat(ctx0, cls_out, inp);
cur = ggml_soft_max(ctx0, cur); // qwen3 uses softmax on the output
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ggerganov I think there is a bug with build_inp_cls(). It suppose to contain only indexes of the output tokens (last token), but in this case, it actually contains all tokens. This make the output score to be incorrect atm as it returns the score for first token. WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can make quick fix for now like this, similar to build_bert():

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index afef84870..8b11197df 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7043,7 +7043,7 @@ struct llm_build_qwen3 : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
+            if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
                 // skip computing output for unused tokens
                 ggml_tensor * inp_out_ids = build_inp_out_ids();
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No that still doesn't work as I expected.

For example, if my sequence has only one output token, then I expect the inp tensor here to have shape [n_embd, 1], but in reality, it has shape [n_embd, n_tokens]

Maybe I misunderstood something here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm ok I think I got it. The main problem is that qwen's rerank model use causal attention, it's simply a normal next generation model which outputs either "yes" or "no" token

I think the assumption in llama.cpp is that CLS and RANK are non-causal, hence only the first token is marked as output

Not sure what's the best way to support this though

Copy link
Collaborator Author

@ngxson ngxson Jun 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK found a hack around this, for Qwen3, I force the position to last (only the position, not the pooling) in 030dc3b

Probably we should separate the notion of "pooling" and "output position" in the future

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the assumption in llama.cpp is that CLS and RANK are non-causal, hence only the first token is marked as output

The idea is that the llm_build_ functions will compute the embeddings for all tokens in the batch. The notion of "output ids" is purely an optimization trick to avoid unnecessary computation in the last layer and when doing any kind of pooling, it should generally be disabled.

For Qwen3 rerank, what you seem to need is to pool using last and apply the classification head on the result - the latter is missing, so it has to be added. We just haven't encountered models with pooling last and a classification head at the same time.

And it seems we should remove LLAMA_POOLING_TYPE_RANK - it's a bit redundant. Instead CLS and LAST should do the same thing - i.e. apply a classification head if there is one.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm ok I got it. The problem is that I don't have much time for the rest of the day. Do you think we can clean this up in a follow up PR?

And it seems we should remove LLAMA_POOLING_TYPE_RANK - it's a bit redundant. Instead CLS and LAST should do the same thing - i.e. apply a classification head if there is one.

I think having the notion of LLAMA_TASK_* would be useful. For example, pooling CLS can be used for task type CLS and RANK. This can also be useful to block certain endpoints. For example, rerank model should only support /rerank and not /embeddings or /completion

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Think it's better to take the time and make it right, no need to merge it now.

} else {
// Single layer classification head (direct projection)
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
GGML_ASSERT(cls_out_b != nullptr);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
}
} else {
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
}
Expand Down
9 changes: 9 additions & 0 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -819,7 +819,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} break;
case LLM_ARCH_QWEN3:
{
// default for embeddings, will be overwritten if model is rerank
hparams.pooling_type = LLAMA_POOLING_TYPE_LAST;

ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);

switch (hparams.n_layer) {
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
Expand Down Expand Up @@ -2463,6 +2469,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

// output rerank
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);

// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
Expand Down
Loading