add rerank prompt

ngxson · ngxson · commit c02f53d9ce78 · 2025-06-06T11:51:11.000+02:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -907,8 +907,12 @@ struct common_init_result common_init_from_params(common_params & params) {
 
         bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
         bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
+        bool has_rerank_prompt = llama_model_chat_template(model, "rerank_prefix") != NULL ||
+                                 llama_model_chat_template(model, "rerank_suffix") != NULL;
 
-        if (!has_eos && !has_sep) {
+        if (has_rerank_prompt) {
+            // OK, do nothing
+        } else if (!has_eos && !has_sep) {
             LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
             ok = false;
         } else if (!has_eos) {
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -3073,6 +3073,7 @@ class Qwen3Model(Qwen2Model):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # a bit hacky, but currently the only way to detect if this is a rerank model
+        # ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
         readme_path = self.dir_model / "README.md"
         readme_text = ""
         if readme_path.exists():
@@ -3086,7 +3087,6 @@ def _find_rerank_config(self):
         tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
         self.token_false_id = tokenizer.convert_tokens_to_ids("no")
         self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
-        self.sep_token_id = tokenizer.convert_tokens_to_ids("\\n") # unused, but needed for rerank check
         self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
         logger.info(f"gguf: token_false_id = {self.token_false_id}, token_true_id = {self.token_true_id}")
         logger.info(f"gguf: sep_token_id = {self.sep_token_id}")
@@ -3097,8 +3097,14 @@ def set_gguf_parameters(self):
         is_rerank = self.token_false_id is not None and self.token_true_id is not None
         if is_rerank:
             self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
-            self.gguf_writer.add_sep_token_id(self.sep_token_id)
             self.gguf_writer.add_classifier_output_labels(["yes", "no"])
+            self.gguf_writer.add_chat_template([{
+                "name": "rerank_prefix",
+                "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n",
+            }, {
+                "name": "rerank_suffix",
+                "template": "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n",
+            }])
 
     def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
         # extract "yes" and "no" tokens from the output lm_head tensor
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -200,7 +200,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
     { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
     { LLM_KV_TOKENIZER_CHAT_TEMPLATE,        "tokenizer.chat_template"                 },
-    { LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,      "tokenizer.chat_template.%s"              },
+    { LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,      "tokenizer.chat_template."                }, // FIXME: cannot add %s because it will be replaced by arch name
     { LLM_KV_TOKENIZER_FIM_PRE_ID,           "tokenizer.ggml.fim_pre_token_id"         },
     { LLM_KV_TOKENIZER_FIM_SUF_ID,           "tokenizer.ggml.fim_suf_token_id"         },
     { LLM_KV_TOKENIZER_FIM_MID_ID,           "tokenizer.ggml.fim_mid_token_id"         },
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -13793,7 +13793,8 @@ uint64_t llama_model_size(const llama_model * model) {
 }
 
 const char * llama_model_chat_template(const llama_model * model, const char * name) {
-    const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)
+    const auto key = name
+        ? LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N) + std::string(name)
         : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
     const auto & it = model->gguf_kv.find(key);
     if (it == model->gguf_kv.end()) {
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -4715,7 +4715,7 @@ int main(int argc, char ** argv) {
             auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
             tasks.reserve(tokenized_docs.size());
             for (size_t i = 0; i < tokenized_docs.size(); i++) {
-                auto tmp = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
+                auto tmp = format_rerank(ctx_server.model, tokenized_query, tokenized_docs[i]);
                 server_task task   = server_task(SERVER_TASK_TYPE_RERANK);
                 task.id            = ctx_server.queue_tasks.get_new_id();
                 task.index         = i;
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
@@ -260,23 +260,44 @@ static size_t validate_utf8(const std::string& text) {
 // template utils
 //
 
-// format rerank task: [BOS]query[EOS][SEP]doc[EOS]
-static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
+// format rerank task:
+// - using SEP token: [BOS]query[EOS][SEP]doc[EOS]
+// - using prompt:    <rerank_prefix>query<rerank_suffix>doc
+static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) {
+    const llama_vocab * vocab = llama_model_get_vocab(model);
     llama_tokens result;
 
-    // Get EOS token - use SEP token as fallback if EOS is not available
-    llama_token eos_token = llama_vocab_eos(vocab);
-    if (eos_token == LLAMA_TOKEN_NULL) {
-        eos_token = llama_vocab_sep(vocab);
-    }
+    if (llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL) {
+        // Get EOS token - use SEP token as fallback if EOS is not available
+        llama_token eos_token = llama_vocab_eos(vocab);
+        if (eos_token == LLAMA_TOKEN_NULL) {
+            eos_token = llama_vocab_sep(vocab);
+        }
+
+        result.reserve(doc.size() + query.size() + 4);
+        result.push_back(llama_vocab_bos(vocab));
+        result.insert(result.end(), query.begin(), query.end());
+        result.push_back(eos_token);
+        result.push_back(llama_vocab_sep(vocab));
+        result.insert(result.end(), doc.begin(), doc.end());
+        result.push_back(eos_token);
+    } else {
+        // using prompt template
+        const char * prefix = llama_model_chat_template(model, "rerank_prefix");
+        const char * suffix = llama_model_chat_template(model, "rerank_suffix");
+
+        if (prefix == NULL && suffix == NULL) {
+            throw std::runtime_error("Rerank prompt template not found in the model\n");
+        }
 
-    result.reserve(doc.size() + query.size() + 4);
-    result.push_back(llama_vocab_bos(vocab));
-    result.insert(result.end(), query.begin(), query.end());
-    result.push_back(eos_token);
-    result.push_back(llama_vocab_sep(vocab));
-    result.insert(result.end(), doc.begin(), doc.end());
-    result.push_back(eos_token);
+        const llama_tokens prefix_tokens = prefix ? common_tokenize(vocab, prefix, true,  false) : llama_tokens();
+        const llama_tokens suffix_tokens = suffix ? common_tokenize(vocab, suffix, false, false) : llama_tokens();
+        result.reserve(prefix_tokens.size() + query.size() + suffix_tokens.size() + doc.size());
+        result.insert(result.end(), prefix_tokens.begin(), prefix_tokens.end());
+        result.insert(result.end(), query.begin(), query.end());
+        result.insert(result.end(), suffix_tokens.begin(), suffix_tokens.end());
+        result.insert(result.end(), doc.begin(), doc.end());
+    }
 
     return result;
 }

Original file line number	Diff line number	Diff line change
`@@ -13793,7 +13793,8 @@ uint64_t llama_model_size(const llama_model * model) {`
`13793`	`13793`	`}`
`13794`	`13794`
`13795`	`13795`	`const char * llama_model_chat_template(const llama_model * model, const char * name) {`
`13796`		`- const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)`
	`13796`	`+ const auto key = name`
	`13797`	`+ ? LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N) + std::string(name)`
`13797`	`13798`	`: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);`
`13798`	`13799`	`const auto & it = model->gguf_kv.find(key);`
`13799`	`13800`	`if (it == model->gguf_kv.end()) {`