correct output token position

ngxson · ngxson · commit 030dc3b09ca1 · 2025-06-06T10:48:19.000+02:00
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -167,9 +167,15 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
 }
 
 void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
-    if (cparams.embeddings && (
-                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
-                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
+    if (!cparams.embeddings) {
+        return;
+    }
+
+    const bool is_last_tok = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
+                             arch == LLM_ARCH_QWEN3; // qwen3 reranking & embedding models use last token
+
+    if (is_last_tok) {
+        // set output to the last token of each sequence
         const int64_t n_tokens     = ubatch->n_tokens;
         const int64_t n_seq_tokens = ubatch->n_seq_tokens;
         const int64_t n_seqs       = ubatch->n_seqs;
@@ -180,23 +186,33 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
         uint32_t * data = (uint32_t *) cls->data;
         memset(cls->data, 0, n_tokens * ggml_element_size(cls));
 
+        std::vector<int> last_pos(n_tokens, -1);
+        std::vector<int> last_row(n_tokens, -1);
+
         for (int s = 0; s < n_seqs; ++s) {
             const llama_seq_id seq_id = ubatch->seq_id[s][0];
 
             // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
 
             for (int i = 0; i < n_seq_tokens; ++i) {
                 const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
 
-                if (pos == 0) {
-                    data[seq_id] = s*n_seq_tokens + i;
+                if (pos >= last_pos[seq_id]) {
+                    last_pos[seq_id] = pos;
+                    last_row[seq_id] = s*n_seq_tokens + i;
                 }
             }
         }
-    }
 
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
+        for (int i = 0; i < n_tokens; ++i) {
+            if (last_row[i] >= 0) {
+                data[i] = last_row[i];
+            }
+        }
+
+    } else {
+        // set output to first token of each sequence
         const int64_t n_tokens     = ubatch->n_tokens;
         const int64_t n_seq_tokens = ubatch->n_seq_tokens;
         const int64_t n_seqs       = ubatch->n_seqs;
@@ -207,30 +223,20 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
         uint32_t * data = (uint32_t *) cls->data;
         memset(cls->data, 0, n_tokens * ggml_element_size(cls));
 
-        std::vector<int> last_pos(n_tokens, -1);
-        std::vector<int> last_row(n_tokens, -1);
-
         for (int s = 0; s < n_seqs; ++s) {
             const llama_seq_id seq_id = ubatch->seq_id[s][0];
 
             // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
 
             for (int i = 0; i < n_seq_tokens; ++i) {
                 const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
 
-                if (pos >= last_pos[seq_id]) {
-                    last_pos[seq_id] = pos;
-                    last_row[seq_id] = s*n_seq_tokens + i;
+                if (pos == 0) {
+                    data[seq_id] = s*n_seq_tokens + i;
                 }
             }
         }
-
-        for (int i = 0; i < n_tokens; ++i) {
-            if (last_row[i] >= 0) {
-                data[i] = last_row[i];
-            }
-        }
     }
 }
 
@@ -943,7 +949,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_cls() const {
-    auto inp = std::make_unique<llm_graph_input_cls>(cparams);
+    auto inp = std::make_unique<llm_graph_input_cls>(cparams, arch);
 
     auto & cur = inp->cls;
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -177,13 +177,14 @@ class llm_graph_input_mean : public llm_graph_input_i {
 
 class llm_graph_input_cls : public llm_graph_input_i {
 public:
-    llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
+    llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : arch(arch), cparams(cparams) {}
     virtual ~llm_graph_input_cls() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * cls; // I32 [n_batch]
 
+    const llm_arch arch;
     const llama_cparams & cparams;
 };
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -7052,7 +7052,7 @@ struct llm_build_qwen3 : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
+            if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
                 // skip computing output for unused tokens
                 ggml_tensor * inp_out_ids = build_inp_out_ids();
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);

Original file line number	Diff line number	Diff line change
`@@ -7052,7 +7052,7 @@ struct llm_build_qwen3 : public llm_graph_context {`
`7052`	`7052`	`Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);`
`7053`	`7053`	`}`
`7054`	`7054`
`7055`		`- if (il == n_layer - 1) {`
	`7055`	`+ if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {`
`7056`	`7056`	`// skip computing output for unused tokens`
`7057`	`7057`	`ggml_tensor * inp_out_ids = build_inp_out_ids();`
`7058`	`7058`	`cur = ggml_get_rows(ctx0, cur, inp_out_ids);`