ggml-org · cjsdurj · Apr 10, 2026 · Apr 10, 2026 · Apr 11, 2026 · Apr 11, 2026
@@ -6559,7 +6559,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             name = name[:-5] + ".bias"
 
         # we are only using BERT for embeddings so we don't need the pooling layer
-        if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
+        if name == "embeddings.position_ids":
             return # we don't need these
 
         if name.startswith("cls.predictions"):
@@ -6576,6 +6576,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             if name == "classifier.bias":
                 name = "classifier.out_proj.bias"
 
+        if name == "pooler.dense.weight":
+            name = "classifier.weight"
+
+        if name == "pooler.dense.bias":
+            name = "classifier.bias"
+
         yield from super().modify_tensors(data_torch, name, bid)
 
     def _xlmroberta_tokenizer_init(self) -> None:

@@ -2,4 +2,5 @@ set(TARGET llama-embedding)
 add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -2,9 +2,9 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
-
+#include <src/llama-arch.h>
+#include <src/llama-model.h>
 #include <clocale>
-#include <ctime>
 #include <algorithm>
 
 #if defined(_MSC_VER)
@@ -169,13 +169,12 @@ int main(int argc, char ** argv) {
 
     // split the prompt into lines
     std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
+    int32_t token_type_offset  = llama_vocab_n_tokens(vocab);
 
     // max batch size
     const uint64_t n_batch = params.n_batch;
 
     // get added sep and eos token, if any
-    const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
-    const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
     const char * rerank_prompt = llama_model_chat_template(model, "rerank");
 
     // tokenize the prompts and trim
@@ -186,27 +185,50 @@ int main(int argc, char ** argv) {
         // split classification pairs and insert expected separator tokens
         if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
             std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
+            const std::string query = pairs[0];
+            const std::string doc = pairs[1];
             if (rerank_prompt != nullptr) {
-                const std::string query = pairs[0];
-                const std::string doc = pairs[1];
                 std::string final_prompt = rerank_prompt;
-                string_replace_all(final_prompt, "{query}"   , query);
-                string_replace_all(final_prompt, "{document}", doc  );
-                inp = common_tokenize(vocab, final_prompt, true, true);
+                size_t pos = final_prompt.find("{document}");
+                std::string query_prompt = final_prompt.substr(0, pos);
+                std::string doc_prompt = final_prompt.substr(pos);
+                string_replace_all(query_prompt, "{query}"   , query);
+                string_replace_all(doc_prompt, "{document}", doc  );
+
+                auto inp_q= common_tokenize(vocab, query_prompt, false, true);
+                auto inp_d= common_tokenize(vocab, doc_prompt, false, true);
+
+                for(auto token: inp_q){
+                    inp.emplace_back(token);
+                }
+                for(auto token: inp_d){
+                    inp.emplace_back(model->arch == LLM_ARCH_BERT ? token + token_type_offset : token );
+                }
             } else {
-                std::string final_prompt;
-                for (size_t i = 0; i < pairs.size(); i++) {
-                    final_prompt += pairs[i];
-                    if (i != pairs.size() - 1) {
-                        if (!added_eos_token.empty()) {
-                            final_prompt += added_eos_token;
-                        }
-                        if (!added_sep_token.empty()) {
-                            final_prompt += added_sep_token;
-                        }
-                    }
+                llama_token eos_token = llama_vocab_eos(vocab);
+                if (eos_token == LLAMA_TOKEN_NULL) {
+                    eos_token = llama_vocab_sep(vocab);
+                }
+
+                auto inp_q= common_tokenize(vocab, query, false, false);
+                auto inp_d= common_tokenize(vocab, doc, false, false);
+                if (llama_vocab_get_add_bos(vocab)) {
+                    inp.emplace_back(llama_vocab_bos(vocab)); //add bos
+                }
+                inp.insert(inp.end(), inp_q.begin(), inp_q.end());//add seq A
+                if (llama_vocab_get_add_eos(vocab)) {
+                    inp.emplace_back(eos_token); //add eos
+                }
+                if (llama_vocab_get_add_sep(vocab)) {
+                    inp.emplace_back(llama_vocab_sep(vocab)); //add sep
+                }
+                for(auto token: inp_d){ //add seq B
+                    inp.emplace_back(model->arch == LLM_ARCH_BERT ? token + token_type_offset : token);
+
+                }
+                if (llama_vocab_get_add_eos(vocab)) {
+                    inp.emplace_back(model->arch == LLM_ARCH_BERT ? eos_token + token_type_offset : eos_token); //add eos
                 }
-                inp = common_tokenize(ctx, final_prompt, true, true);
             }
         } else {
             inp = common_tokenize(ctx, prompt, true, true);

@@ -46,11 +46,12 @@ bool llama_batch_allocr::init(
         return false;
     }
 
+    int32_t vocab_size = vocab.n_tokens();
     if (batch.token) {
         for (int32_t i = 0; i < batch.n_tokens; ++i) {
-            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
-                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
-                return false;
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab_size) {
+                LLAMA_LOG_WARN("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                break;
             }
         }
     }
@@ -69,6 +70,12 @@ bool llama_batch_allocr::init(
     //
     // auto-generate missing fields
     //
+    token_type_ids.resize(batch.n_tokens);
+    for (int32_t i = 0; i < batch.n_tokens; ++i) {
+        int32_t token_type =  batch.token[i] / vocab_size;
+        batch.token[i] = batch.token[i] - token_type * vocab_size;
+        token_type_ids[i] = token_type;
+    }
 
     if (!batch.n_seq_id) {
         n_seq_id.resize(batch.n_tokens);
@@ -219,6 +226,7 @@ bool llama_batch_allocr::init(
             /*.token        =*/ batch.token,
             /*.embd         =*/ batch.embd,
             /*.pos          =*/ batch.pos,
+            /*.token_type   =*/ token_type_ids.data(),
             /*.n_seq_id     =*/ batch.n_seq_id,
             /*.seq_id       =*/ batch.seq_id,
             /*.seq_id_unq   =*/ this->seq_id_unq.data(),
@@ -401,6 +409,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
     udata->token     .resize(n_tokens);
     udata->embd      .clear();
     udata->pos       .resize(n_pos_all);
+    udata->token_type.resize(n_tokens);
     udata->n_seq_id  .resize(n_tokens);
     udata->seq_id    .resize(n_tokens);
     udata->seq_id_unq.resize(0);
@@ -423,6 +432,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
         /*.token        =*/ udata->token.data(),
         /*.embd         =*/ nullptr,
         /*.pos          =*/ udata->pos.data(),
+        /*.token_type   =*/ udata->token_type.data(),
         /*.n_seq_id     =*/ udata->n_seq_id.data(),
         /*.seq_id       =*/ udata->seq_id.data(),
         /*.seq_id_unq   =*/ udata->seq_id_unq.data(),
@@ -658,6 +668,7 @@ void llama_batch_allocr::clear() {
     batch = {};
 
     pos       .clear();
+    token_type_ids.clear();
     n_seq_id  .clear();
     seq_id    .clear();
     seq_id_unq.clear();
@@ -691,6 +702,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
     udata->token     .resize(n_tokens);
     udata->embd      .resize(n_embd_all);
     udata->pos       .resize(n_pos_all);
+    udata->token_type.resize(n_tokens);
     udata->n_seq_id  .resize(n_tokens);
     udata->seq_id    .resize(n_tokens);
     udata->seq_id_unq.resize(0);
@@ -719,6 +731,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
             udata->pos[j*n_tokens + i] = batch.pos[src_off + idxs[i]];
         }
 
+        udata->token_type[i] = token_type_ids[idxs[i]];
         udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
         udata->output[i]   = batch.logits[idxs[i]];
 
@@ -758,6 +771,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
         /*.token        =*/ batch.token ? udata->token.data() : nullptr,
         /*.embd         =*/ batch.embd ? udata->embd.data() : nullptr,
         /*.pos          =*/ udata->pos.data(),
+        /*token_type    =*/ udata->token_type.data(),
         /*.n_seq_id     =*/ udata->n_seq_id.data(),
         /*.seq_id       =*/ udata->seq_id.data(),
         /*.seq_id_unq   =*/ udata->seq_id_unq.data(),
@@ -807,6 +821,7 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
         LLAMA_LOG_DEBUG("%s:   token      = %p\n", __func__, (void *) ubatch.token);
         LLAMA_LOG_DEBUG("%s:   embd       = %p\n", __func__, (void *) ubatch.embd);
         LLAMA_LOG_DEBUG("%s:   pos        = %p\n", __func__, (void *) ubatch.pos);
+        LLAMA_LOG_DEBUG("%s:   token_type = %p\n", __func__, (void *) ubatch.token_type);
         LLAMA_LOG_DEBUG("%s:   n_seq_id   = %p\n", __func__, (void *) ubatch.n_seq_id);
         LLAMA_LOG_DEBUG("%s:   seq_id     = %p\n", __func__, (void *) ubatch.seq_id);
         LLAMA_LOG_DEBUG("%s:   seq_id_unq = %s\n", __func__, ss_seq_id_unq.str().c_str());
@@ -843,9 +858,9 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
                 }
 
                 if (ubatch.token) {
-                    LLAMA_LOG_DEBUG("%s:  %4d: id = %6d (%16s), pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
+                    LLAMA_LOG_DEBUG("%s:  %4d: id = %6d (%16s), pos = %4d, token_type = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
                             __func__, i, ubatch.token[i], vocab->token_to_piece(ubatch.token[i]).c_str(),
-                            ubatch.pos[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]);
+                            ubatch.pos[i], ubatch.token_type[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]);
                 } else {
                     LLAMA_LOG_DEBUG("%s:  %4d: [embd], pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
                             __func__, i, ubatch.pos[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]);

@@ -45,6 +45,7 @@ struct llama_ubatch {
     llama_token  *  token;      // [n_tokens]         | i   | id, token
     float        *  embd;       // [n_embd, n_tokens] | i   | embd
     llama_pos    *  pos;        // [n_tokens*n_pos]   | i   | pos
+    int32_t      *  token_type; // [n_tokens]         | i   | token_type
     int32_t      *  n_seq_id;   // [n_tokens]         | i   | -
     llama_seq_id ** seq_id;     // [n_tokens]         | s   | s0, s1, seq_id
     llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
@@ -55,6 +56,7 @@ struct llama_ubatch {
         std::vector<llama_token>    token;
         std::vector<float>          embd;
         std::vector<llama_pos>      pos;
+        std::vector<int32_t>        token_type;
         std::vector<int32_t>        n_seq_id;
         std::vector<llama_seq_id *> seq_id;      // these point into the seq_id_data below
         std::vector<llama_seq_id>   seq_id_unq;
@@ -139,6 +141,7 @@ class llama_batch_allocr {
     std::array<llama_seq_id, 1> seq_id_0 = {{ 0 }}; // default sequence id
 
     std::vector<llama_pos>      pos;
+    std::vector<int32_t>        token_type_ids;
     std::vector<int32_t>        n_seq_id;
     std::vector<llama_seq_id *> seq_id;
     std::vector<llama_seq_id>   seq_id_unq;

@@ -126,6 +126,17 @@ bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
     return res;
 }
 
+void llm_graph_input_token_type::set_input(const llama_ubatch * ubatch) {
+    if (ubatch->token_type && type) {
+        const int64_t n_tokens = ubatch->n_tokens;
+        ggml_backend_tensor_set(type, ubatch->token_type, 0, n_tokens*ggml_element_size(type));
+    }
+}
+
+bool llm_graph_input_token_type::can_reuse(const llm_graph_params & params) {
+    return type->ne[0] == params.ubatch.n_tokens;
+}
+
 void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
     if (ubatch->pos && attn_scale) {
         const int64_t n_tokens = ubatch->n_tokens;
@@ -1719,6 +1730,15 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
     return cur;
 }
 
+ggml_tensor * llm_graph_context::build_inp_token_type() const {
+    auto inp = std::make_unique<llm_graph_input_token_type>();
+    auto & cur = inp->type;
+    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_input(cur);
+    res->add_input(std::move(inp));
+    return cur;
+}
+
 ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
     auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset);
 

@@ -133,6 +133,18 @@ class llm_graph_input_pos : public llm_graph_input_i {
     const uint32_t n_pos_per_embd = 1;
 };
 
+class llm_graph_input_token_type: public llm_graph_input_i{
+  public:
+    llm_graph_input_token_type() =default;
+    virtual ~llm_graph_input_token_type() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    bool can_reuse(const llm_graph_params & params) override;
+
+    ggml_tensor * type = nullptr; // I32 [n_batch]
+};
+
 // temperature tuning, used by llama4
 class llm_graph_input_attn_temp : public llm_graph_input_i {
 public:
@@ -861,6 +873,7 @@ struct llm_graph_context {
 
     ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
     ggml_tensor * build_inp_pos() const;
+    ggml_tensor * build_inp_token_type() const;
     ggml_tensor * build_inp_attn_scale() const;
     ggml_tensor * build_inp_out_ids() const;
     ggml_tensor * build_inp_mean() const;

@@ -9,6 +9,7 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
     ggml_tensor * cur;
     ggml_tensor * inpL;
     ggml_tensor * inp_pos = nullptr;
+    ggml_tensor * inp_token_type;
 
     if (model.arch != LLM_ARCH_JINA_BERT_V2) {
         inp_pos = build_inp_pos();
@@ -17,10 +18,9 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
     // construct input embeddings (token, type, position)
     inpL = build_inp_embd(model.tok_embd);
 
-    // token types are hardcoded to zero ("Sentence A")
     if (model.type_embd) {
-        ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
-        inpL                    = ggml_add(ctx0, inpL, type_row0);
+        inp_token_type = build_inp_token_type();
+        inpL = ggml_add(ctx0, inpL, ggml_get_rows(ctx0, model.type_embd, inp_token_type));
     }
     if (model.arch == LLM_ARCH_BERT) {
         inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);