diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f6441b8d266..5b0060cfb0d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6559,7 +6559,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name = name[:-5] + ".bias" # we are only using BERT for embeddings so we don't need the pooling layer - if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): + if name == "embeddings.position_ids": return # we don't need these if name.startswith("cls.predictions"): @@ -6576,6 +6576,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name == "classifier.bias": name = "classifier.out_proj.bias" + if name == "pooler.dense.weight": + name = "classifier.weight" + + if name == "pooler.dense.bias": + name = "classifier.bias" + yield from super().modify_tensors(data_torch, name, bid) def _xlmroberta_tokenizer_init(self) -> None: diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index 809040307d2..523d201a4b1 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -2,4 +2,5 @@ set(TARGET llama-embedding) add_executable(${TARGET} embedding.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index f6a20ef9d07..2a2a52f771c 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -2,9 +2,9 @@ #include "common.h" #include "log.h" #include "llama.h" - +#include +#include #include -#include #include #if defined(_MSC_VER) @@ -169,13 +169,12 @@ int main(int argc, char ** argv) { // split the prompt into lines std::vector prompts = split_lines(params.prompt, params.embd_sep); + int32_t token_type_offset = llama_vocab_n_tokens(vocab); // max batch size const uint64_t n_batch = params.n_batch; // get added sep and eos token, if any - const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : ""; - const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : ""; const char * rerank_prompt = llama_model_chat_template(model, "rerank"); // tokenize the prompts and trim @@ -186,27 +185,50 @@ int main(int argc, char ** argv) { // split classification pairs and insert expected separator tokens if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) { std::vector pairs = split_lines(prompt, params.cls_sep); + const std::string query = pairs[0]; + const std::string doc = pairs[1]; if (rerank_prompt != nullptr) { - const std::string query = pairs[0]; - const std::string doc = pairs[1]; std::string final_prompt = rerank_prompt; - string_replace_all(final_prompt, "{query}" , query); - string_replace_all(final_prompt, "{document}", doc ); - inp = common_tokenize(vocab, final_prompt, true, true); + size_t pos = final_prompt.find("{document}"); + std::string query_prompt = final_prompt.substr(0, pos); + std::string doc_prompt = final_prompt.substr(pos); + string_replace_all(query_prompt, "{query}" , query); + string_replace_all(doc_prompt, "{document}", doc ); + + auto inp_q= common_tokenize(vocab, query_prompt, false, true); + auto inp_d= common_tokenize(vocab, doc_prompt, false, true); + + for(auto token: inp_q){ + inp.emplace_back(token); + } + for(auto token: inp_d){ + inp.emplace_back(model->arch == LLM_ARCH_BERT ? token + token_type_offset : token ); + } } else { - std::string final_prompt; - for (size_t i = 0; i < pairs.size(); i++) { - final_prompt += pairs[i]; - if (i != pairs.size() - 1) { - if (!added_eos_token.empty()) { - final_prompt += added_eos_token; - } - if (!added_sep_token.empty()) { - final_prompt += added_sep_token; - } - } + llama_token eos_token = llama_vocab_eos(vocab); + if (eos_token == LLAMA_TOKEN_NULL) { + eos_token = llama_vocab_sep(vocab); + } + + auto inp_q= common_tokenize(vocab, query, false, false); + auto inp_d= common_tokenize(vocab, doc, false, false); + if (llama_vocab_get_add_bos(vocab)) { + inp.emplace_back(llama_vocab_bos(vocab)); //add bos + } + inp.insert(inp.end(), inp_q.begin(), inp_q.end());//add seq A + if (llama_vocab_get_add_eos(vocab)) { + inp.emplace_back(eos_token); //add eos + } + if (llama_vocab_get_add_sep(vocab)) { + inp.emplace_back(llama_vocab_sep(vocab)); //add sep + } + for(auto token: inp_d){ //add seq B + inp.emplace_back(model->arch == LLM_ARCH_BERT ? token + token_type_offset : token); + + } + if (llama_vocab_get_add_eos(vocab)) { + inp.emplace_back(model->arch == LLM_ARCH_BERT ? eos_token + token_type_offset : eos_token); //add eos } - inp = common_tokenize(ctx, final_prompt, true, true); } } else { inp = common_tokenize(ctx, prompt, true, true); diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 6bf76939cdd..0695630d1a4 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -46,11 +46,12 @@ bool llama_batch_allocr::init( return false; } + int32_t vocab_size = vocab.n_tokens(); if (batch.token) { for (int32_t i = 0; i < batch.n_tokens; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); - return false; + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab_size) { + LLAMA_LOG_WARN("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); + break; } } } @@ -69,6 +70,12 @@ bool llama_batch_allocr::init( // // auto-generate missing fields // + token_type_ids.resize(batch.n_tokens); + for (int32_t i = 0; i < batch.n_tokens; ++i) { + int32_t token_type = batch.token[i] / vocab_size; + batch.token[i] = batch.token[i] - token_type * vocab_size; + token_type_ids[i] = token_type; + } if (!batch.n_seq_id) { n_seq_id.resize(batch.n_tokens); @@ -219,6 +226,7 @@ bool llama_batch_allocr::init( /*.token =*/ batch.token, /*.embd =*/ batch.embd, /*.pos =*/ batch.pos, + /*.token_type =*/ token_type_ids.data(), /*.n_seq_id =*/ batch.n_seq_id, /*.seq_id =*/ batch.seq_id, /*.seq_id_unq =*/ this->seq_id_unq.data(), @@ -401,6 +409,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t udata->token .resize(n_tokens); udata->embd .clear(); udata->pos .resize(n_pos_all); + udata->token_type.resize(n_tokens); udata->n_seq_id .resize(n_tokens); udata->seq_id .resize(n_tokens); udata->seq_id_unq.resize(0); @@ -423,6 +432,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t /*.token =*/ udata->token.data(), /*.embd =*/ nullptr, /*.pos =*/ udata->pos.data(), + /*.token_type =*/ udata->token_type.data(), /*.n_seq_id =*/ udata->n_seq_id.data(), /*.seq_id =*/ udata->seq_id.data(), /*.seq_id_unq =*/ udata->seq_id_unq.data(), @@ -658,6 +668,7 @@ void llama_batch_allocr::clear() { batch = {}; pos .clear(); + token_type_ids.clear(); n_seq_id .clear(); seq_id .clear(); seq_id_unq.clear(); @@ -691,6 +702,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u udata->token .resize(n_tokens); udata->embd .resize(n_embd_all); udata->pos .resize(n_pos_all); + udata->token_type.resize(n_tokens); udata->n_seq_id .resize(n_tokens); udata->seq_id .resize(n_tokens); udata->seq_id_unq.resize(0); @@ -719,6 +731,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u udata->pos[j*n_tokens + i] = batch.pos[src_off + idxs[i]]; } + udata->token_type[i] = token_type_ids[idxs[i]]; udata->n_seq_id[i] = batch.n_seq_id[idxs[i]]; udata->output[i] = batch.logits[idxs[i]]; @@ -758,6 +771,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u /*.token =*/ batch.token ? udata->token.data() : nullptr, /*.embd =*/ batch.embd ? udata->embd.data() : nullptr, /*.pos =*/ udata->pos.data(), + /*token_type =*/ udata->token_type.data(), /*.n_seq_id =*/ udata->n_seq_id.data(), /*.seq_id =*/ udata->seq_id.data(), /*.seq_id_unq =*/ udata->seq_id_unq.data(), @@ -807,6 +821,7 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) { LLAMA_LOG_DEBUG("%s: token = %p\n", __func__, (void *) ubatch.token); LLAMA_LOG_DEBUG("%s: embd = %p\n", __func__, (void *) ubatch.embd); LLAMA_LOG_DEBUG("%s: pos = %p\n", __func__, (void *) ubatch.pos); + LLAMA_LOG_DEBUG("%s: token_type = %p\n", __func__, (void *) ubatch.token_type); LLAMA_LOG_DEBUG("%s: n_seq_id = %p\n", __func__, (void *) ubatch.n_seq_id); LLAMA_LOG_DEBUG("%s: seq_id = %p\n", __func__, (void *) ubatch.seq_id); LLAMA_LOG_DEBUG("%s: seq_id_unq = %s\n", __func__, ss_seq_id_unq.str().c_str()); @@ -843,9 +858,9 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) { } if (ubatch.token) { - LLAMA_LOG_DEBUG("%s: %4d: id = %6d (%16s), pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n", + LLAMA_LOG_DEBUG("%s: %4d: id = %6d (%16s), pos = %4d, token_type = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n", __func__, i, ubatch.token[i], vocab->token_to_piece(ubatch.token[i]).c_str(), - ubatch.pos[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]); + ubatch.pos[i], ubatch.token_type[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]); } else { LLAMA_LOG_DEBUG("%s: %4d: [embd], pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n", __func__, i, ubatch.pos[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]); diff --git a/src/llama-batch.h b/src/llama-batch.h index f77520e86c3..6746c3648b5 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -45,6 +45,7 @@ struct llama_ubatch { llama_token * token; // [n_tokens] | i | id, token float * embd; // [n_embd, n_tokens] | i | embd llama_pos * pos; // [n_tokens*n_pos] | i | pos + int32_t * token_type; // [n_tokens] | i | token_type int32_t * n_seq_id; // [n_tokens] | i | - llama_seq_id ** seq_id; // [n_tokens] | s | s0, s1, seq_id llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id @@ -55,6 +56,7 @@ struct llama_ubatch { std::vector token; std::vector embd; std::vector pos; + std::vector token_type; std::vector n_seq_id; std::vector seq_id; // these point into the seq_id_data below std::vector seq_id_unq; @@ -139,6 +141,7 @@ class llama_batch_allocr { std::array seq_id_0 = {{ 0 }}; // default sequence id std::vector pos; + std::vector token_type_ids; std::vector n_seq_id; std::vector seq_id; std::vector seq_id_unq; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 8e2b6ab8e7e..71fa85232ae 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -126,6 +126,17 @@ bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) { return res; } +void llm_graph_input_token_type::set_input(const llama_ubatch * ubatch) { + if (ubatch->token_type && type) { + const int64_t n_tokens = ubatch->n_tokens; + ggml_backend_tensor_set(type, ubatch->token_type, 0, n_tokens*ggml_element_size(type)); + } +} + +bool llm_graph_input_token_type::can_reuse(const llm_graph_params & params) { + return type->ne[0] == params.ubatch.n_tokens; +} + void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && attn_scale) { const int64_t n_tokens = ubatch->n_tokens; @@ -1719,6 +1730,15 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { return cur; } +ggml_tensor * llm_graph_context::build_inp_token_type() const { + auto inp = std::make_unique(); + auto & cur = inp->type; + cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(cur); + res->add_input(std::move(inp)); + return cur; +} + ggml_tensor * llm_graph_context::build_inp_attn_scale() const { auto inp = std::make_unique(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset); diff --git a/src/llama-graph.h b/src/llama-graph.h index 29e78451fbb..4753e8375bd 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -133,6 +133,18 @@ class llm_graph_input_pos : public llm_graph_input_i { const uint32_t n_pos_per_embd = 1; }; +class llm_graph_input_token_type: public llm_graph_input_i{ + public: + llm_graph_input_token_type() =default; + virtual ~llm_graph_input_token_type() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * type = nullptr; // I32 [n_batch] +}; + // temperature tuning, used by llama4 class llm_graph_input_attn_temp : public llm_graph_input_i { public: @@ -861,6 +873,7 @@ struct llm_graph_context { ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const; ggml_tensor * build_inp_pos() const; + ggml_tensor * build_inp_token_type() const; ggml_tensor * build_inp_attn_scale() const; ggml_tensor * build_inp_out_ids() const; ggml_tensor * build_inp_mean() const; diff --git a/src/models/bert.cpp b/src/models/bert.cpp index 6ab8c136858..260447d6803 100644 --- a/src/models/bert.cpp +++ b/src/models/bert.cpp @@ -9,6 +9,7 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params ggml_tensor * cur; ggml_tensor * inpL; ggml_tensor * inp_pos = nullptr; + ggml_tensor * inp_token_type; if (model.arch != LLM_ARCH_JINA_BERT_V2) { inp_pos = build_inp_pos(); @@ -17,10 +18,9 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params // construct input embeddings (token, type, position) inpL = build_inp_embd(model.tok_embd); - // token types are hardcoded to zero ("Sentence A") if (model.type_embd) { - ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); - inpL = ggml_add(ctx0, inpL, type_row0); + inp_token_type = build_inp_token_type(); + inpL = ggml_add(ctx0, inpL, ggml_get_rows(ctx0, model.type_embd, inp_token_type)); } if (model.arch == LLM_ARCH_BERT) { inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL); diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index ed5e306fc5b..856a5042a7e 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -7,6 +7,8 @@ #include "chat.h" #include "base64.hpp" +#include +#include #include "server-common.h" #include @@ -503,7 +505,8 @@ bool server_tokens::validate(const struct llama_context * ctx) const { return false; } } else if (t < 0 || t >= n_vocab) { - return false; + // t = t + token_type_id*n_vocab , if arch == bert + return model->arch==LLM_ARCH_BERT; } } return true; @@ -2037,17 +2040,33 @@ server_tokens format_prompt_rerank( server_tokens result = {}; const char * rerank_prompt = llama_model_chat_template(model, "rerank"); - + auto vocab_size = llama_vocab_n_tokens(vocab); if (rerank_prompt != nullptr) { std::string prompt = rerank_prompt; - string_replace_all(prompt, "{query}" , query); - string_replace_all(prompt, "{document}", doc ); - server_tokens tokens = tokenize_input_subprompt(vocab, mctx, prompt, false, true); - result.push_back(tokens); + size_t pos = prompt.find("{document}"); + std::string query_prompt = prompt.substr(0, pos); + std::string doc_prompt = prompt.substr(pos); + string_replace_all(query_prompt, "{query}" , query); + string_replace_all(doc_prompt, "{document}", doc ); + auto query_tokens= tokenize_input_subprompt(vocab, mctx,query_prompt, false, true); + auto doc_tokens= tokenize_input_subprompt(vocab, mctx,doc_prompt, false, true); + if (model->arch == LLM_ARCH_BERT){ + // token_id = token_id + token_type_ids*vocab_size + for (int32_t i = 0; i < doc_tokens.size(); i++) { + doc_tokens.set_token(i,doc_tokens[i] + vocab_size); + } + } + result.push_back(query_tokens); + result.push_back(doc_tokens); } else { // Get EOS token - use SEP token as fallback if EOS is not available server_tokens query_tokens = tokenize_input_subprompt(vocab, mctx, query, false, false); server_tokens doc_tokens = tokenize_input_subprompt(vocab, mctx, doc, false, false); + if (model->arch == LLM_ARCH_BERT) { + for (int32_t i = 0; i < doc_tokens.size(); i++) { + doc_tokens.set_token(i,doc_tokens[i] + vocab_size); + } + } llama_token eos_token = llama_vocab_eos(vocab); if (eos_token == LLAMA_TOKEN_NULL) { eos_token = llama_vocab_sep(vocab); @@ -2065,7 +2084,7 @@ server_tokens format_prompt_rerank( } result.push_back(doc_tokens); if (llama_vocab_get_add_eos(vocab)) { - result.push_back(eos_token); + result.push_back(model->arch == LLM_ARCH_BERT ? eos_token+ vocab_size : eos_token); } }