Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6559,7 +6559,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
name = name[:-5] + ".bias"

# we are only using BERT for embeddings so we don't need the pooling layer
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
if name == "embeddings.position_ids":
return # we don't need these

if name.startswith("cls.predictions"):
Expand All @@ -6576,6 +6576,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
if name == "classifier.bias":
name = "classifier.out_proj.bias"

if name == "pooler.dense.weight":
name = "classifier.weight"

if name == "pooler.dense.bias":
name = "classifier.bias"

yield from super().modify_tensors(data_torch, name, bid)

def _xlmroberta_tokenizer_init(self) -> None:
Expand Down
1 change: 1 addition & 0 deletions examples/embedding/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ set(TARGET llama-embedding)
add_executable(${TARGET} embedding.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
64 changes: 43 additions & 21 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
#include "common.h"
#include "log.h"
#include "llama.h"

#include <src/llama-arch.h>
#include <src/llama-model.h>
#include <clocale>
#include <ctime>
#include <algorithm>

#if defined(_MSC_VER)
Expand Down Expand Up @@ -169,13 +169,12 @@ int main(int argc, char ** argv) {

// split the prompt into lines
std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
int32_t token_type_offset = llama_vocab_n_tokens(vocab);

// max batch size
const uint64_t n_batch = params.n_batch;

// get added sep and eos token, if any
const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
const char * rerank_prompt = llama_model_chat_template(model, "rerank");

// tokenize the prompts and trim
Expand All @@ -186,27 +185,50 @@ int main(int argc, char ** argv) {
// split classification pairs and insert expected separator tokens
if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
const std::string query = pairs[0];
const std::string doc = pairs[1];
if (rerank_prompt != nullptr) {
const std::string query = pairs[0];
const std::string doc = pairs[1];
std::string final_prompt = rerank_prompt;
string_replace_all(final_prompt, "{query}" , query);
string_replace_all(final_prompt, "{document}", doc );
inp = common_tokenize(vocab, final_prompt, true, true);
size_t pos = final_prompt.find("{document}");
std::string query_prompt = final_prompt.substr(0, pos);
std::string doc_prompt = final_prompt.substr(pos);
string_replace_all(query_prompt, "{query}" , query);
string_replace_all(doc_prompt, "{document}", doc );

auto inp_q= common_tokenize(vocab, query_prompt, false, true);
auto inp_d= common_tokenize(vocab, doc_prompt, false, true);

for(auto token: inp_q){
inp.emplace_back(token);
}
for(auto token: inp_d){
inp.emplace_back(model->arch == LLM_ARCH_BERT ? token + token_type_offset : token );
}
} else {
std::string final_prompt;
for (size_t i = 0; i < pairs.size(); i++) {
final_prompt += pairs[i];
if (i != pairs.size() - 1) {
if (!added_eos_token.empty()) {
final_prompt += added_eos_token;
}
if (!added_sep_token.empty()) {
final_prompt += added_sep_token;
}
}
llama_token eos_token = llama_vocab_eos(vocab);
if (eos_token == LLAMA_TOKEN_NULL) {
eos_token = llama_vocab_sep(vocab);
}

auto inp_q= common_tokenize(vocab, query, false, false);
auto inp_d= common_tokenize(vocab, doc, false, false);
if (llama_vocab_get_add_bos(vocab)) {
inp.emplace_back(llama_vocab_bos(vocab)); //add bos
}
inp.insert(inp.end(), inp_q.begin(), inp_q.end());//add seq A
if (llama_vocab_get_add_eos(vocab)) {
inp.emplace_back(eos_token); //add eos
}
if (llama_vocab_get_add_sep(vocab)) {
inp.emplace_back(llama_vocab_sep(vocab)); //add sep
}
for(auto token: inp_d){ //add seq B
inp.emplace_back(model->arch == LLM_ARCH_BERT ? token + token_type_offset : token);

}
if (llama_vocab_get_add_eos(vocab)) {
inp.emplace_back(model->arch == LLM_ARCH_BERT ? eos_token + token_type_offset : eos_token); //add eos
}
inp = common_tokenize(ctx, final_prompt, true, true);
}
} else {
inp = common_tokenize(ctx, prompt, true, true);
Expand Down
25 changes: 20 additions & 5 deletions src/llama-batch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,12 @@ bool llama_batch_allocr::init(
return false;
}

int32_t vocab_size = vocab.n_tokens();
if (batch.token) {
for (int32_t i = 0; i < batch.n_tokens; ++i) {
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
return false;
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab_size) {
LLAMA_LOG_WARN("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
break;
}
}
}
Expand All @@ -69,6 +70,12 @@ bool llama_batch_allocr::init(
//
// auto-generate missing fields
//
token_type_ids.resize(batch.n_tokens);
for (int32_t i = 0; i < batch.n_tokens; ++i) {
int32_t token_type = batch.token[i] / vocab_size;
batch.token[i] = batch.token[i] - token_type * vocab_size;
token_type_ids[i] = token_type;
}

if (!batch.n_seq_id) {
n_seq_id.resize(batch.n_tokens);
Expand Down Expand Up @@ -219,6 +226,7 @@ bool llama_batch_allocr::init(
/*.token =*/ batch.token,
/*.embd =*/ batch.embd,
/*.pos =*/ batch.pos,
/*.token_type =*/ token_type_ids.data(),
/*.n_seq_id =*/ batch.n_seq_id,
/*.seq_id =*/ batch.seq_id,
/*.seq_id_unq =*/ this->seq_id_unq.data(),
Expand Down Expand Up @@ -401,6 +409,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
udata->token .resize(n_tokens);
udata->embd .clear();
udata->pos .resize(n_pos_all);
udata->token_type.resize(n_tokens);
udata->n_seq_id .resize(n_tokens);
udata->seq_id .resize(n_tokens);
udata->seq_id_unq.resize(0);
Expand All @@ -423,6 +432,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
/*.token =*/ udata->token.data(),
/*.embd =*/ nullptr,
/*.pos =*/ udata->pos.data(),
/*.token_type =*/ udata->token_type.data(),
/*.n_seq_id =*/ udata->n_seq_id.data(),
/*.seq_id =*/ udata->seq_id.data(),
/*.seq_id_unq =*/ udata->seq_id_unq.data(),
Expand Down Expand Up @@ -658,6 +668,7 @@ void llama_batch_allocr::clear() {
batch = {};

pos .clear();
token_type_ids.clear();
n_seq_id .clear();
seq_id .clear();
seq_id_unq.clear();
Expand Down Expand Up @@ -691,6 +702,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
udata->token .resize(n_tokens);
udata->embd .resize(n_embd_all);
udata->pos .resize(n_pos_all);
udata->token_type.resize(n_tokens);
udata->n_seq_id .resize(n_tokens);
udata->seq_id .resize(n_tokens);
udata->seq_id_unq.resize(0);
Expand Down Expand Up @@ -719,6 +731,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
udata->pos[j*n_tokens + i] = batch.pos[src_off + idxs[i]];
}

udata->token_type[i] = token_type_ids[idxs[i]];
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
udata->output[i] = batch.logits[idxs[i]];

Expand Down Expand Up @@ -758,6 +771,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
/*.token =*/ batch.token ? udata->token.data() : nullptr,
/*.embd =*/ batch.embd ? udata->embd.data() : nullptr,
/*.pos =*/ udata->pos.data(),
/*token_type =*/ udata->token_type.data(),
/*.n_seq_id =*/ udata->n_seq_id.data(),
/*.seq_id =*/ udata->seq_id.data(),
/*.seq_id_unq =*/ udata->seq_id_unq.data(),
Expand Down Expand Up @@ -807,6 +821,7 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
LLAMA_LOG_DEBUG("%s: token = %p\n", __func__, (void *) ubatch.token);
LLAMA_LOG_DEBUG("%s: embd = %p\n", __func__, (void *) ubatch.embd);
LLAMA_LOG_DEBUG("%s: pos = %p\n", __func__, (void *) ubatch.pos);
LLAMA_LOG_DEBUG("%s: token_type = %p\n", __func__, (void *) ubatch.token_type);
LLAMA_LOG_DEBUG("%s: n_seq_id = %p\n", __func__, (void *) ubatch.n_seq_id);
LLAMA_LOG_DEBUG("%s: seq_id = %p\n", __func__, (void *) ubatch.seq_id);
LLAMA_LOG_DEBUG("%s: seq_id_unq = %s\n", __func__, ss_seq_id_unq.str().c_str());
Expand Down Expand Up @@ -843,9 +858,9 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
}

if (ubatch.token) {
LLAMA_LOG_DEBUG("%s: %4d: id = %6d (%16s), pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
LLAMA_LOG_DEBUG("%s: %4d: id = %6d (%16s), pos = %4d, token_type = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
__func__, i, ubatch.token[i], vocab->token_to_piece(ubatch.token[i]).c_str(),
ubatch.pos[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]);
ubatch.pos[i], ubatch.token_type[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]);
} else {
LLAMA_LOG_DEBUG("%s: %4d: [embd], pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
__func__, i, ubatch.pos[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]);
Expand Down
3 changes: 3 additions & 0 deletions src/llama-batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ struct llama_ubatch {
llama_token * token; // [n_tokens] | i | id, token
float * embd; // [n_embd, n_tokens] | i | embd
llama_pos * pos; // [n_tokens*n_pos] | i | pos
int32_t * token_type; // [n_tokens] | i | token_type
int32_t * n_seq_id; // [n_tokens] | i | -
llama_seq_id ** seq_id; // [n_tokens] | s | s0, s1, seq_id
llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id
Expand All @@ -55,6 +56,7 @@ struct llama_ubatch {
std::vector<llama_token> token;
std::vector<float> embd;
std::vector<llama_pos> pos;
std::vector<int32_t> token_type;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id *> seq_id; // these point into the seq_id_data below
std::vector<llama_seq_id> seq_id_unq;
Expand Down Expand Up @@ -139,6 +141,7 @@ class llama_batch_allocr {
std::array<llama_seq_id, 1> seq_id_0 = {{ 0 }}; // default sequence id

std::vector<llama_pos> pos;
std::vector<int32_t> token_type_ids;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id *> seq_id;
std::vector<llama_seq_id> seq_id_unq;
Expand Down
20 changes: 20 additions & 0 deletions src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,17 @@ bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
return res;
}

void llm_graph_input_token_type::set_input(const llama_ubatch * ubatch) {
if (ubatch->token_type && type) {
const int64_t n_tokens = ubatch->n_tokens;
ggml_backend_tensor_set(type, ubatch->token_type, 0, n_tokens*ggml_element_size(type));
}
}

bool llm_graph_input_token_type::can_reuse(const llm_graph_params & params) {
return type->ne[0] == params.ubatch.n_tokens;
}

void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && attn_scale) {
const int64_t n_tokens = ubatch->n_tokens;
Expand Down Expand Up @@ -1719,6 +1730,15 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
return cur;
}

ggml_tensor * llm_graph_context::build_inp_token_type() const {
auto inp = std::make_unique<llm_graph_input_token_type>();
auto & cur = inp->type;
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
ggml_set_input(cur);
res->add_input(std::move(inp));
return cur;
}

ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset);

Expand Down
13 changes: 13 additions & 0 deletions src/llama-graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,18 @@ class llm_graph_input_pos : public llm_graph_input_i {
const uint32_t n_pos_per_embd = 1;
};

class llm_graph_input_token_type: public llm_graph_input_i{
public:
llm_graph_input_token_type() =default;
virtual ~llm_graph_input_token_type() = default;

void set_input(const llama_ubatch * ubatch) override;

bool can_reuse(const llm_graph_params & params) override;

ggml_tensor * type = nullptr; // I32 [n_batch]
};

// temperature tuning, used by llama4
class llm_graph_input_attn_temp : public llm_graph_input_i {
public:
Expand Down Expand Up @@ -861,6 +873,7 @@ struct llm_graph_context {

ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
ggml_tensor * build_inp_pos() const;
ggml_tensor * build_inp_token_type() const;
ggml_tensor * build_inp_attn_scale() const;
ggml_tensor * build_inp_out_ids() const;
ggml_tensor * build_inp_mean() const;
Expand Down
6 changes: 3 additions & 3 deletions src/models/bert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
ggml_tensor * cur;
ggml_tensor * inpL;
ggml_tensor * inp_pos = nullptr;
ggml_tensor * inp_token_type;

if (model.arch != LLM_ARCH_JINA_BERT_V2) {
inp_pos = build_inp_pos();
Expand All @@ -17,10 +18,9 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
// construct input embeddings (token, type, position)
inpL = build_inp_embd(model.tok_embd);

// token types are hardcoded to zero ("Sentence A")
if (model.type_embd) {
ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
inpL = ggml_add(ctx0, inpL, type_row0);
inp_token_type = build_inp_token_type();
inpL = ggml_add(ctx0, inpL, ggml_get_rows(ctx0, model.type_embd, inp_token_type));
}
if (model.arch == LLM_ARCH_BERT) {
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
Expand Down
Loading