Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,5 @@ else()
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(tokenize)
add_subdirectory(internvl)
endif()
4 changes: 2 additions & 2 deletions examples/internvl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ python convert_hf_to_gguf.py path/to/adjusted-internlm-chat/
6. Use `vision_model_to_gguf.py` to convert the image encoder to GGUF:

```sh
python vision_model_to_gguf.py path/to/Mini-InternVL-Chat-2B-V1-5/model.safetensors
python vision_model_to_gguf.py -m path/to/Mini-InternVL-Chat-2B-V1-5/model.safetensors
```

7. Collect and rename the models:
Expand Down Expand Up @@ -169,4 +169,4 @@ llama_print_timings: sample time = 0.21 ms / 6 runs ( 0.03 m
llama_print_timings: prompt eval time = 70.31 ms / 820 tokens ( 0.09 ms per token, 11661.97 tokens per second)
llama_print_timings: eval time = 15.84 ms / 5 runs ( 3.17 ms per token, 315.68 tokens per second)
llama_print_timings: total time = 446.85 ms / 825 tokens
```
```
8 changes: 8 additions & 0 deletions examples/internvl/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@
#include "ggml-metal.h"
#endif

#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif

#ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h"
#endif

#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"

Expand Down
125 changes: 78 additions & 47 deletions examples/internvl/internvl-cli.cpp
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
#include "ggml.h"
#include "arg.h"
#include "base64.hpp"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include "clip.h"
#include "internvl.h"
#include "llama.h"
#include "ggml.h"

#include "base64.hpp"

#include <cstdio>
#include <cstdlib>
#include <vector>

static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
static bool eval_tokens(struct llama_context * ctx, std::vector<llama_token> tokens, int n_batch, int * n_past) {
int N = (int) tokens.size();
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
Expand All @@ -26,38 +29,42 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
return true;
}

static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
static bool eval_id(struct llama_context * ctx, int id, int * n_past) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(ctx_llama, tokens, 1, n_past);
return eval_tokens(ctx, tokens, 1, n_past);
}

static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
static bool eval_string(struct llama_context * ctx, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, add_bos, true);

// printf("prompt token ids: ");
// for (int i = 0; i < (int) embd_inp.size(); i++) {
// printf("%d ", embd_inp[i]);
// }
// printf("\n");

eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
eval_tokens(ctx, embd_inp, n_batch, n_past);
return true;
}

static const char * sample(struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_llama,
// static const char * sample(struct llama_sampler * smpl,
static const char * sample(struct gpt_sampler * smpl,
struct llama_context * ctx,
int * n_past) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
// const llama_token id = llama_sampler_sample(smpl, ctx, -1); // exception, Date: 1/10/2025
gpt_sampler_accept(smpl, id, true);
// llama_sampler_accept(smpl, id);
// llama_sampler_accept(smpl, ctx, id, true);
static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
if (llama_token_is_eog(llama_get_model(ctx), id)) {
ret = "</s>";
} else {
ret = llama_token_to_piece(ctx_llama, id);
ret = llama_token_to_piece(ctx, id);
}
eval_id(ctx_llama, id, n_past);
eval_id(ctx, id, n_past);
return ret.c_str();
}

Expand Down Expand Up @@ -114,17 +121,24 @@ static std::string remove_image_from_prompt(const std::string& prompt, const cha

struct internvl_context {
struct clip_ctx * ctx_clip = NULL;
struct llama_context * ctx_llama = NULL;
struct llama_context * ctx = NULL;
struct llama_model * model = NULL;
};

static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);

LOG_TEE("\n example usage:\n");
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
LOG_TEE("\n");
}
// static void print_usage(int argc, char ** argv) {
// static void print_usage(int argc, char ** argv, const gpt_params & params) {
// gpt_params_print_usage(argc, argv, params);

// LOG_TEE("\n example usage:\n");
// LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
// LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
// }

static struct internvl_image_embed * load_image(internvl_context * ctx_internvl, gpt_params * params, const std::string & fname) {

Expand All @@ -135,14 +149,16 @@ static struct internvl_image_embed * load_image(internvl_context * ctx_internvl,
if (!params->image.empty()) {
fprintf(stderr, "using base64 encoded image instead of command line image path\n");
}
embed = internvl_image_embed_make_with_prompt_base64(ctx_internvl->ctx_clip, params->n_threads, prompt);
embed = internvl_image_embed_make_with_prompt_base64(ctx_internvl->ctx_clip, 12, prompt);
// embed = internvl_image_embed_make_with_prompt_base64(ctx_internvl->ctx_clip, params->n_threads, prompt);
if (!embed) {
fprintf(stderr, "%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
} else {
embed = internvl_image_embed_make_with_filename(ctx_internvl->ctx_clip, params->n_threads, fname.c_str());
embed = internvl_image_embed_make_with_filename(ctx_internvl->ctx_clip, 12, fname.c_str());
// embed = internvl_image_embed_make_with_filename(ctx_internvl->ctx_clip, params->n_threads, fname.c_str());
if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL;
Expand All @@ -158,7 +174,8 @@ static void process_prompt(struct internvl_context * ctx_internvl, struct intern
int n_past = 0;

const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_internvl->ctx_llama));
const bool add_bos = llama_add_bos_token(llama_get_model(ctx_internvl->ctx));
// const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_internvl->ctx));

// llava chat format is "'<|im_start|>system\nYou are an AI assistant whose name is InternLM (书生·浦语).<|im_end|><|im_start|>user\n<image>\n请描述图片.<|im_end|><|im_start|>assistant\n'"
std::size_t img_tok_pos = prompt.find("<image>");
Expand All @@ -174,35 +191,45 @@ static void process_prompt(struct internvl_context * ctx_internvl, struct intern
prompt2 = "\n" + prompt;
}

eval_string(ctx_internvl->ctx_llama, ("<|im_start|>system\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。<|im_end|><|im_start|>user\n" + prompt1 + "<img>").c_str(), params->n_batch, &n_past, true);
// eval_string(ctx_internvl->ctx_llama, ("<|im_start|>system\nYou are an AI assistant whose name is InternLM (书生·浦语).<|im_end|><|im_start|>user\n" + prompt1 + "<img>").c_str(), params->n_batch, &n_past, true);
internvl_eval_image_embed(ctx_internvl->ctx_llama, image_embed, params->n_batch, &n_past);
eval_string(ctx_internvl->ctx_llama, ("</img>" + prompt2 + "<|im_end|><|im_start|>assistant\n").c_str(), params->n_batch, &n_past, false);
eval_string(ctx_internvl->ctx, ("<|im_start|>system\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。<|im_end|><|im_start|>user\n" + prompt1 + "<img>").c_str(), params->n_batch, &n_past, true);
// eval_string(ctx_internvl->ctx, ("<|im_start|>system\nYou are an AI assistant whose name is InternLM (书生·浦语).<|im_end|><|im_start|>user\n" + prompt1 + "<img>").c_str(), params->n_batch, &n_past, true);
internvl_eval_image_embed(ctx_internvl->ctx, image_embed, params->n_batch, &n_past); // exception Date: 1/9/2025
eval_string(ctx_internvl->ctx, ("</img>" + prompt2 + "<|im_end|><|im_start|>assistant\n").c_str(), params->n_batch, &n_past, false);
// generate the response

fprintf(stderr, "\n");

struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
// struct llama_sampler * smpl = gpt_sampler_init(ctx_internvl->model, params->sparams);
struct gpt_sampler * smpl = gpt_sampler_init(ctx_internvl->model, params->sparams);
// struct llama_sampler * smpl = llama_sampler_chain_init(params->sparams);

// auto sparams = llama_sampler_chain_default_params();
// struct llama_sampler * smpl = llama_sampler_chain_init(params -> sparams);
// struct llama_sampler * smpl = llama_sampler_chain_init(sparams);
// struct llama_sampler * smpl = llama_sampler_init(params->sparams);

// struct llama_sampler * smpl = llama_sampler_chain_init(params->sparams);

if (params->n_predict == -1) {
while (true) {
const char *tmp = sample(ctx_sampling, ctx_internvl->ctx_llama, &n_past);
const char *tmp = sample(smpl, ctx_internvl->ctx, &n_past); // exception Date: 1/10/2025
if (strcmp(tmp, "</s>") == 0 || strcmp(tmp, "<|im_end|>") == 0)
break;
printf("%s", tmp);
fflush(stdout);
}
} else {
for (int i = 0; i < max_tgt_len; i++) {
const char *tmp = sample(ctx_sampling, ctx_internvl->ctx_llama, &n_past);
const char *tmp = sample(smpl, ctx_internvl->ctx, &n_past);
if (strcmp(tmp, "</s>") == 0 || strcmp(tmp, "<|im_end|>") == 0)
break;
printf("%s", tmp);
fflush(stdout);
}
}

llama_sampling_free(ctx_sampling);
// llama_sampler_free(smpl);
gpt_sampler_free(smpl);
printf("\n");
}

Expand All @@ -225,14 +252,14 @@ static struct llama_context * llama_init_context(gpt_params * params, llama_mode
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings

llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
llama_context * ctx = llama_new_context_with_model(model, ctx_params);

if (ctx_llama == NULL) {
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return NULL;
}

return ctx_llama;
return ctx;
}

static struct internvl_context * internvl_init_context(gpt_params * params, llama_model * model) {
Expand All @@ -248,7 +275,7 @@ static struct internvl_context * internvl_init_context(gpt_params * params, llam

auto ctx_internvl = (struct internvl_context *)malloc(sizeof(internvl_context));

ctx_internvl->ctx_llama = NULL;
ctx_internvl->ctx = NULL;
ctx_internvl->ctx_clip = ctx_clip;
ctx_internvl->model = model;
return ctx_internvl;
Expand All @@ -260,7 +287,7 @@ static void internvl_free(struct internvl_context * ctx_internvl) {
ctx_internvl->ctx_clip = NULL;
}

llama_free(ctx_internvl->ctx_llama);
llama_free(ctx_internvl->ctx);
llama_free_model(ctx_internvl->model);
llama_backend_free();
}
Expand All @@ -276,8 +303,10 @@ int main(int argc, char ** argv) {

gpt_params params;

if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
// if (!gpt_params_parse(argc, argv, params)) {
// print_usage(argc, argv, params);
LOG_TEE("Parameter parsing failed.\n");
return 1;
}

Expand All @@ -289,7 +318,8 @@ int main(int argc, char ** argv) {
#endif // LOG_DISABLE_LOGS

if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv, params);
print_usage(argc, argv);
// print_usage(argc, argv, params);
return 1;
}
// printf("[debug by cxt] use prompt: %s\n", params.prompt.c_str());
Expand All @@ -316,19 +346,19 @@ int main(int argc, char ** argv) {
// }
// printf("\n");
// }
// auto ctx_llama = llama_init_context(&params, model);
// auto ctx = llama_init_context(&params, model);

auto ctx_internvl = internvl_init_context(&params, model);
ctx_internvl->ctx_llama = llama_init_context(&params, model);
ctx_internvl->ctx = llama_init_context(&params, model);
for (auto & image : params.image) {
for (int i=0; i<15; i++) {

ctx_internvl->ctx_llama = llama_init_context(&params, model);
ctx_internvl->ctx = llama_init_context(&params, model);
// // clear kv cache
// llama_kv_cache_clear(ctx_internvl->ctx_llama);
// llama_kv_cache_clear(ctx_internvl->ctx);

const int64_t t_e2e_start_us = ggml_time_us();
auto image_embed = load_image(ctx_internvl, &params, image);
auto image_embed = load_image(ctx_internvl, &params, image); // ok to load image, then fail
if (!image_embed) {
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
return 1;
Expand All @@ -341,7 +371,8 @@ int main(int argc, char ** argv) {
float t_e2e_cost_us = (t_e2e_end_us - t_e2e_start_us) / 1000.0;
LOG_TEE("\n%s: %d e2e in %8.2f ms\n", __func__, i, t_e2e_cost_us);

llama_print_timings(ctx_internvl->ctx_llama);
// llama_print_timings(ctx_internvl->ctx);
// useless for now

// internvl_adaptor_embed_free(prompt_embed);

Expand Down
2 changes: 1 addition & 1 deletion examples/internvl/internvl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ bool internvl_eval_image_embed(llama_context * ctx_llama, const struct internvl_
n_eval = n_batch;
}
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) {
if (llama_decode(ctx_llama, batch)) { // exception Date: 1/9/2025
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
Expand Down
Loading