diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 67b3d277478..e324da6e71a 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -52,4 +52,5 @@ else() add_subdirectory(simple) add_subdirectory(speculative) add_subdirectory(tokenize) + add_subdirectory(internvl) endif() diff --git a/examples/internvl/README.md b/examples/internvl/README.md index e6c11e31bc2..41df666f2a2 100644 --- a/examples/internvl/README.md +++ b/examples/internvl/README.md @@ -45,7 +45,7 @@ python convert_hf_to_gguf.py path/to/adjusted-internlm-chat/ 6. Use `vision_model_to_gguf.py` to convert the image encoder to GGUF: ```sh -python vision_model_to_gguf.py path/to/Mini-InternVL-Chat-2B-V1-5/model.safetensors +python vision_model_to_gguf.py -m path/to/Mini-InternVL-Chat-2B-V1-5/model.safetensors ``` 7. Collect and rename the models: @@ -169,4 +169,4 @@ llama_print_timings: sample time = 0.21 ms / 6 runs ( 0.03 m llama_print_timings: prompt eval time = 70.31 ms / 820 tokens ( 0.09 ms per token, 11661.97 tokens per second) llama_print_timings: eval time = 15.84 ms / 5 runs ( 3.17 ms per token, 315.68 tokens per second) llama_print_timings: total time = 446.85 ms / 825 tokens -``` \ No newline at end of file +``` diff --git a/examples/internvl/clip.cpp b/examples/internvl/clip.cpp index 9569198aa27..43cfc94a9ea 100644 --- a/examples/internvl/clip.cpp +++ b/examples/internvl/clip.cpp @@ -26,6 +26,14 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + +#ifdef GGML_USE_VULKAN +#include "ggml-vulkan.h" +#endif + #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" diff --git a/examples/internvl/internvl-cli.cpp b/examples/internvl/internvl-cli.cpp index ed33a4cc2e1..fe21b977adf 100644 --- a/examples/internvl/internvl-cli.cpp +++ b/examples/internvl/internvl-cli.cpp @@ -1,23 +1,26 @@ -#include "ggml.h" +#include "arg.h" +#include "base64.hpp" +#include "log.h" #include "common.h" +#include "sampling.h" #include "clip.h" #include "internvl.h" #include "llama.h" +#include "ggml.h" -#include "base64.hpp" #include #include #include -static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { +static bool eval_tokens(struct llama_context * ctx, std::vector tokens, int n_batch, int * n_past) { int N = (int) tokens.size(); for (int i = 0; i < N; i += n_batch) { int n_eval = (int) tokens.size() - i; if (n_eval > n_batch) { n_eval = n_batch; } - if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { + if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); return false; } @@ -26,15 +29,15 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens; tokens.push_back(id); - return eval_tokens(ctx_llama, tokens, 1, n_past); + return eval_tokens(ctx, tokens, 1, n_past); } -static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ +static bool eval_string(struct llama_context * ctx, const char* str, int n_batch, int * n_past, bool add_bos){ std::string str2 = str; - std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + std::vector embd_inp = ::llama_tokenize(ctx, str2, add_bos, true); // printf("prompt token ids: "); // for (int i = 0; i < (int) embd_inp.size(); i++) { @@ -42,22 +45,26 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n // } // printf("\n"); - eval_tokens(ctx_llama, embd_inp, n_batch, n_past); + eval_tokens(ctx, embd_inp, n_batch, n_past); return true; } -static const char * sample(struct llama_sampling_context * ctx_sampling, - struct llama_context * ctx_llama, +// static const char * sample(struct llama_sampler * smpl, +static const char * sample(struct gpt_sampler * smpl, + struct llama_context * ctx, int * n_past) { - const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); - llama_sampling_accept(ctx_sampling, ctx_llama, id, true); + const llama_token id = gpt_sampler_sample(smpl, ctx, -1); + // const llama_token id = llama_sampler_sample(smpl, ctx, -1); // exception, Date: 1/10/2025 + gpt_sampler_accept(smpl, id, true); + // llama_sampler_accept(smpl, id); + // llama_sampler_accept(smpl, ctx, id, true); static std::string ret; - if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { + if (llama_token_is_eog(llama_get_model(ctx), id)) { ret = ""; } else { - ret = llama_token_to_piece(ctx_llama, id); + ret = llama_token_to_piece(ctx, id); } - eval_id(ctx_llama, id, n_past); + eval_id(ctx, id, n_past); return ret.c_str(); } @@ -114,17 +121,24 @@ static std::string remove_image_from_prompt(const std::string& prompt, const cha struct internvl_context { struct clip_ctx * ctx_clip = NULL; - struct llama_context * ctx_llama = NULL; + struct llama_context * ctx = NULL; struct llama_model * model = NULL; }; -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - LOG_TEE("\n example usage:\n"); - LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); +static void print_usage(int, char ** argv) { + LOG_TEE("\nexample usage:\n"); + LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); + LOG_TEE("\n"); } +// static void print_usage(int argc, char ** argv) { +// static void print_usage(int argc, char ** argv, const gpt_params & params) { +// gpt_params_print_usage(argc, argv, params); + +// LOG_TEE("\n example usage:\n"); +// LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); +// LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); +// } static struct internvl_image_embed * load_image(internvl_context * ctx_internvl, gpt_params * params, const std::string & fname) { @@ -135,14 +149,16 @@ static struct internvl_image_embed * load_image(internvl_context * ctx_internvl, if (!params->image.empty()) { fprintf(stderr, "using base64 encoded image instead of command line image path\n"); } - embed = internvl_image_embed_make_with_prompt_base64(ctx_internvl->ctx_clip, params->n_threads, prompt); + embed = internvl_image_embed_make_with_prompt_base64(ctx_internvl->ctx_clip, 12, prompt); + // embed = internvl_image_embed_make_with_prompt_base64(ctx_internvl->ctx_clip, params->n_threads, prompt); if (!embed) { fprintf(stderr, "%s: can't load image from prompt\n", __func__); return NULL; } params->prompt = remove_image_from_prompt(prompt); } else { - embed = internvl_image_embed_make_with_filename(ctx_internvl->ctx_clip, params->n_threads, fname.c_str()); + embed = internvl_image_embed_make_with_filename(ctx_internvl->ctx_clip, 12, fname.c_str()); + // embed = internvl_image_embed_make_with_filename(ctx_internvl->ctx_clip, params->n_threads, fname.c_str()); if (!embed) { fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); return NULL; @@ -158,7 +174,8 @@ static void process_prompt(struct internvl_context * ctx_internvl, struct intern int n_past = 0; const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_internvl->ctx_llama)); + const bool add_bos = llama_add_bos_token(llama_get_model(ctx_internvl->ctx)); + // const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_internvl->ctx)); // llava chat format is "'<|im_start|>system\nYou are an AI assistant whose name is InternLM (书生·浦语).<|im_end|><|im_start|>user\n\n请描述图片.<|im_end|><|im_start|>assistant\n'" std::size_t img_tok_pos = prompt.find(""); @@ -174,19 +191,28 @@ static void process_prompt(struct internvl_context * ctx_internvl, struct intern prompt2 = "\n" + prompt; } - eval_string(ctx_internvl->ctx_llama, ("<|im_start|>system\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。<|im_end|><|im_start|>user\n" + prompt1 + "").c_str(), params->n_batch, &n_past, true); - // eval_string(ctx_internvl->ctx_llama, ("<|im_start|>system\nYou are an AI assistant whose name is InternLM (书生·浦语).<|im_end|><|im_start|>user\n" + prompt1 + "").c_str(), params->n_batch, &n_past, true); - internvl_eval_image_embed(ctx_internvl->ctx_llama, image_embed, params->n_batch, &n_past); - eval_string(ctx_internvl->ctx_llama, ("" + prompt2 + "<|im_end|><|im_start|>assistant\n").c_str(), params->n_batch, &n_past, false); + eval_string(ctx_internvl->ctx, ("<|im_start|>system\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。<|im_end|><|im_start|>user\n" + prompt1 + "").c_str(), params->n_batch, &n_past, true); + // eval_string(ctx_internvl->ctx, ("<|im_start|>system\nYou are an AI assistant whose name is InternLM (书生·浦语).<|im_end|><|im_start|>user\n" + prompt1 + "").c_str(), params->n_batch, &n_past, true); + internvl_eval_image_embed(ctx_internvl->ctx, image_embed, params->n_batch, &n_past); // exception Date: 1/9/2025 + eval_string(ctx_internvl->ctx, ("" + prompt2 + "<|im_end|><|im_start|>assistant\n").c_str(), params->n_batch, &n_past, false); // generate the response fprintf(stderr, "\n"); - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); + // struct llama_sampler * smpl = gpt_sampler_init(ctx_internvl->model, params->sparams); + struct gpt_sampler * smpl = gpt_sampler_init(ctx_internvl->model, params->sparams); + // struct llama_sampler * smpl = llama_sampler_chain_init(params->sparams); + + // auto sparams = llama_sampler_chain_default_params(); + // struct llama_sampler * smpl = llama_sampler_chain_init(params -> sparams); + // struct llama_sampler * smpl = llama_sampler_chain_init(sparams); + // struct llama_sampler * smpl = llama_sampler_init(params->sparams); + + // struct llama_sampler * smpl = llama_sampler_chain_init(params->sparams); if (params->n_predict == -1) { while (true) { - const char *tmp = sample(ctx_sampling, ctx_internvl->ctx_llama, &n_past); + const char *tmp = sample(smpl, ctx_internvl->ctx, &n_past); // exception Date: 1/10/2025 if (strcmp(tmp, "") == 0 || strcmp(tmp, "<|im_end|>") == 0) break; printf("%s", tmp); @@ -194,7 +220,7 @@ static void process_prompt(struct internvl_context * ctx_internvl, struct intern } } else { for (int i = 0; i < max_tgt_len; i++) { - const char *tmp = sample(ctx_sampling, ctx_internvl->ctx_llama, &n_past); + const char *tmp = sample(smpl, ctx_internvl->ctx, &n_past); if (strcmp(tmp, "") == 0 || strcmp(tmp, "<|im_end|>") == 0) break; printf("%s", tmp); @@ -202,7 +228,8 @@ static void process_prompt(struct internvl_context * ctx_internvl, struct intern } } - llama_sampling_free(ctx_sampling); + // llama_sampler_free(smpl); + gpt_sampler_free(smpl); printf("\n"); } @@ -225,14 +252,14 @@ static struct llama_context * llama_init_context(gpt_params * params, llama_mode llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings - llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); + llama_context * ctx = llama_new_context_with_model(model, ctx_params); - if (ctx_llama == NULL) { + if (ctx == NULL) { fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); return NULL; } - return ctx_llama; + return ctx; } static struct internvl_context * internvl_init_context(gpt_params * params, llama_model * model) { @@ -248,7 +275,7 @@ static struct internvl_context * internvl_init_context(gpt_params * params, llam auto ctx_internvl = (struct internvl_context *)malloc(sizeof(internvl_context)); - ctx_internvl->ctx_llama = NULL; + ctx_internvl->ctx = NULL; ctx_internvl->ctx_clip = ctx_clip; ctx_internvl->model = model; return ctx_internvl; @@ -260,7 +287,7 @@ static void internvl_free(struct internvl_context * ctx_internvl) { ctx_internvl->ctx_clip = NULL; } - llama_free(ctx_internvl->ctx_llama); + llama_free(ctx_internvl->ctx); llama_free_model(ctx_internvl->model); llama_backend_free(); } @@ -276,8 +303,10 @@ int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { + // if (!gpt_params_parse(argc, argv, params)) { + // print_usage(argc, argv, params); + LOG_TEE("Parameter parsing failed.\n"); return 1; } @@ -289,7 +318,8 @@ int main(int argc, char ** argv) { #endif // LOG_DISABLE_LOGS if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { - print_usage(argc, argv, params); + print_usage(argc, argv); + // print_usage(argc, argv, params); return 1; } // printf("[debug by cxt] use prompt: %s\n", params.prompt.c_str()); @@ -316,19 +346,19 @@ int main(int argc, char ** argv) { // } // printf("\n"); // } - // auto ctx_llama = llama_init_context(¶ms, model); + // auto ctx = llama_init_context(¶ms, model); auto ctx_internvl = internvl_init_context(¶ms, model); - ctx_internvl->ctx_llama = llama_init_context(¶ms, model); + ctx_internvl->ctx = llama_init_context(¶ms, model); for (auto & image : params.image) { for (int i=0; i<15; i++) { - ctx_internvl->ctx_llama = llama_init_context(¶ms, model); + ctx_internvl->ctx = llama_init_context(¶ms, model); // // clear kv cache - // llama_kv_cache_clear(ctx_internvl->ctx_llama); + // llama_kv_cache_clear(ctx_internvl->ctx); const int64_t t_e2e_start_us = ggml_time_us(); - auto image_embed = load_image(ctx_internvl, ¶ms, image); + auto image_embed = load_image(ctx_internvl, ¶ms, image); // ok to load image, then fail if (!image_embed) { std::cerr << "error: failed to load image " << image << ". Terminating\n\n"; return 1; @@ -341,7 +371,8 @@ int main(int argc, char ** argv) { float t_e2e_cost_us = (t_e2e_end_us - t_e2e_start_us) / 1000.0; LOG_TEE("\n%s: %d e2e in %8.2f ms\n", __func__, i, t_e2e_cost_us); - llama_print_timings(ctx_internvl->ctx_llama); + // llama_print_timings(ctx_internvl->ctx); + // useless for now // internvl_adaptor_embed_free(prompt_embed); diff --git a/examples/internvl/internvl.cpp b/examples/internvl/internvl.cpp index 42ee35a95c7..24d4fe0c5b0 100644 --- a/examples/internvl/internvl.cpp +++ b/examples/internvl/internvl.cpp @@ -265,7 +265,7 @@ bool internvl_eval_image_embed(llama_context * ctx_llama, const struct internvl_ n_eval = n_batch; } llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; - if (llama_decode(ctx_llama, batch)) { + if (llama_decode(ctx_llama, batch)) { // exception Date: 1/9/2025 fprintf(stderr, "%s : failed to eval\n", __func__); return false; } diff --git a/examples/internvl/vision_model_to_gguf.py b/examples/internvl/vision_model_to_gguf.py index 2e0531363a1..183c5c0be97 100644 --- a/examples/internvl/vision_model_to_gguf.py +++ b/examples/internvl/vision_model_to_gguf.py @@ -1,9 +1,13 @@ import argparse import os - +import sys +from pathlib import Path import torch from safetensors.torch import load_file import numpy as np + +sys.path.insert(1, str(Path(__file__).parent.parent.parent / 'gguf-py')) + from gguf import * VISION = "clip.vision" @@ -68,6 +72,14 @@ def get_tensor_name(name: str) -> str: config = json.load(config_file) hparams = config["vision_config"] +# add missing parameters +fout.add_bool("clip.has_text_encoder",False) +fout.add_bool("clip.has_vision_encoder",True) +fout.add_bool("clip.has_llava_projector",True) +fout.add_bool("clip.use_gelu",True) +fout.add_uint32('clip.vision.projection_dim',768) + +# original fout.add_uint32("clip.vision.image_size", hparams["image_size"]) fout.add_uint32("clip.vision.patch_size", hparams["patch_size"]) fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), hparams["hidden_size"]) @@ -91,6 +103,14 @@ def get_tensor_name(name: str) -> str: if name.find('language_model') != -1: continue name = get_tensor_name(name) + + # replace mlp tensor names and more + + # if name.startswith("mlp1"): name = name.replace("mlp1", "mm") + # if "attn.proj" in name: name = name.replace("attn.proj", "attn_out") + # if "mlp.fc1" in name: name = name.replace("mlp.fc1", "ffn_down") + # if "mlp.fc2" in name: name = name.replace("mlp.fc2", "ffn_up") + data = data.float().numpy() # pw and dw conv ndim==4 if (data.ndim == 2 or data.ndim == 4) and ftype == 1: @@ -109,13 +129,20 @@ def get_tensor_name(name: str) -> str: print(f"{name} shape {data.shape} split into {len(qkv)} shape: {qkv[0].shape}, {qkv[1].shape}, {qkv[2].shape}") + + # original fout.add_tensor(name.replace(".attn.qkv", ".attn.q"), qkv[0]) fout.add_tensor(name.replace(".attn.qkv", ".attn.k"), qkv[1]) fout.add_tensor(name.replace(".attn.qkv", ".attn.v"), qkv[2]) + + # change naming + # fout.add_tensor(name.replace(".attn.qkv", ".attn_q"), qkv[0]) + # fout.add_tensor(name.replace(".attn.qkv", ".attn_k"), qkv[1]) + # fout.add_tensor(name.replace(".attn.qkv", ".attn_v"), qkv[2]) else: fout.add_tensor(name, data) fout.write_header_to_file() fout.write_kv_data_to_file() fout.write_tensors_to_file() -fout.close() \ No newline at end of file +fout.close() diff --git a/src/llama.cpp b/src/llama.cpp index 39e20440eea..fc6fd2a0248 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16076,12 +16076,6 @@ static int llama_decode_internal( return -1; } - for (uint32_t i = 0; i < n_tokens_all; ++i) { - if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]); - return -1; - } - } const auto & model = lctx.model; const auto & hparams = model.hparams; @@ -16089,6 +16083,15 @@ static int llama_decode_internal( GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT + if (batch_all.token){ + for (uint32_t i = 0; i < n_tokens_all; ++i) { + if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]); + return -1; + } + } + } + GGML_ASSERT(n_tokens_all <= cparams.n_batch); GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); @@ -16375,12 +16378,6 @@ static int llama_encode_internal( return -1; } - for (uint32_t i = 0; i < n_tokens; ++i) { - if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]); - return -1; - } - } const auto & model = lctx.model; const auto & hparams = model.hparams; @@ -16388,6 +16385,15 @@ static int llama_encode_internal( GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + if (batch.token){ + for (uint32_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]); + return -1; + } + } + } + // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");