Alusus language bindings for llama.cpp, providing a complete interface for running LLM inference locally. This library supports both CPU and Vulkan GPU backends.
import "Apm";
Apm.importFile("Alusus/Llama");
import "Srl/Console";
import "Apm";
Apm.importFile("Alusus/Llama");
use Srl;
use Llama;
// Load backends
Ggml.Backend.cpuLoad();
Ggml.Backend.vkLoad(); // Optional: for Vulkan GPU support
// Load model
def model: ref[Model](Model.load("model.gguf", Model.getDefaultParams()));
def ctx: ref[Context](Context.initFromModel(model, Context.getDefaultParams()));
// Tokenize input
def tokens: array[Token, 512];
def nTokens: Int = tokenize(model.vocab, "Hello", 5, tokens, 512, true, true);
// Decode and generate
def batch: Batch = Batch.getOne(tokens, nTokens);
ctx.decode(batch);
// Sample next token
def sampler: ref[Sampler](Sampler.initGreedy());
def nextToken: Token = sampler.sample(ctx, -1);
// Cleanup
Sampler.free(sampler);
Context.free(ctx);
Model.free(model);
Set the GGML_USE_VULKAN environment variable to 1 before running to enable Vulkan GPU acceleration:
GGML_USE_VULKAN=1 alusus your_script.alusus| Alusus Type | llama.cpp Type | Description |
|---|---|---|
Token |
llama_token |
Token ID (alias for Int) |
Pos |
llama_pos |
Token position (alias for Int) |
SeqId |
llama_seq_id |
Sequence ID (alias for Int) |
ProgressCallback |
llama_progress_callback |
Progress callback function pointer |
| Alusus Constant | llama.cpp Constant | Value | Description |
|---|---|---|---|
DEFAULT_SEED |
LLAMA_DEFAULT_SEED |
0xFFFFFFFF |
Default random seed |
TOKEN_NULL |
LLAMA_TOKEN_NULL |
-1 |
Null token value |
Controls how model layers are distributed across GPUs.
| Value | llama.cpp Value | Description |
|---|---|---|
NONE |
LLAMA_SPLIT_MODE_NONE |
Single GPU only |
LAYER |
LLAMA_SPLIT_MODE_LAYER |
Split layers and KV cache across GPUs |
ROW |
LLAMA_SPLIT_MODE_ROW |
Split with tensor parallelism if supported |
RoPE (Rotary Position Embedding) scaling type.
| Value | llama.cpp Value | Description |
|---|---|---|
UNSPECIFIED |
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED |
Not specified |
NONE |
LLAMA_ROPE_SCALING_TYPE_NONE |
No scaling |
LINEAR |
LLAMA_ROPE_SCALING_TYPE_LINEAR |
Linear scaling |
YARN |
LLAMA_ROPE_SCALING_TYPE_YARN |
YaRN scaling |
LONGROPE |
LLAMA_ROPE_SCALING_TYPE_LONGROPE |
LongRoPE scaling |
Pooling type for embeddings.
| Value | llama.cpp Value | Description |
|---|---|---|
UNSPECIFIED |
LLAMA_POOLING_TYPE_UNSPECIFIED |
Not specified |
NONE |
LLAMA_POOLING_TYPE_NONE |
No pooling |
MEAN |
LLAMA_POOLING_TYPE_MEAN |
Mean pooling |
CLS |
LLAMA_POOLING_TYPE_CLS |
CLS token pooling |
LAST |
LLAMA_POOLING_TYPE_LAST |
Last token pooling |
RANK |
LLAMA_POOLING_TYPE_RANK |
Rank pooling |
Attention mechanism type.
| Value | llama.cpp Value | Description |
|---|---|---|
UNSPECIFIED |
LLAMA_ATTENTION_TYPE_UNSPECIFIED |
Not specified |
CAUSAL |
LLAMA_ATTENTION_TYPE_CAUSAL |
Causal attention (autoregressive) |
NON_CAUSAL |
LLAMA_ATTENTION_TYPE_NON_CAUSAL |
Non-causal attention |
Flash attention configuration.
| Value | llama.cpp Value | Description |
|---|---|---|
AUTO |
-1 |
Automatic selection |
DISABLED |
0 |
Flash attention disabled |
ENABLED |
1 |
Flash attention enabled |
Token data with probability information.
| Field | Type | llama.cpp Field | Description |
|---|---|---|---|
id |
Token |
id |
Token ID |
logit |
Float |
logit |
Log probability |
p |
Float |
p |
Probability |
Maps to: llama_token_data
Array of token data for sampling.
| Field | Type | llama.cpp Field | Description |
|---|---|---|---|
data |
ptr[TokenData] |
data |
Pointer to token data array |
size |
ArchWord |
size |
Number of elements |
selected |
Int[64] |
selected |
Selected token index |
sorted |
Bool |
sorted |
Whether array is sorted |
Maps to: llama_token_data_array
Chat message structure for chat templates.
| Field | Type | llama.cpp Field | Description |
|---|---|---|---|
role |
CharsPtr |
role |
Message role ("user", "assistant", "system") |
content |
CharsPtr |
content |
Message content |
Maps to: llama_chat_message
Performance data for context operations.
| Field | Type | llama.cpp Field | Description |
|---|---|---|---|
tStartMs |
Float[64] |
t_start_ms |
Start time in milliseconds |
tLoadMs |
Float[64] |
t_load_ms |
Load time in milliseconds |
tPEvalMs |
Float[64] |
t_p_eval_ms |
Prompt evaluation time |
tEvalMs |
Float[64] |
t_eval_ms |
Token evaluation time |
nPEval |
Int |
n_p_eval |
Number of prompt evaluations |
nEval |
Int |
n_eval |
Number of token evaluations |
nReused |
Int |
n_reused |
Number of reused evaluations |
Maps to: llama_perf_context_data
Performance data for sampler operations.
| Field | Type | llama.cpp Field | Description |
|---|---|---|---|
tSampleMs |
Float[64] |
t_sample_ms |
Sampling time in milliseconds |
nSample |
Int |
n_sample |
Number of samples |
Maps to: llama_perf_sampler_data
Represents a loaded LLM model. Maps to llama_model.
Model loading parameters. Maps to llama_model_params.
Fields:
devices(ptr) - Device list. Maps todevices.tensorBuftOverrides(ref[array[TensorBuftOverride]]) - Tensor buffer type overrides. Maps totensor_buft_overrides.nGpuLayers(Int) - Number of layers to offload to GPU. Maps ton_gpu_layers.splitMode(SplitMode) - GPU split mode. Maps tosplit_mode.mainGpu(Int) - Main GPU index. Maps tomain_gpu.tensorSplit(ref[array[Float]]) - Tensor split ratios. Maps totensor_split.progressCallback(ProgressCallback) - Progress callback function. Maps toprogress_callback.progressCallbackUserData(ptr) - User data for callback. Maps toprogress_callback_user_data.kvOverrides(ptr) - KV cache overrides. Maps tokv_overrides.vocabOnly(Bool) - Load vocabulary only. Maps tovocab_only.useMmap(Bool) - Use memory mapping. Maps touse_mmap.useDirectIo(Bool) - Use direct I/O. Maps touse_direct_io.useMlock(Bool) - Lock memory. Maps touse_mlock.checkTensors(Bool) - Validate tensor data. Maps tocheck_tensors.useExtraBufts(Bool) - Use extra buffer types. Maps touse_extra_bufts.noHost(Bool) - No host buffer allocation. Maps tono_host.noAlloc(Bool) - No memory allocation. Maps tono_alloc.
-
Model.getDefaultParams(): ParamsGet default model parameters. Maps tollama_model_default_params. -
Model.load(path: CharsPtr, params: Params): ref[Model]Load model from file. Maps tollama_model_load_from_file. -
Model.loadSplits(paths: ref[array[CharsPtr]], count: Word, params: Params): ref[Model]Load model from split files. Maps tollama_model_load_from_splits. -
Model.free(model: ref[Model])Free model resources. Maps tollama_model_free.
-
model.save(path: CharsPtr)Save model to file. Maps tollama_model_save_to_file. -
model.hasEncoder: BoolCheck if model has encoder. Maps tollama_model_has_encoder. -
model.hasDecoder: BoolCheck if model has decoder. Maps tollama_model_has_decoder. -
model.nCtxTrain: IntTraining context size. Maps tollama_model_n_ctx_train. -
model.nEmbd: IntEmbedding dimension. Maps tollama_model_n_embd. -
model.nLayer: IntNumber of layers. Maps tollama_model_n_layer. -
model.nHead: IntNumber of attention heads. Maps tollama_model_n_head. -
model.vocab: ref[Vocab]Get vocabulary. Maps tollama_model_get_vocab. -
model.metaValStr(key: CharsPtr, buf: CharsPtr, bufSize: ArchWord): IntGet metadata value as string. Maps tollama_model_meta_val_str. -
model.metaCount: IntGet metadata count. Maps tollama_model_meta_count. -
model.metaKeyByIndex(idx: Int, buf: CharsPtr, bufSize: ArchWord): IntGet metadata key by index. Maps tollama_model_meta_key_by_index.
Inference context for a model. Maps to llama_context.
Context creation parameters. Maps to llama_context_params.
Fields:
nCtx(Word) - Context size (0 = use model default). Maps ton_ctx.nBatch(Word) - Logical batch size. Maps ton_batch.nUbatch(Word) - Physical batch size. Maps ton_ubatch.nSeqMax(Word) - Maximum sequences. Maps ton_seq_max.nThreads(Int) - Number of threads for generation. Maps ton_threads.nThreadsBatch(Int) - Number of threads for batch processing. Maps ton_threads_batch.ropeScalingType(RopeScalingType) - RoPE scaling type. Maps torope_scaling_type.poolingType(PoolingType) - Pooling type. Maps topooling_type.attentionType(AttentionType) - Attention type. Maps toattention_type.flashAttnType(FlashAttnType) - Flash attention setting. Maps toflash_attn.ropeFreqBase(Float) - RoPE base frequency. Maps torope_freq_base.ropeFreqScale(Float) - RoPE frequency scale. Maps torope_freq_scale.yarnExtFactor(Float) - YaRN extrapolation factor. Maps toyarn_ext_factor.yarnAttnFactor(Float) - YaRN attention factor. Maps toyarn_attn_factor.yarnBetaFast(Float) - YaRN beta fast. Maps toyarn_beta_fast.yarnBetaSlow(Float) - YaRN beta slow. Maps toyarn_beta_slow.yarnOrigCtx(Word) - YaRN original context size. Maps toyarn_orig_ctx.defragThold(Float) - Defragmentation threshold. Maps todefrag_thold.cbEval(function ptr) - Evaluation callback. Maps tocb_eval.cbEvalData(ptr) - Evaluation callback user data. Maps tocb_eval_user_data.typeK(Ggml.Type) - K cache data type. Maps totype_k.typeV(Ggml.Type) - V cache data type. Maps totype_v.abortCb(function ptr) - Abort callback. Maps toabort_callback.abortCbData(ptr) - Abort callback user data. Maps toabort_callback_data.embeddings(Bool) - Enable embeddings output. Maps toembeddings.offloadKqv(Bool) - Offload KQV to GPU. Maps tooffload_kqv.noPerf(Bool) - Disable performance counters. Maps tono_perf.opOffload(Bool) - Enable operation offloading. Maps toop_offload.swaFull(Bool) - Full sliding window attention. Maps toswa_full.kvUnified(Bool) - Unified KV cache. Maps tokv_unified.samplers(ref[array[SamplerSeqConfig]]) - Per-sequence samplers. Maps tosamplers.nSamplers(ArchWord) - Number of samplers. Maps ton_samplers.
-
Context.getDefaultParams(): ParamsGet default context parameters. Maps tollama_context_default_params. -
Context.initFromModel(model: ref[Model], params: Params): ref[Context]Create context from model. Maps tollama_init_from_model. -
Context.free(ctx: ref[Context])Free context resources. Maps tollama_free.
-
ctx.attachThreadpool(tp: ref[Ggml.Threadpool], tpBatch: ref[Ggml.Threadpool])Attach thread pool. Maps tollama_attach_threadpool. -
ctx.detachThreadpool()Detach thread pool. Maps tollama_detach_threadpool. -
ctx.getEmbeddingsSeq(seqId: SeqId): ref[array[Float]]Get sequence embeddings. Maps tollama_get_embeddings_seq. -
ctx.getModel(): ref[Model]Get associated model. Maps tollama_get_model. -
ctx.nCtx: WordGet context size. Maps tollama_n_ctx. -
ctx.nBatch: WordGet logical batch size. Maps tollama_n_batch. -
ctx.nUbatch: WordGet physical batch size. Maps tollama_n_ubatch. -
ctx.nSeqMax: WordGet max sequences. Maps tollama_n_seq_max. -
ctx.getMemory(): ptr[Memory]Get KV cache memory. Maps tollama_get_memory. -
ctx.poolingType: PoolingTypeGet pooling type. Maps tollama_pooling_type. -
ctx.encode(batch: Batch): IntEncode batch (for encoder models). Maps tollama_encode. -
ctx.decode(batch: Batch): IntDecode batch. Maps tollama_decode. -
ctx.getLogitsIth(i: Int): ref[array[Float]]Get logits for token at index. Maps tollama_get_logits_ith. -
ctx.getEmbeddingsIth(i: Int): ref[array[Float]]Get embeddings for token at index. Maps tollama_get_embeddings_ith. -
ctx.stateSize: ArchWordGet state size in bytes. Maps tollama_state_get_size. -
ctx.stateGetData(dst: ref[array[Word[8]]], size: ArchWord): ArchWordGet state data. Maps tollama_state_get_data. -
ctx.stateSetData(src: ref[array[Word[8]]], size: ArchWord): ArchWordSet state data. Maps tollama_state_set_data. -
ctx.stateLoadFile(path: CharsPtr, tokensOut: ref[array[Token]], cap: ArchWord, outCount: ref[ArchWord]): BoolLoad state from file. Maps tollama_state_load_file. -
ctx.stateSaveFile(path: CharsPtr, tokens: ref[array[Token]], count: ArchWord): BoolSave state to file. Maps tollama_state_save_file. -
ctx.setAdapterLora(ad: ref[Adapter], scale: Float): IntApply LoRA adapter. Maps tollama_set_adapter_lora. -
ctx.removeAdapterLora(ad: ref[Adapter]): IntRemove LoRA adapter. Maps tollama_rm_adapter_lora. -
ctx.clearAdapterLora()Clear all LoRA adapters. Maps tollama_clear_adapter_lora. -
ctx.perfContext: PerfContextDataGet performance data. Maps tollama_perf_context. -
ctx.perfContextPrint()Print performance data. Maps tollama_perf_context_print. -
ctx.perfContextReset()Reset performance counters. Maps tollama_perf_context_reset.
Token batch for processing. Maps to llama_batch.
Fields:
nTokens(Int) - Number of tokens. Maps ton_tokens.token(ref[array[Token]]) - Token IDs. Maps totoken.embd(ref[array[Float]]) - Embeddings (alternative to tokens). Maps toembd.pos(ref[array[Pos]]) - Token positions. Maps topos.nSeqId(ref[array[Int]]) - Number of sequence IDs per token. Maps ton_seq_id.seqId(ref[array[ref[array[SeqId]]]]) - Sequence IDs. Maps toseq_id.output(ref[array[Int[8]]]) - Output flags. Maps tologits.
-
Batch.getOne(tokens: ref[array[Token]], nTokens: Int): BatchCreate batch from token array. Maps tollama_batch_get_one. -
Batch.init(nTokens: Int, embd: Int, nSeqMax: Int): BatchInitialize empty batch. Maps tollama_batch_init.
Token sampler for generation. Maps to llama_sampler.
Sampler chain parameters. Maps to llama_sampler_chain_params.
Fields:
noPerf(Bool) - Disable performance counters. Maps tono_perf.
-
Sampler.init(iface: ptr, ctx: ptr): ref[Sampler]Initialize custom sampler. Maps tollama_sampler_init. -
Sampler.chainInit(params: ChainParams): ref[Sampler]Create sampler chain. Maps tollama_sampler_chain_init. -
Sampler.initGreedy(): ref[Sampler]Create greedy sampler. Maps tollama_sampler_init_greedy. -
Sampler.initDist(seed: Word): ref[Sampler]Create distribution sampler. Maps tollama_sampler_init_dist. -
Sampler.initTopK(k: Int): ref[Sampler]Create top-k sampler. Maps tollama_sampler_init_top_k. -
Sampler.initTopP(p: Float, minKeep: ArchWord): ref[Sampler]Create top-p (nucleus) sampler. Maps tollama_sampler_init_top_p. -
Sampler.initPenalties(penaltyLastN: Int, penaltyRepeat: Float, penaltyFreq: Float, penaltyPresent: Float): ref[Sampler]Create penalty sampler. Maps tollama_sampler_init_penalties.penaltyLastN: Last n tokens to penalize (0 = disable, -1 = context size)penaltyRepeat: Repeat penalty (1.0 = disabled)penaltyFreq: Frequency penalty (0.0 = disabled)penaltyPresent: Presence penalty (0.0 = disabled)
-
Sampler.clone(s: ref[Sampler]): ref[Sampler]Clone sampler. Maps tollama_sampler_clone. -
Sampler.free(s: ref[Sampler])Free sampler resources. Maps tollama_sampler_free.
-
sampler.reset()Reset sampler state. Maps tollama_sampler_reset. -
sampler.sample(ctx: ref[Context], idx: Int): TokenSample next token. Maps tollama_sampler_sample. -
sampler.name: CharsPtrGet sampler name. Maps tollama_sampler_name. -
sampler.accept(token: Token)Accept token for tracking. Maps tollama_sampler_accept. -
sampler.apply(cand: ref[TokenDataArray])Apply sampler to candidates. Maps tollama_sampler_apply. -
sampler.chainAdd(s: ref[Sampler])Add sampler to chain. Maps tollama_sampler_chain_add. -
sampler.chainGet(idx: Int): ref[Sampler]Get sampler from chain. Maps tollama_sampler_chain_get. -
sampler.chainCount: IntGet chain length. Maps tollama_sampler_chain_n. -
sampler.chainRemove(idx: Int): ref[Sampler]Remove sampler from chain. Maps tollama_sampler_chain_remove. -
sampler.perfSampler: PerfSamplerDataGet performance data. Maps tollama_perf_sampler. -
sampler.perfSamplerPrint()Print performance data. Maps tollama_perf_sampler_print. -
sampler.perfSamplerReset()Reset performance counters. Maps tollama_perf_sampler_reset.
Model vocabulary. Maps to llama_vocab.
-
vocab.tokenToPiece(token: Token, buf: CharsPtr, length: Int, lstrip: Int, special: Bool): IntConvert token to text. Maps tollama_token_to_piece. -
vocab.eos: TokenGet end-of-sequence token. Maps tollama_vocab_eos. -
vocab.isEog(token: Token): BoolCheck if token is end-of-generation. Maps tollama_vocab_is_eog.
KV cache memory management. Maps to llama_memory.
-
memory.clear(data: Bool)Clear KV cache. Maps tollama_memory_clear. -
memory.seqRemove(seq: SeqId, p0: Pos, p1: Pos): BoolRemove sequence range. Maps tollama_memory_seq_rm. -
memory.seqCopy(src: SeqId, dst: SeqId, p0: Pos, p1: Pos)Copy sequence range. Maps tollama_memory_seq_cp. -
memory.seqKeep(seq: SeqId)Keep only specified sequence. Maps tollama_memory_seq_keep. -
memory.seqAdd(seq: SeqId, p0: Pos, p1: Pos, delta: Pos)Add position delta to sequence. Maps tollama_memory_seq_add. -
memory.seqDiv(seq: SeqId, p0: Pos, p1: Pos, d: Int)Divide positions in sequence. Maps tollama_memory_seq_div.
LoRA adapter support. Maps to llama_adapter_lora.
-
Adapter.loraInit(model: ref[Model], path: CharsPtr): ref[Adapter]Load LoRA adapter. Maps tollama_adapter_lora_init. -
Adapter.loraFree(ad: ref[Adapter])Free LoRA adapter. Maps tollama_adapter_lora_free.
-
backendInit()Initialize llama backend. Maps tollama_backend_init. -
backendFree()Free llama backend. Maps tollama_backend_free. -
numaInit(strategy: Ggml.NumaStrategy)Initialize NUMA. Maps tollama_numa_init.
-
timeUs(): Int[64]Get current time in microseconds. Maps tollama_time_us. -
maxDevices(): ArchWordGet maximum number of devices. Maps tollama_max_devices. -
maxParallelSequences(): ArchWordGet max parallel sequences. Maps tollama_max_parallel_sequences. -
supportsMmap(): BoolCheck mmap support. Maps tollama_supports_mmap. -
supportsMlock(): BoolCheck mlock support. Maps tollama_supports_mlock. -
supportsGpuOffload(): BoolCheck GPU offload support. Maps tollama_supports_gpu_offload. -
supportsRpc(): BoolCheck RPC support. Maps tollama_supports_rpc. -
printSystemInfo(): CharsPtrGet system info string. Maps tollama_print_system_info.
-
tokenize(vocab: ref[Vocab], text: CharsPtr, textLen: Int, tokens: ref[array[Token]], nTokensMax: Int, addSpecial: Bool, parseSpecial: Bool): IntConvert text to tokens. Returns the number of tokens written. Maps tollama_tokenize. -
detokenize(vocab: ref[Vocab], tokens: ref[array[Token]], nTokens: Int, text: CharsPtr, textLenMax: Int, removeSpecial: Bool, unparseSpecial: Bool): IntConvert tokens to text. Returns the number of characters written. Maps tollama_detokenize.
-
chatApplyTemplate(tmpl: CharsPtr, chat: ref[array[ChatMessage]], nMsg: ArchWord, addAssistant: Bool, buf: CharsPtr, bufLen: Int): IntApply chat template to messages. Pass0fortmplto use the model's default template. Returns the number of characters written. Maps tollama_chat_apply_template. -
chatBuiltinTemplates(out: ref[CharsPtr], len: ArchWord): IntGet built-in template names. Maps tollama_chat_builtin_templates.
logSet(cb: ptr[function (level: Int, text: CharsPtr, userData: ptr)], userData: ptr)Set logging callback. Maps tollama_log_set.
See Examples/completion.alusus for a complete text completion example.
See Examples/chat.alusus for a multi-turn chat example with chat templates.
import "Apm";
Apm.importFile("Alusus/Llama");
use Llama;
// 1. Initialize backend
Ggml.Backend.cpuLoad();
// 2. Load model
def modelParams: Model.Params = Model.getDefaultParams();
modelParams.nGpuLayers = 0; // CPU only
def model: ref[Model](Model.load("model.gguf", modelParams));
// 3. Create context
def ctxParams: Context.Params = Context.getDefaultParams();
ctxParams.nCtx = 2048;
def ctx: ref[Context](Context.initFromModel(model, ctxParams));
// 4. Create sampler chain
def chainParams: Sampler.ChainParams;
chainParams.noPerf = true;
def sampler: ref[Sampler](Sampler.chainInit(chainParams));
sampler.chainAdd(Sampler.initPenalties(64, 1.1, 0.0, 0.0));
sampler.chainAdd(Sampler.initTopK(40));
sampler.chainAdd(Sampler.initTopP(0.9, 1));
sampler.chainAdd(Sampler.initDist(0));
// 5. Tokenize and decode prompt
def tokens: array[Token, 512];
def nTokens: Int = tokenize(model.vocab, "Hello world", 11, tokens, 512, true, true);
def batch: Batch = Batch.getOne(tokens, nTokens);
ctx.decode(batch);
// 6. Generate tokens
while true {
def id: Token = sampler.sample(ctx, -1);
if model.vocab.isEog(id) break;
// Convert token to text and print
def buf: array[Char, 64];
detokenize(model.vocab, id~ptr~cast[ref[array[Token]]], 1, buf~ptr, 64, 0, 0);
// ... print buf
// Decode next token
def nextBatch: Batch = Batch.getOne(id~ptr~cast[ref[array[Token]]], 1);
ctx.decode(nextBatch);
}
// 7. Cleanup
Sampler.free(sampler);
Context.free(ctx);
Model.free(model);
Copyright (c) 2023-2024 The ggml authors Copyright (c) 2026 Alusus Software Ltd. for the Alusus language bindings.
This library follows the same license as llama.cpp (MIT License).