sam3.cpp/sam3.cpp at main · PABannier/sam3.cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#define _USE_MATH_DEFINES

#include "sam3.h"

/* ggml */
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml.h"

#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif

/* stb (implementation compiled here -- order is pinned) */
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION

#ifdef _WIN32
#include <direct.h>
#include <io.h>
#define popen  _popen
#define pclose _pclose
#define mkdir(path, mode) _mkdir(path)
#define SAM3_NULL_DEV   "NUL"
#define SAM3_POPEN_READ "rb"
#else
#include <sys/stat.h>
#define SAM3_NULL_DEV   "/dev/null"
#define SAM3_POPEN_READ "r"
#endif

/* C++ standard library */
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <thread>
#include <unordered_map>
#include <utility>
#include <vector>

#include "stb_image_write.h"

/* Logging: 0=silent, 1=summary timing, 2=verbose progress. Override with
   -DSAM3_LOG_LEVEL=0 at build time for zero-overhead silent builds. */
#ifndef SAM3_LOG_LEVEL
#define SAM3_LOG_LEVEL 1
#endif
#define SAM3_LOG(level, ...) \
    do { if ((level) <= SAM3_LOG_LEVEL) fprintf(stderr, __VA_ARGS__); } while (0)


/*****************************************************************************
** Constants
*****************************************************************************/

static constexpr uint32_t SAM3_MAGIC     = 0x73616D33;  // "sam3"
static constexpr uint32_t SAM2_MAGIC     = 0x73616D32;  // "sam2"
static constexpr uint32_t SAM3_TOK_MAGIC = 0x746F6B00;  // "tok\0"
static constexpr int      SAM3_FILE_VERSION = 3;
static constexpr int      SAM2_VERSION   = 1;


/*****************************************************************************
** Internal Data Types -- Hyperparameters
*****************************************************************************/

struct sam3_hparams {
    // ── Model type ───────────────────────────────────────────────────────
    sam3_model_type model_type = SAM3_MODEL_SAM3;

    // ── SAM3 fields (unchanged) ─────────────────────────────────────────
    int32_t img_size        = 1008;
    int32_t patch_size      = 14;
    int32_t vit_embed_dim   = 1024;
    int32_t vit_depth       = 32;
    int32_t vit_num_heads   = 16;
    int32_t vit_mlp_dim     = 4736;  // 1024 * 4.625
    int32_t vit_window_size = 24;
    int32_t n_global_attn   = 4;
    int32_t global_attn_idx[4] = {7, 15, 23, 31};

    int32_t text_width      = 1024;
    int32_t text_heads      = 16;
    int32_t text_layers     = 24;
    int32_t text_ctx_len    = 32;
    int32_t text_vocab_size = 49408;
    int32_t text_out_dim    = 256;

    int32_t neck_dim        = 256;

    int32_t fenc_layers     = 6;
    int32_t fenc_heads      = 8;
    int32_t fenc_ffn_dim    = 2048;

    int32_t ddec_layers       = 6;
    int32_t ddec_heads        = 8;
    int32_t ddec_ffn_dim      = 2048;
    int32_t ddec_num_queries  = 200;

    int32_t geom_layers        = 3;
    int32_t n_presence_tokens  = 1;
    int32_t n_geom_queries     = 4;

    int32_t sam_embed_dim     = 256;
    int32_t sam_dec_depth     = 2;
    int32_t sam_n_multimask   = 3;
    int32_t sam_iou_head_depth = 3;

    int32_t mem_out_dim     = 64;
    int32_t mem_attn_layers = 4;
    int32_t num_maskmem     = 7;
    int32_t max_obj_ptrs    = 16;

    int32_t n_amb_experts   = 2;

    int32_t visual_only     = 0;  // 1 = no text encoder / detector path

    // ── SAM2-specific Hiera backbone fields ─────────────────────────────
    int32_t hiera_embed_dim      = 144;
    int32_t hiera_num_heads      = 2;
    int32_t hiera_num_stages     = 4;
    int32_t hiera_stages[4]      = {2, 6, 36, 4};
    int32_t hiera_q_pool         = 3;
    int32_t hiera_window_spec[4] = {8, 4, 16, 8};
    int32_t hiera_global_n       = 3;
    int32_t hiera_global_idx[8]  = {23, 33, 43, 0, 0, 0, 0, 0};
    int32_t hiera_pos_embed_bkg_h = 7;
    int32_t hiera_pos_embed_bkg_w = 7;

    int32_t fpn_top_down_n         = 2;
    int32_t fpn_top_down_levels[4] = {2, 3, 0, 0};
    int32_t scalp                  = 1;

    // ── SAM2-specific memory/tracking flags ─────────────────────────────
    int32_t sigmoid_scale_x100                  = 2000;
    int32_t sigmoid_bias_x100                   = -1000;
    int32_t use_high_res_features               = 1;
    int32_t use_obj_ptrs_in_encoder             = 1;
    int32_t pred_obj_scores                     = 1;
    int32_t use_multimask_token_for_obj_ptr     = 1;
    int32_t directly_add_no_mem_embed           = 1;
    int32_t non_overlap_masks_for_mem_enc       = 1;
    int32_t binarize_mask_from_pts              = 0;
    int32_t multimask_output_for_tracking       = 1;
    int32_t multimask_min_pt_num                = 0;
    int32_t multimask_max_pt_num                = 1;
    int32_t fixed_no_obj_ptr                    = 1;
    int32_t iou_prediction_use_sigmoid          = 1;
    int32_t use_mask_input_as_output            = 1;
    int32_t multimask_output_in_sam             = 1;
    int32_t is_sam2_1                           = 1;  // 0 = SAM2.0, 1 = SAM2.1

    // ── EdgeTAM-specific fields ─────────────────────────────────────
    int32_t backbone_type             = 1;   // 1=hiera, 2=repvit
    int32_t repvit_num_stages         = 4;
    int32_t repvit_stages[4]          = {2, 2, 14, 2};
    int32_t repvit_channels[4]        = {48, 96, 192, 384};
    int32_t repvit_se_ratio_x100      = 25;

    int32_t has_perceiver             = 0;
    int32_t perceiver_depth           = 0;
    int32_t perceiver_dim             = 0;
    int32_t perceiver_n_latents_1d    = 0;
    int32_t perceiver_n_latents_2d    = 0;
    int32_t perceiver_ff_mult         = 0;

    int32_t mem_attn_ca_type          = 0;   // 0=RoPEv1, 1=RoPEv2
    int32_t mem_attn_ca_q_size        = 32;
    int32_t mem_attn_ca_k_size        = 32;

    // ── SAM3 derived helpers ────────────────────────────────────────────
    int32_t n_img_embd() const { return img_size / patch_size; }            // 72
    int32_t n_img_tokens() const { return n_img_embd() * n_img_embd(); }    // 5184
    int32_t vit_head_dim() const { return vit_embed_dim / vit_num_heads; }  // 64

    bool is_global_attn(int layer) const {
        for (int i = 0; i < n_global_attn; ++i) {
            if (global_attn_idx[i] == layer) return true;
        }
        return false;
    }

    // ── EdgeTAM derived helpers ────────────────────────────────────────
    bool is_edgetam() const { return model_type == SAM3_MODEL_EDGETAM; }

    int32_t edgetam_feat_size() const { return img_size / 16; }  // 1024/16 = 64

    // ── SAM2 derived helpers ────────────────────────────────────────────
    bool is_sam2() const { return model_type == SAM3_MODEL_SAM2; }

    int32_t hiera_total_blocks() const {
        int s = 0;
        for (int i = 0; i < hiera_num_stages; ++i) s += hiera_stages[i];
        return s;
    }

    int32_t hiera_feat_size() const {
        int s = img_size / 4;
        int n_pools = (hiera_q_pool < hiera_num_stages) ? hiera_q_pool : hiera_num_stages - 1;
        for (int i = 0; i < n_pools; ++i) s /= 2;
        for (int i = 0; i < scalp; ++i) s *= 2;
        return s;
    }

    int32_t hiera_stage_dim(int stage) const {
        int d = hiera_embed_dim;
        for (int i = 0; i < stage; ++i) d *= 2;
        return d;
    }

    int32_t hiera_stage_heads(int stage) const {
        int h = hiera_num_heads;
        for (int i = 0; i < stage; ++i) h *= 2;
        return h;
    }

    int32_t hiera_stage_spatial(int stage) const {
        int s = img_size / 4;
        for (int i = 1; i <= stage && i <= hiera_q_pool; ++i) s /= 2;
        return s;
    }

    float sigmoid_scale() const { return sigmoid_scale_x100 / 100.0f; }
    float sigmoid_bias()  const { return sigmoid_bias_x100  / 100.0f; }

    // Feature size for the active backbone
    int32_t feat_size() const {
        if (is_edgetam()) return edgetam_feat_size();
        if (is_sam2()) return hiera_feat_size();
        return n_img_embd();
    }

    bool is_hiera_global_attn(int block_idx) const {
        for (int i = 0; i < hiera_global_n; ++i) {
            if (hiera_global_idx[i] == block_idx) return true;
        }
        return false;
    }
};

// Compute feat_size for an arbitrary img_size.
static int sam3_effective_feat_size(const sam3_hparams& hp, int img_size) {
    if (hp.is_edgetam()) {
        return img_size / 16;
    }
    if (hp.is_sam2()) {
        int s = img_size / 4;
        int n_pools = std::min(hp.hiera_q_pool, hp.hiera_num_stages - 1);
        for (int i = 0; i < n_pools; ++i) s /= 2;
        for (int i = 0; i < hp.scalp; ++i) s *= 2;
        return s;
    }
    return img_size / hp.patch_size;
}


/*****************************************************************************
** Internal Data Types -- Layer Weight Structs
*****************************************************************************/

/*
** ── ViT Backbone ─────────────────────────────────────────────────────────
*/

struct sam3_vit_block {
    struct ggml_tensor* norm1_w   = nullptr;
    struct ggml_tensor* norm1_b   = nullptr;
    struct ggml_tensor* qkv_w     = nullptr;
    struct ggml_tensor* qkv_b     = nullptr;
    struct ggml_tensor* proj_w    = nullptr;
    struct ggml_tensor* proj_b    = nullptr;
    struct ggml_tensor* norm2_w   = nullptr;
    struct ggml_tensor* norm2_b   = nullptr;
    struct ggml_tensor* mlp_fc1_w = nullptr;
    struct ggml_tensor* mlp_fc1_b = nullptr;
    struct ggml_tensor* mlp_fc2_w = nullptr;
    struct ggml_tensor* mlp_fc2_b = nullptr;
    struct ggml_tensor* freqs_cis = nullptr;  // [N, 32, 2] RoPE
};

struct sam3_vit {
    struct ggml_tensor*          patch_embed_w = nullptr;  // [patch, patch, 3, embed]
    struct ggml_tensor*          pos_embed     = nullptr;  // [embed, 24, 24, 1]
    struct ggml_tensor*          ln_pre_w      = nullptr;
    struct ggml_tensor*          ln_pre_b      = nullptr;
    std::vector<sam3_vit_block>  blocks;
};

/*
** ── Neck (SimpleFPN) ─────────────────────────────────────────────────────
*/

struct sam3_neck_scale {
    struct ggml_tensor* deconv1_w  = nullptr;
    struct ggml_tensor* deconv1_b  = nullptr;
    struct ggml_tensor* deconv2_w  = nullptr;  // only for 4x scale
    struct ggml_tensor* deconv2_b  = nullptr;
    struct ggml_tensor* conv1x1_w  = nullptr;
    struct ggml_tensor* conv1x1_b  = nullptr;
    struct ggml_tensor* conv3x3_w  = nullptr;
    struct ggml_tensor* conv3x3_b  = nullptr;
};

struct sam3_neck {
    sam3_neck_scale scales[4];
    struct ggml_tensor* norms_w[4] = {};
    struct ggml_tensor* norms_b[4] = {};
};

/*
** ── Text Encoder ─────────────────────────────────────────────────────────
*/

struct sam3_text_block {
    struct ggml_tensor* attn_in_proj_w  = nullptr;
    struct ggml_tensor* attn_in_proj_b  = nullptr;
    struct ggml_tensor* attn_out_proj_w = nullptr;
    struct ggml_tensor* attn_out_proj_b = nullptr;
    struct ggml_tensor* ln1_w           = nullptr;
    struct ggml_tensor* ln1_b           = nullptr;
    struct ggml_tensor* ln2_w           = nullptr;
    struct ggml_tensor* ln2_b           = nullptr;
    struct ggml_tensor* mlp_fc1_w       = nullptr;
    struct ggml_tensor* mlp_fc1_b       = nullptr;
    struct ggml_tensor* mlp_fc2_w       = nullptr;
    struct ggml_tensor* mlp_fc2_b       = nullptr;
    struct ggml_tensor* ls1             = nullptr;  // LayerScale
    struct ggml_tensor* ls2             = nullptr;
};

struct sam3_text_encoder {
    struct ggml_tensor* token_embed_w = nullptr;  // [vocab, width]
    struct ggml_tensor* pos_embed     = nullptr;  // [ctx_len, width]
    struct ggml_tensor* ln_final_w    = nullptr;
    struct ggml_tensor* ln_final_b    = nullptr;
    struct ggml_tensor* resizer_w     = nullptr;  // [out_dim, width]
    struct ggml_tensor* resizer_b     = nullptr;
    // Note: text_projection ([width, proj_dim]) exists in the checkpoint but is
    // intentionally not loaded. In SAM3, VETextEncoder discards the pooled output
    // that text_projection operates on — only the full token sequence (through
    // resizer) is used for downstream fusion/decoding.
    std::vector<sam3_text_block> blocks;
};

/*
** ── Fusion Encoder ───────────────────────────────────────────────────────
*/

struct sam3_fenc_layer {
    // self-attention
    struct ggml_tensor* sa_in_proj_w  = nullptr;
    struct ggml_tensor* sa_in_proj_b  = nullptr;
    struct ggml_tensor* sa_out_proj_w = nullptr;
    struct ggml_tensor* sa_out_proj_b = nullptr;
    struct ggml_tensor* norm1_w       = nullptr;
    struct ggml_tensor* norm1_b       = nullptr;
    // cross-attention to prompt tokens
    struct ggml_tensor* ca_q_w        = nullptr;
    struct ggml_tensor* ca_q_b        = nullptr;
    struct ggml_tensor* ca_kv_w       = nullptr;
    struct ggml_tensor* ca_kv_b       = nullptr;
    struct ggml_tensor* ca_out_w      = nullptr;
    struct ggml_tensor* ca_out_b      = nullptr;
    struct ggml_tensor* norm2_w       = nullptr;
    struct ggml_tensor* norm2_b       = nullptr;
    // FFN
    struct ggml_tensor* ffn_fc1_w     = nullptr;
    struct ggml_tensor* ffn_fc1_b     = nullptr;
    struct ggml_tensor* ffn_fc2_w     = nullptr;
    struct ggml_tensor* ffn_fc2_b     = nullptr;
    struct ggml_tensor* norm3_w       = nullptr;
    struct ggml_tensor* norm3_b       = nullptr;
};

struct sam3_fusion_encoder {
    std::vector<sam3_fenc_layer> layers;
};

/*
** ── DETR Decoder ─────────────────────────────────────────────────────────
*/

struct sam3_ddec_layer {
    // self-attention
    struct ggml_tensor* sa_in_proj_w   = nullptr;
    struct ggml_tensor* sa_in_proj_b   = nullptr;
    struct ggml_tensor* sa_out_proj_w  = nullptr;
    struct ggml_tensor* sa_out_proj_b  = nullptr;
    struct ggml_tensor* norm1_w        = nullptr;
    struct ggml_tensor* norm1_b        = nullptr;
    // cross-attention to image
    struct ggml_tensor* ca_q_w         = nullptr;
    struct ggml_tensor* ca_q_b         = nullptr;
    struct ggml_tensor* ca_kv_w        = nullptr;
    struct ggml_tensor* ca_kv_b        = nullptr;
    struct ggml_tensor* ca_out_w       = nullptr;
    struct ggml_tensor* ca_out_b       = nullptr;
    struct ggml_tensor* norm2_w        = nullptr;
    struct ggml_tensor* norm2_b        = nullptr;
    // cross-attention to text
    struct ggml_tensor* ca_text_q_w    = nullptr;
    struct ggml_tensor* ca_text_q_b    = nullptr;
    struct ggml_tensor* ca_text_kv_w   = nullptr;
    struct ggml_tensor* ca_text_kv_b   = nullptr;
    struct ggml_tensor* ca_text_out_w  = nullptr;
    struct ggml_tensor* ca_text_out_b  = nullptr;
    struct ggml_tensor* norm3_w        = nullptr;
    struct ggml_tensor* norm3_b        = nullptr;
    // FFN
    struct ggml_tensor* ffn_fc1_w      = nullptr;
    struct ggml_tensor* ffn_fc1_b      = nullptr;
    struct ggml_tensor* ffn_fc2_w      = nullptr;
    struct ggml_tensor* ffn_fc2_b      = nullptr;
    struct ggml_tensor* norm4_w        = nullptr;
    struct ggml_tensor* norm4_b        = nullptr;
    // box refinement MLP (3 layers)
    struct ggml_tensor* bbox_w[3]      = {};
    struct ggml_tensor* bbox_b[3]      = {};
};

struct sam3_detr_decoder {
    struct ggml_tensor*          query_embed      = nullptr;  // [num_queries, 512]
    struct ggml_tensor*          presence_token   = nullptr;  // [1, 256]
    // DotProductScoring MLP
    struct ggml_tensor*          score_mlp_w[2]   = {};
    struct ggml_tensor*          score_mlp_b[2]   = {};
    struct ggml_tensor*          score_ln_w       = nullptr;
    struct ggml_tensor*          score_ln_b       = nullptr;
    // Presence head
    struct ggml_tensor*          presence_head_w[2] = {};
    struct ggml_tensor*          presence_head_b[2] = {};
    std::vector<sam3_ddec_layer> layers;
};

/*
** ── Geometry / Exemplar Encoder ──────────────────────────────────────────
*/

struct sam3_geom_layer {
    struct ggml_tensor* sa_in_proj_w  = nullptr;
    struct ggml_tensor* sa_in_proj_b  = nullptr;
    struct ggml_tensor* sa_out_proj_w = nullptr;
    struct ggml_tensor* sa_out_proj_b = nullptr;
    struct ggml_tensor* norm1_w       = nullptr;
    struct ggml_tensor* norm1_b       = nullptr;
    struct ggml_tensor* ca_q_w        = nullptr;
    struct ggml_tensor* ca_q_b        = nullptr;
    struct ggml_tensor* ca_kv_w       = nullptr;
    struct ggml_tensor* ca_kv_b       = nullptr;
    struct ggml_tensor* ca_out_w      = nullptr;
    struct ggml_tensor* ca_out_b      = nullptr;
    struct ggml_tensor* norm2_w       = nullptr;
    struct ggml_tensor* norm2_b       = nullptr;
    struct ggml_tensor* ffn_fc1_w     = nullptr;
    struct ggml_tensor* ffn_fc1_b     = nullptr;
    struct ggml_tensor* ffn_fc2_w     = nullptr;
    struct ggml_tensor* ffn_fc2_b     = nullptr;
    struct ggml_tensor* norm3_w       = nullptr;
    struct ggml_tensor* norm3_b       = nullptr;
};

struct sam3_geom_encoder {
    // Direct projections
    struct ggml_tensor* point_proj_w      = nullptr;  // Linear(2, D)
    struct ggml_tensor* point_proj_b      = nullptr;
    struct ggml_tensor* box_proj_w        = nullptr;  // Linear(4, D)
    struct ggml_tensor* box_proj_b        = nullptr;
    // Pooling projections
    struct ggml_tensor* point_pool_proj_w = nullptr;  // Linear(D, D)
    struct ggml_tensor* point_pool_proj_b = nullptr;
    struct ggml_tensor* box_pool_proj_w   = nullptr;  // Conv2d(D, D, 7)
    struct ggml_tensor* box_pool_proj_b   = nullptr;
    // Positional encoding projections
    struct ggml_tensor* point_pos_proj_w  = nullptr;  // Linear(D, D)
    struct ggml_tensor* point_pos_proj_b  = nullptr;
    struct ggml_tensor* box_pos_proj_w    = nullptr;  // Linear(258, 256)
    struct ggml_tensor* box_pos_proj_b    = nullptr;
    // Label and CLS embeddings
    struct ggml_tensor* type_embed        = nullptr;  // Embedding(2, D)
    struct ggml_tensor* cls_token         = nullptr;  // Embedding(1, D)
    // Final projection + norms
    struct ggml_tensor* post_proj_w       = nullptr;  // Linear(D, D)
    struct ggml_tensor* post_proj_b       = nullptr;
    struct ggml_tensor* norm_w            = nullptr;  // LayerNorm final_proj
    struct ggml_tensor* norm_b            = nullptr;
    struct ggml_tensor* encode_norm_w     = nullptr;  // LayerNorm after xfmr
    struct ggml_tensor* encode_norm_b     = nullptr;
    struct ggml_tensor* img_pre_norm_w    = nullptr;  // LayerNorm before pool
    struct ggml_tensor* img_pre_norm_b    = nullptr;
    std::vector<sam3_geom_layer> layers;
};

/*
** ── Segmentation Head (MaskFormer) ───────────────────────────────────────
*/

struct sam3_seg_head {
    struct ggml_tensor* up_conv_w[3]      = {};
    struct ggml_tensor* up_conv_b[3]      = {};
    struct ggml_tensor* up_norm_w[3]      = {};
    struct ggml_tensor* up_norm_b[3]      = {};
    struct ggml_tensor* ca_prompt_q_w     = nullptr;
    struct ggml_tensor* ca_prompt_q_b     = nullptr;
    struct ggml_tensor* ca_prompt_kv_w    = nullptr;
    struct ggml_tensor* ca_prompt_kv_b    = nullptr;
    struct ggml_tensor* ca_prompt_out_w   = nullptr;
    struct ggml_tensor* ca_prompt_out_b   = nullptr;
    struct ggml_tensor* mask_embed_w      = nullptr;
    struct ggml_tensor* mask_embed_b      = nullptr;
};

/*
** ── SAM Prompt Encoder (Tracker Path) ────────────────────────────────────
*/

struct sam3_sam_prompt_enc {
    struct ggml_tensor* pe_gaussian         = nullptr;  // [2, 128]
    struct ggml_tensor* point_embed[4]      = {};       // neg, pos, box_tl, box_br
    struct ggml_tensor* not_a_point_embed   = nullptr;  // [256]
    struct ggml_tensor* no_mask_embed       = nullptr;  // [256]
    struct ggml_tensor* mask_ds_conv_w[3]   = {};
    struct ggml_tensor* mask_ds_conv_b[3]   = {};
    struct ggml_tensor* mask_ds_norm_w[2]   = {};
    struct ggml_tensor* mask_ds_norm_b[2]   = {};
};

/*
** ── SAM Mask Decoder (Tracker Path) ──────────────────────────────────────
*/

struct sam3_sam_attn {
    struct ggml_tensor* q_w   = nullptr;
    struct ggml_tensor* q_b   = nullptr;
    struct ggml_tensor* k_w   = nullptr;
    struct ggml_tensor* k_b   = nullptr;
    struct ggml_tensor* v_w   = nullptr;
    struct ggml_tensor* v_b   = nullptr;
    struct ggml_tensor* out_w = nullptr;
    struct ggml_tensor* out_b = nullptr;
};

struct sam3_twoway_block {
    sam3_sam_attn       self_attn;
    sam3_sam_attn       ca_tok2img;
    sam3_sam_attn       ca_img2tok;
    struct ggml_tensor* norm1_w   = nullptr;
    struct ggml_tensor* norm1_b   = nullptr;
    struct ggml_tensor* norm2_w   = nullptr;
    struct ggml_tensor* norm2_b   = nullptr;
    struct ggml_tensor* norm3_w   = nullptr;
    struct ggml_tensor* norm3_b   = nullptr;
    struct ggml_tensor* norm4_w   = nullptr;
    struct ggml_tensor* norm4_b   = nullptr;
    struct ggml_tensor* mlp_fc1_w = nullptr;
    struct ggml_tensor* mlp_fc1_b = nullptr;
    struct ggml_tensor* mlp_fc2_w = nullptr;
    struct ggml_tensor* mlp_fc2_b = nullptr;
};

struct sam3_sam_mask_dec {
    struct ggml_tensor*           iou_token       = nullptr;  // [1, 256]
    struct ggml_tensor*           mask_tokens     = nullptr;  // [4, 256]
    struct ggml_tensor*           obj_score_token = nullptr;  // [1, 256]

    std::vector<sam3_twoway_block> twoway_blocks;             // [2]

    sam3_sam_attn                 final_attn;
    struct ggml_tensor*           final_norm_w    = nullptr;
    struct ggml_tensor*           final_norm_b    = nullptr;

    // upscaling
    struct ggml_tensor* up1_w        = nullptr;
    struct ggml_tensor* up1_b        = nullptr;
    struct ggml_tensor* up1_norm_w   = nullptr;
    struct ggml_tensor* up1_norm_b   = nullptr;
    struct ggml_tensor* up2_w        = nullptr;
    struct ggml_tensor* up2_b        = nullptr;

    // high-res feature convolutions
    struct ggml_tensor* conv_s0_w    = nullptr;
    struct ggml_tensor* conv_s0_b    = nullptr;
    struct ggml_tensor* conv_s1_w    = nullptr;
    struct ggml_tensor* conv_s1_b    = nullptr;

    // hypernetwork MLPs: 4 masks x 3 layers
    struct ggml_tensor* hyper_w[4][3]  = {};
    struct ggml_tensor* hyper_b[4][3]  = {};

    // IoU prediction head (3 layers)
    struct ggml_tensor* iou_head_w[3]  = {};
    struct ggml_tensor* iou_head_b[3]  = {};

    // object score head (3 layers)
    struct ggml_tensor* obj_head_w[3]  = {};
    struct ggml_tensor* obj_head_b[3]  = {};
};

/*
** ── Memory Encoder ───────────────────────────────────────────────────────
*/

struct sam3_mem_enc {
    // mask downsampler (4 conv stages + final 1x1)
    struct ggml_tensor* ds_conv_w[5]      = {};
    struct ggml_tensor* ds_conv_b[5]      = {};
    struct ggml_tensor* ds_norm_w[4]      = {};
    struct ggml_tensor* ds_norm_b[4]      = {};
    // pixel feature projection
    struct ggml_tensor* pix_proj_w        = nullptr;
    struct ggml_tensor* pix_proj_b        = nullptr;
    // fuser (2 CXBlock layers)
    struct ggml_tensor* fuser_dw_w[2]     = {};
    struct ggml_tensor* fuser_dw_b[2]     = {};
    struct ggml_tensor* fuser_norm_w[2]   = {};
    struct ggml_tensor* fuser_norm_b[2]   = {};
    struct ggml_tensor* fuser_fc1_w[2]    = {};
    struct ggml_tensor* fuser_fc1_b[2]    = {};
    struct ggml_tensor* fuser_fc2_w[2]    = {};
    struct ggml_tensor* fuser_fc2_b[2]    = {};
    struct ggml_tensor* fuser_gamma[2]    = {};
    // output projection
    struct ggml_tensor* out_proj_w        = nullptr;
    struct ggml_tensor* out_proj_b        = nullptr;
    // temporal pos encodings
    struct ggml_tensor* tpos[7]           = {};
};

/*
** ── Memory Attention (Tracker Transformer) ───────────────────────────────
*/

struct sam3_mem_attn_layer {
    // self-attention (RoPE, 1 head, 256-dim)
    struct ggml_tensor* sa_q_w    = nullptr;
    struct ggml_tensor* sa_q_b    = nullptr;
    struct ggml_tensor* sa_k_w    = nullptr;
    struct ggml_tensor* sa_k_b    = nullptr;
    struct ggml_tensor* sa_v_w    = nullptr;
    struct ggml_tensor* sa_v_b    = nullptr;
    struct ggml_tensor* sa_out_w  = nullptr;
    struct ggml_tensor* sa_out_b  = nullptr;
    struct ggml_tensor* norm1_w   = nullptr;
    struct ggml_tensor* norm1_b   = nullptr;
    // cross-attention (RoPE, kv_dim=64)
    struct ggml_tensor* ca_q_w    = nullptr;
    struct ggml_tensor* ca_q_b    = nullptr;
    struct ggml_tensor* ca_k_w    = nullptr;  // [256, 64]
    struct ggml_tensor* ca_k_b    = nullptr;
    struct ggml_tensor* ca_v_w    = nullptr;  // [256, 64]
    struct ggml_tensor* ca_v_b    = nullptr;
    struct ggml_tensor* ca_out_w  = nullptr;
    struct ggml_tensor* ca_out_b  = nullptr;
    struct ggml_tensor* norm2_w   = nullptr;
    struct ggml_tensor* norm2_b   = nullptr;
    // FFN
    struct ggml_tensor* ffn_fc1_w = nullptr;
    struct ggml_tensor* ffn_fc1_b = nullptr;
    struct ggml_tensor* ffn_fc2_w = nullptr;
    struct ggml_tensor* ffn_fc2_b = nullptr;
    struct ggml_tensor* norm3_w   = nullptr;
    struct ggml_tensor* norm3_b   = nullptr;
};

struct sam3_mem_attn {
    std::vector<sam3_mem_attn_layer> layers;
};

/*
** ── BPE Tokenizer ────────────────────────────────────────────────────────
*/

struct sam3_bpe_tokenizer {
    std::unordered_map<std::string, int> encoder;
    std::unordered_map<int, std::string> decoder;
    std::vector<std::pair<std::string, std::string>> merges;
    std::unordered_map<std::string, int> merge_ranks;       // "a\x1fb" → rank
    std::unordered_map<uint8_t, std::string> byte_encoder;  // byte → unicode UTF-8
    std::unordered_map<std::string, std::string> cache;
    int sot_token = 49406;
    int eot_token = 49407;
};

/*
** ── SAM2 Hiera Backbone ─────────────────────────────────────────────────
*/

struct sam2_hiera_block {
    struct ggml_tensor* norm1_w     = nullptr;
    struct ggml_tensor* norm1_b     = nullptr;
    struct ggml_tensor* qkv_w       = nullptr;  // [3*dim_out, dim_in]
    struct ggml_tensor* qkv_b       = nullptr;  // [3*dim_out]
    struct ggml_tensor* proj_w      = nullptr;  // [dim_out, dim_out]
    struct ggml_tensor* proj_b      = nullptr;  // [dim_out]
    struct ggml_tensor* norm2_w     = nullptr;
    struct ggml_tensor* norm2_b     = nullptr;
    struct ggml_tensor* mlp_fc1_w   = nullptr;
    struct ggml_tensor* mlp_fc1_b   = nullptr;
    struct ggml_tensor* mlp_fc2_w   = nullptr;
    struct ggml_tensor* mlp_fc2_b   = nullptr;
    struct ggml_tensor* dim_proj_w  = nullptr;  // stage transition only
    struct ggml_tensor* dim_proj_b  = nullptr;

    // metadata (set during loading)
    int stage_idx     = -1;
    int dim_in        = 0;
    int dim_out       = 0;
    int num_heads     = 0;
    int window_size   = 0;  // 0 = global attention
    bool has_q_stride = false;
};

struct sam2_hiera {
    struct ggml_tensor* patch_embed_w  = nullptr;  // [embed_dim, 3, 7, 7]
    struct ggml_tensor* patch_embed_b  = nullptr;  // [embed_dim]
    struct ggml_tensor* pos_embed      = nullptr;  // [1, embed_dim, bkg_H, bkg_W]
    struct ggml_tensor* pos_embed_window = nullptr; // [1, embed_dim, W0, W0]

    std::vector<sam2_hiera_block> blocks;
    int stage_ends[4] = {};
};

struct sam2_fpn_level {
    struct ggml_tensor* conv_w = nullptr;  // Conv2d(backbone_ch, d_model, k=1)
    struct ggml_tensor* conv_b = nullptr;
};

struct sam2_fpn_neck {
    sam2_fpn_level levels[4];
};

/*
** ── EdgeTAM RepViT Backbone ─────────────────────────────────────────────
*/

struct edgetam_repvit_block {
    // Token mixer: single fused DW 3×3 (after RepVGG reparameterization)
    struct ggml_tensor* tm_w         = nullptr;  // [3, 3, 1, ch]
    struct ggml_tensor* tm_b         = nullptr;  // [ch]

    // Squeeze-and-excitation (only on even-indexed blocks)
    bool has_se = false;
    struct ggml_tensor* se_fc1_w     = nullptr;  // [1, 1, ch, ch_rd]
    struct ggml_tensor* se_fc1_b     = nullptr;  // [ch_rd]
    struct ggml_tensor* se_fc2_w     = nullptr;  // [1, 1, ch_rd, ch]
    struct ggml_tensor* se_fc2_b     = nullptr;  // [ch]

    // Channel mixer: 1×1 expand → GELU → 1×1 project
    struct ggml_tensor* cm_conv1_w   = nullptr;  // [1, 1, ch, ch*2]
    struct ggml_tensor* cm_conv1_b   = nullptr;  // [ch*2]
    struct ggml_tensor* cm_conv2_w   = nullptr;  // [1, 1, ch*2, ch]
    struct ggml_tensor* cm_conv2_b   = nullptr;  // [ch]
};

struct edgetam_repvit_downsample {
    // Pre-block (RepViT block at prev-stage channels, no SE)
    edgetam_repvit_block pre_block;

    // Spatial downsample: DW Conv 3×3, stride=2
    struct ggml_tensor* spatial_w    = nullptr;  // [3, 3, 1, ch_in]
    struct ggml_tensor* spatial_b    = nullptr;  // [ch_in]

    // Channel expand: 1×1 Conv
    struct ggml_tensor* channel_w    = nullptr;  // [1, 1, ch_in, ch_out]
    struct ggml_tensor* channel_b    = nullptr;  // [ch_out]

    // FFN: 1×1 expand → GELU → 1×1 project
    struct ggml_tensor* ffn_conv1_w  = nullptr;  // [1, 1, ch_out, ch_out*2]
    struct ggml_tensor* ffn_conv1_b  = nullptr;  // [ch_out*2]
    struct ggml_tensor* ffn_conv2_w  = nullptr;  // [1, 1, ch_out*2, ch_out]
    struct ggml_tensor* ffn_conv2_b  = nullptr;  // [ch_out]
};

struct edgetam_repvit_stage {
    std::vector<edgetam_repvit_block> blocks;
    bool has_downsample = false;
    edgetam_repvit_downsample downsample;
};

struct edgetam_repvit {
    // Stem: 2 conv layers (3→24→48, each stride 2)
    struct ggml_tensor* stem_conv1_w = nullptr;  // [3, 3, 3, 24]
    struct ggml_tensor* stem_conv1_b = nullptr;  // [24]
    struct ggml_tensor* stem_conv2_w = nullptr;  // [3, 3, 24, 48]
    struct ggml_tensor* stem_conv2_b = nullptr;  // [48]

    edgetam_repvit_stage stages[4];
};

/*
** ── EdgeTAM Spatial Perceiver ───────────────────────────────────────────
*/

struct edgetam_perceiver_layer {
    // Cross-attention (latents attend to features)
    struct ggml_tensor* ca_norm_latents_w = nullptr;
    struct ggml_tensor* ca_norm_latents_b = nullptr;
    struct ggml_tensor* ca_norm_x_w       = nullptr;
    struct ggml_tensor* ca_norm_x_b       = nullptr;
    struct ggml_tensor* ca_q_w            = nullptr;  // [64, 64] no bias
    struct ggml_tensor* ca_kv_w           = nullptr;  // [128, 64] no bias
    struct ggml_tensor* ca_out_w          = nullptr;  // [64, 64] no bias

    // FFN after cross-attention
    struct ggml_tensor* ff_norm_w         = nullptr;
    struct ggml_tensor* ff_norm_b         = nullptr;
    struct ggml_tensor* ff_fc1_w          = nullptr;  // [256, 64] no bias
    struct ggml_tensor* ff_fc2_w          = nullptr;  // [64, 256] no bias

    // Self-attention on latents
    struct ggml_tensor* sa_norm_w         = nullptr;
    struct ggml_tensor* sa_norm_b         = nullptr;
    struct ggml_tensor* sa_q_w            = nullptr;  // [64, 64] no bias
    struct ggml_tensor* sa_kv_w           = nullptr;  // [128, 64] no bias
    struct ggml_tensor* sa_out_w          = nullptr;  // [64, 64] no bias

    // FFN after self-attention
    struct ggml_tensor* sa_ff_norm_w      = nullptr;
    struct ggml_tensor* sa_ff_norm_b      = nullptr;
    struct ggml_tensor* sa_ff_fc1_w       = nullptr;  // [256, 64]
    struct ggml_tensor* sa_ff_fc2_w       = nullptr;  // [64, 256]
};

struct edgetam_perceiver {
    struct ggml_tensor* latents_1d        = nullptr;  // [256, 64]
    struct ggml_tensor* latents_2d        = nullptr;  // [256, 64]
    struct ggml_tensor* norm_w            = nullptr;  // [64]
    struct ggml_tensor* norm_b            = nullptr;  // [64]

    std::vector<edgetam_perceiver_layer> layers;
};

/*****************************************************************************
** Top-Level Opaque Types (defined here, forward-declared in sam3.h)
*****************************************************************************/

struct sam3_model {
    sam3_hparams        hparams;
    ggml_type           weight_type = GGML_TYPE_F16;

    // ── SAM3-specific (loaded only when model_type != SAM2) ──────────────
    sam3_vit            vit;
    sam3_neck           neck_det;
    sam3_neck           neck_trk;
    sam3_text_encoder   text_enc;
    sam3_fusion_encoder fenc;
    sam3_detr_decoder   ddec;
    sam3_geom_encoder   geom_enc;
    sam3_seg_head       seg_head;

    // ── SAM2-specific (loaded only when model_type == SAM2) ──────────────
    sam2_hiera          hiera;
    sam2_fpn_neck       fpn_neck;

    // ── EdgeTAM-specific (loaded only when model_type == EDGETAM) ───────
    edgetam_repvit      repvit;
    edgetam_perceiver   perceiver;

    // ── Shared (loaded for both SAM2 and SAM3) ──────────────────────────
    sam3_sam_prompt_enc sam_pe;
    sam3_sam_mask_dec   sam_dec;
    sam3_mem_enc        mem_enc;
    sam3_mem_attn       mem_attn;

    // object pointer projection
    struct ggml_tensor* obj_ptr_proj_w[3]  = {};
    struct ggml_tensor* obj_ptr_proj_b[3]  = {};
    struct ggml_tensor* no_obj_ptr         = nullptr;
    struct ggml_tensor* obj_ptr_tpos_w     = nullptr;
    struct ggml_tensor* obj_ptr_tpos_b     = nullptr;

    // standalone tracker/SAM2 top-level tensors
    struct ggml_tensor* no_mem_embed       = nullptr;  // [1, 1, 256]
    struct ggml_tensor* no_mem_pos_enc     = nullptr;  // [1, 1, 256]
    struct ggml_tensor* no_obj_embed_spatial = nullptr; // [1, 64]
    struct ggml_tensor* mem_attn_norm_w    = nullptr;
    struct ggml_tensor* mem_attn_norm_b    = nullptr;

    // precomputed RoPE frequencies (SAM3 only)
    struct ggml_tensor* rope_freqs         = nullptr;  // [n_img_tokens, head_dim]

    // ggml backend
    struct ggml_context*    ctx     = nullptr;
    ggml_backend_t          backend = nullptr;
    ggml_backend_buffer_t   buffer  = nullptr;

    // tensor lookup
    std::map<std::string, struct ggml_tensor*> tensors;

    // tokenizer
    sam3_bpe_tokenizer tokenizer;
};

struct sam3_state {
    // cached backbone outputs
    struct ggml_tensor* vit_output       = nullptr;  // [1, embed, H, W]
    struct ggml_tensor* neck_det[4]      = {};       // FPN levels (det path)
    struct ggml_tensor* neck_trk[4]      = {};       // FPN levels (trk path)
    struct ggml_tensor* neck_det_pe[4]   = {};       // sinusoidal PE
    struct ggml_tensor* neck_trk_pe[4]   = {};

    int orig_width  = 0;
    int orig_height = 0;
    int n_threads   = 4;

    int encode_img_size  = 0;  // effective img_size for encoding (0 = hp.img_size)
    int encode_feat_size = 0;  // effective feat_size for the active backbone

    struct ggml_context*  ctx     = nullptr;
    ggml_backend_t        backend = nullptr;
    ggml_backend_buffer_t buffer  = nullptr;
    struct ggml_gallocr*  galloc  = nullptr;

    // PE buffer: holds sinusoidal PE tensors for neck outputs
    struct ggml_context*  pe_ctx  = nullptr;
    ggml_backend_buffer_t pe_buf  = nullptr;

    // Cached SAM prompt encoder embeddings (read from GPU once, reused)
    bool pe_cache_valid = false;
    std::vector<float> pe_gauss_cache;      // [2 * num_pos_feats]
    float point_emb_cache[4][256]   = {};
    float not_a_point_cache[256]    = {};
    float no_mask_emb_cache[256]    = {};
    std::vector<float> dense_pe_cache;      // [D * H * H] -- PE grid
    std::vector<float> dense_nomask_cache;  // [D * H * H] -- no-mask tiled
};

/*
** ── Video Tracker State ──────────────────────────────────────────────────
*/

struct sam3_masklet {
    int   instance_id = -1;
    int   first_frame = -1;
    int   last_seen   = -1;
    float last_score  = 0.0f;
    bool  confirmed   = false;
    int   mds_sum     = 0;

    // last predicted mask logits (owned by tracker ctx)
    struct ggml_tensor* mask_logits = nullptr;  // [1, 1, 288, 288]
    struct ggml_tensor* obj_ptr = nullptr;      // [1, 256]
};

struct sam3_memory_slot {
    struct ggml_tensor* spatial_feats  = nullptr;  // [64, 72, 72]
    struct ggml_tensor* spatial_pe     = nullptr;  // [64, 72, 72]
    int                 frame_index    = -1;
    bool                is_cond_frame  = false;
};

struct sam3_tracker {
    sam3_video_params params;
    int frame_index  = 0;
    int next_inst_id = 1;

    std::vector<sam3_masklet> masklets;
    std::vector<sam3_masklet> pending;

    std::map<int, std::vector<sam3_memory_slot>> mem_banks;
    std::map<int, std::vector<std::pair<int, struct ggml_tensor*>>> ptr_banks;

    struct ggml_context* ctx = nullptr;
    ggml_backend_buffer_t buffer = nullptr;

    // Per-tensor backend buffers allocated by sam3_encode_memory / sam3_store_obj_ptr.
    // Tracked here so they can be freed on tracker reset.
    std::vector<ggml_backend_buffer_t> owned_buffers;

    // Cached PE / RoPE data — pure functions of fixed hyperparameters, computed once.
    bool pe_caches_valid = false;
    std::vector<float> cached_sinpe_256;       // sam3_sinusoidal_pe_2d(72, 72, 256)
    std::vector<float> cached_sinpe_64;        // sam3_sinusoidal_pe_2d(72, 72, 64)
    std::vector<float> cached_axial_cis_reord; // reordered axial CIS for RoPE Q

    // EdgeTAM-specific: RoPE for 16x16 grid (cross-attn K on perceiver 2D latents)
    std::vector<float> cached_axial_cis_k16_reord; // [2, 128, 256] for 16x16 grid
};

// Resolve effective img_size / feat_size from state (which may override hp defaults).
static int sam3_eff_img_size(const sam3_state& s, const sam3_hparams& hp) {
    return (s.encode_img_size > 0) ? s.encode_img_size : hp.img_size;
}
static int sam3_eff_feat_size(const sam3_state& s, const sam3_hparams& hp) {
    return (s.encode_feat_size > 0) ? s.encode_feat_size : hp.feat_size();
}

/*****************************************************************************
** Internal Helper Declarations
*****************************************************************************/

// graph execution
static bool sam3_graph_compute(ggml_backend_t backend, struct ggml_cgraph* graph, int n_threads);