From f660833814ec7a580098ad56c054fedf4ffe450b Mon Sep 17 00:00:00 2001
From: priyadarshini75 <priyadarshinichavan75@gmail.com>
Date: Thu, 12 Mar 2026 18:04:35 +0530
Subject: [PATCH 1/2] feat: Enable dynamic image size processing and enhance
 benchmarking in the SAM3 application, updating documentation and gitignore.

---
 .gitignore                              |  4 +++
 README.md                               | 12 ++++++--
 cpp/include/sam3.cuh                    |  2 +-
 cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp | 37 +++++++++++++++++--------
 cpp/src/sam3/sam3_trt/sam3.cu           | 35 ++++++++++++++++++-----
 5 files changed, 67 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index ee1bdf6..f93b4f9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,6 @@
 *.DS_Store
 *build
+.results
+*.plan
+onnx_weights/
+test_images/
\ No newline at end of file
diff --git a/README.md b/README.md
index 4f2ef02..2c5f3c6 100755
--- a/README.md
+++ b/README.md
@@ -112,11 +112,11 @@ docker run -it --rm \
 ```bash
 python python/onnxexport.py
 ```
-This produces `onnx_weights/sam3_static.onnx` plus external weight shards.
+This produces `onnx_weights/sam3_dynamic.onnx` plus external weight shards.
 
 5) Build a TensorRT engine
 ```bash
-trtexec --onnx=onnx_weights/sam3_static.onnx --saveEngine=sam3_fp16.plan --fp16 --verbose
+trtexec --onnx=onnx_weights/sam3_dynamic.onnx --saveEngine=sam3_fp16.plan --fp16 --verbose
 ```
 
 6) Build the C++/CUDA library and sample app
@@ -181,4 +181,10 @@ TensorRT + CUDA (benchmark mode disables output writes):
 If this saved you time, drop a ⭐ so others can find it and ship SAM-3 faster.
 
 # Disclaimer
-All views expressed here are my own. This project is not affiliated with my employer.
\ No newline at end of file
+All views expressed here are my own. This project is not affiliated with my employer.
+
+cd /workspace/cpp/build
+make
+./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan 
+
+cd /workspace/cpp/build && make && ./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan 1
diff --git a/cpp/include/sam3.cuh b/cpp/include/sam3.cuh
index 03efa19..118c057 100755
--- a/cpp/include/sam3.cuh
+++ b/cpp/include/sam3.cuh
@@ -52,7 +52,7 @@ private:
     cudaStream_t sam3_stream;
     dim3 bsize;
     dim3 gsize;
-    int in_width, in_height, opencv_inbytes;
+    int in_width, in_height, opencv_inbytes = 0;
 
     std::vector<void*> input_cpu;
     std::vector<void*> input_gpu;
diff --git a/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp b/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp
index 3f7c3d8..97f6c4c 100755
--- a/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp
+++ b/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp
@@ -95,7 +95,7 @@ int main(int argc, char* argv[])
     SAM3_PCS pcs(epath, vis_alpha, probability_threshold);
 
     cv::Mat img, result;
-    char* raw_bytes;
+    int prev_width = 0, prev_height = 0;
 
     std::filesystem::create_directories("results");
     int num_images_read=0;
@@ -111,24 +111,26 @@ int main(int argc, char* argv[])
         
     pcs.set_prompt(iid, iam);
 
+    const int MAX_BENCHMARK_IMAGES = 100;
+
     for (const auto& fname : std::filesystem::directory_iterator(in_dir))
     {
         if (std::filesystem::is_regular_file(fname.path())) 
         {
             std::filesystem::path outfile = std::filesystem::path("results") / fname.path().filename();
             
-            if (num_images_read==0)
+            img = cv::imread(fname.path(), cv::IMREAD_COLOR);
+            if (img.empty()) continue;
+
+            result.create(img.rows, img.cols, img.type());
+
+            if (img.cols != prev_width || img.rows != prev_height)
             {
-                cv::Mat tmp = cv::imread(fname.path(), cv::IMREAD_COLOR);
-                raw_bytes = (char *)malloc(tmp.total()*tmp.elemSize());
-                read_image_into_buffer(fname.path(), raw_bytes, img);
-                result = cv::imread(fname.path(), cv::IMREAD_COLOR);
                 pcs.pin_opencv_matrices(img, result);
+                prev_width = img.cols;
+                prev_height = img.rows;
             }
-            else
-            {
-                read_image_into_buffer(fname.path(), raw_bytes, img);
-            }
+
             start = std::chrono::system_clock::now();
             infer_one_image(pcs, img, result, visualize, outfile, benchmark);
             num_images_read++;
@@ -138,9 +140,20 @@ int main(int argc, char* argv[])
 
             if (num_images_read>0 && num_images_read%10==0)
             {
-                float msec_per_image = millis_elapsed/num_images_read;
-                printf("Processed %d images at %f msec/image\n", num_images_read, msec_per_image);
+                printf("Processed %d images...\n", num_images_read);
             }
+
+            if (num_images_read >= MAX_BENCHMARK_IMAGES) break;
         }
     }
+
+    if (num_images_read > 0)
+    {
+        float msec_per_image = millis_elapsed/num_images_read;
+        float est_1000_min = msec_per_image * 1000.0f / 1000.0f / 60.0f;
+        printf("\n=== Benchmark Results ===\n");
+        printf("Processed %d images in %.1f s\n", num_images_read, millis_elapsed/1000.0f);
+        printf("Average: %.2f msec/image\n", msec_per_image);
+        printf("Estimated time for 1000 images: %.1f min\n", est_1000_min);
+    }
 }
diff --git a/cpp/src/sam3/sam3_trt/sam3.cu b/cpp/src/sam3/sam3_trt/sam3.cu
index 9bcab00..e892b47 100755
--- a/cpp/src/sam3/sam3_trt/sam3.cu
+++ b/cpp/src/sam3/sam3_trt/sam3.cu
@@ -1,5 +1,21 @@
 #include "sam3.cuh"
 
+static size_t datatype_size(nvinfer1::DataType dtype)
+{
+    switch (dtype)
+    {
+        case nvinfer1::DataType::kFLOAT:  return 4;
+        case nvinfer1::DataType::kHALF:   return 2;
+        case nvinfer1::DataType::kINT8:   return 1;
+        case nvinfer1::DataType::kINT32:  return 4;
+        case nvinfer1::DataType::kINT64:  return 8;
+        case nvinfer1::DataType::kBOOL:   return 1;
+        case nvinfer1::DataType::kBF16:   return 2;
+        case nvinfer1::DataType::kFP8:    return 1;
+        default:                          return 4;
+    }
+}
+
 SAM3_PCS::SAM3_PCS(const std::string engine_path, const float vis_alpha, const float prob_threshold)
     : _engine_path(engine_path)
     , _overlay_alpha(vis_alpha)
@@ -23,19 +39,24 @@ SAM3_PCS::SAM3_PCS(const std::string engine_path, const float vis_alpha, const f
 
 void SAM3_PCS::pin_opencv_matrices(cv::Mat& input_mat, cv::Mat& result_mat)
 {
+    // Free previous GPU buffers if re-allocating (image size changed)
+    if (!is_zerocopy && opencv_inbytes > 0)
+    {
+        if (opencv_input) { cudaFree(opencv_input); opencv_input = nullptr; }
+        if (gpu_result)   { cudaFree(gpu_result);   gpu_result = nullptr;   }
+    }
+
     opencv_inbytes = input_mat.total() * input_mat.elemSize();
 
-    cuda_check(cudaHostRegister(
+    if (is_zerocopy)
+    {
+        cuda_check(cudaHostRegister(
             input_mat.data,
             opencv_inbytes,
             cudaHostRegisterDefault),
             " pinning opencv input Mat on host"
         );
-    // for most purposes the default flag is good enough, in my benchmarking
-    // using others say readonly flag did not improve performance
 
-    if (is_zerocopy)
-    {
         cuda_check(cudaHostRegister(
             result_mat.data,
             opencv_inbytes,
@@ -53,7 +74,7 @@ void SAM3_PCS::pin_opencv_matrices(cv::Mat& input_mat, cv::Mat& result_mat)
     }
     else
     {
-        // on dGPU allocate additional memory for input
+        // on dGPU allocate GPU-side buffers for image data
         cuda_check(cudaMalloc(&opencv_input, opencv_inbytes), " allocating opencv input memory on a dGPU system");
         cuda_check(cudaMalloc((void**)&gpu_result, opencv_inbytes), " allocating result memory on a dGPU system");        
         cudaMemset(opencv_input, 0, opencv_inbytes);
@@ -274,7 +295,7 @@ void SAM3_PCS::allocate_io_buffers()
         nvinfer1::TensorIOMode mode = trt_engine->getTensorIOMode(name);
 
         nvinfer1::Dims dims = trt_engine->getTensorShape(name);
-        size_t nbytes = sizeof(trt_engine->getTensorDataType(name));
+        size_t nbytes = datatype_size(trt_engine->getTensorDataType(name));
         
         for (int idx=0;idx < MAX_DIMS; idx++)
         {

From 23f47f69355f3db3e1fe0d08dd04301629ba71de Mon Sep 17 00:00:00 2001
From: priyadarshini75 <priyadarshinichavan75@gmail.com>
Date: Fri, 13 Mar 2026 12:34:40 +0530
Subject: [PATCH 2/2] added-native-bbox-detection

---
 README.md                               |  25 ++++-
 cpp/CMakeLists.txt                      |   2 +
 cpp/include/prepost.cuh                 |  14 +++
 cpp/include/sam3.hpp                    |   3 +-
 cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp | 140 +++++++++++++++---------
 cpp/src/sam3/sam3_trt/prepost.cu        |  65 +++++++++++
 cpp/src/sam3/sam3_trt/sam3.cu           |  45 ++++++++
 python/export_tokenizer.py              |  16 +++
 python/onnxexport.py                    |   4 +-
 python/tokenize_prompt.py               |  43 ++++++++
 10 files changed, 300 insertions(+), 57 deletions(-)
 create mode 100644 python/export_tokenizer.py
 create mode 100644 python/tokenize_prompt.py

diff --git a/README.md b/README.md
index 2c5f3c6..b423034 100755
--- a/README.md
+++ b/README.md
@@ -183,8 +183,27 @@ If this saved you time, drop a ⭐ so others can find it and ship SAM-3 faster.
 # Disclaimer
 All views expressed here are my own. This project is not affiliated with my employer.
 
+## Dynamic Bounding Box Detection (New!)
+The application has been extended to support **Native Bounding Box Detection** directly from the SAM3 model outputs, as well as **Dynamic Text Prompting** without hardcoded tokens.
+
+### Setup Tokenizer
+Because the C++ application relies on HuggingFace tokenization, you first must export the tokenizer files:
+```bash
+python3 python/export_tokenizer.py
+```
+*This will create `tokenizer.json` in the `onnx_weights/` directory for the Python script to use.*
+
+### Run Bounding Box Visualization
+Run the application with your target prompt as the 3rd argument. The C++ application will dynamically tokenize the prompt, run the TensorRT engine, and draw green bounding boxes with the text label above them.
+```bash
 cd /workspace/cpp/build
-make
-./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan 
+make -j
+./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan "helmet"
+```
 
-cd /workspace/cpp/build && make && ./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan 1
+### Benchmark Bounding Box Inference
+To test the raw speed of the `.plan` engine executing the prompt and calculating bounding boxes (without the latency of OpenCV drawing and saving the images), append `1` to the end of the command:
+```bash
+cd /workspace/cpp/build
+./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan "helmet" 1
+```
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 077a7d2..dcfc53a 100755
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -30,6 +30,8 @@ include_directories(
     ${CUDNN_ROOT_DIR}/include
 )
 
+
+
 add_library(sam3_trt SHARED 
     src/sam3/sam3_trt/sam3.cu
     src/sam3/sam3_trt/prepost.cu
diff --git a/cpp/include/prepost.cuh b/cpp/include/prepost.cuh
index b8d7329..2a9a476 100755
--- a/cpp/include/prepost.cuh
+++ b/cpp/include/prepost.cuh
@@ -51,6 +51,20 @@ __global__ void draw_instance_seg_mask(
     float3* color_palette
 );
 
+__global__ void draw_bounding_box(
+    float* boxes,
+    float* logits,
+    uint8_t* result,
+    int src_width,
+    int src_height,
+    int src_channels,
+    int max_boxes,
+    int box_idx,
+    float prob_threshold,
+    float3* color_palette,
+    int thickness
+);
+
 static std::vector<float3> colpal = {
     make_float3(  0, 185, 118), // teal (your original)
     make_float3(230, 159,   0), // orange
diff --git a/cpp/include/sam3.hpp b/cpp/include/sam3.hpp
index 8245084..e05d373 100755
--- a/cpp/include/sam3.hpp
+++ b/cpp/include/sam3.hpp
@@ -15,7 +15,8 @@ typedef enum {
 typedef enum {
     VIS_NONE,
     VIS_SEMANTIC_SEGMENTATION,
-    VIS_INSTANCE_SEGMENTATION
+    VIS_INSTANCE_SEGMENTATION,
+    VIS_BBOX
 } SAM3_VISUALIZATION;
 
 typedef struct {
diff --git a/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp b/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp
index 97f6c4c..30b2957 100755
--- a/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp
+++ b/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp
@@ -3,33 +3,27 @@
 #include <chrono>
 #include <thread>
 #include <opencv2/imgproc.hpp>
-
-void read_image_into_buffer(const std::string imgpath, char* raw_buffer, cv::Mat& buffer)
-{
-    size_t file_size = std::filesystem::file_size(imgpath);
-    if (file_size==0)
-    {
-        std::stringstream err;
-        err << "Image file is empty";
-        throw std::runtime_error(err.str());
+#include <fstream>
+#include <array>
+#include <memory>
+#include <stdexcept>
+
+// Helper to execute bash command and read stdout
+std::string exec_python_tokenizer(const std::string& prompt) {
+    // Assuming the app is run from /workspace/cpp/build or /workspace
+    std::string cmd = "python3 /workspace/python/tokenize_prompt.py \"" + prompt + "\"";
+    std::array<char, 128> buffer;
+    std::string result;
+    std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);
+    if (!pipe) {
+        throw std::runtime_error("popen() failed!");
     }
-
-    std::ifstream file(imgpath, std::ios::binary);
-
-    if (!file.is_open())
-    {
-        std::stringstream err;
-        err << "File " << imgpath << " could not be opened. Please check permissions\n";
-        throw std::runtime_error(err.str());
+    while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
+        result += buffer.data();
     }
-
-    file.read(raw_buffer, file_size);
-    file.close();
-    
-    cv::Mat raw_mat(1, static_cast<int>(file_size), CV_8UC1, raw_buffer); 
-    // just a wrapper, minimal allocation
-
-    cv::imdecode(raw_mat, cv::IMREAD_COLOR, &buffer);
+    // Remove trailing newlines
+    result.erase(std::remove(result.begin(), result.end(), '\n'), result.end());
+    return result;
 }
 
 void infer_one_image(SAM3_PCS& pcs, 
@@ -37,13 +31,35 @@ void infer_one_image(SAM3_PCS& pcs,
     cv::Mat& result, 
     const SAM3_VISUALIZATION vis,
     const std::string outfile,
+    const std::string prompt,
     bool benchmark_run)
 {
     bool success = pcs.infer_on_image(img, result, vis);
 
-    if (benchmark_run)
+    if (benchmark_run) return;
+
+    if (vis == SAM3_VISUALIZATION::VIS_BBOX)
     {
-        return;
+        // CPU-side box coordinates and logits copied over by sam3.cu
+        float* boxes = static_cast<float*>(pcs.output_cpu[2]);
+        float* logits = static_cast<float*>(pcs.output_cpu[3]);
+        int num_boxes = 200;
+        
+        for (int i=0; i < num_boxes; i++) {
+            float logit = logits[i];
+            float prob = 1.0f / (1.0f + std::exp(-logit));
+            if (prob > 0.5f) { // threshold match
+                float x1 = boxes[i * 4 + 0];
+                float y1 = boxes[i * 4 + 1];
+                int x_min = std::max(0, (int)(x1 * img.cols));
+                int y_min = std::max(0, (int)(y1 * img.rows));
+                
+                // Draw text slightly above the bounding box
+                // Reduced font scale from 0.9 to 0.5, thickness from 2 to 1
+                cv::putText(result, prompt, cv::Point(x_min, std::max(15, y_min - 6)), 
+                            cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 0), 1);
+            }
+        }
     }
 
     if (vis == SAM3_VISUALIZATION::VIS_NONE)
@@ -59,38 +75,32 @@ void infer_one_image(SAM3_PCS& pcs,
 
 int main(int argc, char* argv[])
 {
-    if (argc < 3)
+    if (argc < 4)
     {
-        std::cout << "Usage: ./sam3_pcs_app indir engine_path.engine <benchmark=false>" << std::endl;
+        std::cout << "Usage: ./sam3_pcs_app indir engine_path.engine prompt <benchmark=0>" << std::endl;
         return 0;
     }
 
     const std::string in_dir = argv[1];
     std::string epath = argv[2];
-    bool benchmark=false; // in benchmarking mode we dont save output images
+    std::string prompt = argv[3];
+    bool benchmark = false; // in benchmarking mode we dont save output images
 
-    if (argc==4)
+    if (argc == 5)
     {
-        std::string b_arg = argv[3]; // should be 0 or 1
-        try
-        {
-            benchmark = (b_arg == "1");
-        }
-        catch(const std::exception)
-        {
-            std::cout << "Unrecognized benchmark type " << argv[3] << std::endl;
-        }
+        benchmark = (std::string(argv[4]) == "1");
     }
+    std::cout << "Target Prompt: " << prompt << std::endl;
     std::cout << "Benchmarking: " << benchmark << std::endl;
 
     auto start = std::chrono::system_clock::now();
     auto end = std::chrono::system_clock::now();
     std::chrono::duration<float> diff;
-    float millis_elapsed = 0.0; // int will overflow after ~650 hours
+    float millis_elapsed = 0.0;
 
     const float vis_alpha = 0.3;
     const float probability_threshold = 0.5;
-    const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_SEMANTIC_SEGMENTATION;
+    const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_BBOX;
 
     SAM3_PCS pcs(epath, vis_alpha, probability_threshold);
 
@@ -100,15 +110,43 @@ int main(int argc, char* argv[])
     std::filesystem::create_directories("results");
     int num_images_read=0;
 
-    // tokenized version of 'person'
-    std::vector<int64_t> iid={49406,  2533, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
-         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
-         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
-         49407, 49407};
-    
-    std::vector<int64_t> iam={1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, 0, 0};
+    // Tokenize the prompt
+    std::vector<int64_t> iid(32, 49407); // 49407 is usually the PAD/EOS token for SAM3 text encoder
+    std::vector<int64_t> iam(32, 0);
+
+    try {
+        std::cout << "Calling Python to tokenize prompt: '" << prompt << "'..." << std::endl;
+        std::string py_out = exec_python_tokenizer(prompt);
+        
+        if (py_out == "-1" || py_out.empty()) {
+            throw std::runtime_error("Python tokenizer script failed.");
+        }
         
+        // Parse comma separated string
+        std::vector<int32_t> ids;
+        std::stringstream ss(py_out);
+        std::string token;
+        while (std::getline(ss, token, ',')) {
+            ids.push_back(std::stoi(token));
+        }
+        
+        for (size_t i = 0; i < ids.size() && i < 32; ++i) {
+            iid[i] = ids[i];
+            iam[i] = 1; // 1 for real tokens, 0 for pad
+        }
+        std::cout << "Successfully tokenized prompt into " << ids.size() << " tokens." << std::endl;
+        
+    } catch (const std::exception& e) {
+        std::cerr << "Tokenizer error: " << e.what() << std::endl;
+        std::cout << "Falling back to 'person' tokens.\n";
+        iid = {49406, 2533, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
+               49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
+               49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
+               49407, 49407};
+        iam = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0};
+    }
+
     pcs.set_prompt(iid, iam);
 
     const int MAX_BENCHMARK_IMAGES = 100;
@@ -132,7 +170,7 @@ int main(int argc, char* argv[])
             }
 
             start = std::chrono::system_clock::now();
-            infer_one_image(pcs, img, result, visualize, outfile, benchmark);
+            infer_one_image(pcs, img, result, visualize, outfile, prompt, benchmark);
             num_images_read++;
             end = std::chrono::system_clock::now();
             diff = end - start;
diff --git a/cpp/src/sam3/sam3_trt/prepost.cu b/cpp/src/sam3/sam3_trt/prepost.cu
index fb29316..0ee7b00 100755
--- a/cpp/src/sam3/sam3_trt/prepost.cu
+++ b/cpp/src/sam3/sam3_trt/prepost.cu
@@ -174,4 +174,69 @@ __global__ void draw_instance_seg_mask(
             }
         }
     }
+}
+
+__global__ void draw_bounding_box(
+    float* boxes,
+    float* logits,
+    uint8_t* result,
+    int src_width,
+    int src_height,
+    int src_channels,
+    int max_boxes,
+    int box_idx,
+    float prob_threshold,
+    float3* color_palette,
+    int thickness
+)
+{
+    // One thread per pixel in the image. Block size e.g., 16x16
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= src_width || y >= src_height) return;
+
+    // Check if the box is valid (prob > threshold)
+    // Logits: 1 / (1 + exp(-logit))
+    float logit = logits[box_idx];
+    float prob = 1.0f / (1.0f + exp(-logit));
+    if (prob <= prob_threshold) return;
+
+    // Get color from palette
+    float3 color = color_palette[box_idx % 20];
+
+    // Read box coords [x_min, y_min, x_max, y_max] normalized 0-1
+    float x1 = boxes[box_idx * 4 + 0];
+    float y1 = boxes[box_idx * 4 + 1];
+    float x2 = boxes[box_idx * 4 + 2];
+    float y2 = boxes[box_idx * 4 + 3];
+
+    // Convert to pixel coordinates
+    int x_min = max(0, (int)(x1 * src_width));
+    int y_min = max(0, (int)(y1 * src_height));
+    int x_max = min(src_width - 1, (int)(x2 * src_width));
+    int y_max = min(src_height - 1, (int)(y2 * src_height));
+
+    // Check if current thread pixel is ON the border of the bounding box
+    bool is_border = false;
+    
+    // Check horizontal borders (top and bottom)
+    if (x >= x_min && x <= x_max) {
+        if (abs(y - y_min) < thickness || abs(y - y_max) < thickness) {
+            is_border = true;
+        }
+    }
+    // Check vertical borders (left and right)
+    if (y >= y_min && y <= y_max) {
+        if (abs(x - x_min) < thickness || abs(x - x_max) < thickness) {
+            is_border = true;
+        }
+    }
+
+    if (is_border) {
+        int res_loc = (y * src_width + x) * src_channels;
+        result[res_loc] = (uint8_t)color.x;
+        result[res_loc + 1] = (uint8_t)color.y;
+        result[res_loc + 2] = (uint8_t)color.z;
+    }
 }
\ No newline at end of file
diff --git a/cpp/src/sam3/sam3_trt/sam3.cu b/cpp/src/sam3/sam3_trt/sam3.cu
index e892b47..95800d6 100755
--- a/cpp/src/sam3/sam3_trt/sam3.cu
+++ b/cpp/src/sam3/sam3_trt/sam3.cu
@@ -145,12 +145,57 @@ void SAM3_PCS::visualize_on_dGPU(const cv::Mat& input, cv::Mat& result, SAM3_VIS
                 gpu_colpal);
         }
     }
+    else if (vis_type == SAM3_VISUALIZATION::VIS_BBOX)
+    {
+        // First copy the original image to the result
+        cuda_check(cudaMemcpyAsync((void *)gpu_result, 
+            (void *)input_ptr, 
+            opencv_inbytes, 
+            cudaMemcpyDeviceToDevice, 
+            sam3_stream), " async memcpy for result during bbox visualization");
+
+        dim3 bbsize(16, 16);
+        dim3 bgsize;
+        bgsize.x = (input.cols + bbsize.x - 1) / bbsize.x;
+        bgsize.y = (input.rows + bbsize.y - 1) / bbsize.y;
+
+        int num_boxes = 200; // SAM3 predicts 200 boxes max generally
+        
+        // Loop over all boxes and launch drawing kernel for each
+        for (int box_idx = 0; box_idx < num_boxes; box_idx++) {
+            draw_bounding_box<<<bgsize, bbsize, 0, sam3_stream>>>(
+                static_cast<float*>(output_gpu[2]), // pred_boxes
+                static_cast<float*>(output_gpu[3]), // pred_logits
+                gpu_result,
+                input.cols,
+                input.rows,
+                input.channels(),
+                num_boxes,
+                box_idx,
+                _probability_threshold,
+                gpu_colpal,
+                2); // thickness 2
+        }
+    }
 
     if (!is_zerocopy && vis_type == SAM3_VISUALIZATION::VIS_NONE)
     {
         cudaMemcpyAsync(output_cpu[0], output_gpu[0],output_sizes[0], cudaMemcpyDeviceToHost, sam3_stream);
         cudaMemcpyAsync(output_cpu[1], output_gpu[1],output_sizes[1], cudaMemcpyDeviceToHost, sam3_stream);
     }
+    else if (!is_zerocopy && vis_type == SAM3_VISUALIZATION::VIS_BBOX)
+    {
+        // Copy the small tensor outputs (boxes and logits) to CPU for drawing text labels
+        cudaMemcpyAsync(output_cpu[2], output_gpu[2], output_sizes[2], cudaMemcpyDeviceToHost, sam3_stream);
+        cudaMemcpyAsync(output_cpu[3], output_gpu[3], output_sizes[3], cudaMemcpyDeviceToHost, sam3_stream);
+        
+        cudaMemcpyAsync(
+            (void*)result.data, 
+            (void*)gpu_result, 
+            opencv_inbytes, 
+            cudaMemcpyDeviceToHost, 
+            sam3_stream);
+    }
     else if (!is_zerocopy)
     {
         cudaMemcpyAsync(
diff --git a/python/export_tokenizer.py b/python/export_tokenizer.py
new file mode 100644
index 0000000..b786d43
--- /dev/null
+++ b/python/export_tokenizer.py
@@ -0,0 +1,16 @@
+from transformers import AutoProcessor
+import os
+
+def main():
+    if not os.path.exists('onnx_weights'):
+        os.makedirs('onnx_weights')
+        
+    print("Loading SAM3 Processor...")
+    processor = AutoProcessor.from_pretrained("facebook/sam3")
+    
+    print("Saving tokenizer configuration to onnx_weights/tokenizer.json")
+    processor.tokenizer.save_pretrained("onnx_weights/")
+    print("Done!")
+
+if __name__ == "__main__":
+    main()
diff --git a/python/onnxexport.py b/python/onnxexport.py
index e0432cc..1d8aeb9 100755
--- a/python/onnxexport.py
+++ b/python/onnxexport.py
@@ -42,7 +42,7 @@ def forward(self, pixel_values, input_ids, attention_mask):
             input_ids=input_ids,
             attention_mask=attention_mask)
         
-        return outputs.pred_masks, outputs.semantic_seg
+        return outputs.pred_masks, outputs.semantic_seg, outputs.pred_boxes, outputs.pred_logits
 
 wrapper = Sam3ONNXWrapper(model).to(device).eval()
 
@@ -56,7 +56,7 @@ def forward(self, pixel_values, input_ids, attention_mask):
     (pixel_values, input_ids, attention_mask),
     onnx_path,
     input_names=["pixel_values", "input_ids", "attention_mask"],
-    output_names=["instance_masks", "semantic_seg"],
+    output_names=["instance_masks", "semantic_seg", "pred_boxes", "pred_logits"],
     dynamo=False,
     opset_version=17,
 )
diff --git a/python/tokenize_prompt.py b/python/tokenize_prompt.py
new file mode 100644
index 0000000..7fea4db
--- /dev/null
+++ b/python/tokenize_prompt.py
@@ -0,0 +1,43 @@
+import os
+import sys
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python3 tokenize.py 'your prompt text'")
+        sys.exit(1)
+        
+    prompt = sys.argv[1]
+    
+    # Check if transformers is available
+    try:
+        from transformers import AutoProcessor
+    except ImportError as e:
+        import traceback
+        traceback.print_exc(file=sys.stderr)
+        print("-1") # Signal failure to C++
+        sys.exit(1)
+        
+    # We load standard SAM3 tokenizer
+    # We suppress huggingface warnings so they dont pollute stdout
+    os.environ["TRANSFORMERS_VERBOSITY"] = "error"
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    
+    try:
+        processor = AutoProcessor.from_pretrained("facebook/sam3")
+        # Call the text tokenizer directly to bypass the Image/Video extractor requirements
+        inputs = processor.tokenizer(text=prompt, return_tensors="pt")
+        
+        # input_ids is typically shape [1, N]
+        ids = inputs.input_ids[0].tolist()
+        
+        # print comma separated string to stdout for C++ to read
+        print(",".join(map(str, ids)))
+        
+    except Exception as e:
+        import traceback
+        traceback.print_exc(file=sys.stderr)
+        print("-1") # Signal failure
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()