From f660833814ec7a580098ad56c054fedf4ffe450b Mon Sep 17 00:00:00 2001 From: priyadarshini75 Date: Thu, 12 Mar 2026 18:04:35 +0530 Subject: [PATCH 1/2] feat: Enable dynamic image size processing and enhance benchmarking in the SAM3 application, updating documentation and gitignore. --- .gitignore | 4 +++ README.md | 12 ++++++-- cpp/include/sam3.cuh | 2 +- cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp | 37 +++++++++++++++++-------- cpp/src/sam3/sam3_trt/sam3.cu | 35 ++++++++++++++++++----- 5 files changed, 67 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index ee1bdf6..f93b4f9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,6 @@ *.DS_Store *build +.results +*.plan +onnx_weights/ +test_images/ \ No newline at end of file diff --git a/README.md b/README.md index 4f2ef02..2c5f3c6 100755 --- a/README.md +++ b/README.md @@ -112,11 +112,11 @@ docker run -it --rm \ ```bash python python/onnxexport.py ``` -This produces `onnx_weights/sam3_static.onnx` plus external weight shards. +This produces `onnx_weights/sam3_dynamic.onnx` plus external weight shards. 5) Build a TensorRT engine ```bash -trtexec --onnx=onnx_weights/sam3_static.onnx --saveEngine=sam3_fp16.plan --fp16 --verbose +trtexec --onnx=onnx_weights/sam3_dynamic.onnx --saveEngine=sam3_fp16.plan --fp16 --verbose ``` 6) Build the C++/CUDA library and sample app @@ -181,4 +181,10 @@ TensorRT + CUDA (benchmark mode disables output writes): If this saved you time, drop a ⭐ so others can find it and ship SAM-3 faster. # Disclaimer -All views expressed here are my own. This project is not affiliated with my employer. \ No newline at end of file +All views expressed here are my own. This project is not affiliated with my employer. + +cd /workspace/cpp/build +make +./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan + +cd /workspace/cpp/build && make && ./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan 1 diff --git a/cpp/include/sam3.cuh b/cpp/include/sam3.cuh index 03efa19..118c057 100755 --- a/cpp/include/sam3.cuh +++ b/cpp/include/sam3.cuh @@ -52,7 +52,7 @@ private: cudaStream_t sam3_stream; dim3 bsize; dim3 gsize; - int in_width, in_height, opencv_inbytes; + int in_width, in_height, opencv_inbytes = 0; std::vector input_cpu; std::vector input_gpu; diff --git a/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp b/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp index 3f7c3d8..97f6c4c 100755 --- a/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp +++ b/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp @@ -95,7 +95,7 @@ int main(int argc, char* argv[]) SAM3_PCS pcs(epath, vis_alpha, probability_threshold); cv::Mat img, result; - char* raw_bytes; + int prev_width = 0, prev_height = 0; std::filesystem::create_directories("results"); int num_images_read=0; @@ -111,24 +111,26 @@ int main(int argc, char* argv[]) pcs.set_prompt(iid, iam); + const int MAX_BENCHMARK_IMAGES = 100; + for (const auto& fname : std::filesystem::directory_iterator(in_dir)) { if (std::filesystem::is_regular_file(fname.path())) { std::filesystem::path outfile = std::filesystem::path("results") / fname.path().filename(); - if (num_images_read==0) + img = cv::imread(fname.path(), cv::IMREAD_COLOR); + if (img.empty()) continue; + + result.create(img.rows, img.cols, img.type()); + + if (img.cols != prev_width || img.rows != prev_height) { - cv::Mat tmp = cv::imread(fname.path(), cv::IMREAD_COLOR); - raw_bytes = (char *)malloc(tmp.total()*tmp.elemSize()); - read_image_into_buffer(fname.path(), raw_bytes, img); - result = cv::imread(fname.path(), cv::IMREAD_COLOR); pcs.pin_opencv_matrices(img, result); + prev_width = img.cols; + prev_height = img.rows; } - else - { - read_image_into_buffer(fname.path(), raw_bytes, img); - } + start = std::chrono::system_clock::now(); infer_one_image(pcs, img, result, visualize, outfile, benchmark); num_images_read++; @@ -138,9 +140,20 @@ int main(int argc, char* argv[]) if (num_images_read>0 && num_images_read%10==0) { - float msec_per_image = millis_elapsed/num_images_read; - printf("Processed %d images at %f msec/image\n", num_images_read, msec_per_image); + printf("Processed %d images...\n", num_images_read); } + + if (num_images_read >= MAX_BENCHMARK_IMAGES) break; } } + + if (num_images_read > 0) + { + float msec_per_image = millis_elapsed/num_images_read; + float est_1000_min = msec_per_image * 1000.0f / 1000.0f / 60.0f; + printf("\n=== Benchmark Results ===\n"); + printf("Processed %d images in %.1f s\n", num_images_read, millis_elapsed/1000.0f); + printf("Average: %.2f msec/image\n", msec_per_image); + printf("Estimated time for 1000 images: %.1f min\n", est_1000_min); + } } diff --git a/cpp/src/sam3/sam3_trt/sam3.cu b/cpp/src/sam3/sam3_trt/sam3.cu index 9bcab00..e892b47 100755 --- a/cpp/src/sam3/sam3_trt/sam3.cu +++ b/cpp/src/sam3/sam3_trt/sam3.cu @@ -1,5 +1,21 @@ #include "sam3.cuh" +static size_t datatype_size(nvinfer1::DataType dtype) +{ + switch (dtype) + { + case nvinfer1::DataType::kFLOAT: return 4; + case nvinfer1::DataType::kHALF: return 2; + case nvinfer1::DataType::kINT8: return 1; + case nvinfer1::DataType::kINT32: return 4; + case nvinfer1::DataType::kINT64: return 8; + case nvinfer1::DataType::kBOOL: return 1; + case nvinfer1::DataType::kBF16: return 2; + case nvinfer1::DataType::kFP8: return 1; + default: return 4; + } +} + SAM3_PCS::SAM3_PCS(const std::string engine_path, const float vis_alpha, const float prob_threshold) : _engine_path(engine_path) , _overlay_alpha(vis_alpha) @@ -23,19 +39,24 @@ SAM3_PCS::SAM3_PCS(const std::string engine_path, const float vis_alpha, const f void SAM3_PCS::pin_opencv_matrices(cv::Mat& input_mat, cv::Mat& result_mat) { + // Free previous GPU buffers if re-allocating (image size changed) + if (!is_zerocopy && opencv_inbytes > 0) + { + if (opencv_input) { cudaFree(opencv_input); opencv_input = nullptr; } + if (gpu_result) { cudaFree(gpu_result); gpu_result = nullptr; } + } + opencv_inbytes = input_mat.total() * input_mat.elemSize(); - cuda_check(cudaHostRegister( + if (is_zerocopy) + { + cuda_check(cudaHostRegister( input_mat.data, opencv_inbytes, cudaHostRegisterDefault), " pinning opencv input Mat on host" ); - // for most purposes the default flag is good enough, in my benchmarking - // using others say readonly flag did not improve performance - if (is_zerocopy) - { cuda_check(cudaHostRegister( result_mat.data, opencv_inbytes, @@ -53,7 +74,7 @@ void SAM3_PCS::pin_opencv_matrices(cv::Mat& input_mat, cv::Mat& result_mat) } else { - // on dGPU allocate additional memory for input + // on dGPU allocate GPU-side buffers for image data cuda_check(cudaMalloc(&opencv_input, opencv_inbytes), " allocating opencv input memory on a dGPU system"); cuda_check(cudaMalloc((void**)&gpu_result, opencv_inbytes), " allocating result memory on a dGPU system"); cudaMemset(opencv_input, 0, opencv_inbytes); @@ -274,7 +295,7 @@ void SAM3_PCS::allocate_io_buffers() nvinfer1::TensorIOMode mode = trt_engine->getTensorIOMode(name); nvinfer1::Dims dims = trt_engine->getTensorShape(name); - size_t nbytes = sizeof(trt_engine->getTensorDataType(name)); + size_t nbytes = datatype_size(trt_engine->getTensorDataType(name)); for (int idx=0;idx < MAX_DIMS; idx++) { From 23f47f69355f3db3e1fe0d08dd04301629ba71de Mon Sep 17 00:00:00 2001 From: priyadarshini75 Date: Fri, 13 Mar 2026 12:34:40 +0530 Subject: [PATCH 2/2] added-native-bbox-detection --- README.md | 25 ++++- cpp/CMakeLists.txt | 2 + cpp/include/prepost.cuh | 14 +++ cpp/include/sam3.hpp | 3 +- cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp | 140 +++++++++++++++--------- cpp/src/sam3/sam3_trt/prepost.cu | 65 +++++++++++ cpp/src/sam3/sam3_trt/sam3.cu | 45 ++++++++ python/export_tokenizer.py | 16 +++ python/onnxexport.py | 4 +- python/tokenize_prompt.py | 43 ++++++++ 10 files changed, 300 insertions(+), 57 deletions(-) create mode 100644 python/export_tokenizer.py create mode 100644 python/tokenize_prompt.py diff --git a/README.md b/README.md index 2c5f3c6..b423034 100755 --- a/README.md +++ b/README.md @@ -183,8 +183,27 @@ If this saved you time, drop a ⭐ so others can find it and ship SAM-3 faster. # Disclaimer All views expressed here are my own. This project is not affiliated with my employer. +## Dynamic Bounding Box Detection (New!) +The application has been extended to support **Native Bounding Box Detection** directly from the SAM3 model outputs, as well as **Dynamic Text Prompting** without hardcoded tokens. + +### Setup Tokenizer +Because the C++ application relies on HuggingFace tokenization, you first must export the tokenizer files: +```bash +python3 python/export_tokenizer.py +``` +*This will create `tokenizer.json` in the `onnx_weights/` directory for the Python script to use.* + +### Run Bounding Box Visualization +Run the application with your target prompt as the 3rd argument. The C++ application will dynamically tokenize the prompt, run the TensorRT engine, and draw green bounding boxes with the text label above them. +```bash cd /workspace/cpp/build -make -./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan +make -j +./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan "helmet" +``` -cd /workspace/cpp/build && make && ./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan 1 +### Benchmark Bounding Box Inference +To test the raw speed of the `.plan` engine executing the prompt and calculating bounding boxes (without the latency of OpenCV drawing and saving the images), append `1` to the end of the command: +```bash +cd /workspace/cpp/build +./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan "helmet" 1 +``` diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 077a7d2..dcfc53a 100755 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -30,6 +30,8 @@ include_directories( ${CUDNN_ROOT_DIR}/include ) + + add_library(sam3_trt SHARED src/sam3/sam3_trt/sam3.cu src/sam3/sam3_trt/prepost.cu diff --git a/cpp/include/prepost.cuh b/cpp/include/prepost.cuh index b8d7329..2a9a476 100755 --- a/cpp/include/prepost.cuh +++ b/cpp/include/prepost.cuh @@ -51,6 +51,20 @@ __global__ void draw_instance_seg_mask( float3* color_palette ); +__global__ void draw_bounding_box( + float* boxes, + float* logits, + uint8_t* result, + int src_width, + int src_height, + int src_channels, + int max_boxes, + int box_idx, + float prob_threshold, + float3* color_palette, + int thickness +); + static std::vector colpal = { make_float3( 0, 185, 118), // teal (your original) make_float3(230, 159, 0), // orange diff --git a/cpp/include/sam3.hpp b/cpp/include/sam3.hpp index 8245084..e05d373 100755 --- a/cpp/include/sam3.hpp +++ b/cpp/include/sam3.hpp @@ -15,7 +15,8 @@ typedef enum { typedef enum { VIS_NONE, VIS_SEMANTIC_SEGMENTATION, - VIS_INSTANCE_SEGMENTATION + VIS_INSTANCE_SEGMENTATION, + VIS_BBOX } SAM3_VISUALIZATION; typedef struct { diff --git a/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp b/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp index 97f6c4c..30b2957 100755 --- a/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp +++ b/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp @@ -3,33 +3,27 @@ #include #include #include - -void read_image_into_buffer(const std::string imgpath, char* raw_buffer, cv::Mat& buffer) -{ - size_t file_size = std::filesystem::file_size(imgpath); - if (file_size==0) - { - std::stringstream err; - err << "Image file is empty"; - throw std::runtime_error(err.str()); +#include +#include +#include +#include + +// Helper to execute bash command and read stdout +std::string exec_python_tokenizer(const std::string& prompt) { + // Assuming the app is run from /workspace/cpp/build or /workspace + std::string cmd = "python3 /workspace/python/tokenize_prompt.py \"" + prompt + "\""; + std::array buffer; + std::string result; + std::unique_ptr pipe(popen(cmd.c_str(), "r"), pclose); + if (!pipe) { + throw std::runtime_error("popen() failed!"); } - - std::ifstream file(imgpath, std::ios::binary); - - if (!file.is_open()) - { - std::stringstream err; - err << "File " << imgpath << " could not be opened. Please check permissions\n"; - throw std::runtime_error(err.str()); + while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { + result += buffer.data(); } - - file.read(raw_buffer, file_size); - file.close(); - - cv::Mat raw_mat(1, static_cast(file_size), CV_8UC1, raw_buffer); - // just a wrapper, minimal allocation - - cv::imdecode(raw_mat, cv::IMREAD_COLOR, &buffer); + // Remove trailing newlines + result.erase(std::remove(result.begin(), result.end(), '\n'), result.end()); + return result; } void infer_one_image(SAM3_PCS& pcs, @@ -37,13 +31,35 @@ void infer_one_image(SAM3_PCS& pcs, cv::Mat& result, const SAM3_VISUALIZATION vis, const std::string outfile, + const std::string prompt, bool benchmark_run) { bool success = pcs.infer_on_image(img, result, vis); - if (benchmark_run) + if (benchmark_run) return; + + if (vis == SAM3_VISUALIZATION::VIS_BBOX) { - return; + // CPU-side box coordinates and logits copied over by sam3.cu + float* boxes = static_cast(pcs.output_cpu[2]); + float* logits = static_cast(pcs.output_cpu[3]); + int num_boxes = 200; + + for (int i=0; i < num_boxes; i++) { + float logit = logits[i]; + float prob = 1.0f / (1.0f + std::exp(-logit)); + if (prob > 0.5f) { // threshold match + float x1 = boxes[i * 4 + 0]; + float y1 = boxes[i * 4 + 1]; + int x_min = std::max(0, (int)(x1 * img.cols)); + int y_min = std::max(0, (int)(y1 * img.rows)); + + // Draw text slightly above the bounding box + // Reduced font scale from 0.9 to 0.5, thickness from 2 to 1 + cv::putText(result, prompt, cv::Point(x_min, std::max(15, y_min - 6)), + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 0), 1); + } + } } if (vis == SAM3_VISUALIZATION::VIS_NONE) @@ -59,38 +75,32 @@ void infer_one_image(SAM3_PCS& pcs, int main(int argc, char* argv[]) { - if (argc < 3) + if (argc < 4) { - std::cout << "Usage: ./sam3_pcs_app indir engine_path.engine " << std::endl; + std::cout << "Usage: ./sam3_pcs_app indir engine_path.engine prompt " << std::endl; return 0; } const std::string in_dir = argv[1]; std::string epath = argv[2]; - bool benchmark=false; // in benchmarking mode we dont save output images + std::string prompt = argv[3]; + bool benchmark = false; // in benchmarking mode we dont save output images - if (argc==4) + if (argc == 5) { - std::string b_arg = argv[3]; // should be 0 or 1 - try - { - benchmark = (b_arg == "1"); - } - catch(const std::exception) - { - std::cout << "Unrecognized benchmark type " << argv[3] << std::endl; - } + benchmark = (std::string(argv[4]) == "1"); } + std::cout << "Target Prompt: " << prompt << std::endl; std::cout << "Benchmarking: " << benchmark << std::endl; auto start = std::chrono::system_clock::now(); auto end = std::chrono::system_clock::now(); std::chrono::duration diff; - float millis_elapsed = 0.0; // int will overflow after ~650 hours + float millis_elapsed = 0.0; const float vis_alpha = 0.3; const float probability_threshold = 0.5; - const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_SEMANTIC_SEGMENTATION; + const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_BBOX; SAM3_PCS pcs(epath, vis_alpha, probability_threshold); @@ -100,15 +110,43 @@ int main(int argc, char* argv[]) std::filesystem::create_directories("results"); int num_images_read=0; - // tokenized version of 'person' - std::vector iid={49406, 2533, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, - 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, - 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, - 49407, 49407}; - - std::vector iam={1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0}; + // Tokenize the prompt + std::vector iid(32, 49407); // 49407 is usually the PAD/EOS token for SAM3 text encoder + std::vector iam(32, 0); + + try { + std::cout << "Calling Python to tokenize prompt: '" << prompt << "'..." << std::endl; + std::string py_out = exec_python_tokenizer(prompt); + + if (py_out == "-1" || py_out.empty()) { + throw std::runtime_error("Python tokenizer script failed."); + } + // Parse comma separated string + std::vector ids; + std::stringstream ss(py_out); + std::string token; + while (std::getline(ss, token, ',')) { + ids.push_back(std::stoi(token)); + } + + for (size_t i = 0; i < ids.size() && i < 32; ++i) { + iid[i] = ids[i]; + iam[i] = 1; // 1 for real tokens, 0 for pad + } + std::cout << "Successfully tokenized prompt into " << ids.size() << " tokens." << std::endl; + + } catch (const std::exception& e) { + std::cerr << "Tokenizer error: " << e.what() << std::endl; + std::cout << "Falling back to 'person' tokens.\n"; + iid = {49406, 2533, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, + 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, + 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, + 49407, 49407}; + iam = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + } + pcs.set_prompt(iid, iam); const int MAX_BENCHMARK_IMAGES = 100; @@ -132,7 +170,7 @@ int main(int argc, char* argv[]) } start = std::chrono::system_clock::now(); - infer_one_image(pcs, img, result, visualize, outfile, benchmark); + infer_one_image(pcs, img, result, visualize, outfile, prompt, benchmark); num_images_read++; end = std::chrono::system_clock::now(); diff = end - start; diff --git a/cpp/src/sam3/sam3_trt/prepost.cu b/cpp/src/sam3/sam3_trt/prepost.cu index fb29316..0ee7b00 100755 --- a/cpp/src/sam3/sam3_trt/prepost.cu +++ b/cpp/src/sam3/sam3_trt/prepost.cu @@ -174,4 +174,69 @@ __global__ void draw_instance_seg_mask( } } } +} + +__global__ void draw_bounding_box( + float* boxes, + float* logits, + uint8_t* result, + int src_width, + int src_height, + int src_channels, + int max_boxes, + int box_idx, + float prob_threshold, + float3* color_palette, + int thickness +) +{ + // One thread per pixel in the image. Block size e.g., 16x16 + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= src_width || y >= src_height) return; + + // Check if the box is valid (prob > threshold) + // Logits: 1 / (1 + exp(-logit)) + float logit = logits[box_idx]; + float prob = 1.0f / (1.0f + exp(-logit)); + if (prob <= prob_threshold) return; + + // Get color from palette + float3 color = color_palette[box_idx % 20]; + + // Read box coords [x_min, y_min, x_max, y_max] normalized 0-1 + float x1 = boxes[box_idx * 4 + 0]; + float y1 = boxes[box_idx * 4 + 1]; + float x2 = boxes[box_idx * 4 + 2]; + float y2 = boxes[box_idx * 4 + 3]; + + // Convert to pixel coordinates + int x_min = max(0, (int)(x1 * src_width)); + int y_min = max(0, (int)(y1 * src_height)); + int x_max = min(src_width - 1, (int)(x2 * src_width)); + int y_max = min(src_height - 1, (int)(y2 * src_height)); + + // Check if current thread pixel is ON the border of the bounding box + bool is_border = false; + + // Check horizontal borders (top and bottom) + if (x >= x_min && x <= x_max) { + if (abs(y - y_min) < thickness || abs(y - y_max) < thickness) { + is_border = true; + } + } + // Check vertical borders (left and right) + if (y >= y_min && y <= y_max) { + if (abs(x - x_min) < thickness || abs(x - x_max) < thickness) { + is_border = true; + } + } + + if (is_border) { + int res_loc = (y * src_width + x) * src_channels; + result[res_loc] = (uint8_t)color.x; + result[res_loc + 1] = (uint8_t)color.y; + result[res_loc + 2] = (uint8_t)color.z; + } } \ No newline at end of file diff --git a/cpp/src/sam3/sam3_trt/sam3.cu b/cpp/src/sam3/sam3_trt/sam3.cu index e892b47..95800d6 100755 --- a/cpp/src/sam3/sam3_trt/sam3.cu +++ b/cpp/src/sam3/sam3_trt/sam3.cu @@ -145,12 +145,57 @@ void SAM3_PCS::visualize_on_dGPU(const cv::Mat& input, cv::Mat& result, SAM3_VIS gpu_colpal); } } + else if (vis_type == SAM3_VISUALIZATION::VIS_BBOX) + { + // First copy the original image to the result + cuda_check(cudaMemcpyAsync((void *)gpu_result, + (void *)input_ptr, + opencv_inbytes, + cudaMemcpyDeviceToDevice, + sam3_stream), " async memcpy for result during bbox visualization"); + + dim3 bbsize(16, 16); + dim3 bgsize; + bgsize.x = (input.cols + bbsize.x - 1) / bbsize.x; + bgsize.y = (input.rows + bbsize.y - 1) / bbsize.y; + + int num_boxes = 200; // SAM3 predicts 200 boxes max generally + + // Loop over all boxes and launch drawing kernel for each + for (int box_idx = 0; box_idx < num_boxes; box_idx++) { + draw_bounding_box<<>>( + static_cast(output_gpu[2]), // pred_boxes + static_cast(output_gpu[3]), // pred_logits + gpu_result, + input.cols, + input.rows, + input.channels(), + num_boxes, + box_idx, + _probability_threshold, + gpu_colpal, + 2); // thickness 2 + } + } if (!is_zerocopy && vis_type == SAM3_VISUALIZATION::VIS_NONE) { cudaMemcpyAsync(output_cpu[0], output_gpu[0],output_sizes[0], cudaMemcpyDeviceToHost, sam3_stream); cudaMemcpyAsync(output_cpu[1], output_gpu[1],output_sizes[1], cudaMemcpyDeviceToHost, sam3_stream); } + else if (!is_zerocopy && vis_type == SAM3_VISUALIZATION::VIS_BBOX) + { + // Copy the small tensor outputs (boxes and logits) to CPU for drawing text labels + cudaMemcpyAsync(output_cpu[2], output_gpu[2], output_sizes[2], cudaMemcpyDeviceToHost, sam3_stream); + cudaMemcpyAsync(output_cpu[3], output_gpu[3], output_sizes[3], cudaMemcpyDeviceToHost, sam3_stream); + + cudaMemcpyAsync( + (void*)result.data, + (void*)gpu_result, + opencv_inbytes, + cudaMemcpyDeviceToHost, + sam3_stream); + } else if (!is_zerocopy) { cudaMemcpyAsync( diff --git a/python/export_tokenizer.py b/python/export_tokenizer.py new file mode 100644 index 0000000..b786d43 --- /dev/null +++ b/python/export_tokenizer.py @@ -0,0 +1,16 @@ +from transformers import AutoProcessor +import os + +def main(): + if not os.path.exists('onnx_weights'): + os.makedirs('onnx_weights') + + print("Loading SAM3 Processor...") + processor = AutoProcessor.from_pretrained("facebook/sam3") + + print("Saving tokenizer configuration to onnx_weights/tokenizer.json") + processor.tokenizer.save_pretrained("onnx_weights/") + print("Done!") + +if __name__ == "__main__": + main() diff --git a/python/onnxexport.py b/python/onnxexport.py index e0432cc..1d8aeb9 100755 --- a/python/onnxexport.py +++ b/python/onnxexport.py @@ -42,7 +42,7 @@ def forward(self, pixel_values, input_ids, attention_mask): input_ids=input_ids, attention_mask=attention_mask) - return outputs.pred_masks, outputs.semantic_seg + return outputs.pred_masks, outputs.semantic_seg, outputs.pred_boxes, outputs.pred_logits wrapper = Sam3ONNXWrapper(model).to(device).eval() @@ -56,7 +56,7 @@ def forward(self, pixel_values, input_ids, attention_mask): (pixel_values, input_ids, attention_mask), onnx_path, input_names=["pixel_values", "input_ids", "attention_mask"], - output_names=["instance_masks", "semantic_seg"], + output_names=["instance_masks", "semantic_seg", "pred_boxes", "pred_logits"], dynamo=False, opset_version=17, ) diff --git a/python/tokenize_prompt.py b/python/tokenize_prompt.py new file mode 100644 index 0000000..7fea4db --- /dev/null +++ b/python/tokenize_prompt.py @@ -0,0 +1,43 @@ +import os +import sys + +def main(): + if len(sys.argv) < 2: + print("Usage: python3 tokenize.py 'your prompt text'") + sys.exit(1) + + prompt = sys.argv[1] + + # Check if transformers is available + try: + from transformers import AutoProcessor + except ImportError as e: + import traceback + traceback.print_exc(file=sys.stderr) + print("-1") # Signal failure to C++ + sys.exit(1) + + # We load standard SAM3 tokenizer + # We suppress huggingface warnings so they dont pollute stdout + os.environ["TRANSFORMERS_VERBOSITY"] = "error" + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + try: + processor = AutoProcessor.from_pretrained("facebook/sam3") + # Call the text tokenizer directly to bypass the Image/Video extractor requirements + inputs = processor.tokenizer(text=prompt, return_tensors="pt") + + # input_ids is typically shape [1, N] + ids = inputs.input_ids[0].tolist() + + # print comma separated string to stdout for C++ to read + print(",".join(map(str, ids))) + + except Exception as e: + import traceback + traceback.print_exc(file=sys.stderr) + print("-1") # Signal failure + sys.exit(1) + +if __name__ == "__main__": + main()