From e3e1899206b77ef700f66d4fe89cde23f715b63b Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 23 Feb 2026 14:02:32 -0800 Subject: [PATCH 01/25] initial commit --- cuda_core/cuda/core/__init__.py | 10 + cuda_core/cuda/core/_kernel_arg_handler.pyx | 24 + cuda_core/cuda/core/_tensor_map.pxd | 12 + cuda_core/cuda/core/_tensor_map.pyx | 524 ++++++++++++++++++++ cuda_core/examples/tma_tensor_map.py | 195 ++++++++ cuda_core/tests/test_tensor_map.py | 279 +++++++++++ 6 files changed, 1044 insertions(+) create mode 100644 cuda_core/cuda/core/_tensor_map.pxd create mode 100644 cuda_core/cuda/core/_tensor_map.pyx create mode 100644 cuda_core/examples/tma_tensor_map.py create mode 100644 cuda_core/tests/test_tensor_map.py diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index dfba887144..84e4f3d356 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -60,6 +60,7 @@ StridedMemoryView, args_viewable_as_strided_memory, ) +<<<<<<< HEAD from cuda.core._module import Kernel, ObjectCode from cuda.core._program import Program, ProgramOptions from cuda.core._stream import ( @@ -68,3 +69,12 @@ Stream, StreamOptions, ) +from cuda.core._tensor_map import ( + TensorMapDataType, + TensorMapDescriptor, + TensorMapIm2ColWideMode, + TensorMapInterleave, + TensorMapL2Promotion, + TensorMapOOBFill, + TensorMapSwizzle, +) diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx index 882ca5eaab..e88def13cd 100644 --- a/cuda_core/cuda/core/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -6,6 +6,7 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free from libc.stdint cimport (intptr_t, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t,) +from libc.string cimport memcpy from libcpp cimport bool as cpp_bool from libcpp.complex cimport complex as cpp_complex from libcpp cimport nullptr @@ -16,6 +17,8 @@ import ctypes import numpy from cuda.core._memory import Buffer +from cuda.core._tensor_map import TensorMapDescriptor as _TensorMapDescriptor_py +from cuda.core._tensor_map cimport TensorMapDescriptor from cuda.core._utils.cuda_utils import driver from cuda.bindings cimport cydriver @@ -97,6 +100,9 @@ cdef object numpy_complex64 = numpy.complex64 cdef object numpy_complex128 = numpy.complex128 +cdef object tensor_map_descriptor_type = _TensorMapDescriptor_py + + # limitation due to cython/cython#534 ctypedef void* voidptr @@ -124,6 +130,18 @@ cdef inline int prepare_arg( return 0 +cdef inline int prepare_tensor_map_arg( + vector.vector[void*]& data, + vector.vector[void*]& data_addresses, + TensorMapDescriptor arg, + const size_t idx) except -1: + cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap)) + memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap)) + data_addresses[idx] = ptr + data[idx] = ptr + return 0 + + cdef inline int prepare_ctypes_arg( vector.vector[void*]& data, vector.vector[void*]& data_addresses, @@ -273,6 +291,9 @@ cdef class ParamHolder: # it's a CUdeviceptr: self.data_addresses[i] = (arg.handle.getPtr()) continue + elif arg_type is tensor_map_descriptor_type: + prepare_tensor_map_arg(self.data, self.data_addresses, arg, i) + continue elif arg_type is bool: prepare_arg[cpp_bool](self.data, self.data_addresses, arg, i) continue @@ -322,6 +343,9 @@ cdef class ParamHolder: elif isinstance(arg, driver.CUgraphConditionalHandle): prepare_arg[cydriver.CUgraphConditionalHandle](self.data, self.data_addresses, arg, i) continue + elif isinstance(arg, tensor_map_descriptor_type): + prepare_tensor_map_arg(self.data, self.data_addresses, arg, i) + continue # TODO: support ctypes/numpy struct raise TypeError("the argument is of unsupported type: " + str(type(arg))) diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd new file mode 100644 index 0000000000..3cfd571d89 --- /dev/null +++ b/cuda_core/cuda/core/_tensor_map.pxd @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings cimport cydriver + + +cdef class TensorMapDescriptor: + cdef cydriver.CUtensorMap _tensor_map + cdef object _source_ref + + cdef void* _get_data_ptr(self) diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx new file mode 100644 index 0000000000..9375acf944 --- /dev/null +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -0,0 +1,524 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport intptr_t, uint32_t, uint64_t +from cuda.bindings cimport cydriver +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN + +import enum + +import numpy + +from cuda.core._memoryview import StridedMemoryView + + +try: + from ml_dtypes import bfloat16 as ml_bfloat16 +except ImportError: + ml_bfloat16 = None + + +class TensorMapDataType(enum.IntEnum): + """Data types for tensor map descriptors. + + These correspond to the ``CUtensorMapDataType`` driver enum values. + """ + UINT8 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT8 + UINT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT16 + UINT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT32 + INT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT32 + UINT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT64 + INT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT64 + FLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT16 + FLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32 + FLOAT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT64 + BFLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_BFLOAT16 + FLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ + TFLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32 + TFLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ + + +class TensorMapInterleave(enum.IntEnum): + """Interleave layout for tensor map descriptors. + + These correspond to the ``CUtensorMapInterleave`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_INTERLEAVE_NONE + INTERLEAVE_16B = cydriver.CU_TENSOR_MAP_INTERLEAVE_16B + INTERLEAVE_32B = cydriver.CU_TENSOR_MAP_INTERLEAVE_32B + + +class TensorMapSwizzle(enum.IntEnum): + """Swizzle mode for tensor map descriptors. + + These correspond to the ``CUtensorMapSwizzle`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_SWIZZLE_NONE + SWIZZLE_32B = cydriver.CU_TENSOR_MAP_SWIZZLE_32B + SWIZZLE_64B = cydriver.CU_TENSOR_MAP_SWIZZLE_64B + SWIZZLE_128B = cydriver.CU_TENSOR_MAP_SWIZZLE_128B + + +class TensorMapL2Promotion(enum.IntEnum): + """L2 promotion mode for tensor map descriptors. + + These correspond to the ``CUtensorMapL2promotion`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_L2_PROMOTION_NONE + L2_64B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_64B + L2_128B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_128B + L2_256B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_256B + + +class TensorMapOOBFill(enum.IntEnum): + """Out-of-bounds fill mode for tensor map descriptors. + + These correspond to the ``CUtensorMapFloatOOBfill`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE + NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA + + +# Mapping from numpy dtype to TMA data type +_NUMPY_DTYPE_TO_TMA = { + numpy.dtype(numpy.uint8): TensorMapDataType.UINT8, + numpy.dtype(numpy.uint16): TensorMapDataType.UINT16, + numpy.dtype(numpy.uint32): TensorMapDataType.UINT32, + numpy.dtype(numpy.int32): TensorMapDataType.INT32, + numpy.dtype(numpy.uint64): TensorMapDataType.UINT64, + numpy.dtype(numpy.int64): TensorMapDataType.INT64, + numpy.dtype(numpy.float16): TensorMapDataType.FLOAT16, + numpy.dtype(numpy.float32): TensorMapDataType.FLOAT32, + numpy.dtype(numpy.float64): TensorMapDataType.FLOAT64, +} + +if ml_bfloat16 is not None: + _NUMPY_DTYPE_TO_TMA[numpy.dtype(ml_bfloat16)] = TensorMapDataType.BFLOAT16 + + +# Mapping from TMA data type to element size in bytes +_TMA_DATA_TYPE_SIZE = { + TensorMapDataType.UINT8: 1, + TensorMapDataType.UINT16: 2, + TensorMapDataType.UINT32: 4, + TensorMapDataType.INT32: 4, + TensorMapDataType.UINT64: 8, + TensorMapDataType.INT64: 8, + TensorMapDataType.FLOAT16: 2, + TensorMapDataType.FLOAT32: 4, + TensorMapDataType.FLOAT64: 8, + TensorMapDataType.BFLOAT16: 2, + TensorMapDataType.FLOAT32_FTZ: 4, + TensorMapDataType.TFLOAT32: 4, + TensorMapDataType.TFLOAT32_FTZ: 4, +} + + +def _resolve_data_type(view, data_type): + """Resolve the TMA data type from an explicit value or the view's dtype.""" + + if data_type is not None: + if not isinstance(data_type, TensorMapDataType): + raise TypeError( + f"data_type must be a TensorMapDataType, got {type(data_type)}") + return data_type + + dt = view.dtype + if dt is None: + raise ValueError( + "Cannot infer TMA data type from the tensor; " + "please specify data_type explicitly") + + tma_dt = _NUMPY_DTYPE_TO_TMA.get(dt) + if tma_dt is None: + raise ValueError( + f"Unsupported dtype {dt} for TMA; " + f"supported dtypes: {list(_NUMPY_DTYPE_TO_TMA.keys())}. " + "You may also specify data_type explicitly.") + + return tma_dt + + +cdef class TensorMapDescriptor: + """Describes a TMA (Tensor Memory Accelerator) tensor map for Hopper+ GPUs. + + A ``TensorMapDescriptor`` wraps the opaque 128-byte ``CUtensorMap`` struct + used by the hardware TMA unit for efficient bulk data movement between + global and shared memory. + + Instances are created via the class methods :meth:`from_tiled` and + :meth:`from_im2col`, and can be passed directly to + :func:`~cuda.core.launch` as a kernel argument. + """ + + def __init__(self): + raise RuntimeError( + "TensorMapDescriptor cannot be instantiated directly. " + "Use TensorMapDescriptor.from_tiled() or " + "TensorMapDescriptor.from_im2col().") + + cdef void* _get_data_ptr(self): + return &self._tensor_map + + @staticmethod + def from_tiled(tensor, box_dim, *, + element_strides=None, + data_type=None, + interleave=TensorMapInterleave.NONE, + swizzle=TensorMapSwizzle.NONE, + l2_promotion=TensorMapL2Promotion.NONE, + oob_fill=TensorMapOOBFill.NONE): + """Create a tiled TMA descriptor from a tensor object. + + Parameters + ---------- + tensor : object + Any object supporting DLPack or ``__cuda_array_interface__``, + or a :obj:`~cuda.core.StridedMemoryView`. Must refer to + device-accessible memory with a 16-byte-aligned pointer. + box_dim : tuple of int + The size of each tile dimension (in elements). Must have the + same rank as the tensor and each value must be in [1, 256]. + Specified in the same (row-major) order as the tensor shape. + element_strides : tuple of int, optional + Per-dimension element traversal strides. Default is all 1s. + Specified in the same (row-major) order as the tensor shape. + data_type : TensorMapDataType, optional + Explicit data type override. If ``None``, inferred from the + tensor's dtype. + interleave : TensorMapInterleave + Interleave layout. Default ``NONE``. + swizzle : TensorMapSwizzle + Swizzle mode. Default ``NONE``. + l2_promotion : TensorMapL2Promotion + L2 promotion mode. Default ``NONE``. + oob_fill : TensorMapOOBFill + Out-of-bounds fill mode. Default ``NONE``. + + Returns + ------- + TensorMapDescriptor + + Raises + ------ + ValueError + If the tensor rank is outside [1, 5], the pointer is not + 16-byte aligned, or dimension/stride constraints are violated. + """ + cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) + + # Obtain a StridedMemoryView from the tensor + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + # Keep a strong reference to prevent GC + desc._source_ref = tensor + + # Resolve data type + tma_dt = _resolve_data_type(view, data_type) + cdef int c_data_type_int = int(tma_dt) + cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int + + # Get tensor metadata + cdef intptr_t global_address = view.ptr + shape = view.shape + strides = view.strides # in elements, can be None for C-contiguous + + cdef int rank = len(shape) + if rank < 1 or rank > 5: + raise ValueError( + f"Tensor rank must be between 1 and 5, got {rank}") + + if global_address % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{global_address:x} (misaligned by {global_address % 16} bytes)") + + if len(box_dim) != rank: + raise ValueError( + f"box_dim must have {rank} elements (same as tensor rank), " + f"got {len(box_dim)}") + + for i, bd in enumerate(box_dim): + if bd < 1 or bd > 256: + raise ValueError( + f"box_dim[{i}] must be in [1, 256], got {bd}") + + if element_strides is not None: + if len(element_strides) != rank: + raise ValueError( + f"element_strides must have {rank} elements, got {len(element_strides)}") + else: + element_strides = (1,) * rank + + # Compute byte strides from element strides + cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] + if strides is not None: + byte_strides = tuple(s * elem_size for s in strides) + else: + # C-contiguous: strides in bytes, row-major + byte_strides = [] + stride = elem_size + for i in range(rank - 1, -1, -1): + byte_strides.append(stride) + stride *= shape[i] + byte_strides.reverse() + + # Reverse dimensions for column-major cuTensorMap convention + # Python/DLPack: row-major (dim 0 = outermost) + # cuTensorMap: column-major (dim 0 = innermost) + cdef uint64_t[5] c_global_dim + cdef uint64_t[4] c_global_strides # rank - 1 elements + cdef uint32_t[5] c_box_dim + cdef uint32_t[5] c_element_strides + cdef int i_c + + for i_c in range(rank): + # Reverse: Python dim i -> cuTensorMap dim (rank - 1 - i) + c_global_dim[i_c] = shape[rank - 1 - i_c] + c_box_dim[i_c] = box_dim[rank - 1 - i_c] + c_element_strides[i_c] = element_strides[rank - 1 - i_c] + + # globalStrides: rank-1 elements (byte strides for dims 1..N-1 in col-major order) + # The innermost stride (dim 0) is implicit = element size + for i_c in range(rank - 1): + c_global_strides[i_c] = byte_strides[rank - 2 - i_c] + + cdef uint32_t c_rank = rank + cdef int c_interleave_int = int(interleave) + cdef int c_swizzle_int = int(swizzle) + cdef int c_l2_promotion_int = int(l2_promotion) + cdef int c_oob_fill_int = int(oob_fill) + cdef cydriver.CUtensorMapInterleave c_interleave = c_interleave_int + cdef cydriver.CUtensorMapSwizzle c_swizzle = c_swizzle_int + cdef cydriver.CUtensorMapL2promotion c_l2_promotion = c_l2_promotion_int + cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = c_oob_fill_int + + with nogil: + HANDLE_RETURN(cydriver.cuTensorMapEncodeTiled( + &desc._tensor_map, + c_data_type, + c_rank, + global_address, + c_global_dim, + c_global_strides, + c_box_dim, + c_element_strides, + c_interleave, + c_swizzle, + c_l2_promotion, + c_oob_fill, + )) + + return desc + + @staticmethod + def from_im2col(tensor, pixel_box_lower_corner, pixel_box_upper_corner, + channels_per_pixel, pixels_per_column, *, + element_strides=None, + data_type=None, + interleave=TensorMapInterleave.NONE, + swizzle=TensorMapSwizzle.NONE, + l2_promotion=TensorMapL2Promotion.NONE, + oob_fill=TensorMapOOBFill.NONE): + """Create an im2col TMA descriptor from a tensor object. + + Im2col layout is used for convolution-style data access patterns. + + Parameters + ---------- + tensor : object + Any object supporting DLPack or ``__cuda_array_interface__``, + or a :obj:`~cuda.core.StridedMemoryView`. Must refer to + device-accessible memory with a 16-byte-aligned pointer. + pixel_box_lower_corner : tuple of int + Lower corner of the pixel bounding box for each spatial + dimension (rank - 2 elements). Specified in row-major order + matching the tensor's spatial dimensions. + pixel_box_upper_corner : tuple of int + Upper corner of the pixel bounding box for each spatial + dimension (rank - 2 elements). Specified in row-major order + matching the tensor's spatial dimensions. + channels_per_pixel : int + Number of channels per pixel. + pixels_per_column : int + Number of pixels per column. + element_strides : tuple of int, optional + Per-dimension element traversal strides. Default is all 1s. + data_type : TensorMapDataType, optional + Explicit data type override. If ``None``, inferred from the + tensor's dtype. + interleave : TensorMapInterleave + Interleave layout. Default ``NONE``. + swizzle : TensorMapSwizzle + Swizzle mode. Default ``NONE``. + l2_promotion : TensorMapL2Promotion + L2 promotion mode. Default ``NONE``. + oob_fill : TensorMapOOBFill + Out-of-bounds fill mode. Default ``NONE``. + + Returns + ------- + TensorMapDescriptor + + Raises + ------ + ValueError + If the tensor rank is outside [3, 5], the pointer is not + 16-byte aligned, or other constraints are violated. + """ + cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) + + # Obtain a StridedMemoryView from the tensor + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + desc._source_ref = tensor + + tma_dt = _resolve_data_type(view, data_type) + cdef int c_data_type_int = int(tma_dt) + cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int + + cdef intptr_t global_address = view.ptr + shape = view.shape + strides = view.strides + + cdef int rank = len(shape) + if rank < 3 or rank > 5: + raise ValueError( + f"Im2col tensor rank must be between 3 and 5, got {rank}") + + if global_address % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{global_address:x}") + + cdef int n_spatial = rank - 2 + if len(pixel_box_lower_corner) != n_spatial: + raise ValueError( + f"pixel_box_lower_corner must have {n_spatial} elements " + f"(rank - 2), got {len(pixel_box_lower_corner)}") + if len(pixel_box_upper_corner) != n_spatial: + raise ValueError( + f"pixel_box_upper_corner must have {n_spatial} elements " + f"(rank - 2), got {len(pixel_box_upper_corner)}") + + if element_strides is not None: + if len(element_strides) != rank: + raise ValueError( + f"element_strides must have {rank} elements, got {len(element_strides)}") + else: + element_strides = (1,) * rank + + cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] + if strides is not None: + byte_strides = tuple(s * elem_size for s in strides) + else: + byte_strides = [] + stride = elem_size + for i in range(rank - 1, -1, -1): + byte_strides.append(stride) + stride *= shape[i] + byte_strides.reverse() + + # Reverse all dimension arrays for column-major convention + cdef uint64_t[5] c_global_dim + cdef uint64_t[4] c_global_strides + cdef uint32_t[5] c_element_strides + cdef int[3] c_pixel_box_lower # max 3 spatial dims (rank 5 - 2) + cdef int[3] c_pixel_box_upper + cdef int i_c + + for i_c in range(rank): + c_global_dim[i_c] = shape[rank - 1 - i_c] + c_element_strides[i_c] = element_strides[rank - 1 - i_c] + + for i_c in range(rank - 1): + c_global_strides[i_c] = byte_strides[rank - 2 - i_c] + + # Reverse spatial dimensions for lower/upper corners + for i_c in range(n_spatial): + c_pixel_box_lower[i_c] = pixel_box_lower_corner[n_spatial - 1 - i_c] + c_pixel_box_upper[i_c] = pixel_box_upper_corner[n_spatial - 1 - i_c] + + cdef uint32_t c_rank = rank + cdef uint32_t c_channels = channels_per_pixel + cdef uint32_t c_pixels = pixels_per_column + cdef int c_interleave_int = int(interleave) + cdef int c_swizzle_int = int(swizzle) + cdef int c_l2_promotion_int = int(l2_promotion) + cdef int c_oob_fill_int = int(oob_fill) + cdef cydriver.CUtensorMapInterleave c_interleave = c_interleave_int + cdef cydriver.CUtensorMapSwizzle c_swizzle = c_swizzle_int + cdef cydriver.CUtensorMapL2promotion c_l2_promotion = c_l2_promotion_int + cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = c_oob_fill_int + + with nogil: + HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2col( + &desc._tensor_map, + c_data_type, + c_rank, + global_address, + c_global_dim, + c_global_strides, + c_pixel_box_lower, + c_pixel_box_upper, + c_channels, + c_pixels, + c_element_strides, + c_interleave, + c_swizzle, + c_l2_promotion, + c_oob_fill, + )) + + return desc + + def replace_address(self, tensor): + """Replace the global memory address in this tensor map descriptor. + + This is useful when the tensor data has been reallocated but the + shape, strides, and other parameters remain the same. + + Parameters + ---------- + tensor : object + Any object supporting DLPack or ``__cuda_array_interface__``, + or a :obj:`~cuda.core.StridedMemoryView`. Must refer to + device-accessible memory with a 16-byte-aligned pointer. + """ + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + cdef intptr_t global_address = view.ptr + if global_address % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{global_address:x}") + + self._source_ref = tensor + + with nogil: + HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress( + &self._tensor_map, + global_address, + )) + + def __repr__(self): + return f"TensorMapDescriptor()" diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py new file mode 100644 index 0000000000..de51570ec4 --- /dev/null +++ b/cuda_core/examples/tma_tensor_map.py @@ -0,0 +1,195 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates how to use TMA (Tensor Memory Accelerator) descriptors +# with cuda.core on Hopper+ GPUs (compute capability >= 9.0). +# +# TMA enables efficient bulk data movement between global and shared memory using +# hardware-managed tensor map descriptors. This example shows: +# +# 1. Creating a TMA tiled descriptor from a CuPy device array +# 2. Passing the descriptor to a kernel via launch() +# 3. Using TMA to load tiles into shared memory (via inline PTX) +# 4. Updating the descriptor's source address with replace_address() +# +# Requirements: +# - Hopper or later GPU (compute capability >= 9.0) +# - CuPy +# - CUDA toolkit headers (CUDA_PATH or CUDA_HOME set) +# +# ################################################################################ + +import sys + +import cupy as cp +import numpy as np + +from cuda.core import ( + Device, + LaunchConfig, + Program, + ProgramOptions, + TensorMapDescriptor, + launch, +) + +# --------------------------------------------------------------------------- +# Check for Hopper+ GPU +# --------------------------------------------------------------------------- +dev = Device() +arch = dev.compute_capability +if arch < (9, 0): + print( + "TMA requires compute capability >= 9.0 (Hopper or later)", + file=sys.stderr, + ) + sys.exit(0) +dev.set_current() + +arch_str = "".join(f"{i}" for i in arch) + +# --------------------------------------------------------------------------- +# CUDA kernel that uses TMA to load a 1-D tile into shared memory, then +# copies the tile to an output buffer so we can verify correctness. +# +# The CUtensorMap struct (128 bytes) is defined inline so the kernel can be +# compiled with NVRTC without pulling in the full driver-API header. +# +# Key points: +# - The tensor map is passed by value with __grid_constant__ so the TMA +# hardware can read it from grid-constant memory. +# - Thread 0 in each block issues the TMA load and manages the mbarrier. +# - All threads wait on the mbarrier, then copy from shared to global. +# --------------------------------------------------------------------------- +TILE_SIZE = 128 # elements per tile (must match the kernel constant) + +code = r""" +// Minimal definition of the 128-byte opaque tensor map struct. +struct __align__(64) TensorMap { unsigned long long opaque[16]; }; + +static constexpr int TILE_SIZE = 128; + +extern "C" +__global__ void tma_copy( + const __grid_constant__ TensorMap tensor_map, + float* output, + int N) +{ + __shared__ __align__(128) float smem[TILE_SIZE]; + __shared__ __align__(8) unsigned long long mbar; + + const int tid = threadIdx.x; + const int tile_start = blockIdx.x * TILE_SIZE; + + // ---- Thread 0: set up mbarrier and issue the TMA load ---- + if (tid == 0) + { + // Initialise a single-phase mbarrier (1 arriving thread). + asm volatile( + "mbarrier.init.shared.b64 [%0], 1;" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); + + // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start' + // from the tensor described by 'tensor_map' into shared memory. + asm volatile( + "cp.async.bulk.tensor.1d.shared::cluster.global.tile" + ".mbarrier::complete_tx::bytes" + " [%0], [%1, {%2}], [%3];" + :: "r"((unsigned)__cvta_generic_to_shared(smem)), + "l"(&tensor_map), + "r"(tile_start), + "r"((unsigned)__cvta_generic_to_shared(&mbar))); + + // Tell the mbarrier how many bytes the TMA will deliver. + asm volatile( + "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar)), + "r"((unsigned)(TILE_SIZE * sizeof(float)))); + } + + __syncthreads(); + + // ---- Wait for the TMA load to complete ---- + if (tid == 0) + { + asm volatile( + "{ .reg .pred P; \n" + "WAIT: \n" + " mbarrier.try_wait.parity.shared.b64 P, [%0], 0; \n" + " @!P bra WAIT; \n" + "} \n" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); + } + + __syncthreads(); + + // ---- Copy the tile from shared memory to the output buffer ---- + if (tid < TILE_SIZE) + { + const int idx = tile_start + tid; + if (idx < N) + output[idx] = smem[tid]; + } +} +""" + +# --------------------------------------------------------------------------- +# Compile the kernel +# --------------------------------------------------------------------------- +prog = Program( + code, + code_type="c++", + options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"), +) +mod = prog.compile("cubin") +ker = mod.get_kernel("tma_copy") + +# --------------------------------------------------------------------------- +# 1) Prepare input data on the device +# --------------------------------------------------------------------------- +N = 1024 +a = cp.arange(N, dtype=cp.float32) # [0, 1, 2, ..., N-1] +output = cp.zeros(N, dtype=cp.float32) +dev.sync() # cupy uses its own stream + +# --------------------------------------------------------------------------- +# 2) Create a TMA tiled descriptor +# from_tiled() accepts any DLPack / __cuda_array_interface__ object. +# The dtype (float32) is inferred automatically from the CuPy array. +# --------------------------------------------------------------------------- +tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,)) + +# --------------------------------------------------------------------------- +# 3) Launch the kernel +# The TensorMapDescriptor is passed directly as a kernel argument — the +# 128-byte struct is copied into kernel parameter space automatically. +# --------------------------------------------------------------------------- +n_tiles = N // TILE_SIZE +config = LaunchConfig(grid=n_tiles, block=TILE_SIZE) +launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N)) +dev.sync() + +assert cp.array_equal(output, a), "TMA copy produced incorrect results" +print(f"TMA copy verified: {N} elements across {n_tiles} tiles") + +# --------------------------------------------------------------------------- +# 4) Demonstrate replace_address() +# Create a second tensor with different content, point the *same* +# descriptor at it, and re-launch without rebuilding the descriptor. +# --------------------------------------------------------------------------- +b = cp.full(N, fill_value=42.0, dtype=cp.float32) +dev.sync() + +tensor_map.replace_address(b) + +output2 = cp.zeros(N, dtype=cp.float32) +dev.sync() + +launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N)) +dev.sync() + +assert cp.array_equal(output2, b), "replace_address produced incorrect results" +print("replace_address verified: descriptor reused with new source tensor") diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py new file mode 100644 index 0000000000..0aa5968a85 --- /dev/null +++ b/cuda_core/tests/test_tensor_map.py @@ -0,0 +1,279 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +import numpy as np + +from cuda.core import ( + Device, + TensorMapDescriptor, + TensorMapDataType, + TensorMapInterleave, + TensorMapL2Promotion, + TensorMapOOBFill, + TensorMapSwizzle, +) + + +@pytest.fixture +def dev(init_cuda): + return Device() + + +@pytest.fixture +def skip_if_no_tma(dev): + if not dev.properties.tensor_map_access_supported: + pytest.skip("Device does not support TMA (requires compute capability 9.0+)") + + +def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256): + """Allocate a device buffer and return it with proper alignment.""" + n_elements = 1 + for s in shape: + n_elements *= s + buf = dev.allocate(n_elements * np.dtype(dtype).itemsize + alignment) + return buf + + +class TestTensorMapEnums: + """Test that enum wrappers expose the expected values.""" + + def test_data_type_values(self): + assert TensorMapDataType.UINT8 == 0 + assert TensorMapDataType.FLOAT32 == 7 + assert TensorMapDataType.FLOAT64 == 8 + assert TensorMapDataType.BFLOAT16 == 9 + + def test_interleave_values(self): + assert TensorMapInterleave.NONE == 0 + assert TensorMapInterleave.INTERLEAVE_16B == 1 + assert TensorMapInterleave.INTERLEAVE_32B == 2 + + def test_swizzle_values(self): + assert TensorMapSwizzle.NONE == 0 + assert TensorMapSwizzle.SWIZZLE_32B == 1 + assert TensorMapSwizzle.SWIZZLE_64B == 2 + assert TensorMapSwizzle.SWIZZLE_128B == 3 + + def test_l2_promotion_values(self): + assert TensorMapL2Promotion.NONE == 0 + assert TensorMapL2Promotion.L2_64B == 1 + assert TensorMapL2Promotion.L2_128B == 2 + assert TensorMapL2Promotion.L2_256B == 3 + + def test_oob_fill_values(self): + assert TensorMapOOBFill.NONE == 0 + assert TensorMapOOBFill.NAN_REQUEST_ZERO_FMA == 1 + + +class TestTensorMapDescriptorCreation: + """Test TensorMapDescriptor factory methods.""" + + def test_cannot_instantiate_directly(self): + with pytest.raises(RuntimeError, match="cannot be instantiated directly"): + TensorMapDescriptor() + + def test_from_tiled_1d(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) # 1024 float32 elements + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(64,), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + assert repr(desc) == "TensorMapDescriptor()" + + def test_from_tiled_2d(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) # 64x64 float32 + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + def test_from_tiled_3d(self, dev, skip_if_no_tma): + buf = dev.allocate(16 * 16 * 16 * 4) # 16x16x16 float32 + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(8, 8, 8), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + swizzle=TensorMapSwizzle.SWIZZLE_128B, + ) + assert desc is not None + + def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + l2_promotion=TensorMapL2Promotion.L2_128B, + ) + assert desc is not None + + def test_from_tiled_with_oob_fill(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA, + ) + assert desc is not None + + +class TestTensorMapDescriptorValidation: + """Test validation in TensorMapDescriptor factory methods.""" + + def test_invalid_rank_zero(self, dev, skip_if_no_tma): + buf = dev.allocate(64) + with pytest.raises(ValueError, match="rank must be between 1 and 5"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(), + data_type=TensorMapDataType.FLOAT32, + ) + + def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match="box_dim must have 1 elements"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + ) + + def test_box_dim_out_of_range(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match=r"box_dim\[0\] must be in \[1, 256\]"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(512,), + data_type=TensorMapDataType.FLOAT32, + ) + + def test_element_strides_rank_mismatch(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match="element_strides must have 1 elements"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(64,), + element_strides=(1, 1), + data_type=TensorMapDataType.FLOAT32, + ) + + def test_invalid_data_type(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(TypeError, match="data_type must be a TensorMapDataType"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(64,), + data_type=42, + ) + + +class TestTensorMapDtypeMapping: + """Test automatic dtype inference from numpy dtypes.""" + + @pytest.mark.parametrize("np_dtype,expected_tma_dt", [ + (np.uint8, TensorMapDataType.UINT8), + (np.uint16, TensorMapDataType.UINT16), + (np.uint32, TensorMapDataType.UINT32), + (np.int32, TensorMapDataType.INT32), + (np.uint64, TensorMapDataType.UINT64), + (np.int64, TensorMapDataType.INT64), + (np.float16, TensorMapDataType.FLOAT16), + (np.float32, TensorMapDataType.FLOAT32), + (np.float64, TensorMapDataType.FLOAT64), + ]) + def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma): + from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA + assert _NUMPY_DTYPE_TO_TMA[np.dtype(np_dtype)] == expected_tma_dt + + def test_bfloat16_mapping(self): + try: + from ml_dtypes import bfloat16 + from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA + assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16 + except ImportError: + pytest.skip("ml_dtypes not installed") + + +class TestTensorMapReplaceAddress: + """Test replace_address functionality.""" + + def test_replace_address(self, dev, skip_if_no_tma): + buf1 = dev.allocate(1024 * 4) + desc = TensorMapDescriptor.from_tiled( + buf1, + box_dim=(64,), + data_type=TensorMapDataType.FLOAT32, + ) + + buf2 = dev.allocate(1024 * 4) + desc.replace_address(buf2) + # No exception means success + + def test_replace_address_requires_device_accessible(self, dev, skip_if_no_tma): + buf1 = dev.allocate(1024 * 4) + desc = TensorMapDescriptor.from_tiled( + buf1, + box_dim=(64,), + data_type=TensorMapDataType.FLOAT32, + ) + # Create a host-only array (not device-accessible) + host_arr = np.zeros(1024, dtype=np.float32) + with pytest.raises(ValueError, match="device-accessible"): + desc.replace_address(host_arr) + + +class TestTensorMapIm2col: + """Test im2col TMA descriptor creation.""" + + def test_from_im2col_3d(self, dev, skip_if_no_tma): + # 3D tensor: batch=1, height=32, channels=64 + buf = dev.allocate(1 * 32 * 64 * 4) + desc = TensorMapDescriptor.from_im2col( + buf, + pixel_box_lower_corner=(0,), + pixel_box_upper_corner=(4,), + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + def test_from_im2col_rank_validation(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match="Im2col tensor rank must be between 3 and 5"): + TensorMapDescriptor.from_im2col( + buf, + pixel_box_lower_corner=(), + pixel_box_upper_corner=(), + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) + + def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma): + buf = dev.allocate(1 * 32 * 64 * 4) + with pytest.raises(ValueError, match="pixel_box_lower_corner must have 1 elements"): + TensorMapDescriptor.from_im2col( + buf, + pixel_box_lower_corner=(0, 0), + pixel_box_upper_corner=(4,), + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) From 77a3c8ecab4734eacbf7f36af964b103ca2ecf69 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 13:53:49 -0800 Subject: [PATCH 02/25] tma wide --- cuda_core/cuda/core/__init__.py | 1 - cuda_core/cuda/core/_tensor_map.pxd | 1 + cuda_core/cuda/core/_tensor_map.pyx | 208 +++++++++++++++++++++++++++- cuda_core/pixi.toml | 13 +- cuda_core/tests/test_tensor_map.py | 133 +++++++++++++++++- 5 files changed, 341 insertions(+), 15 deletions(-) diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 84e4f3d356..d5f3693721 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -60,7 +60,6 @@ StridedMemoryView, args_viewable_as_strided_memory, ) -<<<<<<< HEAD from cuda.core._module import Kernel, ObjectCode from cuda.core._program import Program, ProgramOptions from cuda.core._stream import ( diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd index 3cfd571d89..fa2254ba70 100644 --- a/cuda_core/cuda/core/_tensor_map.pxd +++ b/cuda_core/cuda/core/_tensor_map.pxd @@ -8,5 +8,6 @@ from cuda.bindings cimport cydriver cdef class TensorMapDescriptor: cdef cydriver.CUtensorMap _tensor_map cdef object _source_ref + cdef object _repr_info cdef void* _get_data_ptr(self) diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx index 9375acf944..6b072f7404 100644 --- a/cuda_core/cuda/core/_tensor_map.pyx +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -80,6 +80,16 @@ class TensorMapOOBFill(enum.IntEnum): NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA +class TensorMapIm2ColWideMode(enum.IntEnum): + """Im2col wide mode for tensor map descriptors. + + These correspond to the ``CUtensorMapIm2ColWideMode`` driver enum values. + Supported on compute capability 10.0+. + """ + W = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W + W128 = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128 + + # Mapping from numpy dtype to TMA data type _NUMPY_DTYPE_TO_TMA = { numpy.dtype(numpy.uint8): TensorMapDataType.UINT8, @@ -316,6 +326,13 @@ cdef class TensorMapDescriptor: c_oob_fill, )) + desc._repr_info = { + "method": "tiled", + "rank": rank, + "data_type": tma_dt, + "swizzle": swizzle, + } + return desc @staticmethod @@ -483,6 +500,176 @@ cdef class TensorMapDescriptor: c_oob_fill, )) + desc._repr_info = { + "method": "im2col", + "rank": rank, + "data_type": tma_dt, + "swizzle": swizzle, + } + + return desc + + @staticmethod + def from_im2col_wide(tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width, + channels_per_pixel, pixels_per_column, *, + element_strides=None, + data_type=None, + interleave=TensorMapInterleave.NONE, + mode=TensorMapIm2ColWideMode.W, + swizzle=TensorMapSwizzle.SWIZZLE_128B, + l2_promotion=TensorMapL2Promotion.NONE, + oob_fill=TensorMapOOBFill.NONE): + """Create an im2col-wide TMA descriptor from a tensor object. + + Im2col-wide layout loads elements exclusively along the W (width) + dimension. This variant is supported on compute capability 10.0+ + (Blackwell and later). + + Parameters + ---------- + tensor : object + Any object supporting DLPack or ``__cuda_array_interface__``, + or a :obj:`~cuda.core.StridedMemoryView`. Must refer to + device-accessible memory with a 16-byte-aligned pointer. + pixel_box_lower_corner_width : int + Lower corner of the pixel bounding box along the W dimension. + pixel_box_upper_corner_width : int + Upper corner of the pixel bounding box along the W dimension. + channels_per_pixel : int + Number of channels per pixel. + pixels_per_column : int + Number of pixels per column. + element_strides : tuple of int, optional + Per-dimension element traversal strides. Default is all 1s. + data_type : TensorMapDataType, optional + Explicit data type override. If ``None``, inferred from the + tensor's dtype. + interleave : TensorMapInterleave + Interleave layout. Default ``NONE``. + mode : TensorMapIm2ColWideMode + Im2col wide mode. Default ``W``. + swizzle : TensorMapSwizzle + Swizzle mode. Default ``SWIZZLE_128B``. + l2_promotion : TensorMapL2Promotion + L2 promotion mode. Default ``NONE``. + oob_fill : TensorMapOOBFill + Out-of-bounds fill mode. Default ``NONE``. + + Returns + ------- + TensorMapDescriptor + + Raises + ------ + ValueError + If the tensor rank is outside [3, 5], the pointer is not + 16-byte aligned, or other constraints are violated. + """ + cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) + + # Obtain a StridedMemoryView from the tensor + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + desc._source_ref = tensor + + tma_dt = _resolve_data_type(view, data_type) + cdef int c_data_type_int = int(tma_dt) + cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int + + cdef intptr_t global_address = view.ptr + shape = view.shape + strides = view.strides + + cdef int rank = len(shape) + if rank < 3 or rank > 5: + raise ValueError( + f"Im2col-wide tensor rank must be between 3 and 5, got {rank}") + + if global_address % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{global_address:x}") + + if element_strides is not None: + if len(element_strides) != rank: + raise ValueError( + f"element_strides must have {rank} elements, got {len(element_strides)}") + else: + element_strides = (1,) * rank + + cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] + if strides is not None: + byte_strides = tuple(s * elem_size for s in strides) + else: + byte_strides = [] + stride = elem_size + for i in range(rank - 1, -1, -1): + byte_strides.append(stride) + stride *= shape[i] + byte_strides.reverse() + + # Reverse all dimension arrays for column-major convention + cdef uint64_t[5] c_global_dim + cdef uint64_t[4] c_global_strides + cdef uint32_t[5] c_element_strides + cdef int i_c + + for i_c in range(rank): + c_global_dim[i_c] = shape[rank - 1 - i_c] + c_element_strides[i_c] = element_strides[rank - 1 - i_c] + + for i_c in range(rank - 1): + c_global_strides[i_c] = byte_strides[rank - 2 - i_c] + + cdef uint32_t c_rank = rank + cdef int c_lower_w = pixel_box_lower_corner_width + cdef int c_upper_w = pixel_box_upper_corner_width + cdef uint32_t c_channels = channels_per_pixel + cdef uint32_t c_pixels = pixels_per_column + cdef int c_interleave_int = int(interleave) + cdef int c_mode_int = int(mode) + cdef int c_swizzle_int = int(swizzle) + cdef int c_l2_promotion_int = int(l2_promotion) + cdef int c_oob_fill_int = int(oob_fill) + cdef cydriver.CUtensorMapInterleave c_interleave = c_interleave_int + cdef cydriver.CUtensorMapIm2ColWideMode c_mode = c_mode_int + cdef cydriver.CUtensorMapSwizzle c_swizzle = c_swizzle_int + cdef cydriver.CUtensorMapL2promotion c_l2_promotion = c_l2_promotion_int + cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = c_oob_fill_int + + with nogil: + HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2colWide( + &desc._tensor_map, + c_data_type, + c_rank, + global_address, + c_global_dim, + c_global_strides, + c_lower_w, + c_upper_w, + c_channels, + c_pixels, + c_element_strides, + c_interleave, + c_mode, + c_swizzle, + c_l2_promotion, + c_oob_fill, + )) + + desc._repr_info = { + "method": "im2col_wide", + "rank": rank, + "data_type": tma_dt, + "swizzle": swizzle, + } + return desc def replace_address(self, tensor): @@ -512,13 +699,28 @@ cdef class TensorMapDescriptor: f"Global memory address must be 16-byte aligned, " f"got address 0x{global_address:x}") - self._source_ref = tensor - with nogil: HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress( &self._tensor_map, global_address, )) + # Update the source reference only after the driver call succeeds, + # so we don't drop the old tensor (risking a dangling pointer in the + # CUtensorMap struct) if the call fails. + self._source_ref = tensor + def __repr__(self): - return f"TensorMapDescriptor()" + info = self._repr_info + if info is None: + return "TensorMapDescriptor()" + parts = [] + if "method" in info: + parts.append(info["method"]) + if "rank" in info: + parts.append(f"rank={info['rank']}") + if "data_type" in info: + parts.append(f"dtype={info['data_type'].name}") + if "swizzle" in info: + parts.append(f"swizzle={info['swizzle'].name}") + return f"TensorMapDescriptor({', '.join(parts)})" diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml index 9dc6ac1ed9..e351fb63b6 100644 --- a/cuda_core/pixi.toml +++ b/cuda_core/pixi.toml @@ -75,16 +75,9 @@ cuda = "13" [feature.cu13.dependencies] cuda-version = "13.1.*" -[feature.cu12.system-requirements] -cuda = "12" - -[feature.cu12.dependencies] -cuda-version = "12.*" - -# We keep both cu12 and cu13 because cuda.core works with either major version -# NOTE: Path dependency to ../cuda_bindings only works for cu13 (local bindings is v13.1) -# For cu12 testing, use conda-forge packages: temporarily change path to wildcard -# or skip cu12 locally: pixi run -e cu13 test +# NOTE: cu12 environment is intentionally omitted because the path dependency +# to ../cuda_bindings (v13.1) makes it unsolvable locally. For cu12 testing, +# use conda-forge packages or CI workflows. [environments] default = { features = [ "cu13", diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 0aa5968a85..f57e73ded8 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -9,6 +9,7 @@ Device, TensorMapDescriptor, TensorMapDataType, + TensorMapIm2ColWideMode, TensorMapInterleave, TensorMapL2Promotion, TensorMapOOBFill, @@ -36,6 +37,23 @@ def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256): return buf +class _DeviceArray: + """Wrap a Buffer with explicit shape via __cuda_array_interface__. + + dev.allocate() returns a 1D byte buffer. For multi-dimensional TMA tests + we need the tensor to report a proper shape/dtype so the TMA encoder sees + the correct rank, dimensions, and strides. + """ + def __init__(self, buf, shape, dtype=np.float32): + self._buf = buf # prevent GC + self.__cuda_array_interface__ = { + "shape": tuple(shape), + "typestr": np.dtype(dtype).str, + "data": (int(buf.handle), False), + "version": 3, + } + + class TestTensorMapEnums: """Test that enum wrappers expose the expected values.""" @@ -66,6 +84,10 @@ def test_oob_fill_values(self): assert TensorMapOOBFill.NONE == 0 assert TensorMapOOBFill.NAN_REQUEST_ZERO_FMA == 1 + def test_im2col_wide_mode_values(self): + assert TensorMapIm2ColWideMode.W == 0 + assert TensorMapIm2ColWideMode.W128 == 1 + class TestTensorMapDescriptorCreation: """Test TensorMapDescriptor factory methods.""" @@ -82,7 +104,7 @@ def test_from_tiled_1d(self, dev, skip_if_no_tma): data_type=TensorMapDataType.FLOAT32, ) assert desc is not None - assert repr(desc) == "TensorMapDescriptor()" + assert repr(desc) == "TensorMapDescriptor(tiled, rank=1, dtype=FLOAT32, swizzle=NONE)" def test_from_tiled_2d(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) # 64x64 float32 @@ -102,6 +124,18 @@ def test_from_tiled_3d(self, dev, skip_if_no_tma): ) assert desc is not None + def test_from_tiled_5d(self, dev, skip_if_no_tma): + # 5D: exercises all 5 c_global_dim / 4 c_global_strides slots + shape = (2, 4, 4, 4, 8) + n_bytes = 2 * 4 * 4 * 4 * 8 * 4 # float32 + buf = dev.allocate(n_bytes) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_tiled( + tensor, + box_dim=(1, 2, 2, 2, 8), + ) + assert desc is not None + def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) desc = TensorMapDescriptor.from_tiled( @@ -277,3 +311,100 @@ def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma): pixels_per_column=4, data_type=TensorMapDataType.FLOAT32, ) + + def test_from_im2col_4d(self, dev, skip_if_no_tma): + # NHWC layout: N=1, H=8, W=8, C=64 — 2 spatial dims + # Exercises spatial corner reversal with n_spatial=2: + # Python [H_lower, W_lower] -> driver [W_lower, H_lower] + shape = (1, 8, 8, 64) + buf = dev.allocate(1 * 8 * 8 * 64 * 4) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_im2col( + tensor, + pixel_box_lower_corner=(0, 0), + pixel_box_upper_corner=(4, 4), + channels_per_pixel=64, + pixels_per_column=16, + ) + assert desc is not None + + def test_from_im2col_5d(self, dev, skip_if_no_tma): + # NDHWC layout: N=1, D=4, H=8, W=8, C=64 — 3 spatial dims + # Exercises the full spatial corner reversal: + # Python [D, H, W] -> driver [W, H, D] + shape = (1, 4, 8, 8, 64) + buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_im2col( + tensor, + pixel_box_lower_corner=(0, 0, 0), + pixel_box_upper_corner=(2, 4, 4), + channels_per_pixel=64, + pixels_per_column=32, + ) + assert desc is not None + + +class TestTensorMapIm2colWide: + """Test im2col-wide TMA descriptor creation (compute capability 10.0+).""" + + @pytest.fixture + def skip_if_no_im2col_wide(self, dev): + cc = dev.compute_capability + if cc.major < 10: + pytest.skip("Device does not support im2col-wide (requires compute capability 10.0+)") + + def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide): + # 3D tensor: batch=1, width=32, channels=64 + buf = dev.allocate(1 * 32 * 64 * 4) + desc = TensorMapDescriptor.from_im2col_wide( + buf, + pixel_box_lower_corner_width=0, + pixel_box_upper_corner_width=4, + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + def test_from_im2col_wide_4d(self, dev, skip_if_no_im2col_wide): + # NHWC layout: N=1, H=8, W=8, C=64 + # Wide mode only uses scalar W corners, even with higher rank + shape = (1, 8, 8, 64) + buf = dev.allocate(1 * 8 * 8 * 64 * 4) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_im2col_wide( + tensor, + pixel_box_lower_corner_width=0, + pixel_box_upper_corner_width=4, + channels_per_pixel=64, + pixels_per_column=16, + ) + assert desc is not None + + def test_from_im2col_wide_5d(self, dev, skip_if_no_im2col_wide): + # NDHWC layout: N=1, D=4, H=8, W=8, C=64 + # Max rank boundary — verifies all 5 dim/stride slots are filled + shape = (1, 4, 8, 8, 64) + buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_im2col_wide( + tensor, + pixel_box_lower_corner_width=0, + pixel_box_upper_corner_width=4, + channels_per_pixel=64, + pixels_per_column=32, + ) + assert desc is not None + + def test_from_im2col_wide_rank_validation(self, dev, skip_if_no_im2col_wide): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match="Im2col-wide tensor rank must be between 3 and 5"): + TensorMapDescriptor.from_im2col_wide( + buf, + pixel_box_lower_corner_width=0, + pixel_box_upper_corner_width=4, + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) From 19c4a0f6ce25b30faffec5dbb0b78dca2496af8f Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 15:15:56 -0800 Subject: [PATCH 03/25] clean up --- cuda_core/cuda/core/_tensor_map.pxd | 2 +- cuda_core/cuda/core/_tensor_map.pyx | 185 ++++++++++----------------- cuda_core/examples/tma_tensor_map.py | 2 +- cuda_core/tests/test_tensor_map.py | 2 +- 4 files changed, 72 insertions(+), 119 deletions(-) diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd index fa2254ba70..b2b165b5aa 100644 --- a/cuda_core/cuda/core/_tensor_map.pxd +++ b/cuda_core/cuda/core/_tensor_map.pxd @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx index 6b072f7404..9edcccb928 100644 --- a/cuda_core/cuda/core/_tensor_map.pyx +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 @@ -150,6 +150,55 @@ def _resolve_data_type(view, data_type): return tma_dt +def _get_validated_view(tensor): + """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer.""" + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + # stream_ptr=-1: no stream synchronization needed because descriptor + # creation only reads tensor metadata, it does not move data. + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + if view.ptr % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{view.ptr:x}") + + return view + + +def _compute_byte_strides(shape, strides, elem_size): + """Compute byte strides from element strides or C-contiguous fallback. + + Returns a tuple of byte strides in row-major order. + """ + if strides is not None: + return tuple(s * elem_size for s in strides) + + # C-contiguous: compute byte strides from shape, innermost first + rank = len(shape) + byte_strides = [] + stride = elem_size + for i in range(rank - 1, -1, -1): + byte_strides.append(stride) + stride *= shape[i] + byte_strides.reverse() + return tuple(byte_strides) + + +def _validate_element_strides(element_strides, rank): + """Validate or default element_strides to all-ones.""" + if element_strides is not None: + if len(element_strides) != rank: + raise ValueError( + f"element_strides must have {rank} elements, got {len(element_strides)}") + return element_strides + return (1,) * rank + + cdef class TensorMapDescriptor: """Describes a TMA (Tensor Memory Accelerator) tensor map for Hopper+ GPUs. @@ -171,8 +220,8 @@ cdef class TensorMapDescriptor: cdef void* _get_data_ptr(self): return &self._tensor_map - @staticmethod - def from_tiled(tensor, box_dim, *, + @classmethod + def from_tiled(cls, tensor, box_dim, *, element_strides=None, data_type=None, interleave=TensorMapInterleave.NONE, @@ -216,40 +265,23 @@ cdef class TensorMapDescriptor: If the tensor rank is outside [1, 5], the pointer is not 16-byte aligned, or dimension/stride constraints are violated. """ - cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) + cdef TensorMapDescriptor desc = cls.__new__(cls) - # Obtain a StridedMemoryView from the tensor - if isinstance(tensor, StridedMemoryView): - view = tensor - else: - view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) - - if not view.is_device_accessible: - raise ValueError("The tensor must be device-accessible") - - # Keep a strong reference to prevent GC + view = _get_validated_view(tensor) desc._source_ref = tensor - # Resolve data type tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int - # Get tensor metadata cdef intptr_t global_address = view.ptr shape = view.shape - strides = view.strides # in elements, can be None for C-contiguous cdef int rank = len(shape) if rank < 1 or rank > 5: raise ValueError( f"Tensor rank must be between 1 and 5, got {rank}") - if global_address % 16 != 0: - raise ValueError( - f"Global memory address must be 16-byte aligned, " - f"got address 0x{global_address:x} (misaligned by {global_address % 16} bytes)") - if len(box_dim) != rank: raise ValueError( f"box_dim must have {rank} elements (same as tensor rank), " @@ -260,25 +292,10 @@ cdef class TensorMapDescriptor: raise ValueError( f"box_dim[{i}] must be in [1, 256], got {bd}") - if element_strides is not None: - if len(element_strides) != rank: - raise ValueError( - f"element_strides must have {rank} elements, got {len(element_strides)}") - else: - element_strides = (1,) * rank + element_strides = _validate_element_strides(element_strides, rank) - # Compute byte strides from element strides cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] - if strides is not None: - byte_strides = tuple(s * elem_size for s in strides) - else: - # C-contiguous: strides in bytes, row-major - byte_strides = [] - stride = elem_size - for i in range(rank - 1, -1, -1): - byte_strides.append(stride) - stride *= shape[i] - byte_strides.reverse() + byte_strides = _compute_byte_strides(shape, view.strides, elem_size) # Reverse dimensions for column-major cuTensorMap convention # Python/DLPack: row-major (dim 0 = outermost) @@ -335,8 +352,8 @@ cdef class TensorMapDescriptor: return desc - @staticmethod - def from_im2col(tensor, pixel_box_lower_corner, pixel_box_upper_corner, + @classmethod + def from_im2col(cls, tensor, pixel_box_lower_corner, pixel_box_upper_corner, channels_per_pixel, pixels_per_column, *, element_strides=None, data_type=None, @@ -390,17 +407,9 @@ cdef class TensorMapDescriptor: If the tensor rank is outside [3, 5], the pointer is not 16-byte aligned, or other constraints are violated. """ - cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) - - # Obtain a StridedMemoryView from the tensor - if isinstance(tensor, StridedMemoryView): - view = tensor - else: - view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) - - if not view.is_device_accessible: - raise ValueError("The tensor must be device-accessible") + cdef TensorMapDescriptor desc = cls.__new__(cls) + view = _get_validated_view(tensor) desc._source_ref = tensor tma_dt = _resolve_data_type(view, data_type) @@ -409,18 +418,12 @@ cdef class TensorMapDescriptor: cdef intptr_t global_address = view.ptr shape = view.shape - strides = view.strides cdef int rank = len(shape) if rank < 3 or rank > 5: raise ValueError( f"Im2col tensor rank must be between 3 and 5, got {rank}") - if global_address % 16 != 0: - raise ValueError( - f"Global memory address must be 16-byte aligned, " - f"got address 0x{global_address:x}") - cdef int n_spatial = rank - 2 if len(pixel_box_lower_corner) != n_spatial: raise ValueError( @@ -431,23 +434,10 @@ cdef class TensorMapDescriptor: f"pixel_box_upper_corner must have {n_spatial} elements " f"(rank - 2), got {len(pixel_box_upper_corner)}") - if element_strides is not None: - if len(element_strides) != rank: - raise ValueError( - f"element_strides must have {rank} elements, got {len(element_strides)}") - else: - element_strides = (1,) * rank + element_strides = _validate_element_strides(element_strides, rank) cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] - if strides is not None: - byte_strides = tuple(s * elem_size for s in strides) - else: - byte_strides = [] - stride = elem_size - for i in range(rank - 1, -1, -1): - byte_strides.append(stride) - stride *= shape[i] - byte_strides.reverse() + byte_strides = _compute_byte_strides(shape, view.strides, elem_size) # Reverse all dimension arrays for column-major convention cdef uint64_t[5] c_global_dim @@ -509,8 +499,8 @@ cdef class TensorMapDescriptor: return desc - @staticmethod - def from_im2col_wide(tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width, + @classmethod + def from_im2col_wide(cls, tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width, channels_per_pixel, pixels_per_column, *, element_strides=None, data_type=None, @@ -565,17 +555,9 @@ cdef class TensorMapDescriptor: If the tensor rank is outside [3, 5], the pointer is not 16-byte aligned, or other constraints are violated. """ - cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) - - # Obtain a StridedMemoryView from the tensor - if isinstance(tensor, StridedMemoryView): - view = tensor - else: - view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) - - if not view.is_device_accessible: - raise ValueError("The tensor must be device-accessible") + cdef TensorMapDescriptor desc = cls.__new__(cls) + view = _get_validated_view(tensor) desc._source_ref = tensor tma_dt = _resolve_data_type(view, data_type) @@ -584,35 +566,16 @@ cdef class TensorMapDescriptor: cdef intptr_t global_address = view.ptr shape = view.shape - strides = view.strides cdef int rank = len(shape) if rank < 3 or rank > 5: raise ValueError( f"Im2col-wide tensor rank must be between 3 and 5, got {rank}") - if global_address % 16 != 0: - raise ValueError( - f"Global memory address must be 16-byte aligned, " - f"got address 0x{global_address:x}") - - if element_strides is not None: - if len(element_strides) != rank: - raise ValueError( - f"element_strides must have {rank} elements, got {len(element_strides)}") - else: - element_strides = (1,) * rank + element_strides = _validate_element_strides(element_strides, rank) cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] - if strides is not None: - byte_strides = tuple(s * elem_size for s in strides) - else: - byte_strides = [] - stride = elem_size - for i in range(rank - 1, -1, -1): - byte_strides.append(stride) - stride *= shape[i] - byte_strides.reverse() + byte_strides = _compute_byte_strides(shape, view.strides, elem_size) # Reverse all dimension arrays for column-major convention cdef uint64_t[5] c_global_dim @@ -685,19 +648,9 @@ cdef class TensorMapDescriptor: or a :obj:`~cuda.core.StridedMemoryView`. Must refer to device-accessible memory with a 16-byte-aligned pointer. """ - if isinstance(tensor, StridedMemoryView): - view = tensor - else: - view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) - - if not view.is_device_accessible: - raise ValueError("The tensor must be device-accessible") + view = _get_validated_view(tensor) cdef intptr_t global_address = view.ptr - if global_address % 16 != 0: - raise ValueError( - f"Global memory address must be 16-byte aligned, " - f"got address 0x{global_address:x}") with nogil: HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress( diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index de51570ec4..e93c02dc08 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index f57e73ded8..790307857b 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 import pytest From 35a04b97367b5cf39a493d2ab6c325e70d970cb8 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 15:55:00 -0800 Subject: [PATCH 04/25] Add comments to prepare_tensor_map_arg explaining allocation and lifetime Co-Authored-By: Claude Opus 4.6 --- cuda_core/cuda/core/_kernel_arg_handler.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx index e88def13cd..b55a4abc7a 100644 --- a/cuda_core/cuda/core/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -135,8 +135,13 @@ cdef inline int prepare_tensor_map_arg( vector.vector[void*]& data_addresses, TensorMapDescriptor arg, const size_t idx) except -1: + # Allocate a temporary buffer for the 128-byte CUtensorMap struct. + # We copy rather than pointing directly at arg._tensor_map for lifetime + # safety: ParamHolder owns and frees its argument buffers independently. cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap)) memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap)) + # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__, + # data_addresses[idx] is the pointer passed to cuLaunchKernel. data_addresses[idx] = ptr data[idx] = ptr return 0 From bb19e4f5c03cb65fd0d53a1cd1a58a8159acaee8 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 17:04:26 -0800 Subject: [PATCH 05/25] Address Copilot review feedback - Remove unused _alloc_device_tensor helper from tests - Add test for rank > 5 (6D tensor) to verify upper bound validation - Add NULL check for PyMem_Malloc in prepare_tensor_map_arg Co-Authored-By: Claude Opus 4.6 --- cuda_core/cuda/core/_kernel_arg_handler.pyx | 2 ++ cuda_core/tests/test_tensor_map.py | 21 +++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx index b55a4abc7a..5619ba7492 100644 --- a/cuda_core/cuda/core/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -139,6 +139,8 @@ cdef inline int prepare_tensor_map_arg( # We copy rather than pointing directly at arg._tensor_map for lifetime # safety: ParamHolder owns and frees its argument buffers independently. cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap)) + if ptr is NULL: + raise MemoryError("Failed to allocate memory for CUtensorMap") memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap)) # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__, # data_addresses[idx] is the pointer passed to cuLaunchKernel. diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 790307857b..96dd14a419 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -28,14 +28,6 @@ def skip_if_no_tma(dev): pytest.skip("Device does not support TMA (requires compute capability 9.0+)") -def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256): - """Allocate a device buffer and return it with proper alignment.""" - n_elements = 1 - for s in shape: - n_elements *= s - buf = dev.allocate(n_elements * np.dtype(dtype).itemsize + alignment) - return buf - class _DeviceArray: """Wrap a Buffer with explicit shape via __cuda_array_interface__. @@ -179,6 +171,19 @@ def test_invalid_rank_zero(self, dev, skip_if_no_tma): data_type=TensorMapDataType.FLOAT32, ) + def test_invalid_rank_six(self, dev, skip_if_no_tma): + shape = (2, 2, 2, 2, 2, 2) + n_elements = 1 + for s in shape: + n_elements *= s + buf = dev.allocate(n_elements * 4) + arr = _DeviceArray(buf, shape) + with pytest.raises(ValueError, match="rank must be between 1 and 5"): + TensorMapDescriptor.from_tiled( + arr, + box_dim=(2,) * 6, + ) + def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma): buf = dev.allocate(1024 * 4) with pytest.raises(ValueError, match="box_dim must have 1 elements"): From 23a8900834a1b62a163664d3ab31fcbdf29bba02 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 17:20:05 -0800 Subject: [PATCH 06/25] Split TMA example into two focused files Move the replace_address() demonstration into its own self-contained example (tma_replace_address.py) so each file covers a single concept. Co-Authored-By: Claude Opus 4.6 --- cuda_core/examples/tma_replace_address.py | 189 ++++++++++++++++++++++ cuda_core/examples/tma_tensor_map.py | 20 --- 2 files changed, 189 insertions(+), 20 deletions(-) create mode 100644 cuda_core/examples/tma_replace_address.py diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py new file mode 100644 index 0000000000..7f11cf551c --- /dev/null +++ b/cuda_core/examples/tma_replace_address.py @@ -0,0 +1,189 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates how to use replace_address() to repoint a TMA +# (Tensor Memory Accelerator) descriptor at a different tensor without +# rebuilding the descriptor from scratch. +# +# The workflow is: +# +# 1. Create a TMA tiled descriptor and launch a kernel to verify it works +# 2. Allocate a second tensor with different content +# 3. Call replace_address() to repoint the same descriptor at the new tensor +# 4. Re-launch the kernel and verify it reads from the new tensor +# +# This is useful when the tensor layout (shape, dtype, tile size) stays the +# same but the underlying data buffer changes, e.g. double-buffering or +# iterating over a sequence of same-shaped tensors. +# +# Requirements: +# - Hopper or later GPU (compute capability >= 9.0) +# - CuPy +# - CUDA toolkit headers (CUDA_PATH or CUDA_HOME set) +# +# ################################################################################ + +import sys + +import cupy as cp +import numpy as np + +from cuda.core import ( + Device, + LaunchConfig, + Program, + ProgramOptions, + TensorMapDescriptor, + launch, +) + +# --------------------------------------------------------------------------- +# Check for Hopper+ GPU +# --------------------------------------------------------------------------- +dev = Device() +arch = dev.compute_capability +if arch < (9, 0): + print( + "TMA requires compute capability >= 9.0 (Hopper or later)", + file=sys.stderr, + ) + sys.exit(0) +dev.set_current() + +arch_str = "".join(f"{i}" for i in arch) + +# --------------------------------------------------------------------------- +# CUDA kernel that uses TMA to load a 1-D tile into shared memory, then +# copies the tile to an output buffer so we can verify correctness. +# +# The CUtensorMap struct (128 bytes) is defined inline so the kernel can be +# compiled with NVRTC without pulling in the full driver-API header. +# +# Key points: +# - The tensor map is passed by value with __grid_constant__ so the TMA +# hardware can read it from grid-constant memory. +# - Thread 0 in each block issues the TMA load and manages the mbarrier. +# - All threads wait on the mbarrier, then copy from shared to global. +# --------------------------------------------------------------------------- +TILE_SIZE = 128 # elements per tile (must match the kernel constant) + +code = r""" +// Minimal definition of the 128-byte opaque tensor map struct. +struct __align__(64) TensorMap { unsigned long long opaque[16]; }; + +static constexpr int TILE_SIZE = 128; + +extern "C" +__global__ void tma_copy( + const __grid_constant__ TensorMap tensor_map, + float* output, + int N) +{ + __shared__ __align__(128) float smem[TILE_SIZE]; + __shared__ __align__(8) unsigned long long mbar; + + const int tid = threadIdx.x; + const int tile_start = blockIdx.x * TILE_SIZE; + + // ---- Thread 0: set up mbarrier and issue the TMA load ---- + if (tid == 0) + { + // Initialise a single-phase mbarrier (1 arriving thread). + asm volatile( + "mbarrier.init.shared.b64 [%0], 1;" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); + + // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start' + // from the tensor described by 'tensor_map' into shared memory. + asm volatile( + "cp.async.bulk.tensor.1d.shared::cluster.global.tile" + ".mbarrier::complete_tx::bytes" + " [%0], [%1, {%2}], [%3];" + :: "r"((unsigned)__cvta_generic_to_shared(smem)), + "l"(&tensor_map), + "r"(tile_start), + "r"((unsigned)__cvta_generic_to_shared(&mbar))); + + // Tell the mbarrier how many bytes the TMA will deliver. + asm volatile( + "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar)), + "r"((unsigned)(TILE_SIZE * sizeof(float)))); + } + + __syncthreads(); + + // ---- Wait for the TMA load to complete ---- + if (tid == 0) + { + asm volatile( + "{ .reg .pred P; \n" + "WAIT: \n" + " mbarrier.try_wait.parity.shared.b64 P, [%0], 0; \n" + " @!P bra WAIT; \n" + "} \n" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); + } + + __syncthreads(); + + // ---- Copy the tile from shared memory to the output buffer ---- + if (tid < TILE_SIZE) + { + const int idx = tile_start + tid; + if (idx < N) + output[idx] = smem[tid]; + } +} +""" + +# --------------------------------------------------------------------------- +# Compile the kernel +# --------------------------------------------------------------------------- +prog = Program( + code, + code_type="c++", + options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"), +) +mod = prog.compile("cubin") +ker = mod.get_kernel("tma_copy") + +# --------------------------------------------------------------------------- +# 1) Prepare input data and verify the initial TMA copy +# --------------------------------------------------------------------------- +N = 1024 +a = cp.arange(N, dtype=cp.float32) # [0, 1, 2, ..., N-1] +output = cp.zeros(N, dtype=cp.float32) +dev.sync() # cupy uses its own stream + +tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,)) + +n_tiles = N // TILE_SIZE +config = LaunchConfig(grid=n_tiles, block=TILE_SIZE) +launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N)) +dev.sync() + +assert cp.array_equal(output, a), "TMA copy produced incorrect results" +print(f"TMA copy verified: {N} elements across {n_tiles} tiles") + +# --------------------------------------------------------------------------- +# 2) Demonstrate replace_address() +# Create a second tensor with different content, point the *same* +# descriptor at it, and re-launch without rebuilding the descriptor. +# --------------------------------------------------------------------------- +b = cp.full(N, fill_value=42.0, dtype=cp.float32) +dev.sync() + +tensor_map.replace_address(b) + +output2 = cp.zeros(N, dtype=cp.float32) +dev.sync() + +launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N)) +dev.sync() + +assert cp.array_equal(output2, b), "replace_address produced incorrect results" +print("replace_address verified: descriptor reused with new source tensor") diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index e93c02dc08..63c0ac7b5c 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -13,7 +13,6 @@ # 1. Creating a TMA tiled descriptor from a CuPy device array # 2. Passing the descriptor to a kernel via launch() # 3. Using TMA to load tiles into shared memory (via inline PTX) -# 4. Updating the descriptor's source address with replace_address() # # Requirements: # - Hopper or later GPU (compute capability >= 9.0) @@ -174,22 +173,3 @@ assert cp.array_equal(output, a), "TMA copy produced incorrect results" print(f"TMA copy verified: {N} elements across {n_tiles} tiles") - -# --------------------------------------------------------------------------- -# 4) Demonstrate replace_address() -# Create a second tensor with different content, point the *same* -# descriptor at it, and re-launch without rebuilding the descriptor. -# --------------------------------------------------------------------------- -b = cp.full(N, fill_value=42.0, dtype=cp.float32) -dev.sync() - -tensor_map.replace_address(b) - -output2 = cp.zeros(N, dtype=cp.float32) -dev.sync() - -launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N)) -dev.sync() - -assert cp.array_equal(output2, b), "replace_address produced incorrect results" -print("replace_address verified: descriptor reused with new source tensor") From 0a1b7202b9bbaee509c8a6b44cc47edb9d51d03b Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 25 Feb 2026 14:14:38 -0800 Subject: [PATCH 07/25] pre-commit --- cuda_core/cuda/core/__init__.py | 1 + cuda_core/examples/tma_replace_address.py | 1 - cuda_core/examples/tma_tensor_map.py | 1 - cuda_core/tests/test_tensor_map.py | 37 ++++++++++++----------- 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index d5f3693721..84e4f3d356 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -60,6 +60,7 @@ StridedMemoryView, args_viewable_as_strided_memory, ) +<<<<<<< HEAD from cuda.core._module import Kernel, ObjectCode from cuda.core._program import Program, ProgramOptions from cuda.core._stream import ( diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py index 7f11cf551c..21fb91c254 100644 --- a/cuda_core/examples/tma_replace_address.py +++ b/cuda_core/examples/tma_replace_address.py @@ -30,7 +30,6 @@ import cupy as cp import numpy as np - from cuda.core import ( Device, LaunchConfig, diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index 63c0ac7b5c..c85c5d96bd 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -25,7 +25,6 @@ import cupy as cp import numpy as np - from cuda.core import ( Device, LaunchConfig, diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 96dd14a419..1d6e8195b6 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -1,14 +1,12 @@ # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import pytest - import numpy as np - +import pytest from cuda.core import ( Device, - TensorMapDescriptor, TensorMapDataType, + TensorMapDescriptor, TensorMapIm2ColWideMode, TensorMapInterleave, TensorMapL2Promotion, @@ -28,7 +26,6 @@ def skip_if_no_tma(dev): pytest.skip("Device does not support TMA (requires compute capability 9.0+)") - class _DeviceArray: """Wrap a Buffer with explicit shape via __cuda_array_interface__. @@ -36,6 +33,7 @@ class _DeviceArray: we need the tensor to report a proper shape/dtype so the TMA encoder sees the correct rank, dimensions, and strides. """ + def __init__(self, buf, shape, dtype=np.float32): self._buf = buf # prevent GC self.__cuda_array_interface__ = { @@ -225,25 +223,30 @@ def test_invalid_data_type(self, dev, skip_if_no_tma): class TestTensorMapDtypeMapping: """Test automatic dtype inference from numpy dtypes.""" - @pytest.mark.parametrize("np_dtype,expected_tma_dt", [ - (np.uint8, TensorMapDataType.UINT8), - (np.uint16, TensorMapDataType.UINT16), - (np.uint32, TensorMapDataType.UINT32), - (np.int32, TensorMapDataType.INT32), - (np.uint64, TensorMapDataType.UINT64), - (np.int64, TensorMapDataType.INT64), - (np.float16, TensorMapDataType.FLOAT16), - (np.float32, TensorMapDataType.FLOAT32), - (np.float64, TensorMapDataType.FLOAT64), - ]) + @pytest.mark.parametrize( + "np_dtype,expected_tma_dt", + [ + (np.uint8, TensorMapDataType.UINT8), + (np.uint16, TensorMapDataType.UINT16), + (np.uint32, TensorMapDataType.UINT32), + (np.int32, TensorMapDataType.INT32), + (np.uint64, TensorMapDataType.UINT64), + (np.int64, TensorMapDataType.INT64), + (np.float16, TensorMapDataType.FLOAT16), + (np.float32, TensorMapDataType.FLOAT32), + (np.float64, TensorMapDataType.FLOAT64), + ], + ) def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma): from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA + assert _NUMPY_DTYPE_TO_TMA[np.dtype(np_dtype)] == expected_tma_dt def test_bfloat16_mapping(self): try: - from ml_dtypes import bfloat16 from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA + from ml_dtypes import bfloat16 + assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16 except ImportError: pytest.skip("ml_dtypes not installed") From 44fbdcf80f7c917bbd9ef790da0c7001cd2b747f Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 25 Feb 2026 15:07:54 -0800 Subject: [PATCH 08/25] adding stride meta data to gpu allocated memory --- cuda_core/tests/test_tensor_map.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 1d6e8195b6..3e030dd29d 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -98,8 +98,9 @@ def test_from_tiled_1d(self, dev, skip_if_no_tma): def test_from_tiled_2d(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) # 64x64 float32 + tensor = _DeviceArray(buf, (64, 64)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, ) @@ -107,8 +108,9 @@ def test_from_tiled_2d(self, dev, skip_if_no_tma): def test_from_tiled_3d(self, dev, skip_if_no_tma): buf = dev.allocate(16 * 16 * 16 * 4) # 16x16x16 float32 + tensor = _DeviceArray(buf, (16, 16, 16)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(8, 8, 8), data_type=TensorMapDataType.FLOAT32, ) @@ -128,8 +130,9 @@ def test_from_tiled_5d(self, dev, skip_if_no_tma): def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) + tensor = _DeviceArray(buf, (64, 64)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, swizzle=TensorMapSwizzle.SWIZZLE_128B, @@ -138,8 +141,9 @@ def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) + tensor = _DeviceArray(buf, (64, 64)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, l2_promotion=TensorMapL2Promotion.L2_128B, @@ -148,8 +152,9 @@ def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma): def test_from_tiled_with_oob_fill(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) + tensor = _DeviceArray(buf, (64, 64)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA, @@ -162,9 +167,10 @@ class TestTensorMapDescriptorValidation: def test_invalid_rank_zero(self, dev, skip_if_no_tma): buf = dev.allocate(64) + tensor = _DeviceArray(buf, ()) # 0-dim tensor with pytest.raises(ValueError, match="rank must be between 1 and 5"): TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(), data_type=TensorMapDataType.FLOAT32, ) @@ -286,8 +292,9 @@ class TestTensorMapIm2col: def test_from_im2col_3d(self, dev, skip_if_no_tma): # 3D tensor: batch=1, height=32, channels=64 buf = dev.allocate(1 * 32 * 64 * 4) + tensor = _DeviceArray(buf, (1, 32, 64)) desc = TensorMapDescriptor.from_im2col( - buf, + tensor, pixel_box_lower_corner=(0,), pixel_box_upper_corner=(4,), channels_per_pixel=64, @@ -310,9 +317,10 @@ def test_from_im2col_rank_validation(self, dev, skip_if_no_tma): def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma): buf = dev.allocate(1 * 32 * 64 * 4) + tensor = _DeviceArray(buf, (1, 32, 64)) # 3D: n_spatial = 1 with pytest.raises(ValueError, match="pixel_box_lower_corner must have 1 elements"): TensorMapDescriptor.from_im2col( - buf, + tensor, pixel_box_lower_corner=(0, 0), pixel_box_upper_corner=(4,), channels_per_pixel=64, From bdf39a24efcb528a2610967ea5f6459c0fee2472 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 25 Feb 2026 15:59:31 -0800 Subject: [PATCH 09/25] im2col fixes --- cuda_core/tests/test_tensor_map.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 3e030dd29d..8b5e27ce6f 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -373,8 +373,9 @@ def skip_if_no_im2col_wide(self, dev): def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide): # 3D tensor: batch=1, width=32, channels=64 buf = dev.allocate(1 * 32 * 64 * 4) + tensor = _DeviceArray(buf, (1, 32, 64)) desc = TensorMapDescriptor.from_im2col_wide( - buf, + tensor, pixel_box_lower_corner_width=0, pixel_box_upper_corner_width=4, channels_per_pixel=64, From 96a3e84275f8d88e6476b1e71ef04bf6fedba6c8 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Tue, 3 Mar 2026 18:41:07 -0500 Subject: [PATCH 10/25] Reuse CCCL TMA descriptor construction for tiled TensorMap and keep validated views alive to avoid DLPack-backed pointer lifetime hazards. Add explicit tiled element-stride coverage and acknowledge the DLPack include-layout compatibility follow-up in NVIDIA/cccl#7871. Made-with: Cursor --- .gitignore | 1 + cuda_core/cuda/core/__init__.py | 1 - cuda_core/cuda/core/_cpp/tensor_map.cpp | 149 +++++++++ cuda_core/cuda/core/_cpp/tensor_map_cccl.h | 43 +++ cuda_core/cuda/core/_tensor_map.pxd | 1 + cuda_core/cuda/core/_tensor_map.pyx | 358 ++++++++++++++++----- cuda_core/examples/tma_replace_address.py | 1 + cuda_core/examples/tma_tensor_map.py | 1 + cuda_core/tests/test_tensor_map.py | 27 +- 9 files changed, 497 insertions(+), 85 deletions(-) create mode 100644 cuda_core/cuda/core/_cpp/tensor_map.cpp create mode 100644 cuda_core/cuda/core/_cpp/tensor_map_cccl.h diff --git a/.gitignore b/.gitignore index 7d9bcd38c3..18c919ca5e 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ __pycache__/ !*_impl.cpp !cuda_bindings/cuda/bindings/_lib/param_packer.cpp !cuda_bindings/cuda/bindings/_bindings/loader.cpp +!cuda_core/cuda/core/_cpp/*.cpp cache_driver cache_runtime cache_nvrtc diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 84e4f3d356..d5f3693721 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -60,7 +60,6 @@ StridedMemoryView, args_viewable_as_strided_memory, ) -<<<<<<< HEAD from cuda.core._module import Kernel, ObjectCode from cuda.core._program import Program, ProgramOptions from cuda.core._stream import ( diff --git a/cuda_core/cuda/core/_cpp/tensor_map.cpp b/cuda_core/cuda/core/_cpp/tensor_map.cpp new file mode 100644 index 0000000000..af09aa1b2f --- /dev/null +++ b/cuda_core/cuda/core/_cpp/tensor_map.cpp @@ -0,0 +1,149 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tensor_map_cccl.h" + +#include + +#include +#include + +#if defined(__has_include) +# if __has_include() +# include +# define CUDA_CORE_HAS_CUDA_TMA 1 +# else +# define CUDA_CORE_HAS_CUDA_TMA 0 +# endif +# if __has_include() +# include +# define CUDA_CORE_HAS_DLPACK_H 1 +# else +# define CUDA_CORE_HAS_DLPACK_H 0 +# endif +#else +# define CUDA_CORE_HAS_CUDA_TMA 0 +# define CUDA_CORE_HAS_DLPACK_H 0 +#endif + +static inline void cuda_core_write_err(char* err, size_t cap, const char* msg) noexcept +{ + if (!err || cap == 0) + return; + if (!msg) + { + err[0] = '\0'; + return; + } + size_t n = ::strlen(msg); + if (n >= cap) + n = cap - 1; + ::memcpy(err, msg, n); + err[n] = '\0'; +} + +int cuda_core_cccl_make_tma_descriptor_tiled( + void* out_tensor_map, + void* data, + int device_type, + int device_id, + int ndim, + const int64_t* shape, + const int64_t* strides, + uint8_t dtype_code, + uint8_t dtype_bits, + uint16_t dtype_lanes, + const int* box_sizes, + const int* elem_strides, + int interleave_layout, + int swizzle, + int l2_fetch_size, + int oob_fill, + char* err, + size_t err_cap) noexcept +{ +#if !(CUDA_CORE_HAS_CUDA_TMA && CUDA_CORE_HAS_DLPACK_H) + (void)out_tensor_map; + (void)data; + (void)device_type; + (void)device_id; + (void)ndim; + (void)shape; + (void)strides; + (void)dtype_code; + (void)dtype_bits; + (void)dtype_lanes; + (void)box_sizes; + (void)elem_strides; + (void)interleave_layout; + (void)swizzle; + (void)l2_fetch_size; + (void)oob_fill; + cuda_core_write_err(err, err_cap, "CCCL and/or not available at build time"); + return 1; +#else + try + { + if (!out_tensor_map) + { + cuda_core_write_err(err, err_cap, "out_tensor_map is NULL"); + return 1; + } + if (!data) + { + cuda_core_write_err(err, err_cap, "tensor data pointer is NULL"); + return 1; + } + if (!shape || !box_sizes || ndim <= 0) + { + cuda_core_write_err(err, err_cap, "invalid rank/shape/box_sizes"); + return 1; + } + + DLTensor t{}; + t.data = data; + t.device = {static_cast(device_type), device_id}; + t.ndim = ndim; + t.dtype.code = dtype_code; + t.dtype.bits = dtype_bits; + t.dtype.lanes = dtype_lanes; + // CCCL promises not to mutate the arrays, but DLPack uses non-const pointers. + t.shape = const_cast(shape); + t.strides = const_cast(strides); + t.byte_offset = 0; + + const auto layout = static_cast(interleave_layout); + const auto swz = static_cast(swizzle); + const auto l2 = static_cast(l2_fetch_size); + const auto oob = static_cast(oob_fill); + + auto box = cuda::std::span(box_sizes, static_cast(ndim)); + + CUtensorMap desc{}; + if (elem_strides) + { + auto es = cuda::std::span(elem_strides, static_cast(ndim)); + desc = cuda::make_tma_descriptor(t, box, es, layout, swz, l2, oob); + } + else + { + desc = cuda::make_tma_descriptor(t, box, layout, swz, l2, oob); + } + + ::memcpy(out_tensor_map, &desc, sizeof(CUtensorMap)); + cuda_core_write_err(err, err_cap, nullptr); + return 0; + } + catch (const std::exception& e) + { + cuda_core_write_err(err, err_cap, e.what()); + return 1; + } + catch (...) + { + cuda_core_write_err(err, err_cap, "unknown error while building TMA descriptor"); + return 1; + } +#endif +} diff --git a/cuda_core/cuda/core/_cpp/tensor_map_cccl.h b/cuda_core/cuda/core/_cpp/tensor_map_cccl.h new file mode 100644 index 0000000000..71be425182 --- /dev/null +++ b/cuda_core/cuda/core/_cpp/tensor_map_cccl.h @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef CUDA_CORE_TENSOR_MAP_CCCL_H_ +#define CUDA_CORE_TENSOR_MAP_CCCL_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Build a tiled CUtensorMap using CCCL's cuda::make_tma_descriptor (from ). +// +// Returns 0 on success; on failure returns non-zero and writes a best-effort +// human-readable message into (err, err_cap) if provided. +int cuda_core_cccl_make_tma_descriptor_tiled( + void* out_tensor_map, + void* data, + int device_type, + int device_id, + int ndim, + const int64_t* shape, // length ndim + const int64_t* strides, // length ndim, or NULL for contiguous + uint8_t dtype_code, + uint8_t dtype_bits, + uint16_t dtype_lanes, + const int* box_sizes, // length ndim + const int* elem_strides, // length ndim, or NULL for all-ones overload + int interleave_layout, + int swizzle, + int l2_fetch_size, + int oob_fill, + char* err, + size_t err_cap) noexcept; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // CUDA_CORE_TENSOR_MAP_CCCL_H_ diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd index b2b165b5aa..07e8adebc9 100644 --- a/cuda_core/cuda/core/_tensor_map.pxd +++ b/cuda_core/cuda/core/_tensor_map.pxd @@ -8,6 +8,7 @@ from cuda.bindings cimport cydriver cdef class TensorMapDescriptor: cdef cydriver.CUtensorMap _tensor_map cdef object _source_ref + cdef object _view_ref cdef object _repr_info cdef void* _get_data_ptr(self) diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx index 9edcccb928..6054add9b6 100644 --- a/cuda_core/cuda/core/_tensor_map.pyx +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -2,9 +2,11 @@ # # SPDX-License-Identifier: Apache-2.0 -from libc.stdint cimport intptr_t, uint32_t, uint64_t +from libc.stdint cimport intptr_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t +from libc.stddef cimport size_t from cuda.bindings cimport cydriver from cuda.core._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core._dlpack cimport kDLInt, kDLUInt, kDLFloat, kDLBfloat, _kDLCUDA import enum @@ -12,6 +14,27 @@ import numpy from cuda.core._memoryview import StridedMemoryView +cdef extern from "_cpp/tensor_map_cccl.h": + int cuda_core_cccl_make_tma_descriptor_tiled( + void* out_tensor_map, + void* data, + int device_type, + int device_id, + int ndim, + const int64_t* shape, + const int64_t* strides, + uint8_t dtype_code, + uint8_t dtype_bits, + uint16_t dtype_lanes, + const int* box_sizes, + const int* elem_strides, + int interleave_layout, + int swizzle, + int l2_fetch_size, + int oob_fill, + char* err, + size_t err_cap) nogil + try: from ml_dtypes import bfloat16 as ml_bfloat16 @@ -80,14 +103,25 @@ class TensorMapOOBFill(enum.IntEnum): NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA -class TensorMapIm2ColWideMode(enum.IntEnum): - """Im2col wide mode for tensor map descriptors. +IF CUDA_CORE_BUILD_MAJOR >= 13: + class TensorMapIm2ColWideMode(enum.IntEnum): + """Im2col wide mode for tensor map descriptors. - These correspond to the ``CUtensorMapIm2ColWideMode`` driver enum values. - Supported on compute capability 10.0+. - """ - W = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W - W128 = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128 + These correspond to the ``CUtensorMapIm2ColWideMode`` driver enum values. + Supported on compute capability 10.0+. + """ + W = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W + W128 = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128 +ELSE: + class TensorMapIm2ColWideMode(enum.IntEnum): + """Im2col wide mode for tensor map descriptors. + + This enum is always defined for API stability, but the + :meth:`TensorMapDescriptor.from_im2col_wide` factory requires a CUDA 13+ + build and will raise otherwise. + """ + W = 0 + W128 = 1 # Mapping from numpy dtype to TMA data type @@ -150,6 +184,65 @@ def _resolve_data_type(view, data_type): return tma_dt +cdef inline bint _tma_dtype_to_dlpack( + object tma_dt, + uint8_t* out_code, + uint8_t* out_bits, + uint16_t* out_lanes, +) noexcept: + if tma_dt == TensorMapDataType.UINT8: + out_code[0] = kDLUInt + out_bits[0] = 8 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.UINT16: + out_code[0] = kDLUInt + out_bits[0] = 16 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.UINT32: + out_code[0] = kDLUInt + out_bits[0] = 32 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.UINT64: + out_code[0] = kDLUInt + out_bits[0] = 64 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.INT32: + out_code[0] = kDLInt + out_bits[0] = 32 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.INT64: + out_code[0] = kDLInt + out_bits[0] = 64 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.FLOAT16: + out_code[0] = kDLFloat + out_bits[0] = 16 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.FLOAT32: + out_code[0] = kDLFloat + out_bits[0] = 32 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.FLOAT64: + out_code[0] = kDLFloat + out_bits[0] = 64 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.BFLOAT16: + out_code[0] = kDLBfloat + out_bits[0] = 16 + out_lanes[0] = 1 + return True + return False + + def _get_validated_view(tensor): """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer.""" if isinstance(tensor, StridedMemoryView): @@ -268,7 +361,11 @@ cdef class TensorMapDescriptor: cdef TensorMapDescriptor desc = cls.__new__(cls) view = _get_validated_view(tensor) + # Keep both the original tensor object and the validated view alive. + # For DLPack exporters, the view may hold the owning capsule whose + # deleter can free the backing allocation when released. desc._source_ref = tensor + desc._view_ref = view tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -292,8 +389,92 @@ cdef class TensorMapDescriptor: raise ValueError( f"box_dim[{i}] must be in [1, 256], got {bd}") + cdef bint elem_strides_provided = element_strides is not None element_strides = _validate_element_strides(element_strides, rank) + # Reuse CCCL/libcu++'s DLPack -> CUtensorMap conversion when possible. + # This avoids maintaining a second, independent validation/encoding implementation. + cdef uint8_t dl_code + cdef uint8_t dl_bits + cdef uint16_t dl_lanes + cdef int64_t c_shape[5] + cdef int64_t c_strides[5] + cdef int c_box_sizes[5] + cdef int c_elem_strides[5] + cdef const int64_t* c_strides_ptr + cdef const int* c_elem_strides_ptr + cdef char errbuf[512] + cdef int i_cccl + cdef int device_type + cdef int c_device_id + cdef int c_cccl_interleave_int + cdef int c_cccl_swizzle_int + cdef int c_cccl_l2_promotion_int + cdef int c_cccl_oob_fill_int + cdef int rc + if _tma_dtype_to_dlpack(tma_dt, &dl_code, &dl_bits, &dl_lanes): + c_strides_ptr = NULL + c_elem_strides_ptr = NULL + errbuf[0] = 0 + + for i_cccl in range(rank): + c_shape[i_cccl] = shape[i_cccl] + c_box_sizes[i_cccl] = box_dim[i_cccl] + if elem_strides_provided: + c_elem_strides[i_cccl] = element_strides[i_cccl] + + if view.strides is not None: + for i_cccl in range(rank): + c_strides[i_cccl] = view.strides[i_cccl] + c_strides_ptr = &c_strides[0] + + if elem_strides_provided: + c_elem_strides_ptr = &c_elem_strides[0] + + device_type = _kDLCUDA + c_device_id = view.device_id + c_cccl_interleave_int = int(interleave) + c_cccl_swizzle_int = int(swizzle) + c_cccl_l2_promotion_int = int(l2_promotion) + c_cccl_oob_fill_int = int(oob_fill) + + with nogil: + rc = cuda_core_cccl_make_tma_descriptor_tiled( + &desc._tensor_map, + global_address, + device_type, + c_device_id, + rank, + &c_shape[0], + c_strides_ptr, + dl_code, + dl_bits, + dl_lanes, + &c_box_sizes[0], + c_elem_strides_ptr, + c_cccl_interleave_int, + c_cccl_swizzle_int, + c_cccl_l2_promotion_int, + c_cccl_oob_fill_int, + &errbuf[0], + sizeof(errbuf), + ) + + if rc == 0: + desc._repr_info = { + "method": "tiled", + "rank": rank, + "data_type": tma_dt, + "swizzle": swizzle, + } + return desc + + msg = errbuf[:].split(b"\0", 1)[0].decode("utf-8", errors="replace") + # If CCCL isn't available at build time, fall back to the direct + # driver API path to preserve functionality on older toolchains. + if "not available at build time" not in msg: + raise ValueError(f"Failed to build TMA descriptor via CCCL: {msg}") + cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] byte_strides = _compute_byte_strides(shape, view.strides, elem_size) @@ -411,6 +592,7 @@ cdef class TensorMapDescriptor: view = _get_validated_view(tensor) desc._source_ref = tensor + desc._view_ref = view tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -447,6 +629,10 @@ cdef class TensorMapDescriptor: cdef int[3] c_pixel_box_upper cdef int i_c + for i_c in range(3): + c_pixel_box_lower[i_c] = 0 + c_pixel_box_upper[i_c] = 0 + for i_c in range(rank): c_global_dim[i_c] = shape[rank - 1 - i_c] c_element_strides[i_c] = element_strides[rank - 1 - i_c] @@ -555,85 +741,90 @@ cdef class TensorMapDescriptor: If the tensor rank is outside [3, 5], the pointer is not 16-byte aligned, or other constraints are violated. """ - cdef TensorMapDescriptor desc = cls.__new__(cls) - - view = _get_validated_view(tensor) - desc._source_ref = tensor - - tma_dt = _resolve_data_type(view, data_type) - cdef int c_data_type_int = int(tma_dt) - cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int - - cdef intptr_t global_address = view.ptr - shape = view.shape - - cdef int rank = len(shape) - if rank < 3 or rank > 5: - raise ValueError( - f"Im2col-wide tensor rank must be between 3 and 5, got {rank}") - - element_strides = _validate_element_strides(element_strides, rank) - - cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] - byte_strides = _compute_byte_strides(shape, view.strides, elem_size) - - # Reverse all dimension arrays for column-major convention - cdef uint64_t[5] c_global_dim - cdef uint64_t[4] c_global_strides - cdef uint32_t[5] c_element_strides - cdef int i_c - - for i_c in range(rank): - c_global_dim[i_c] = shape[rank - 1 - i_c] - c_element_strides[i_c] = element_strides[rank - 1 - i_c] + IF CUDA_CORE_BUILD_MAJOR < 13: + raise RuntimeError( + "TensorMapDescriptor.from_im2col_wide requires a CUDA 13+ build") + ELSE: + cdef TensorMapDescriptor desc = cls.__new__(cls) - for i_c in range(rank - 1): - c_global_strides[i_c] = byte_strides[rank - 2 - i_c] + view = _get_validated_view(tensor) + desc._source_ref = tensor + desc._view_ref = view - cdef uint32_t c_rank = rank - cdef int c_lower_w = pixel_box_lower_corner_width - cdef int c_upper_w = pixel_box_upper_corner_width - cdef uint32_t c_channels = channels_per_pixel - cdef uint32_t c_pixels = pixels_per_column - cdef int c_interleave_int = int(interleave) - cdef int c_mode_int = int(mode) - cdef int c_swizzle_int = int(swizzle) - cdef int c_l2_promotion_int = int(l2_promotion) - cdef int c_oob_fill_int = int(oob_fill) - cdef cydriver.CUtensorMapInterleave c_interleave = c_interleave_int - cdef cydriver.CUtensorMapIm2ColWideMode c_mode = c_mode_int - cdef cydriver.CUtensorMapSwizzle c_swizzle = c_swizzle_int - cdef cydriver.CUtensorMapL2promotion c_l2_promotion = c_l2_promotion_int - cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = c_oob_fill_int + tma_dt = _resolve_data_type(view, data_type) + cdef int c_data_type_int = int(tma_dt) + cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int - with nogil: - HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2colWide( - &desc._tensor_map, - c_data_type, - c_rank, - global_address, - c_global_dim, - c_global_strides, - c_lower_w, - c_upper_w, - c_channels, - c_pixels, - c_element_strides, - c_interleave, - c_mode, - c_swizzle, - c_l2_promotion, - c_oob_fill, - )) + cdef intptr_t global_address = view.ptr + shape = view.shape - desc._repr_info = { - "method": "im2col_wide", - "rank": rank, - "data_type": tma_dt, - "swizzle": swizzle, - } - - return desc + cdef int rank = len(shape) + if rank < 3 or rank > 5: + raise ValueError( + f"Im2col-wide tensor rank must be between 3 and 5, got {rank}") + + element_strides = _validate_element_strides(element_strides, rank) + + cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] + byte_strides = _compute_byte_strides(shape, view.strides, elem_size) + + # Reverse all dimension arrays for column-major convention + cdef uint64_t[5] c_global_dim + cdef uint64_t[4] c_global_strides + cdef uint32_t[5] c_element_strides + cdef int i_c + + for i_c in range(rank): + c_global_dim[i_c] = shape[rank - 1 - i_c] + c_element_strides[i_c] = element_strides[rank - 1 - i_c] + + for i_c in range(rank - 1): + c_global_strides[i_c] = byte_strides[rank - 2 - i_c] + + cdef uint32_t c_rank = rank + cdef int c_lower_w = pixel_box_lower_corner_width + cdef int c_upper_w = pixel_box_upper_corner_width + cdef uint32_t c_channels = channels_per_pixel + cdef uint32_t c_pixels = pixels_per_column + cdef int c_interleave_int = int(interleave) + cdef int c_mode_int = int(mode) + cdef int c_swizzle_int = int(swizzle) + cdef int c_l2_promotion_int = int(l2_promotion) + cdef int c_oob_fill_int = int(oob_fill) + cdef cydriver.CUtensorMapInterleave c_interleave = c_interleave_int + cdef cydriver.CUtensorMapIm2ColWideMode c_mode = c_mode_int + cdef cydriver.CUtensorMapSwizzle c_swizzle = c_swizzle_int + cdef cydriver.CUtensorMapL2promotion c_l2_promotion = c_l2_promotion_int + cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = c_oob_fill_int + + with nogil: + HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2colWide( + &desc._tensor_map, + c_data_type, + c_rank, + global_address, + c_global_dim, + c_global_strides, + c_lower_w, + c_upper_w, + c_channels, + c_pixels, + c_element_strides, + c_interleave, + c_mode, + c_swizzle, + c_l2_promotion, + c_oob_fill, + )) + + desc._repr_info = { + "method": "im2col_wide", + "rank": rank, + "data_type": tma_dt, + "swizzle": swizzle, + } + + return desc def replace_address(self, tensor): """Replace the global memory address in this tensor map descriptor. @@ -662,6 +853,7 @@ cdef class TensorMapDescriptor: # so we don't drop the old tensor (risking a dangling pointer in the # CUtensorMap struct) if the call fails. self._source_ref = tensor + self._view_ref = view def __repr__(self): info = self._repr_info diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py index 21fb91c254..7f11cf551c 100644 --- a/cuda_core/examples/tma_replace_address.py +++ b/cuda_core/examples/tma_replace_address.py @@ -30,6 +30,7 @@ import cupy as cp import numpy as np + from cuda.core import ( Device, LaunchConfig, diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index c85c5d96bd..63c0ac7b5c 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -25,6 +25,7 @@ import cupy as cp import numpy as np + from cuda.core import ( Device, LaunchConfig, diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 8b5e27ce6f..4db456d863 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -3,6 +3,7 @@ import numpy as np import pytest + from cuda.core import ( Device, TensorMapDataType, @@ -128,6 +129,29 @@ def test_from_tiled_5d(self, dev, skip_if_no_tma): ) assert desc is not None + def test_from_tiled_with_element_strides_buffer(self, dev, skip_if_no_tma): + # Use a Buffer input (DLPack path) and explicit element_strides. + buf = dev.allocate(1024 * 4) + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(64,), + element_strides=(2,), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + def test_from_tiled_with_element_strides_cai(self, dev, skip_if_no_tma): + # Use a CAI-style tensor wrapper and explicit element_strides. + buf = dev.allocate(64 * 64 * 4) + tensor = _DeviceArray(buf, (64, 64)) + desc = TensorMapDescriptor.from_tiled( + tensor, + box_dim=(32, 32), + element_strides=(2, 1), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) tensor = _DeviceArray(buf, (64, 64)) @@ -250,9 +274,10 @@ def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma): def test_bfloat16_mapping(self): try: - from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA from ml_dtypes import bfloat16 + from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA + assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16 except ImportError: pytest.skip("ml_dtypes not installed") From 1a6b41608658499486fa5705eaea93551425442b Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 7 Mar 2026 08:00:36 -0500 Subject: [PATCH 11/25] Skip im2col-wide TensorMap tests when runtime support is unavailable. Probe support in the fixture and skip when cuda.core is built without CUDA 13 im2col-wide support or when the driver/GPU reports CUDA_ERROR_INVALID_VALUE, so unsupported RTXPRO6000 lanes don't block unrelated changes. Made-with: Cursor --- cuda_core/tests/test_tensor_map.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 4db456d863..2d6dc138c0 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -395,6 +395,31 @@ def skip_if_no_im2col_wide(self, dev): if cc.major < 10: pytest.skip("Device does not support im2col-wide (requires compute capability 10.0+)") + # Some environments in CI exercise this test module with a cuda.core + # build that does not include im2col-wide symbols (CUDA < 13 build), + # or with driver/GPU combinations that reject im2col-wide descriptor + # encoding for otherwise valid inputs. Probe once per test invocation + # and skip only for those known unsupported cases. + buf = dev.allocate(1 * 32 * 64 * 4) + tensor = _DeviceArray(buf, (1, 32, 64)) + try: + TensorMapDescriptor.from_im2col_wide( + tensor, + pixel_box_lower_corner_width=0, + pixel_box_upper_corner_width=4, + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) + except RuntimeError as e: + if "requires a CUDA 13+ build" in str(e): + pytest.skip("Im2col-wide requires cuda.core built with CUDA 13+") + raise + except Exception as e: + if "CUDA_ERROR_INVALID_VALUE" in str(e): + pytest.skip("Im2col-wide unsupported on this driver/GPU combination") + raise + def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide): # 3D tensor: batch=1, width=32, channels=64 buf = dev.allocate(1 * 32 * 64 * 4) From 892ee605e500b2e9225f7077bd66e61e1fc4eb97 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 7 Mar 2026 09:41:46 -0500 Subject: [PATCH 12/25] Align TensorMap API surface with review feedback and enforce context safety. Expose only TensorMapDescriptor in cuda.core, add StridedMemoryView.as_tensor_map(), remove redundant tensor-map fallback packing, and track/check descriptor context/device compatibility before replacement and kernel launch argument packing. Made-with: Cursor --- .gitignore | 1 - cuda_core/cuda/core/__init__.py | 10 +---- cuda_core/cuda/core/_kernel_arg_handler.pyx | 4 +- cuda_core/cuda/core/_memoryview.pyx | 33 ++++++++++++++ cuda_core/cuda/core/_tensor_map.pxd | 4 ++ cuda_core/cuda/core/_tensor_map.pyx | 48 +++++++++++++++++++++ cuda_core/examples/tma_replace_address.py | 4 +- cuda_core/examples/tma_tensor_map.py | 11 ++--- cuda_core/tests/test_tensor_map.py | 15 ++++++- 9 files changed, 107 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 18c919ca5e..7d9bcd38c3 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,6 @@ __pycache__/ !*_impl.cpp !cuda_bindings/cuda/bindings/_lib/param_packer.cpp !cuda_bindings/cuda/bindings/_bindings/loader.cpp -!cuda_core/cuda/core/_cpp/*.cpp cache_driver cache_runtime cache_nvrtc diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index d5f3693721..c32ff79ecc 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -68,12 +68,4 @@ Stream, StreamOptions, ) -from cuda.core._tensor_map import ( - TensorMapDataType, - TensorMapDescriptor, - TensorMapIm2ColWideMode, - TensorMapInterleave, - TensorMapL2Promotion, - TensorMapOOBFill, - TensorMapSwizzle, -) +from cuda.core._tensor_map import TensorMapDescriptor diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx index 5619ba7492..28a981fed2 100644 --- a/cuda_core/cuda/core/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -135,6 +135,7 @@ cdef inline int prepare_tensor_map_arg( vector.vector[void*]& data_addresses, TensorMapDescriptor arg, const size_t idx) except -1: + arg._check_context_compat() # Allocate a temporary buffer for the 128-byte CUtensorMap struct. # We copy rather than pointing directly at arg._tensor_map for lifetime # safety: ParamHolder owns and frees its argument buffers independently. @@ -350,9 +351,6 @@ cdef class ParamHolder: elif isinstance(arg, driver.CUgraphConditionalHandle): prepare_arg[cydriver.CUgraphConditionalHandle](self.data, self.data_addresses, arg, i) continue - elif isinstance(arg, tensor_map_descriptor_type): - prepare_tensor_map_arg(self.data, self.data_addresses, arg, i) - continue # TODO: support ctypes/numpy struct raise TypeError("the argument is of unsupported type: " + str(type(arg))) diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index 0e1df726c0..78002b3ff1 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -316,6 +316,39 @@ cdef class StridedMemoryView: view_buffer_strided(view, self.get_buffer(), layout, dtype, self.readonly) return view + def as_tensor_map( + self, + box_dim, + *, + element_strides=None, + data_type=None, + interleave=None, + swizzle=None, + l2_promotion=None, + oob_fill=None, + ): + """Create a tiled :obj:`TensorMapDescriptor` from this view. + + This is a convenience wrapper around + :meth:`cuda.core._tensor_map.TensorMapDescriptor.from_tiled`. + """ + from cuda.core._tensor_map import TensorMapDescriptor + + kwargs = {} + if element_strides is not None: + kwargs["element_strides"] = element_strides + if data_type is not None: + kwargs["data_type"] = data_type + if interleave is not None: + kwargs["interleave"] = interleave + if swizzle is not None: + kwargs["swizzle"] = swizzle + if l2_promotion is not None: + kwargs["l2_promotion"] = l2_promotion + if oob_fill is not None: + kwargs["oob_fill"] = oob_fill + return TensorMapDescriptor.from_tiled(self, box_dim, **kwargs) + def copy_from( self, other : StridedMemoryView, stream : Stream, allocator = None, diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd index 07e8adebc9..4c60b7fc70 100644 --- a/cuda_core/cuda/core/_tensor_map.pxd +++ b/cuda_core/cuda/core/_tensor_map.pxd @@ -3,12 +3,16 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver +from libc.stdint cimport intptr_t cdef class TensorMapDescriptor: cdef cydriver.CUtensorMap _tensor_map + cdef int _device_id + cdef intptr_t _context cdef object _source_ref cdef object _view_ref cdef object _repr_info + cdef int _check_context_compat(self) except -1 cdef void* _get_data_ptr(self) diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx index 6054add9b6..9d83a709cc 100644 --- a/cuda_core/cuda/core/_tensor_map.pyx +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -263,6 +263,22 @@ def _get_validated_view(tensor): return view +cdef inline intptr_t _get_current_context_ptr() except? 0: + cdef cydriver.CUcontext ctx + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + if ctx == NULL: + raise RuntimeError("TensorMapDescriptor requires an active CUDA context") + return ctx + + +cdef inline int _get_current_device_id() except -1: + cdef cydriver.CUdevice dev + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) + return dev + + def _compute_byte_strides(shape, strides, elem_size): """Compute byte strides from element strides or C-contiguous fallback. @@ -313,6 +329,28 @@ cdef class TensorMapDescriptor: cdef void* _get_data_ptr(self): return &self._tensor_map + cdef int _check_context_compat(self) except -1: + cdef cydriver.CUcontext current_ctx + cdef cydriver.CUdevice current_dev + cdef int current_dev_id + if self._context == 0 and self._device_id < 0: + return 0 + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(¤t_ctx)) + if current_ctx == NULL: + raise RuntimeError("TensorMapDescriptor requires an active CUDA context") + if self._context != 0 and current_ctx != self._context: + raise RuntimeError( + "TensorMapDescriptor was created in a different CUDA context") + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetDevice(¤t_dev)) + current_dev_id = current_dev + if self._device_id >= 0 and current_dev_id != self._device_id: + raise RuntimeError( + f"TensorMapDescriptor belongs to device {self._device_id}, " + f"but current device is {current_dev_id}") + return 0 + @classmethod def from_tiled(cls, tensor, box_dim, *, element_strides=None, @@ -366,6 +404,8 @@ cdef class TensorMapDescriptor: # deleter can free the backing allocation when released. desc._source_ref = tensor desc._view_ref = view + desc._context = _get_current_context_ptr() + desc._device_id = _get_current_device_id() tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -593,6 +633,8 @@ cdef class TensorMapDescriptor: view = _get_validated_view(tensor) desc._source_ref = tensor desc._view_ref = view + desc._context = _get_current_context_ptr() + desc._device_id = _get_current_device_id() tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -750,6 +792,8 @@ cdef class TensorMapDescriptor: view = _get_validated_view(tensor) desc._source_ref = tensor desc._view_ref = view + desc._context = _get_current_context_ptr() + desc._device_id = _get_current_device_id() tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -839,7 +883,11 @@ cdef class TensorMapDescriptor: or a :obj:`~cuda.core.StridedMemoryView`. Must refer to device-accessible memory with a 16-byte-aligned pointer. """ + self._check_context_compat() view = _get_validated_view(tensor) + if view.device_id != self._device_id: + raise ValueError( + f"replace_address expects tensor on device {self._device_id}, got {view.device_id}") cdef intptr_t global_address = view.ptr diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py index 7f11cf551c..a734301fd6 100644 --- a/cuda_core/examples/tma_replace_address.py +++ b/cuda_core/examples/tma_replace_address.py @@ -36,7 +36,7 @@ LaunchConfig, Program, ProgramOptions, - TensorMapDescriptor, + StridedMemoryView, launch, ) @@ -159,7 +159,7 @@ output = cp.zeros(N, dtype=cp.float32) dev.sync() # cupy uses its own stream -tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,)) +tensor_map = StridedMemoryView.from_any_interface(a, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,)) n_tiles = N // TILE_SIZE config = LaunchConfig(grid=n_tiles, block=TILE_SIZE) diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index 63c0ac7b5c..2a5ce9ad86 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -31,7 +31,7 @@ LaunchConfig, Program, ProgramOptions, - TensorMapDescriptor, + StridedMemoryView, launch, ) @@ -48,8 +48,6 @@ sys.exit(0) dev.set_current() -arch_str = "".join(f"{i}" for i in arch) - # --------------------------------------------------------------------------- # CUDA kernel that uses TMA to load a 1-D tile into shared memory, then # copies the tile to an output buffer so we can verify correctness. @@ -141,7 +139,7 @@ prog = Program( code, code_type="c++", - options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"), + options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"), ) mod = prog.compile("cubin") ker = mod.get_kernel("tma_copy") @@ -155,11 +153,10 @@ dev.sync() # cupy uses its own stream # --------------------------------------------------------------------------- -# 2) Create a TMA tiled descriptor -# from_tiled() accepts any DLPack / __cuda_array_interface__ object. +# 2) Create a TMA tiled descriptor from a StridedMemoryView. # The dtype (float32) is inferred automatically from the CuPy array. # --------------------------------------------------------------------------- -tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,)) +tensor_map = StridedMemoryView.from_any_interface(a, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,)) # --------------------------------------------------------------------------- # 3) Launch the kernel diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 2d6dc138c0..dee1f1e2e1 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -6,8 +6,11 @@ from cuda.core import ( Device, - TensorMapDataType, + StridedMemoryView, TensorMapDescriptor, +) +from cuda.core._tensor_map import ( + TensorMapDataType, TensorMapIm2ColWideMode, TensorMapInterleave, TensorMapL2Promotion, @@ -107,6 +110,16 @@ def test_from_tiled_2d(self, dev, skip_if_no_tma): ) assert desc is not None + def test_strided_memory_view_as_tensor_map(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) + tensor = _DeviceArray(buf, (64, 64)) + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + desc = view.as_tensor_map( + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + def test_from_tiled_3d(self, dev, skip_if_no_tma): buf = dev.allocate(16 * 16 * 16 * 4) # 16x16x16 float32 tensor = _DeviceArray(buf, (16, 16, 16)) From 5a0e141ebf5b4d7a65818c20d4adc4648d37571f Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 7 Mar 2026 09:54:32 -0500 Subject: [PATCH 13/25] Restore cu12 feature definitions in cuda_core pixi manifest. Bring back the cu12 feature blocks so pixi can parse the manifest and local test commands no longer fail early with a missing feature error. Made-with: Cursor --- cuda_core/pixi.toml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml index e351fb63b6..9dc6ac1ed9 100644 --- a/cuda_core/pixi.toml +++ b/cuda_core/pixi.toml @@ -75,9 +75,16 @@ cuda = "13" [feature.cu13.dependencies] cuda-version = "13.1.*" -# NOTE: cu12 environment is intentionally omitted because the path dependency -# to ../cuda_bindings (v13.1) makes it unsolvable locally. For cu12 testing, -# use conda-forge packages or CI workflows. +[feature.cu12.system-requirements] +cuda = "12" + +[feature.cu12.dependencies] +cuda-version = "12.*" + +# We keep both cu12 and cu13 because cuda.core works with either major version +# NOTE: Path dependency to ../cuda_bindings only works for cu13 (local bindings is v13.1) +# For cu12 testing, use conda-forge packages: temporarily change path to wildcard +# or skip cu12 locally: pixi run -e cu13 test [environments] default = { features = [ "cu13", From eef1c7a1293b618d90c222bed5d7345e3d3f8fb5 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 10 Mar 2026 10:39:26 -0700 Subject: [PATCH 14/25] Handle TensorMap device validation by DLPack type Reject CUDA device-local tensors from a different GPU while still allowing CUDA host and managed memory. Add regression tests for descriptor creation, replace_address, and the shared validation helper. --- cuda_core/cuda/core/_tensor_map.pyx | 27 ++++-- cuda_core/tests/test_tensor_map.py | 123 ++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx index 9d83a709cc..3b4d644e7b 100644 --- a/cuda_core/cuda/core/_tensor_map.pyx +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -263,6 +263,19 @@ def _get_validated_view(tensor): return view +def _require_view_device(view, expected_device_id, operation): + """Ensure device-local tensors match the current CUDA device. + + DLPack reports host/managed CUDA memory as ``kDLCUDAHost`` / + ``kDLCUDAManaged`` with ``device_id=0`` regardless of the current device, + so only true ``kDLCUDA`` tensors are rejected by device-id mismatch. + """ + device_type, device_id = view.__dlpack_device__() + if device_type == _kDLCUDA and device_id != expected_device_id: + raise ValueError( + f"{operation} expects tensor on device {expected_device_id}, got {device_id}") + + cdef inline intptr_t _get_current_context_ptr() except? 0: cdef cydriver.CUcontext ctx with nogil: @@ -406,6 +419,7 @@ cdef class TensorMapDescriptor: desc._view_ref = view desc._context = _get_current_context_ptr() desc._device_id = _get_current_device_id() + _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_tiled") tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -447,6 +461,8 @@ cdef class TensorMapDescriptor: cdef int i_cccl cdef int device_type cdef int c_device_id + cdef int dl_device_type + cdef int dl_device_id cdef int c_cccl_interleave_int cdef int c_cccl_swizzle_int cdef int c_cccl_l2_promotion_int @@ -471,8 +487,9 @@ cdef class TensorMapDescriptor: if elem_strides_provided: c_elem_strides_ptr = &c_elem_strides[0] - device_type = _kDLCUDA - c_device_id = view.device_id + dl_device_type, dl_device_id = view.__dlpack_device__() + device_type = dl_device_type + c_device_id = dl_device_id c_cccl_interleave_int = int(interleave) c_cccl_swizzle_int = int(swizzle) c_cccl_l2_promotion_int = int(l2_promotion) @@ -635,6 +652,7 @@ cdef class TensorMapDescriptor: desc._view_ref = view desc._context = _get_current_context_ptr() desc._device_id = _get_current_device_id() + _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_im2col") tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -794,6 +812,7 @@ cdef class TensorMapDescriptor: desc._view_ref = view desc._context = _get_current_context_ptr() desc._device_id = _get_current_device_id() + _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_im2col_wide") tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -885,9 +904,7 @@ cdef class TensorMapDescriptor: """ self._check_context_compat() view = _get_validated_view(tensor) - if view.device_id != self._device_id: - raise ValueError( - f"replace_address expects tensor on device {self._device_id}, got {view.device_id}") + _require_view_device(view, self._device_id, "replace_address") cdef intptr_t global_address = view.ptr diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index dee1f1e2e1..a370558019 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -4,11 +4,15 @@ import numpy as np import pytest +from conftest import create_managed_memory_resource_or_skip, skip_if_managed_memory_unsupported from cuda.core import ( Device, + ManagedMemoryResourceOptions, StridedMemoryView, TensorMapDescriptor, + system, ) +from cuda.core._dlpack import DLDeviceType from cuda.core._tensor_map import ( TensorMapDataType, TensorMapIm2ColWideMode, @@ -16,6 +20,7 @@ TensorMapL2Promotion, TensorMapOOBFill, TensorMapSwizzle, + _require_view_device, ) @@ -48,6 +53,15 @@ def __init__(self, buf, shape, dtype=np.float32): } +class _MockTensorMapView: + def __init__(self, device_type, device_id): + self._device_type = device_type + self._device_id = device_id + + def __dlpack_device__(self): + return (self._device_type, self._device_id) + + class TestTensorMapEnums: """Test that enum wrappers expose the expected values.""" @@ -323,6 +337,115 @@ def test_replace_address_requires_device_accessible(self, dev, skip_if_no_tma): with pytest.raises(ValueError, match="device-accessible"): desc.replace_address(host_arr) + def test_replace_address_rejects_tensor_from_other_device(self, dev, skip_if_no_tma): + if system.get_num_devices() < 2: + pytest.skip("requires multi-GPU") + + dev0 = dev + dev1 = Device(1) + + dev0.set_current() + buf0 = dev0.allocate(1024 * 4) + desc = TensorMapDescriptor.from_tiled( + buf0, + box_dim=(64,), + data_type=TensorMapDataType.FLOAT32, + ) + + dev1.set_current() + buf1 = dev1.allocate(1024 * 4) + dev0.set_current() + + with pytest.raises(ValueError, match=r"replace_address expects tensor on device 0, got 1"): + desc.replace_address(buf1) + + def test_replace_address_accepts_managed_buffer_on_nonzero_device(self, init_cuda): + if system.get_num_devices() < 2: + pytest.skip("requires multi-GPU") + + dev1 = Device(1) + if not dev1.properties.tensor_map_access_supported: + pytest.skip("Device does not support TMA (requires compute capability 9.0+)") + skip_if_managed_memory_unsupported(dev1) + + dev1.set_current() + desc = TensorMapDescriptor.from_tiled( + dev1.allocate(1024 * 4), + box_dim=(64,), + data_type=TensorMapDataType.FLOAT32, + ) + + mr = create_managed_memory_resource_or_skip( + ManagedMemoryResourceOptions(preferred_location=dev1.device_id) + ) + managed_buf = mr.allocate(1024 * 4) + + desc.replace_address(managed_buf) + + +class TestTensorMapMultiDeviceValidation: + """Test multi-device validation for descriptor creation.""" + + def test_from_tiled_rejects_tensor_from_other_device(self, init_cuda): + if system.get_num_devices() < 2: + pytest.skip("requires multi-GPU") + + dev0 = Device(0) + dev1 = Device(1) + + dev1.set_current() + buf1 = dev1.allocate(1024 * 4) + dev0.set_current() + + with pytest.raises( + ValueError, + match=r"TensorMapDescriptor\.from_tiled expects tensor on device 0, got 1", + ): + TensorMapDescriptor.from_tiled( + buf1, + box_dim=(64,), + data_type=TensorMapDataType.FLOAT32, + ) + + def test_from_tiled_accepts_managed_buffer_on_nonzero_device(self, init_cuda): + if system.get_num_devices() < 2: + pytest.skip("requires multi-GPU") + + dev1 = Device(1) + if not dev1.properties.tensor_map_access_supported: + pytest.skip("Device does not support TMA (requires compute capability 9.0+)") + skip_if_managed_memory_unsupported(dev1) + + dev1.set_current() + mr = create_managed_memory_resource_or_skip( + ManagedMemoryResourceOptions(preferred_location=dev1.device_id) + ) + managed_buf = mr.allocate(1024 * 4) + + desc = TensorMapDescriptor.from_tiled( + managed_buf, + box_dim=(64,), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + +class TestTensorMapDeviceValidation: + """Test device validation behavior for tensor-map-compatible views.""" + + def test_require_view_device_accepts_same_cuda_device(self): + _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDA, 1), 1, "op") + + def test_require_view_device_rejects_different_cuda_device(self): + with pytest.raises(ValueError, match=r"op expects tensor on device 0, got 1"): + _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDA, 1), 0, "op") + + def test_require_view_device_allows_cuda_host_memory(self): + _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDAHost, 0), 1, "op") + + def test_require_view_device_allows_cuda_managed_memory(self): + _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDAManaged, 0), 1, "op") + class TestTensorMapIm2col: """Test im2col TMA descriptor creation.""" From d6c311aa9b62deb00370b102f9d7ce3909e5e1df Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 11 Mar 2026 09:26:26 -0700 Subject: [PATCH 15/25] formatting change --- cuda_core/tests/test_tensor_map.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index a370558019..b50067c9a3 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -375,9 +375,7 @@ def test_replace_address_accepts_managed_buffer_on_nonzero_device(self, init_cud data_type=TensorMapDataType.FLOAT32, ) - mr = create_managed_memory_resource_or_skip( - ManagedMemoryResourceOptions(preferred_location=dev1.device_id) - ) + mr = create_managed_memory_resource_or_skip(ManagedMemoryResourceOptions(preferred_location=dev1.device_id)) managed_buf = mr.allocate(1024 * 4) desc.replace_address(managed_buf) @@ -417,9 +415,7 @@ def test_from_tiled_accepts_managed_buffer_on_nonzero_device(self, init_cuda): skip_if_managed_memory_unsupported(dev1) dev1.set_current() - mr = create_managed_memory_resource_or_skip( - ManagedMemoryResourceOptions(preferred_location=dev1.device_id) - ) + mr = create_managed_memory_resource_or_skip(ManagedMemoryResourceOptions(preferred_location=dev1.device_id)) managed_buf = mr.allocate(1024 * 4) desc = TensorMapDescriptor.from_tiled( From 9673bcf9c777031ed575e74b73cd4146ab3acd1c Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 13 Mar 2026 14:20:02 -0700 Subject: [PATCH 16/25] Update cuda_core/cuda/core/_cpp/tensor_map_cccl.h Co-authored-by: Leo Fang --- cuda_core/cuda/core/_cpp/tensor_map_cccl.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/tensor_map_cccl.h b/cuda_core/cuda/core/_cpp/tensor_map_cccl.h index 71be425182..37f0e0fd8d 100644 --- a/cuda_core/cuda/core/_cpp/tensor_map_cccl.h +++ b/cuda_core/cuda/core/_cpp/tensor_map_cccl.h @@ -5,11 +5,13 @@ #ifndef CUDA_CORE_TENSOR_MAP_CCCL_H_ #define CUDA_CORE_TENSOR_MAP_CCCL_H_ -#include -#include - #ifdef __cplusplus +#include +#include extern "C" { +#else +#include +#include #endif // Build a tiled CUtensorMap using CCCL's cuda::make_tma_descriptor (from ). From ae86192e05236fb77f039cd6665415ca01135ac1 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 13 Mar 2026 18:16:46 -0700 Subject: [PATCH 17/25] Update cuda_core/examples/tma_replace_address.py Co-authored-by: Leo Fang --- cuda_core/examples/tma_replace_address.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py index a734301fd6..fd19aeca31 100644 --- a/cuda_core/examples/tma_replace_address.py +++ b/cuda_core/examples/tma_replace_address.py @@ -146,7 +146,7 @@ prog = Program( code, code_type="c++", - options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"), + options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"), ) mod = prog.compile("cubin") ker = mod.get_kernel("tma_copy") From 232b621556492a6e2c2b8b8d28a07aa1798444a4 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 13 Mar 2026 18:17:02 -0700 Subject: [PATCH 18/25] Update cuda_core/cuda/core/__init__.py Co-authored-by: Leo Fang --- cuda_core/cuda/core/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index c32ff79ecc..139078e86e 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -68,4 +68,4 @@ Stream, StreamOptions, ) -from cuda.core._tensor_map import TensorMapDescriptor +from cuda.core._tensor_map import TensorMapDescriptor, TensorMapDescriptorOptions From 358d9753685637faa0190e8baf4b6e33d0fa476a Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 14 Mar 2026 08:43:11 -0400 Subject: [PATCH 19/25] Align TensorMap creation and launch behavior with the latest review guidance. Keep the public TMA entry point on StridedMemoryView and remove avoidable launch/build overhead so the reviewed API stays smaller without regressing local CUDA builds. Made-with: Cursor --- cuda_core/cuda/core/_cpp/tensor_map.cpp | 7 +- cuda_core/cuda/core/_kernel_arg_handler.pyx | 22 +--- cuda_core/cuda/core/_memoryview.pyx | 6 +- cuda_core/cuda/core/_tensor_map.pyx | 120 ++++++++++++-------- cuda_core/tests/test_tensor_map.py | 106 +++++++++-------- 5 files changed, 139 insertions(+), 122 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/tensor_map.cpp b/cuda_core/cuda/core/_cpp/tensor_map.cpp index af09aa1b2f..df3f7654e5 100644 --- a/cuda_core/cuda/core/_cpp/tensor_map.cpp +++ b/cuda_core/cuda/core/_cpp/tensor_map.cpp @@ -10,13 +10,18 @@ #include #if defined(__has_include) +// Older CTK releases do not ship . When it is unavailable we keep +// the CCCL helper compiled out and fall back to the direct driver path. # if __has_include() # include # define CUDA_CORE_HAS_CUDA_TMA 1 # else # define CUDA_CORE_HAS_CUDA_TMA 0 # endif -# if __has_include() +# if __has_include("dlpack.h") +# include "dlpack.h" +# define CUDA_CORE_HAS_DLPACK_H 1 +# elif __has_include() # include # define CUDA_CORE_HAS_DLPACK_H 1 # else diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx index 28a981fed2..35eea2de47 100644 --- a/cuda_core/cuda/core/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -6,7 +6,6 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free from libc.stdint cimport (intptr_t, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t,) -from libc.string cimport memcpy from libcpp cimport bool as cpp_bool from libcpp.complex cimport complex as cpp_complex from libcpp cimport nullptr @@ -135,18 +134,9 @@ cdef inline int prepare_tensor_map_arg( vector.vector[void*]& data_addresses, TensorMapDescriptor arg, const size_t idx) except -1: - arg._check_context_compat() - # Allocate a temporary buffer for the 128-byte CUtensorMap struct. - # We copy rather than pointing directly at arg._tensor_map for lifetime - # safety: ParamHolder owns and frees its argument buffers independently. - cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap)) - if ptr is NULL: - raise MemoryError("Failed to allocate memory for CUtensorMap") - memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap)) - # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__, - # data_addresses[idx] is the pointer passed to cuLaunchKernel. - data_addresses[idx] = ptr - data[idx] = ptr + # cuLaunchKernel copies argument bytes during launch, so a TensorMap + # descriptor can point directly at its internal CUtensorMap storage. + data_addresses[idx] = arg._get_data_ptr() return 0 @@ -299,9 +289,6 @@ cdef class ParamHolder: # it's a CUdeviceptr: self.data_addresses[i] = (arg.handle.getPtr()) continue - elif arg_type is tensor_map_descriptor_type: - prepare_tensor_map_arg(self.data, self.data_addresses, arg, i) - continue elif arg_type is bool: prepare_arg[cpp_bool](self.data, self.data_addresses, arg, i) continue @@ -319,6 +306,9 @@ cdef class ParamHolder: elif arg_type is complex: prepare_arg[cpp_double_complex](self.data, self.data_addresses, arg, i) continue + elif arg_type is tensor_map_descriptor_type: + prepare_tensor_map_arg(self.data, self.data_addresses, arg, i) + continue not_prepared = prepare_numpy_arg(self.data, self.data_addresses, arg, i) if not_prepared: diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index 78002b3ff1..e34aca424d 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -329,8 +329,8 @@ cdef class StridedMemoryView: ): """Create a tiled :obj:`TensorMapDescriptor` from this view. - This is a convenience wrapper around - :meth:`cuda.core._tensor_map.TensorMapDescriptor.from_tiled`. + This is the public entry point for creating tiled tensor map + descriptors in ``cuda.core``. """ from cuda.core._tensor_map import TensorMapDescriptor @@ -347,7 +347,7 @@ cdef class StridedMemoryView: kwargs["l2_promotion"] = l2_promotion if oob_fill is not None: kwargs["oob_fill"] = oob_fill - return TensorMapDescriptor.from_tiled(self, box_dim, **kwargs) + return TensorMapDescriptor._from_tiled(self, box_dim, **kwargs) def copy_from( self, other : StridedMemoryView, stream : Stream, diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx index 3b4d644e7b..7ed69bf722 100644 --- a/cuda_core/cuda/core/_tensor_map.pyx +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -117,7 +117,7 @@ ELSE: """Im2col wide mode for tensor map descriptors. This enum is always defined for API stability, but the - :meth:`TensorMapDescriptor.from_im2col_wide` factory requires a CUDA 13+ + :meth:`TensorMapDescriptor._from_im2col_wide` factory requires a CUDA 13+ build and will raise otherwise. """ W = 0 @@ -163,10 +163,20 @@ def _resolve_data_type(view, data_type): """Resolve the TMA data type from an explicit value or the view's dtype.""" if data_type is not None: - if not isinstance(data_type, TensorMapDataType): + if isinstance(data_type, TensorMapDataType): + return data_type + try: + dt = numpy.dtype(data_type) + except TypeError as e: raise TypeError( - f"data_type must be a TensorMapDataType, got {type(data_type)}") - return data_type + "data_type must be a TensorMapDataType or a numpy/ml_dtypes dtype, " + f"got {type(data_type)}") from e + tma_dt = _NUMPY_DTYPE_TO_TMA.get(dt) + if tma_dt is None: + raise ValueError( + f"Unsupported dtype {dt} for TMA; " + f"supported dtypes: {list(_NUMPY_DTYPE_TO_TMA.keys())}.") + return tma_dt dt = view.dtype if dt is None: @@ -243,15 +253,7 @@ cdef inline bint _tma_dtype_to_dlpack( return False -def _get_validated_view(tensor): - """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer.""" - if isinstance(tensor, StridedMemoryView): - view = tensor - else: - # stream_ptr=-1: no stream synchronization needed because descriptor - # creation only reads tensor metadata, it does not move data. - view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) - +cdef inline int _validate_tensor_map_view(view) except -1: if not view.is_device_accessible: raise ValueError("The tensor must be device-accessible") @@ -259,7 +261,18 @@ def _get_validated_view(tensor): raise ValueError( f"Global memory address must be 16-byte aligned, " f"got address 0x{view.ptr:x}") + return 0 + +def _get_validated_view(tensor): + """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer.""" + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + # stream_ptr=-1: no stream synchronization needed because descriptor + # creation only reads tensor metadata, it does not move data. + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + _validate_tensor_map_view(view) return view @@ -292,6 +305,17 @@ cdef inline int _get_current_device_id() except -1: return dev +cdef inline int _require_view_device( + view, + int device_id, + object caller, +) except -1: + if view.device_id != device_id: + raise ValueError( + f"{caller} expects tensor on device {device_id}, got {view.device_id}") + return 0 + + def _compute_byte_strides(shape, strides, elem_size): """Compute byte strides from element strides or C-contiguous fallback. @@ -328,16 +352,17 @@ cdef class TensorMapDescriptor: used by the hardware TMA unit for efficient bulk data movement between global and shared memory. - Instances are created via the class methods :meth:`from_tiled` and - :meth:`from_im2col`, and can be passed directly to - :func:`~cuda.core.launch` as a kernel argument. + Public tiled descriptors are created via + :meth:`cuda.core.StridedMemoryView.as_tensor_map`. Specialized + ``_from_*`` helpers remain private while this API surface settles, and + descriptors can be passed directly to :func:`~cuda.core.launch` as a + kernel argument. """ def __init__(self): raise RuntimeError( "TensorMapDescriptor cannot be instantiated directly. " - "Use TensorMapDescriptor.from_tiled() or " - "TensorMapDescriptor.from_im2col().") + "Use StridedMemoryView.as_tensor_map() instead.") cdef void* _get_data_ptr(self): return &self._tensor_map @@ -364,22 +389,27 @@ cdef class TensorMapDescriptor: f"but current device is {current_dev_id}") return 0 + @property + def device(self): + """Return the :obj:`~cuda.core.Device` associated with this descriptor.""" + if self._device_id >= 0: + from cuda.core._device import Device + return Device(self._device_id) + @classmethod - def from_tiled(cls, tensor, box_dim, *, + def _from_tiled(cls, view, box_dim, *, element_strides=None, data_type=None, interleave=TensorMapInterleave.NONE, swizzle=TensorMapSwizzle.NONE, l2_promotion=TensorMapL2Promotion.NONE, oob_fill=TensorMapOOBFill.NONE): - """Create a tiled TMA descriptor from a tensor object. + """Create a tiled TMA descriptor from a validated view. Parameters ---------- - tensor : object - Any object supporting DLPack or ``__cuda_array_interface__``, - or a :obj:`~cuda.core.StridedMemoryView`. Must refer to - device-accessible memory with a 16-byte-aligned pointer. + view : StridedMemoryView + A device-accessible view with a 16-byte-aligned pointer. box_dim : tuple of int The size of each tile dimension (in elements). Must have the same rank as the tensor and each value must be in [1, 256]. @@ -411,15 +441,15 @@ cdef class TensorMapDescriptor: """ cdef TensorMapDescriptor desc = cls.__new__(cls) - view = _get_validated_view(tensor) + _validate_tensor_map_view(view) # Keep both the original tensor object and the validated view alive. # For DLPack exporters, the view may hold the owning capsule whose # deleter can free the backing allocation when released. - desc._source_ref = tensor + desc._source_ref = view.exporting_obj desc._view_ref = view desc._context = _get_current_context_ptr() desc._device_id = _get_current_device_id() - _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_tiled") + _require_view_device(view, desc._device_id, "TensorMapDescriptor._from_tiled") tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -591,7 +621,7 @@ cdef class TensorMapDescriptor: return desc @classmethod - def from_im2col(cls, tensor, pixel_box_lower_corner, pixel_box_upper_corner, + def _from_im2col(cls, view, pixel_box_lower_corner, pixel_box_upper_corner, channels_per_pixel, pixels_per_column, *, element_strides=None, data_type=None, @@ -599,16 +629,14 @@ cdef class TensorMapDescriptor: swizzle=TensorMapSwizzle.NONE, l2_promotion=TensorMapL2Promotion.NONE, oob_fill=TensorMapOOBFill.NONE): - """Create an im2col TMA descriptor from a tensor object. + """Create an im2col TMA descriptor from a validated view. Im2col layout is used for convolution-style data access patterns. Parameters ---------- - tensor : object - Any object supporting DLPack or ``__cuda_array_interface__``, - or a :obj:`~cuda.core.StridedMemoryView`. Must refer to - device-accessible memory with a 16-byte-aligned pointer. + view : StridedMemoryView + A device-accessible view with a 16-byte-aligned pointer. pixel_box_lower_corner : tuple of int Lower corner of the pixel bounding box for each spatial dimension (rank - 2 elements). Specified in row-major order @@ -647,12 +675,12 @@ cdef class TensorMapDescriptor: """ cdef TensorMapDescriptor desc = cls.__new__(cls) - view = _get_validated_view(tensor) - desc._source_ref = tensor + _validate_tensor_map_view(view) + desc._source_ref = view.exporting_obj desc._view_ref = view desc._context = _get_current_context_ptr() desc._device_id = _get_current_device_id() - _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_im2col") + _require_view_device(view, desc._device_id, "TensorMapDescriptor._from_im2col") tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -746,7 +774,7 @@ cdef class TensorMapDescriptor: return desc @classmethod - def from_im2col_wide(cls, tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width, + def _from_im2col_wide(cls, view, pixel_box_lower_corner_width, pixel_box_upper_corner_width, channels_per_pixel, pixels_per_column, *, element_strides=None, data_type=None, @@ -755,7 +783,7 @@ cdef class TensorMapDescriptor: swizzle=TensorMapSwizzle.SWIZZLE_128B, l2_promotion=TensorMapL2Promotion.NONE, oob_fill=TensorMapOOBFill.NONE): - """Create an im2col-wide TMA descriptor from a tensor object. + """Create an im2col-wide TMA descriptor from a validated view. Im2col-wide layout loads elements exclusively along the W (width) dimension. This variant is supported on compute capability 10.0+ @@ -763,10 +791,8 @@ cdef class TensorMapDescriptor: Parameters ---------- - tensor : object - Any object supporting DLPack or ``__cuda_array_interface__``, - or a :obj:`~cuda.core.StridedMemoryView`. Must refer to - device-accessible memory with a 16-byte-aligned pointer. + view : StridedMemoryView + A device-accessible view with a 16-byte-aligned pointer. pixel_box_lower_corner_width : int Lower corner of the pixel bounding box along the W dimension. pixel_box_upper_corner_width : int @@ -803,16 +829,16 @@ cdef class TensorMapDescriptor: """ IF CUDA_CORE_BUILD_MAJOR < 13: raise RuntimeError( - "TensorMapDescriptor.from_im2col_wide requires a CUDA 13+ build") + "TensorMapDescriptor._from_im2col_wide requires a CUDA 13+ build") ELSE: cdef TensorMapDescriptor desc = cls.__new__(cls) - view = _get_validated_view(tensor) - desc._source_ref = tensor + _validate_tensor_map_view(view) + desc._source_ref = view.exporting_obj desc._view_ref = view desc._context = _get_current_context_ptr() desc._device_id = _get_current_device_id() - _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_im2col_wide") + _require_view_device(view, desc._device_id, "TensorMapDescriptor._from_im2col_wide") tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -917,7 +943,7 @@ cdef class TensorMapDescriptor: # Update the source reference only after the driver call succeeds, # so we don't drop the old tensor (risking a dangling pointer in the # CUtensorMap struct) if the call fails. - self._source_ref = tensor + self._source_ref = view.exporting_obj self._view_ref = view def __repr__(self): diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index b50067c9a3..099140354c 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -61,6 +61,11 @@ def __init__(self, device_type, device_id): def __dlpack_device__(self): return (self._device_type, self._device_id) +def _as_view(obj): + if isinstance(obj, StridedMemoryView): + return obj + return StridedMemoryView.from_any_interface(obj, stream_ptr=-1) + class TestTensorMapEnums: """Test that enum wrappers expose the expected values.""" @@ -106,19 +111,25 @@ def test_cannot_instantiate_directly(self): def test_from_tiled_1d(self, dev, skip_if_no_tma): buf = dev.allocate(1024 * 4) # 1024 float32 elements - desc = TensorMapDescriptor.from_tiled( - buf, + desc = _as_view(buf).as_tensor_map( box_dim=(64,), data_type=TensorMapDataType.FLOAT32, ) assert desc is not None assert repr(desc) == "TensorMapDescriptor(tiled, rank=1, dtype=FLOAT32, swizzle=NONE)" + def test_device_property(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + desc = _as_view(buf).as_tensor_map( + box_dim=(64,), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc.device.device_id == dev.device_id + def test_from_tiled_2d(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) # 64x64 float32 tensor = _DeviceArray(buf, (64, 64)) - desc = TensorMapDescriptor.from_tiled( - tensor, + desc = _as_view(tensor).as_tensor_map( box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, ) @@ -137,8 +148,7 @@ def test_strided_memory_view_as_tensor_map(self, dev, skip_if_no_tma): def test_from_tiled_3d(self, dev, skip_if_no_tma): buf = dev.allocate(16 * 16 * 16 * 4) # 16x16x16 float32 tensor = _DeviceArray(buf, (16, 16, 16)) - desc = TensorMapDescriptor.from_tiled( - tensor, + desc = _as_view(tensor).as_tensor_map( box_dim=(8, 8, 8), data_type=TensorMapDataType.FLOAT32, ) @@ -150,8 +160,7 @@ def test_from_tiled_5d(self, dev, skip_if_no_tma): n_bytes = 2 * 4 * 4 * 4 * 8 * 4 # float32 buf = dev.allocate(n_bytes) tensor = _DeviceArray(buf, shape) - desc = TensorMapDescriptor.from_tiled( - tensor, + desc = _as_view(tensor).as_tensor_map( box_dim=(1, 2, 2, 2, 8), ) assert desc is not None @@ -159,8 +168,7 @@ def test_from_tiled_5d(self, dev, skip_if_no_tma): def test_from_tiled_with_element_strides_buffer(self, dev, skip_if_no_tma): # Use a Buffer input (DLPack path) and explicit element_strides. buf = dev.allocate(1024 * 4) - desc = TensorMapDescriptor.from_tiled( - buf, + desc = _as_view(buf).as_tensor_map( box_dim=(64,), element_strides=(2,), data_type=TensorMapDataType.FLOAT32, @@ -171,8 +179,7 @@ def test_from_tiled_with_element_strides_cai(self, dev, skip_if_no_tma): # Use a CAI-style tensor wrapper and explicit element_strides. buf = dev.allocate(64 * 64 * 4) tensor = _DeviceArray(buf, (64, 64)) - desc = TensorMapDescriptor.from_tiled( - tensor, + desc = _as_view(tensor).as_tensor_map( box_dim=(32, 32), element_strides=(2, 1), data_type=TensorMapDataType.FLOAT32, @@ -182,8 +189,7 @@ def test_from_tiled_with_element_strides_cai(self, dev, skip_if_no_tma): def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) tensor = _DeviceArray(buf, (64, 64)) - desc = TensorMapDescriptor.from_tiled( - tensor, + desc = _as_view(tensor).as_tensor_map( box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, swizzle=TensorMapSwizzle.SWIZZLE_128B, @@ -193,8 +199,7 @@ def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) tensor = _DeviceArray(buf, (64, 64)) - desc = TensorMapDescriptor.from_tiled( - tensor, + desc = _as_view(tensor).as_tensor_map( box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, l2_promotion=TensorMapL2Promotion.L2_128B, @@ -204,8 +209,7 @@ def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma): def test_from_tiled_with_oob_fill(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) tensor = _DeviceArray(buf, (64, 64)) - desc = TensorMapDescriptor.from_tiled( - tensor, + desc = _as_view(tensor).as_tensor_map( box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA, @@ -220,8 +224,7 @@ def test_invalid_rank_zero(self, dev, skip_if_no_tma): buf = dev.allocate(64) tensor = _DeviceArray(buf, ()) # 0-dim tensor with pytest.raises(ValueError, match="rank must be between 1 and 5"): - TensorMapDescriptor.from_tiled( - tensor, + _as_view(tensor).as_tensor_map( box_dim=(), data_type=TensorMapDataType.FLOAT32, ) @@ -234,16 +237,14 @@ def test_invalid_rank_six(self, dev, skip_if_no_tma): buf = dev.allocate(n_elements * 4) arr = _DeviceArray(buf, shape) with pytest.raises(ValueError, match="rank must be between 1 and 5"): - TensorMapDescriptor.from_tiled( - arr, + _as_view(arr).as_tensor_map( box_dim=(2,) * 6, ) def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma): buf = dev.allocate(1024 * 4) with pytest.raises(ValueError, match="box_dim must have 1 elements"): - TensorMapDescriptor.from_tiled( - buf, + _as_view(buf).as_tensor_map( box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, ) @@ -251,8 +252,7 @@ def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma): def test_box_dim_out_of_range(self, dev, skip_if_no_tma): buf = dev.allocate(1024 * 4) with pytest.raises(ValueError, match=r"box_dim\[0\] must be in \[1, 256\]"): - TensorMapDescriptor.from_tiled( - buf, + _as_view(buf).as_tensor_map( box_dim=(512,), data_type=TensorMapDataType.FLOAT32, ) @@ -260,8 +260,7 @@ def test_box_dim_out_of_range(self, dev, skip_if_no_tma): def test_element_strides_rank_mismatch(self, dev, skip_if_no_tma): buf = dev.allocate(1024 * 4) with pytest.raises(ValueError, match="element_strides must have 1 elements"): - TensorMapDescriptor.from_tiled( - buf, + _as_view(buf).as_tensor_map( box_dim=(64,), element_strides=(1, 1), data_type=TensorMapDataType.FLOAT32, @@ -269,9 +268,8 @@ def test_element_strides_rank_mismatch(self, dev, skip_if_no_tma): def test_invalid_data_type(self, dev, skip_if_no_tma): buf = dev.allocate(1024 * 4) - with pytest.raises(TypeError, match="data_type must be a TensorMapDataType"): - TensorMapDescriptor.from_tiled( - buf, + with pytest.raises(TypeError, match="data_type must be"): + _as_view(buf).as_tensor_map( box_dim=(64,), data_type=42, ) @@ -315,8 +313,7 @@ class TestTensorMapReplaceAddress: def test_replace_address(self, dev, skip_if_no_tma): buf1 = dev.allocate(1024 * 4) - desc = TensorMapDescriptor.from_tiled( - buf1, + desc = _as_view(buf1).as_tensor_map( box_dim=(64,), data_type=TensorMapDataType.FLOAT32, ) @@ -327,8 +324,7 @@ def test_replace_address(self, dev, skip_if_no_tma): def test_replace_address_requires_device_accessible(self, dev, skip_if_no_tma): buf1 = dev.allocate(1024 * 4) - desc = TensorMapDescriptor.from_tiled( - buf1, + desc = _as_view(buf1).as_tensor_map( box_dim=(64,), data_type=TensorMapDataType.FLOAT32, ) @@ -450,8 +446,8 @@ def test_from_im2col_3d(self, dev, skip_if_no_tma): # 3D tensor: batch=1, height=32, channels=64 buf = dev.allocate(1 * 32 * 64 * 4) tensor = _DeviceArray(buf, (1, 32, 64)) - desc = TensorMapDescriptor.from_im2col( - tensor, + desc = TensorMapDescriptor._from_im2col( + _as_view(tensor), pixel_box_lower_corner=(0,), pixel_box_upper_corner=(4,), channels_per_pixel=64, @@ -463,8 +459,8 @@ def test_from_im2col_3d(self, dev, skip_if_no_tma): def test_from_im2col_rank_validation(self, dev, skip_if_no_tma): buf = dev.allocate(1024 * 4) with pytest.raises(ValueError, match="Im2col tensor rank must be between 3 and 5"): - TensorMapDescriptor.from_im2col( - buf, + TensorMapDescriptor._from_im2col( + _as_view(buf), pixel_box_lower_corner=(), pixel_box_upper_corner=(), channels_per_pixel=64, @@ -476,8 +472,8 @@ def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma): buf = dev.allocate(1 * 32 * 64 * 4) tensor = _DeviceArray(buf, (1, 32, 64)) # 3D: n_spatial = 1 with pytest.raises(ValueError, match="pixel_box_lower_corner must have 1 elements"): - TensorMapDescriptor.from_im2col( - tensor, + TensorMapDescriptor._from_im2col( + _as_view(tensor), pixel_box_lower_corner=(0, 0), pixel_box_upper_corner=(4,), channels_per_pixel=64, @@ -492,8 +488,8 @@ def test_from_im2col_4d(self, dev, skip_if_no_tma): shape = (1, 8, 8, 64) buf = dev.allocate(1 * 8 * 8 * 64 * 4) tensor = _DeviceArray(buf, shape) - desc = TensorMapDescriptor.from_im2col( - tensor, + desc = TensorMapDescriptor._from_im2col( + _as_view(tensor), pixel_box_lower_corner=(0, 0), pixel_box_upper_corner=(4, 4), channels_per_pixel=64, @@ -508,8 +504,8 @@ def test_from_im2col_5d(self, dev, skip_if_no_tma): shape = (1, 4, 8, 8, 64) buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4) tensor = _DeviceArray(buf, shape) - desc = TensorMapDescriptor.from_im2col( - tensor, + desc = TensorMapDescriptor._from_im2col( + _as_view(tensor), pixel_box_lower_corner=(0, 0, 0), pixel_box_upper_corner=(2, 4, 4), channels_per_pixel=64, @@ -535,8 +531,8 @@ def skip_if_no_im2col_wide(self, dev): buf = dev.allocate(1 * 32 * 64 * 4) tensor = _DeviceArray(buf, (1, 32, 64)) try: - TensorMapDescriptor.from_im2col_wide( - tensor, + TensorMapDescriptor._from_im2col_wide( + _as_view(tensor), pixel_box_lower_corner_width=0, pixel_box_upper_corner_width=4, channels_per_pixel=64, @@ -556,8 +552,8 @@ def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide): # 3D tensor: batch=1, width=32, channels=64 buf = dev.allocate(1 * 32 * 64 * 4) tensor = _DeviceArray(buf, (1, 32, 64)) - desc = TensorMapDescriptor.from_im2col_wide( - tensor, + desc = TensorMapDescriptor._from_im2col_wide( + _as_view(tensor), pixel_box_lower_corner_width=0, pixel_box_upper_corner_width=4, channels_per_pixel=64, @@ -572,8 +568,8 @@ def test_from_im2col_wide_4d(self, dev, skip_if_no_im2col_wide): shape = (1, 8, 8, 64) buf = dev.allocate(1 * 8 * 8 * 64 * 4) tensor = _DeviceArray(buf, shape) - desc = TensorMapDescriptor.from_im2col_wide( - tensor, + desc = TensorMapDescriptor._from_im2col_wide( + _as_view(tensor), pixel_box_lower_corner_width=0, pixel_box_upper_corner_width=4, channels_per_pixel=64, @@ -587,8 +583,8 @@ def test_from_im2col_wide_5d(self, dev, skip_if_no_im2col_wide): shape = (1, 4, 8, 8, 64) buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4) tensor = _DeviceArray(buf, shape) - desc = TensorMapDescriptor.from_im2col_wide( - tensor, + desc = TensorMapDescriptor._from_im2col_wide( + _as_view(tensor), pixel_box_lower_corner_width=0, pixel_box_upper_corner_width=4, channels_per_pixel=64, @@ -599,8 +595,8 @@ def test_from_im2col_wide_5d(self, dev, skip_if_no_im2col_wide): def test_from_im2col_wide_rank_validation(self, dev, skip_if_no_im2col_wide): buf = dev.allocate(1024 * 4) with pytest.raises(ValueError, match="Im2col-wide tensor rank must be between 3 and 5"): - TensorMapDescriptor.from_im2col_wide( - buf, + TensorMapDescriptor._from_im2col_wide( + _as_view(buf), pixel_box_lower_corner_width=0, pixel_box_upper_corner_width=4, channels_per_pixel=64, From e67e9d3e3dcc83c38ee98a4df43bd46d3d3224a8 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 14 Mar 2026 08:56:26 -0400 Subject: [PATCH 20/25] Consolidate the TMA examples around the libcudacxx wrappers. Keep the example surface smaller and closer to CUDA C++ by showing barrier/TMA helpers and replace_address() in one place instead of duplicating raw PTX snippets. Made-with: Cursor --- cuda_core/examples/tma_replace_address.py | 189 ---------------------- cuda_core/examples/tma_tensor_map.py | 167 +++++++++---------- 2 files changed, 79 insertions(+), 277 deletions(-) delete mode 100644 cuda_core/examples/tma_replace_address.py diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py deleted file mode 100644 index fd19aeca31..0000000000 --- a/cuda_core/examples/tma_replace_address.py +++ /dev/null @@ -1,189 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -# ################################################################################ -# -# This example demonstrates how to use replace_address() to repoint a TMA -# (Tensor Memory Accelerator) descriptor at a different tensor without -# rebuilding the descriptor from scratch. -# -# The workflow is: -# -# 1. Create a TMA tiled descriptor and launch a kernel to verify it works -# 2. Allocate a second tensor with different content -# 3. Call replace_address() to repoint the same descriptor at the new tensor -# 4. Re-launch the kernel and verify it reads from the new tensor -# -# This is useful when the tensor layout (shape, dtype, tile size) stays the -# same but the underlying data buffer changes, e.g. double-buffering or -# iterating over a sequence of same-shaped tensors. -# -# Requirements: -# - Hopper or later GPU (compute capability >= 9.0) -# - CuPy -# - CUDA toolkit headers (CUDA_PATH or CUDA_HOME set) -# -# ################################################################################ - -import sys - -import cupy as cp -import numpy as np - -from cuda.core import ( - Device, - LaunchConfig, - Program, - ProgramOptions, - StridedMemoryView, - launch, -) - -# --------------------------------------------------------------------------- -# Check for Hopper+ GPU -# --------------------------------------------------------------------------- -dev = Device() -arch = dev.compute_capability -if arch < (9, 0): - print( - "TMA requires compute capability >= 9.0 (Hopper or later)", - file=sys.stderr, - ) - sys.exit(0) -dev.set_current() - -arch_str = "".join(f"{i}" for i in arch) - -# --------------------------------------------------------------------------- -# CUDA kernel that uses TMA to load a 1-D tile into shared memory, then -# copies the tile to an output buffer so we can verify correctness. -# -# The CUtensorMap struct (128 bytes) is defined inline so the kernel can be -# compiled with NVRTC without pulling in the full driver-API header. -# -# Key points: -# - The tensor map is passed by value with __grid_constant__ so the TMA -# hardware can read it from grid-constant memory. -# - Thread 0 in each block issues the TMA load and manages the mbarrier. -# - All threads wait on the mbarrier, then copy from shared to global. -# --------------------------------------------------------------------------- -TILE_SIZE = 128 # elements per tile (must match the kernel constant) - -code = r""" -// Minimal definition of the 128-byte opaque tensor map struct. -struct __align__(64) TensorMap { unsigned long long opaque[16]; }; - -static constexpr int TILE_SIZE = 128; - -extern "C" -__global__ void tma_copy( - const __grid_constant__ TensorMap tensor_map, - float* output, - int N) -{ - __shared__ __align__(128) float smem[TILE_SIZE]; - __shared__ __align__(8) unsigned long long mbar; - - const int tid = threadIdx.x; - const int tile_start = blockIdx.x * TILE_SIZE; - - // ---- Thread 0: set up mbarrier and issue the TMA load ---- - if (tid == 0) - { - // Initialise a single-phase mbarrier (1 arriving thread). - asm volatile( - "mbarrier.init.shared.b64 [%0], 1;" - :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); - - // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start' - // from the tensor described by 'tensor_map' into shared memory. - asm volatile( - "cp.async.bulk.tensor.1d.shared::cluster.global.tile" - ".mbarrier::complete_tx::bytes" - " [%0], [%1, {%2}], [%3];" - :: "r"((unsigned)__cvta_generic_to_shared(smem)), - "l"(&tensor_map), - "r"(tile_start), - "r"((unsigned)__cvta_generic_to_shared(&mbar))); - - // Tell the mbarrier how many bytes the TMA will deliver. - asm volatile( - "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;" - :: "r"((unsigned)__cvta_generic_to_shared(&mbar)), - "r"((unsigned)(TILE_SIZE * sizeof(float)))); - } - - __syncthreads(); - - // ---- Wait for the TMA load to complete ---- - if (tid == 0) - { - asm volatile( - "{ .reg .pred P; \n" - "WAIT: \n" - " mbarrier.try_wait.parity.shared.b64 P, [%0], 0; \n" - " @!P bra WAIT; \n" - "} \n" - :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); - } - - __syncthreads(); - - // ---- Copy the tile from shared memory to the output buffer ---- - if (tid < TILE_SIZE) - { - const int idx = tile_start + tid; - if (idx < N) - output[idx] = smem[tid]; - } -} -""" - -# --------------------------------------------------------------------------- -# Compile the kernel -# --------------------------------------------------------------------------- -prog = Program( - code, - code_type="c++", - options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"), -) -mod = prog.compile("cubin") -ker = mod.get_kernel("tma_copy") - -# --------------------------------------------------------------------------- -# 1) Prepare input data and verify the initial TMA copy -# --------------------------------------------------------------------------- -N = 1024 -a = cp.arange(N, dtype=cp.float32) # [0, 1, 2, ..., N-1] -output = cp.zeros(N, dtype=cp.float32) -dev.sync() # cupy uses its own stream - -tensor_map = StridedMemoryView.from_any_interface(a, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,)) - -n_tiles = N // TILE_SIZE -config = LaunchConfig(grid=n_tiles, block=TILE_SIZE) -launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N)) -dev.sync() - -assert cp.array_equal(output, a), "TMA copy produced incorrect results" -print(f"TMA copy verified: {N} elements across {n_tiles} tiles") - -# --------------------------------------------------------------------------- -# 2) Demonstrate replace_address() -# Create a second tensor with different content, point the *same* -# descriptor at it, and re-launch without rebuilding the descriptor. -# --------------------------------------------------------------------------- -b = cp.full(N, fill_value=42.0, dtype=cp.float32) -dev.sync() - -tensor_map.replace_address(b) - -output2 = cp.zeros(N, dtype=cp.float32) -dev.sync() - -launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N)) -dev.sync() - -assert cp.array_equal(output2, b), "replace_address produced incorrect results" -print("replace_address verified: descriptor reused with new source tensor") diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index 2a5ce9ad86..2a622a552d 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -4,15 +4,16 @@ # ################################################################################ # -# This example demonstrates how to use TMA (Tensor Memory Accelerator) descriptors -# with cuda.core on Hopper+ GPUs (compute capability >= 9.0). +# This example demonstrates how to use TMA (Tensor Memory Accelerator) +# descriptors with cuda.core on Hopper+ GPUs (compute capability >= 9.0). # -# TMA enables efficient bulk data movement between global and shared memory using -# hardware-managed tensor map descriptors. This example shows: +# TMA enables efficient bulk data movement between global and shared memory +# using hardware-managed tensor map descriptors. This example shows: # # 1. Creating a TMA tiled descriptor from a CuPy device array # 2. Passing the descriptor to a kernel via launch() -# 3. Using TMA to load tiles into shared memory (via inline PTX) +# 3. Using libcudacxx TMA/barrier wrappers instead of raw PTX +# 4. Reusing the same descriptor with replace_address() # # Requirements: # - Hopper or later GPU (compute capability >= 9.0) @@ -35,39 +36,31 @@ launch, ) -# --------------------------------------------------------------------------- -# Check for Hopper+ GPU -# --------------------------------------------------------------------------- -dev = Device() -arch = dev.compute_capability -if arch < (9, 0): - print( - "TMA requires compute capability >= 9.0 (Hopper or later)", - file=sys.stderr, - ) - sys.exit(0) -dev.set_current() - # --------------------------------------------------------------------------- # CUDA kernel that uses TMA to load a 1-D tile into shared memory, then # copies the tile to an output buffer so we can verify correctness. # # The CUtensorMap struct (128 bytes) is defined inline so the kernel can be -# compiled with NVRTC without pulling in the full driver-API header. +# compiled with NVRTC without pulling in the full driver-API header. The +# kernel uses libcudacxx's `cuda::barrier` and TMA wrapper helpers rather +# than embedding raw PTX strings. # # Key points: # - The tensor map is passed by value with __grid_constant__ so the TMA # hardware can read it from grid-constant memory. -# - Thread 0 in each block issues the TMA load and manages the mbarrier. -# - All threads wait on the mbarrier, then copy from shared to global. +# - Thread 0 in each block issues the TMA load and waits on the barrier. +# - All threads synchronize before copying from shared to global memory. # --------------------------------------------------------------------------- TILE_SIZE = 128 # elements per tile (must match the kernel constant) code = r""" +#include + // Minimal definition of the 128-byte opaque tensor map struct. struct __align__(64) TensorMap { unsigned long long opaque[16]; }; static constexpr int TILE_SIZE = 128; +using TmaBarrier = cuda::barrier; extern "C" __global__ void tma_copy( @@ -76,54 +69,28 @@ int N) { __shared__ __align__(128) float smem[TILE_SIZE]; - __shared__ __align__(8) unsigned long long mbar; + __shared__ TmaBarrier bar; const int tid = threadIdx.x; const int tile_start = blockIdx.x * TILE_SIZE; - // ---- Thread 0: set up mbarrier and issue the TMA load ---- if (tid == 0) { - // Initialise a single-phase mbarrier (1 arriving thread). - asm volatile( - "mbarrier.init.shared.b64 [%0], 1;" - :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); - - // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start' - // from the tensor described by 'tensor_map' into shared memory. - asm volatile( - "cp.async.bulk.tensor.1d.shared::cluster.global.tile" - ".mbarrier::complete_tx::bytes" - " [%0], [%1, {%2}], [%3];" - :: "r"((unsigned)__cvta_generic_to_shared(smem)), - "l"(&tensor_map), - "r"(tile_start), - "r"((unsigned)__cvta_generic_to_shared(&mbar))); - - // Tell the mbarrier how many bytes the TMA will deliver. - asm volatile( - "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;" - :: "r"((unsigned)__cvta_generic_to_shared(&mbar)), - "r"((unsigned)(TILE_SIZE * sizeof(float)))); + init(&bar, 1); } - __syncthreads(); - // ---- Wait for the TMA load to complete ---- if (tid == 0) { - asm volatile( - "{ .reg .pred P; \n" - "WAIT: \n" - " mbarrier.try_wait.parity.shared.b64 P, [%0], 0; \n" - " @!P bra WAIT; \n" - "} \n" - :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); + cuda::device::experimental::cp_async_bulk_tensor_1d_global_to_shared( + smem, + reinterpret_cast(&tensor_map), + tile_start, + bar); + bar.wait(cuda::device::barrier_arrive_tx(bar, 1, TILE_SIZE * sizeof(float))); } - __syncthreads(); - // ---- Copy the tile from shared memory to the output buffer ---- if (tid < TILE_SIZE) { const int idx = tile_start + tid; @@ -133,40 +100,64 @@ } """ -# --------------------------------------------------------------------------- -# Compile the kernel -# --------------------------------------------------------------------------- -prog = Program( - code, - code_type="c++", - options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"), -) -mod = prog.compile("cubin") -ker = mod.get_kernel("tma_copy") +def main(): + # ----------------------------------------------------------------------- + # Check for Hopper+ GPU + # ----------------------------------------------------------------------- + dev = Device() + arch = dev.compute_capability + if arch < (9, 0): + print( + "TMA requires compute capability >= 9.0 (Hopper or later)", + file=sys.stderr, + ) + sys.exit(0) + dev.set_current() + + # ----------------------------------------------------------------------- + # Compile the kernel + # ----------------------------------------------------------------------- + prog = Program( + code, + code_type="c++", + options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"), + ) + mod = prog.compile("cubin") + ker = mod.get_kernel("tma_copy") -# --------------------------------------------------------------------------- -# 1) Prepare input data on the device -# --------------------------------------------------------------------------- -N = 1024 -a = cp.arange(N, dtype=cp.float32) # [0, 1, 2, ..., N-1] -output = cp.zeros(N, dtype=cp.float32) -dev.sync() # cupy uses its own stream + # ----------------------------------------------------------------------- + # 1) Prepare input data and verify the initial TMA copy + # ----------------------------------------------------------------------- + n = 1024 + src = cp.arange(n, dtype=cp.float32) + output = cp.zeros(n, dtype=cp.float32) + dev.sync() # CuPy uses its own stream -# --------------------------------------------------------------------------- -# 2) Create a TMA tiled descriptor from a StridedMemoryView. -# The dtype (float32) is inferred automatically from the CuPy array. -# --------------------------------------------------------------------------- -tensor_map = StridedMemoryView.from_any_interface(a, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,)) + tensor_map = StridedMemoryView.from_any_interface(src, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,)) + + n_tiles = n // TILE_SIZE + config = LaunchConfig(grid=n_tiles, block=TILE_SIZE) + launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(n)) + dev.sync() + + assert cp.array_equal(output, src), "TMA copy produced incorrect results" + print(f"TMA copy verified: {n} elements across {n_tiles} tiles") + + # ----------------------------------------------------------------------- + # 2) Demonstrate replace_address() without rebuilding the descriptor + # ----------------------------------------------------------------------- + replacement = cp.full(n, fill_value=42.0, dtype=cp.float32) + dev.sync() + + tensor_map.replace_address(replacement) + + output2 = cp.zeros(n, dtype=cp.float32) + launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(n)) + dev.sync() + + assert cp.array_equal(output2, replacement), "replace_address produced incorrect results" + print("replace_address verified: descriptor reused with new source tensor") -# --------------------------------------------------------------------------- -# 3) Launch the kernel -# The TensorMapDescriptor is passed directly as a kernel argument — the -# 128-byte struct is copied into kernel parameter space automatically. -# --------------------------------------------------------------------------- -n_tiles = N // TILE_SIZE -config = LaunchConfig(grid=n_tiles, block=TILE_SIZE) -launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N)) -dev.sync() -assert cp.array_equal(output, a), "TMA copy produced incorrect results" -print(f"TMA copy verified: {N} elements across {n_tiles} tiles") +if __name__ == "__main__": + main() From 9ff8d0fc002cc16f5a0b1583a0f1d62ad969d947 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 14 Mar 2026 08:58:03 -0400 Subject: [PATCH 21/25] Teach the TMA example where to find libcudacxx headers. Use the toolkit include and optional cccl include roots when compiling the wrapper-based example so NVRTC can resolve cuda/barrier outside the test harness. Made-with: Cursor --- cuda_core/examples/tma_tensor_map.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index 2a622a552d..b914651089 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -22,6 +22,7 @@ # # ################################################################################ +import os import sys import cupy as cp @@ -100,6 +101,25 @@ } """ + +def _get_cccl_include_paths(): + cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME")) + if cuda_path is None: + print("This example requires CUDA_PATH or CUDA_HOME to point to a CUDA toolkit.", file=sys.stderr) + sys.exit(1) + + cuda_include = os.path.join(cuda_path, "include") + if not os.path.isdir(cuda_include): + print(f"CUDA include directory not found: {cuda_include}", file=sys.stderr) + sys.exit(1) + + include_path = [cuda_include] + cccl_include = os.path.join(cuda_include, "cccl") + if os.path.isdir(cccl_include): + include_path.insert(0, cccl_include) + return include_path + + def main(): # ----------------------------------------------------------------------- # Check for Hopper+ GPU @@ -113,6 +133,7 @@ def main(): ) sys.exit(0) dev.set_current() + include_path = _get_cccl_include_paths() # ----------------------------------------------------------------------- # Compile the kernel @@ -120,7 +141,7 @@ def main(): prog = Program( code, code_type="c++", - options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"), + options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}", include_path=include_path), ) mod = prog.compile("cubin") ker = mod.get_kernel("tma_copy") From 719f0f357c853b939f54a2152f0ba204f3d91947 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 14 Mar 2026 09:18:28 -0400 Subject: [PATCH 22/25] Bundle tiled TensorMap options and type retained views. Centralize the tiled descriptor arguments in an options object, keep dtype-like inputs on the public path while using raw driver values internally, and declare StridedMemoryView in a pxd so retained views stay typed without extra helper indirection. Made-with: Cursor --- cuda_core/cuda/core/_memoryview.pxd | 24 +++ cuda_core/cuda/core/_memoryview.pyx | 38 +--- cuda_core/cuda/core/_tensor_map.pxd | 3 +- cuda_core/cuda/core/_tensor_map.pyx | 271 +++++++++++++++++++++------- cuda_core/tests/test_tensor_map.py | 33 ++++ 5 files changed, 276 insertions(+), 93 deletions(-) create mode 100644 cuda_core/cuda/core/_memoryview.pxd diff --git a/cuda_core/cuda/core/_memoryview.pxd b/cuda_core/cuda/core/_memoryview.pxd new file mode 100644 index 0000000000..c12aa38ad8 --- /dev/null +++ b/cuda_core/cuda/core/_memoryview.pxd @@ -0,0 +1,24 @@ +from libc.stdint cimport intptr_t + +from cuda.core._dlpack cimport DLTensor +from cuda.core._layout cimport _StridedLayout + + +cdef class StridedMemoryView: + cdef readonly: + intptr_t ptr + int device_id + bint is_device_accessible + bint readonly + object exporting_obj + + cdef: + object metadata + DLTensor* dl_tensor + _StridedLayout _layout + object _buffer + object _dtype + + cdef inline _StridedLayout get_layout(self) + cdef inline object get_buffer(self) + cdef inline object get_dtype(self) diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index e34aca424d..7dc32b7ec7 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -107,35 +107,6 @@ cdef class StridedMemoryView: it will be the Buffer instance passed to the method. """ - cdef readonly: - intptr_t ptr - int device_id - bint is_device_accessible - bint readonly - object exporting_obj - - cdef: - # If using dlpack, this is a strong reference to the result of - # obj.__dlpack__() so we can lazily create shape and strides from - # it later. If using CAI, this is a reference to the source - # `__cuda_array_interface__` object. - object metadata - - # The tensor object if has obj has __dlpack__, otherwise must be NULL - DLTensor *dl_tensor - - # Memoized properties - # Either lazily inferred from dl_tensor/metadata, - # or explicitly provided if created with from_buffer(). - _StridedLayout _layout - # Either exporting_obj if it is a Buffer, otherwise a Buffer instance - # with owner set to the exporting object. - object _buffer - # Either lazily inferred from dl_tensor/metadata, - # or explicitly provided if created with from_buffer(). - # In the latter case, it can be None. - object _dtype - def __init__(self, obj: object = None, stream_ptr: int | None = None) -> None: cdef str clsname = self.__class__.__name__ if obj is not None: @@ -318,8 +289,9 @@ cdef class StridedMemoryView: def as_tensor_map( self, - box_dim, + box_dim=None, *, + options=None, element_strides=None, data_type=None, interleave=None, @@ -330,11 +302,15 @@ cdef class StridedMemoryView: """Create a tiled :obj:`TensorMapDescriptor` from this view. This is the public entry point for creating tiled tensor map - descriptors in ``cuda.core``. + descriptors in ``cuda.core``. Pass either ``box_dim`` and the + individual keyword arguments directly, or provide bundled tiled + options via ``options=``. """ from cuda.core._tensor_map import TensorMapDescriptor kwargs = {} + if options is not None: + kwargs["options"] = options if element_strides is not None: kwargs["element_strides"] = element_strides if data_type is not None: diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd index 4c60b7fc70..25aef56626 100644 --- a/cuda_core/cuda/core/_tensor_map.pxd +++ b/cuda_core/cuda/core/_tensor_map.pxd @@ -4,6 +4,7 @@ from cuda.bindings cimport cydriver from libc.stdint cimport intptr_t +from cuda.core._memoryview cimport StridedMemoryView cdef class TensorMapDescriptor: @@ -11,7 +12,7 @@ cdef class TensorMapDescriptor: cdef int _device_id cdef intptr_t _context cdef object _source_ref - cdef object _view_ref + cdef StridedMemoryView _view_ref cdef object _repr_info cdef int _check_context_compat(self) except -1 diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx index 7ed69bf722..c101ab1963 100644 --- a/cuda_core/cuda/core/_tensor_map.pyx +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -9,10 +9,12 @@ from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from cuda.core._dlpack cimport kDLInt, kDLUInt, kDLFloat, kDLBfloat, _kDLCUDA import enum +from dataclasses import dataclass import numpy from cuda.core._memoryview import StridedMemoryView +from cuda.core._utils.cuda_utils import check_or_create_options cdef extern from "_cpp/tensor_map_cccl.h": int cuda_core_cccl_make_tma_descriptor_tiled( @@ -124,38 +126,166 @@ ELSE: W128 = 1 +_TMA_DT_UINT8 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT8) +_TMA_DT_UINT16 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT16) +_TMA_DT_UINT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT32) +_TMA_DT_INT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_INT32) +_TMA_DT_UINT64 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT64) +_TMA_DT_INT64 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_INT64) +_TMA_DT_FLOAT16 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT16) +_TMA_DT_FLOAT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32) +_TMA_DT_FLOAT64 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT64) +_TMA_DT_BFLOAT16 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_BFLOAT16) +_TMA_DT_FLOAT32_FTZ = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ) +_TMA_DT_TFLOAT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32) +_TMA_DT_TFLOAT32_FTZ = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ) + + +def _normalize_tensor_map_data_type(data_type): + if data_type is None or isinstance(data_type, TensorMapDataType): + return data_type + try: + return numpy.dtype(data_type) + except TypeError as e: + raise TypeError( + "data_type must be a TensorMapDataType or a numpy/ml_dtypes dtype, " + f"got {type(data_type)}") from e + + +def _normalize_tensor_map_sequence(name, values): + try: + values = tuple(values) + except TypeError as e: + raise TypeError(f"{name} must be a tuple of ints, got {type(values)}") from e + for i, value in enumerate(values): + if not isinstance(value, int): + raise TypeError(f"{name}[{i}] must be an int, got {type(value)}") + return values + + +def _require_tensor_map_enum(name, value, enum_type): + if not isinstance(value, enum_type): + raise TypeError(f"{name} must be a {enum_type.__name__}, got {type(value)}") + return value + + +@dataclass +class TensorMapDescriptorOptions: + """Options for :meth:`cuda.core.StridedMemoryView.as_tensor_map`. + + Attributes + ---------- + box_dim : tuple[int, ...] + Tile size for each tensor dimension, expressed in elements. + element_strides : tuple[int, ...], optional + Per-dimension element traversal strides. + data_type : object, optional + Explicit dtype override. Prefer NumPy or ``ml_dtypes`` dtype objects; + :class:`TensorMapDataType` remains accepted for compatibility. + interleave : TensorMapInterleave, optional + Interleave layout. Default ``NONE``. + swizzle : TensorMapSwizzle, optional + Swizzle mode. Default ``NONE``. + l2_promotion : TensorMapL2Promotion, optional + L2 promotion mode. Default ``NONE``. + oob_fill : TensorMapOOBFill, optional + Out-of-bounds fill mode. Default ``NONE``. + """ + + box_dim: tuple[int, ...] + element_strides: tuple[int, ...] | None = None + data_type: object = None + interleave: TensorMapInterleave = TensorMapInterleave.NONE + swizzle: TensorMapSwizzle = TensorMapSwizzle.NONE + l2_promotion: TensorMapL2Promotion = TensorMapL2Promotion.NONE + oob_fill: TensorMapOOBFill = TensorMapOOBFill.NONE + + def __post_init__(self): + self.box_dim = _normalize_tensor_map_sequence("box_dim", self.box_dim) + if self.element_strides is not None: + self.element_strides = _normalize_tensor_map_sequence("element_strides", self.element_strides) + self.data_type = _normalize_tensor_map_data_type(self.data_type) + self.interleave = _require_tensor_map_enum("interleave", self.interleave, TensorMapInterleave) + self.swizzle = _require_tensor_map_enum("swizzle", self.swizzle, TensorMapSwizzle) + self.l2_promotion = _require_tensor_map_enum("l2_promotion", self.l2_promotion, TensorMapL2Promotion) + self.oob_fill = _require_tensor_map_enum("oob_fill", self.oob_fill, TensorMapOOBFill) + + +def _coerce_tensor_map_descriptor_options( + box_dim, + options, + *, + element_strides, + data_type, + interleave, + swizzle, + l2_promotion, + oob_fill, +): + if options is not None: + if ( + box_dim is not None + or element_strides is not None + or data_type is not None + or interleave != TensorMapInterleave.NONE + or swizzle != TensorMapSwizzle.NONE + or l2_promotion != TensorMapL2Promotion.NONE + or oob_fill != TensorMapOOBFill.NONE + ): + raise TypeError( + "Specify either options or the individual tensor map arguments, not both") + return check_or_create_options( + TensorMapDescriptorOptions, + options, + "Tensor map descriptor options", + ) + + if box_dim is None: + raise TypeError("box_dim is required unless options is provided") + + return TensorMapDescriptorOptions( + box_dim=box_dim, + element_strides=element_strides, + data_type=data_type, + interleave=interleave, + swizzle=swizzle, + l2_promotion=l2_promotion, + oob_fill=oob_fill, + ) + + # Mapping from numpy dtype to TMA data type _NUMPY_DTYPE_TO_TMA = { - numpy.dtype(numpy.uint8): TensorMapDataType.UINT8, - numpy.dtype(numpy.uint16): TensorMapDataType.UINT16, - numpy.dtype(numpy.uint32): TensorMapDataType.UINT32, - numpy.dtype(numpy.int32): TensorMapDataType.INT32, - numpy.dtype(numpy.uint64): TensorMapDataType.UINT64, - numpy.dtype(numpy.int64): TensorMapDataType.INT64, - numpy.dtype(numpy.float16): TensorMapDataType.FLOAT16, - numpy.dtype(numpy.float32): TensorMapDataType.FLOAT32, - numpy.dtype(numpy.float64): TensorMapDataType.FLOAT64, + numpy.dtype(numpy.uint8): _TMA_DT_UINT8, + numpy.dtype(numpy.uint16): _TMA_DT_UINT16, + numpy.dtype(numpy.uint32): _TMA_DT_UINT32, + numpy.dtype(numpy.int32): _TMA_DT_INT32, + numpy.dtype(numpy.uint64): _TMA_DT_UINT64, + numpy.dtype(numpy.int64): _TMA_DT_INT64, + numpy.dtype(numpy.float16): _TMA_DT_FLOAT16, + numpy.dtype(numpy.float32): _TMA_DT_FLOAT32, + numpy.dtype(numpy.float64): _TMA_DT_FLOAT64, } if ml_bfloat16 is not None: - _NUMPY_DTYPE_TO_TMA[numpy.dtype(ml_bfloat16)] = TensorMapDataType.BFLOAT16 + _NUMPY_DTYPE_TO_TMA[numpy.dtype(ml_bfloat16)] = _TMA_DT_BFLOAT16 # Mapping from TMA data type to element size in bytes _TMA_DATA_TYPE_SIZE = { - TensorMapDataType.UINT8: 1, - TensorMapDataType.UINT16: 2, - TensorMapDataType.UINT32: 4, - TensorMapDataType.INT32: 4, - TensorMapDataType.UINT64: 8, - TensorMapDataType.INT64: 8, - TensorMapDataType.FLOAT16: 2, - TensorMapDataType.FLOAT32: 4, - TensorMapDataType.FLOAT64: 8, - TensorMapDataType.BFLOAT16: 2, - TensorMapDataType.FLOAT32_FTZ: 4, - TensorMapDataType.TFLOAT32: 4, - TensorMapDataType.TFLOAT32_FTZ: 4, + _TMA_DT_UINT8: 1, + _TMA_DT_UINT16: 2, + _TMA_DT_UINT32: 4, + _TMA_DT_INT32: 4, + _TMA_DT_UINT64: 8, + _TMA_DT_INT64: 8, + _TMA_DT_FLOAT16: 2, + _TMA_DT_FLOAT32: 4, + _TMA_DT_FLOAT64: 8, + _TMA_DT_BFLOAT16: 2, + _TMA_DT_FLOAT32_FTZ: 4, + _TMA_DT_TFLOAT32: 4, + _TMA_DT_TFLOAT32_FTZ: 4, } @@ -164,13 +294,8 @@ def _resolve_data_type(view, data_type): if data_type is not None: if isinstance(data_type, TensorMapDataType): - return data_type - try: - dt = numpy.dtype(data_type) - except TypeError as e: - raise TypeError( - "data_type must be a TensorMapDataType or a numpy/ml_dtypes dtype, " - f"got {type(data_type)}") from e + return int(data_type) + dt = _normalize_tensor_map_data_type(data_type) tma_dt = _NUMPY_DTYPE_TO_TMA.get(dt) if tma_dt is None: raise ValueError( @@ -195,57 +320,57 @@ def _resolve_data_type(view, data_type): cdef inline bint _tma_dtype_to_dlpack( - object tma_dt, + int tma_dt, uint8_t* out_code, uint8_t* out_bits, uint16_t* out_lanes, ) noexcept: - if tma_dt == TensorMapDataType.UINT8: + if tma_dt == _TMA_DT_UINT8: out_code[0] = kDLUInt out_bits[0] = 8 out_lanes[0] = 1 return True - if tma_dt == TensorMapDataType.UINT16: + if tma_dt == _TMA_DT_UINT16: out_code[0] = kDLUInt out_bits[0] = 16 out_lanes[0] = 1 return True - if tma_dt == TensorMapDataType.UINT32: + if tma_dt == _TMA_DT_UINT32: out_code[0] = kDLUInt out_bits[0] = 32 out_lanes[0] = 1 return True - if tma_dt == TensorMapDataType.UINT64: + if tma_dt == _TMA_DT_UINT64: out_code[0] = kDLUInt out_bits[0] = 64 out_lanes[0] = 1 return True - if tma_dt == TensorMapDataType.INT32: + if tma_dt == _TMA_DT_INT32: out_code[0] = kDLInt out_bits[0] = 32 out_lanes[0] = 1 return True - if tma_dt == TensorMapDataType.INT64: + if tma_dt == _TMA_DT_INT64: out_code[0] = kDLInt out_bits[0] = 64 out_lanes[0] = 1 return True - if tma_dt == TensorMapDataType.FLOAT16: + if tma_dt == _TMA_DT_FLOAT16: out_code[0] = kDLFloat out_bits[0] = 16 out_lanes[0] = 1 return True - if tma_dt == TensorMapDataType.FLOAT32: + if tma_dt == _TMA_DT_FLOAT32: out_code[0] = kDLFloat out_bits[0] = 32 out_lanes[0] = 1 return True - if tma_dt == TensorMapDataType.FLOAT64: + if tma_dt == _TMA_DT_FLOAT64: out_code[0] = kDLFloat out_bits[0] = 64 out_lanes[0] = 1 return True - if tma_dt == TensorMapDataType.BFLOAT16: + if tma_dt == _TMA_DT_BFLOAT16: out_code[0] = kDLBfloat out_bits[0] = 16 out_lanes[0] = 1 @@ -287,8 +412,6 @@ def _require_view_device(view, expected_device_id, operation): if device_type == _kDLCUDA and device_id != expected_device_id: raise ValueError( f"{operation} expects tensor on device {expected_device_id}, got {device_id}") - - cdef inline intptr_t _get_current_context_ptr() except? 0: cdef cydriver.CUcontext ctx with nogil: @@ -397,7 +520,8 @@ cdef class TensorMapDescriptor: return Device(self._device_id) @classmethod - def _from_tiled(cls, view, box_dim, *, + def _from_tiled(cls, view, box_dim=None, *, + options=None, element_strides=None, data_type=None, interleave=TensorMapInterleave.NONE, @@ -410,16 +534,21 @@ cdef class TensorMapDescriptor: ---------- view : StridedMemoryView A device-accessible view with a 16-byte-aligned pointer. - box_dim : tuple of int + box_dim : tuple of int, optional The size of each tile dimension (in elements). Must have the same rank as the tensor and each value must be in [1, 256]. Specified in the same (row-major) order as the tensor shape. + Required unless ``options`` is provided. + options : TensorMapDescriptorOptions or mapping, optional + Bundled tiled-descriptor options. When provided, do not also pass + ``box_dim`` or the individual option kwargs. element_strides : tuple of int, optional Per-dimension element traversal strides. Default is all 1s. Specified in the same (row-major) order as the tensor shape. - data_type : TensorMapDataType, optional - Explicit data type override. If ``None``, inferred from the - tensor's dtype. + data_type : dtype-like or TensorMapDataType, optional + Explicit dtype override. If ``None``, inferred from the tensor's + dtype. Prefer NumPy or ``ml_dtypes`` dtype objects; the enum is + accepted for compatibility. interleave : TensorMapInterleave Interleave layout. Default ``NONE``. swizzle : TensorMapSwizzle @@ -441,6 +570,24 @@ cdef class TensorMapDescriptor: """ cdef TensorMapDescriptor desc = cls.__new__(cls) + opts = _coerce_tensor_map_descriptor_options( + box_dim, + options, + element_strides=element_strides, + data_type=data_type, + interleave=interleave, + swizzle=swizzle, + l2_promotion=l2_promotion, + oob_fill=oob_fill, + ) + box_dim = opts.box_dim + element_strides = opts.element_strides + data_type = opts.data_type + interleave = opts.interleave + swizzle = opts.swizzle + l2_promotion = opts.l2_promotion + oob_fill = opts.oob_fill + _validate_tensor_map_view(view) # Keep both the original tensor object and the validated view alive. # For DLPack exporters, the view may hold the owning capsule whose @@ -452,7 +599,7 @@ cdef class TensorMapDescriptor: _require_view_device(view, desc._device_id, "TensorMapDescriptor._from_tiled") tma_dt = _resolve_data_type(view, data_type) - cdef int c_data_type_int = int(tma_dt) + cdef int c_data_type_int = tma_dt cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int cdef intptr_t global_address = view.ptr @@ -551,7 +698,7 @@ cdef class TensorMapDescriptor: desc._repr_info = { "method": "tiled", "rank": rank, - "data_type": tma_dt, + "data_type": TensorMapDataType(tma_dt), "swizzle": swizzle, } return desc @@ -614,7 +761,7 @@ cdef class TensorMapDescriptor: desc._repr_info = { "method": "tiled", "rank": rank, - "data_type": tma_dt, + "data_type": TensorMapDataType(tma_dt), "swizzle": swizzle, } @@ -651,9 +798,10 @@ cdef class TensorMapDescriptor: Number of pixels per column. element_strides : tuple of int, optional Per-dimension element traversal strides. Default is all 1s. - data_type : TensorMapDataType, optional - Explicit data type override. If ``None``, inferred from the - tensor's dtype. + data_type : dtype-like or TensorMapDataType, optional + Explicit dtype override. If ``None``, inferred from the tensor's + dtype. Prefer NumPy or ``ml_dtypes`` dtype objects; the enum is + accepted for compatibility. interleave : TensorMapInterleave Interleave layout. Default ``NONE``. swizzle : TensorMapSwizzle @@ -683,7 +831,7 @@ cdef class TensorMapDescriptor: _require_view_device(view, desc._device_id, "TensorMapDescriptor._from_im2col") tma_dt = _resolve_data_type(view, data_type) - cdef int c_data_type_int = int(tma_dt) + cdef int c_data_type_int = tma_dt cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int cdef intptr_t global_address = view.ptr @@ -767,7 +915,7 @@ cdef class TensorMapDescriptor: desc._repr_info = { "method": "im2col", "rank": rank, - "data_type": tma_dt, + "data_type": TensorMapDataType(tma_dt), "swizzle": swizzle, } @@ -803,9 +951,10 @@ cdef class TensorMapDescriptor: Number of pixels per column. element_strides : tuple of int, optional Per-dimension element traversal strides. Default is all 1s. - data_type : TensorMapDataType, optional - Explicit data type override. If ``None``, inferred from the - tensor's dtype. + data_type : dtype-like or TensorMapDataType, optional + Explicit dtype override. If ``None``, inferred from the tensor's + dtype. Prefer NumPy or ``ml_dtypes`` dtype objects; the enum is + accepted for compatibility. interleave : TensorMapInterleave Interleave layout. Default ``NONE``. mode : TensorMapIm2ColWideMode @@ -841,7 +990,7 @@ cdef class TensorMapDescriptor: _require_view_device(view, desc._device_id, "TensorMapDescriptor._from_im2col_wide") tma_dt = _resolve_data_type(view, data_type) - cdef int c_data_type_int = int(tma_dt) + cdef int c_data_type_int = tma_dt cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int cdef intptr_t global_address = view.ptr @@ -909,7 +1058,7 @@ cdef class TensorMapDescriptor: desc._repr_info = { "method": "im2col_wide", "rank": rank, - "data_type": tma_dt, + "data_type": TensorMapDataType(tma_dt), "swizzle": swizzle, } diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 099140354c..3fb5f9808a 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -15,6 +15,7 @@ from cuda.core._dlpack import DLDeviceType from cuda.core._tensor_map import ( TensorMapDataType, + TensorMapDescriptorOptions, TensorMapIm2ColWideMode, TensorMapInterleave, TensorMapL2Promotion, @@ -145,6 +146,38 @@ def test_strided_memory_view_as_tensor_map(self, dev, skip_if_no_tma): ) assert desc is not None + def test_strided_memory_view_as_tensor_map_options(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) + tensor = _DeviceArray(buf, (64, 64)) + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + desc = view.as_tensor_map( + options=TensorMapDescriptorOptions( + box_dim=(32, 32), + data_type=np.float32, + swizzle=TensorMapSwizzle.SWIZZLE_128B, + ) + ) + assert desc is not None + + def test_strided_memory_view_as_tensor_map_options_dict(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + desc = _as_view(buf).as_tensor_map( + options={ + "box_dim": (64,), + "data_type": np.float32, + "element_strides": (1,), + } + ) + assert desc is not None + + def test_strided_memory_view_as_tensor_map_rejects_options_with_kwargs(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(TypeError, match="Specify either options or the individual tensor map arguments"): + _as_view(buf).as_tensor_map( + box_dim=(64,), + options=TensorMapDescriptorOptions(box_dim=(64,)), + ) + def test_from_tiled_3d(self, dev, skip_if_no_tma): buf = dev.allocate(16 * 16 * 16 * 4) # 16x16x16 float32 tensor = _DeviceArray(buf, (16, 16, 16)) From ad1c8009bfc3085ae6949eef9ee831a91d74fb37 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 14 Mar 2026 09:42:18 -0400 Subject: [PATCH 23/25] Keep the rebased TensorMap validation helper consistent. Remove the stale Cython-only `_require_view_device` definition left behind while porting the TensorMap fixes onto the PR head branch so the extension builds against the newer managed-memory-aware helper. Made-with: Cursor --- cuda_core/cuda/core/_tensor_map.pyx | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx index c101ab1963..e1e5daa9fd 100644 --- a/cuda_core/cuda/core/_tensor_map.pyx +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -427,18 +427,6 @@ cdef inline int _get_current_device_id() except -1: HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) return dev - -cdef inline int _require_view_device( - view, - int device_id, - object caller, -) except -1: - if view.device_id != device_id: - raise ValueError( - f"{caller} expects tensor on device {device_id}, got {view.device_id}") - return 0 - - def _compute_byte_strides(shape, strides, elem_size): """Compute byte strides from element strides or C-contiguous fallback. From a1203ac9d1ff30492c90fd0cc6b05da7064702f9 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 14 Mar 2026 09:44:34 -0400 Subject: [PATCH 24/25] Apply the pre-commit fixes for the rebased TensorMap branch. Add the missing SPDX header on the new `_memoryview.pxd` file and keep the test module formatted the way `ruff format` expects so pre-commit.ci can clear on the live PR branch. Made-with: Cursor --- cuda_core/cuda/core/_memoryview.pxd | 4 ++++ cuda_core/tests/test_tensor_map.py | 1 + 2 files changed, 5 insertions(+) diff --git a/cuda_core/cuda/core/_memoryview.pxd b/cuda_core/cuda/core/_memoryview.pxd index c12aa38ad8..5b50ae6dc7 100644 --- a/cuda_core/cuda/core/_memoryview.pxd +++ b/cuda_core/cuda/core/_memoryview.pxd @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + from libc.stdint cimport intptr_t from cuda.core._dlpack cimport DLTensor diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 3fb5f9808a..967b96b701 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -62,6 +62,7 @@ def __init__(self, device_type, device_id): def __dlpack_device__(self): return (self._device_type, self._device_id) + def _as_view(obj): if isinstance(obj, StridedMemoryView): return obj From 3c9e32daa649816ccba84d4d9335ba4d8cbb2b2c Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 14 Mar 2026 10:16:40 -0400 Subject: [PATCH 25/25] Keep the TensorMap multi-GPU tests on the view-based API. Replace the last stale `TensorMapDescriptor.from_tiled()` call sites with `StridedMemoryView.as_tensor_map()` so the multi-device CI coverage exercises the constructor path that actually exists on this branch. Made-with: Cursor --- cuda_core/tests/test_tensor_map.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 967b96b701..9ca8790d2b 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -376,8 +376,7 @@ def test_replace_address_rejects_tensor_from_other_device(self, dev, skip_if_no_ dev0.set_current() buf0 = dev0.allocate(1024 * 4) - desc = TensorMapDescriptor.from_tiled( - buf0, + desc = _as_view(buf0).as_tensor_map( box_dim=(64,), data_type=TensorMapDataType.FLOAT32, ) @@ -399,8 +398,7 @@ def test_replace_address_accepts_managed_buffer_on_nonzero_device(self, init_cud skip_if_managed_memory_unsupported(dev1) dev1.set_current() - desc = TensorMapDescriptor.from_tiled( - dev1.allocate(1024 * 4), + desc = _as_view(dev1.allocate(1024 * 4)).as_tensor_map( box_dim=(64,), data_type=TensorMapDataType.FLOAT32, ) @@ -427,10 +425,9 @@ def test_from_tiled_rejects_tensor_from_other_device(self, init_cuda): with pytest.raises( ValueError, - match=r"TensorMapDescriptor\.from_tiled expects tensor on device 0, got 1", + match=r"TensorMapDescriptor\._from_tiled expects tensor on device 0, got 1", ): - TensorMapDescriptor.from_tiled( - buf1, + _as_view(buf1).as_tensor_map( box_dim=(64,), data_type=TensorMapDataType.FLOAT32, ) @@ -448,8 +445,7 @@ def test_from_tiled_accepts_managed_buffer_on_nonzero_device(self, init_cuda): mr = create_managed_memory_resource_or_skip(ManagedMemoryResourceOptions(preferred_location=dev1.device_id)) managed_buf = mr.allocate(1024 * 4) - desc = TensorMapDescriptor.from_tiled( - managed_buf, + desc = _as_view(managed_buf).as_tensor_map( box_dim=(64,), data_type=TensorMapDataType.FLOAT32, )