From e3e1899206b77ef700f66d4fe89cde23f715b63b Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 23 Feb 2026 14:02:32 -0800
Subject: [PATCH 01/25] initial commit

---
 cuda_core/cuda/core/__init__.py             |  10 +
 cuda_core/cuda/core/_kernel_arg_handler.pyx |  24 +
 cuda_core/cuda/core/_tensor_map.pxd         |  12 +
 cuda_core/cuda/core/_tensor_map.pyx         | 524 ++++++++++++++++++++
 cuda_core/examples/tma_tensor_map.py        | 195 ++++++++
 cuda_core/tests/test_tensor_map.py          | 279 +++++++++++
 6 files changed, 1044 insertions(+)
 create mode 100644 cuda_core/cuda/core/_tensor_map.pxd
 create mode 100644 cuda_core/cuda/core/_tensor_map.pyx
 create mode 100644 cuda_core/examples/tma_tensor_map.py
 create mode 100644 cuda_core/tests/test_tensor_map.py

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index dfba887144..84e4f3d356 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -60,6 +60,7 @@
     StridedMemoryView,
     args_viewable_as_strided_memory,
 )
+<<<<<<< HEAD
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._stream import (
@@ -68,3 +69,12 @@
     Stream,
     StreamOptions,
 )
+from cuda.core._tensor_map import (
+    TensorMapDataType,
+    TensorMapDescriptor,
+    TensorMapIm2ColWideMode,
+    TensorMapInterleave,
+    TensorMapL2Promotion,
+    TensorMapOOBFill,
+    TensorMapSwizzle,
+)
diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
index 882ca5eaab..e88def13cd 100644
--- a/cuda_core/cuda/core/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -6,6 +6,7 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
 from libc.stdint cimport (intptr_t,
                           int8_t, int16_t, int32_t, int64_t,
                           uint8_t, uint16_t, uint32_t, uint64_t,)
+from libc.string cimport memcpy
 from libcpp cimport bool as cpp_bool
 from libcpp.complex cimport complex as cpp_complex
 from libcpp cimport nullptr
@@ -16,6 +17,8 @@ import ctypes
 import numpy
 
 from cuda.core._memory import Buffer
+from cuda.core._tensor_map import TensorMapDescriptor as _TensorMapDescriptor_py
+from cuda.core._tensor_map cimport TensorMapDescriptor
 from cuda.core._utils.cuda_utils import driver
 from cuda.bindings cimport cydriver
 
@@ -97,6 +100,9 @@ cdef object numpy_complex64 = numpy.complex64
 cdef object numpy_complex128 = numpy.complex128
 
 
+cdef object tensor_map_descriptor_type = _TensorMapDescriptor_py
+
+
 # limitation due to cython/cython#534
 ctypedef void* voidptr
 
@@ -124,6 +130,18 @@ cdef inline int prepare_arg(
     return 0
 
 
+cdef inline int prepare_tensor_map_arg(
+        vector.vector[void*]& data,
+        vector.vector[void*]& data_addresses,
+        TensorMapDescriptor arg,
+        const size_t idx) except -1:
+    cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap))
+    memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap))
+    data_addresses[idx] = ptr
+    data[idx] = ptr
+    return 0
+
+
 cdef inline int prepare_ctypes_arg(
         vector.vector[void*]& data,
         vector.vector[void*]& data_addresses,
@@ -273,6 +291,9 @@ cdef class ParamHolder:
                     # it's a CUdeviceptr:
                     self.data_addresses[i] = <void*><intptr_t>(arg.handle.getPtr())
                 continue
+            elif arg_type is tensor_map_descriptor_type:
+                prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
+                continue
             elif arg_type is bool:
                 prepare_arg[cpp_bool](self.data, self.data_addresses, arg, i)
                 continue
@@ -322,6 +343,9 @@ cdef class ParamHolder:
                 elif isinstance(arg, driver.CUgraphConditionalHandle):
                     prepare_arg[cydriver.CUgraphConditionalHandle](self.data, self.data_addresses, arg, i)
                     continue
+                elif isinstance(arg, tensor_map_descriptor_type):
+                    prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
+                    continue
                 # TODO: support ctypes/numpy struct
                 raise TypeError("the argument is of unsupported type: " + str(type(arg)))
 
diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
new file mode 100644
index 0000000000..3cfd571d89
--- /dev/null
+++ b/cuda_core/cuda/core/_tensor_map.pxd
@@ -0,0 +1,12 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.bindings cimport cydriver
+
+
+cdef class TensorMapDescriptor:
+    cdef cydriver.CUtensorMap _tensor_map
+    cdef object _source_ref
+
+    cdef void* _get_data_ptr(self)
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
new file mode 100644
index 0000000000..9375acf944
--- /dev/null
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -0,0 +1,524 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t, uint32_t, uint64_t
+from cuda.bindings cimport cydriver
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+
+import enum
+
+import numpy
+
+from cuda.core._memoryview import StridedMemoryView
+
+
+try:
+    from ml_dtypes import bfloat16 as ml_bfloat16
+except ImportError:
+    ml_bfloat16 = None
+
+
+class TensorMapDataType(enum.IntEnum):
+    """Data types for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapDataType`` driver enum values.
+    """
+    UINT8 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT8
+    UINT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT16
+    UINT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT32
+    INT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT32
+    UINT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT64
+    INT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT64
+    FLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT16
+    FLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32
+    FLOAT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT64
+    BFLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_BFLOAT16
+    FLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ
+    TFLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32
+    TFLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ
+
+
+class TensorMapInterleave(enum.IntEnum):
+    """Interleave layout for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapInterleave`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_INTERLEAVE_NONE
+    INTERLEAVE_16B = cydriver.CU_TENSOR_MAP_INTERLEAVE_16B
+    INTERLEAVE_32B = cydriver.CU_TENSOR_MAP_INTERLEAVE_32B
+
+
+class TensorMapSwizzle(enum.IntEnum):
+    """Swizzle mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapSwizzle`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_SWIZZLE_NONE
+    SWIZZLE_32B = cydriver.CU_TENSOR_MAP_SWIZZLE_32B
+    SWIZZLE_64B = cydriver.CU_TENSOR_MAP_SWIZZLE_64B
+    SWIZZLE_128B = cydriver.CU_TENSOR_MAP_SWIZZLE_128B
+
+
+class TensorMapL2Promotion(enum.IntEnum):
+    """L2 promotion mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapL2promotion`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_L2_PROMOTION_NONE
+    L2_64B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_64B
+    L2_128B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_128B
+    L2_256B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+
+
+class TensorMapOOBFill(enum.IntEnum):
+    """Out-of-bounds fill mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapFloatOOBfill`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
+    NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+
+
+# Mapping from numpy dtype to TMA data type
+_NUMPY_DTYPE_TO_TMA = {
+    numpy.dtype(numpy.uint8): TensorMapDataType.UINT8,
+    numpy.dtype(numpy.uint16): TensorMapDataType.UINT16,
+    numpy.dtype(numpy.uint32): TensorMapDataType.UINT32,
+    numpy.dtype(numpy.int32): TensorMapDataType.INT32,
+    numpy.dtype(numpy.uint64): TensorMapDataType.UINT64,
+    numpy.dtype(numpy.int64): TensorMapDataType.INT64,
+    numpy.dtype(numpy.float16): TensorMapDataType.FLOAT16,
+    numpy.dtype(numpy.float32): TensorMapDataType.FLOAT32,
+    numpy.dtype(numpy.float64): TensorMapDataType.FLOAT64,
+}
+
+if ml_bfloat16 is not None:
+    _NUMPY_DTYPE_TO_TMA[numpy.dtype(ml_bfloat16)] = TensorMapDataType.BFLOAT16
+
+
+# Mapping from TMA data type to element size in bytes
+_TMA_DATA_TYPE_SIZE = {
+    TensorMapDataType.UINT8: 1,
+    TensorMapDataType.UINT16: 2,
+    TensorMapDataType.UINT32: 4,
+    TensorMapDataType.INT32: 4,
+    TensorMapDataType.UINT64: 8,
+    TensorMapDataType.INT64: 8,
+    TensorMapDataType.FLOAT16: 2,
+    TensorMapDataType.FLOAT32: 4,
+    TensorMapDataType.FLOAT64: 8,
+    TensorMapDataType.BFLOAT16: 2,
+    TensorMapDataType.FLOAT32_FTZ: 4,
+    TensorMapDataType.TFLOAT32: 4,
+    TensorMapDataType.TFLOAT32_FTZ: 4,
+}
+
+
+def _resolve_data_type(view, data_type):
+    """Resolve the TMA data type from an explicit value or the view's dtype."""
+
+    if data_type is not None:
+        if not isinstance(data_type, TensorMapDataType):
+            raise TypeError(
+                f"data_type must be a TensorMapDataType, got {type(data_type)}")
+        return data_type
+
+    dt = view.dtype
+    if dt is None:
+        raise ValueError(
+            "Cannot infer TMA data type from the tensor; "
+            "please specify data_type explicitly")
+
+    tma_dt = _NUMPY_DTYPE_TO_TMA.get(dt)
+    if tma_dt is None:
+        raise ValueError(
+            f"Unsupported dtype {dt} for TMA; "
+            f"supported dtypes: {list(_NUMPY_DTYPE_TO_TMA.keys())}. "
+            "You may also specify data_type explicitly.")
+
+    return tma_dt
+
+
+cdef class TensorMapDescriptor:
+    """Describes a TMA (Tensor Memory Accelerator) tensor map for Hopper+ GPUs.
+
+    A ``TensorMapDescriptor`` wraps the opaque 128-byte ``CUtensorMap`` struct
+    used by the hardware TMA unit for efficient bulk data movement between
+    global and shared memory.
+
+    Instances are created via the class methods :meth:`from_tiled` and
+    :meth:`from_im2col`, and can be passed directly to
+    :func:`~cuda.core.launch` as a kernel argument.
+    """
+
+    def __init__(self):
+        raise RuntimeError(
+            "TensorMapDescriptor cannot be instantiated directly. "
+            "Use TensorMapDescriptor.from_tiled() or "
+            "TensorMapDescriptor.from_im2col().")
+
+    cdef void* _get_data_ptr(self):
+        return <void*>&self._tensor_map
+
+    @staticmethod
+    def from_tiled(tensor, box_dim, *,
+                   element_strides=None,
+                   data_type=None,
+                   interleave=TensorMapInterleave.NONE,
+                   swizzle=TensorMapSwizzle.NONE,
+                   l2_promotion=TensorMapL2Promotion.NONE,
+                   oob_fill=TensorMapOOBFill.NONE):
+        """Create a tiled TMA descriptor from a tensor object.
+
+        Parameters
+        ----------
+        tensor : object
+            Any object supporting DLPack or ``__cuda_array_interface__``,
+            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
+            device-accessible memory with a 16-byte-aligned pointer.
+        box_dim : tuple of int
+            The size of each tile dimension (in elements). Must have the
+            same rank as the tensor and each value must be in [1, 256].
+            Specified in the same (row-major) order as the tensor shape.
+        element_strides : tuple of int, optional
+            Per-dimension element traversal strides. Default is all 1s.
+            Specified in the same (row-major) order as the tensor shape.
+        data_type : TensorMapDataType, optional
+            Explicit data type override. If ``None``, inferred from the
+            tensor's dtype.
+        interleave : TensorMapInterleave
+            Interleave layout. Default ``NONE``.
+        swizzle : TensorMapSwizzle
+            Swizzle mode. Default ``NONE``.
+        l2_promotion : TensorMapL2Promotion
+            L2 promotion mode. Default ``NONE``.
+        oob_fill : TensorMapOOBFill
+            Out-of-bounds fill mode. Default ``NONE``.
+
+        Returns
+        -------
+        TensorMapDescriptor
+
+        Raises
+        ------
+        ValueError
+            If the tensor rank is outside [1, 5], the pointer is not
+            16-byte aligned, or dimension/stride constraints are violated.
+        """
+        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
+
+        # Obtain a StridedMemoryView from the tensor
+        if isinstance(tensor, StridedMemoryView):
+            view = tensor
+        else:
+            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+        if not view.is_device_accessible:
+            raise ValueError("The tensor must be device-accessible")
+
+        # Keep a strong reference to prevent GC
+        desc._source_ref = tensor
+
+        # Resolve data type
+        tma_dt = _resolve_data_type(view, data_type)
+        cdef int c_data_type_int = int(tma_dt)
+        cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
+
+        # Get tensor metadata
+        cdef intptr_t global_address = view.ptr
+        shape = view.shape
+        strides = view.strides  # in elements, can be None for C-contiguous
+
+        cdef int rank = len(shape)
+        if rank < 1 or rank > 5:
+            raise ValueError(
+                f"Tensor rank must be between 1 and 5, got {rank}")
+
+        if global_address % 16 != 0:
+            raise ValueError(
+                f"Global memory address must be 16-byte aligned, "
+                f"got address 0x{global_address:x} (misaligned by {global_address % 16} bytes)")
+
+        if len(box_dim) != rank:
+            raise ValueError(
+                f"box_dim must have {rank} elements (same as tensor rank), "
+                f"got {len(box_dim)}")
+
+        for i, bd in enumerate(box_dim):
+            if bd < 1 or bd > 256:
+                raise ValueError(
+                    f"box_dim[{i}] must be in [1, 256], got {bd}")
+
+        if element_strides is not None:
+            if len(element_strides) != rank:
+                raise ValueError(
+                    f"element_strides must have {rank} elements, got {len(element_strides)}")
+        else:
+            element_strides = (1,) * rank
+
+        # Compute byte strides from element strides
+        cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
+        if strides is not None:
+            byte_strides = tuple(s * elem_size for s in strides)
+        else:
+            # C-contiguous: strides in bytes, row-major
+            byte_strides = []
+            stride = elem_size
+            for i in range(rank - 1, -1, -1):
+                byte_strides.append(stride)
+                stride *= shape[i]
+            byte_strides.reverse()
+
+        # Reverse dimensions for column-major cuTensorMap convention
+        # Python/DLPack: row-major (dim 0 = outermost)
+        # cuTensorMap: column-major (dim 0 = innermost)
+        cdef uint64_t[5] c_global_dim
+        cdef uint64_t[4] c_global_strides  # rank - 1 elements
+        cdef uint32_t[5] c_box_dim
+        cdef uint32_t[5] c_element_strides
+        cdef int i_c
+
+        for i_c in range(rank):
+            # Reverse: Python dim i -> cuTensorMap dim (rank - 1 - i)
+            c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
+            c_box_dim[i_c] = <uint32_t>box_dim[rank - 1 - i_c]
+            c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
+
+        # globalStrides: rank-1 elements (byte strides for dims 1..N-1 in col-major order)
+        # The innermost stride (dim 0) is implicit = element size
+        for i_c in range(rank - 1):
+            c_global_strides[i_c] = <uint64_t>byte_strides[rank - 2 - i_c]
+
+        cdef uint32_t c_rank = <uint32_t>rank
+        cdef int c_interleave_int = int(interleave)
+        cdef int c_swizzle_int = int(swizzle)
+        cdef int c_l2_promotion_int = int(l2_promotion)
+        cdef int c_oob_fill_int = int(oob_fill)
+        cdef cydriver.CUtensorMapInterleave c_interleave = <cydriver.CUtensorMapInterleave>c_interleave_int
+        cdef cydriver.CUtensorMapSwizzle c_swizzle = <cydriver.CUtensorMapSwizzle>c_swizzle_int
+        cdef cydriver.CUtensorMapL2promotion c_l2_promotion = <cydriver.CUtensorMapL2promotion>c_l2_promotion_int
+        cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = <cydriver.CUtensorMapFloatOOBfill>c_oob_fill_int
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuTensorMapEncodeTiled(
+                &desc._tensor_map,
+                c_data_type,
+                c_rank,
+                <void*>global_address,
+                c_global_dim,
+                c_global_strides,
+                c_box_dim,
+                c_element_strides,
+                c_interleave,
+                c_swizzle,
+                c_l2_promotion,
+                c_oob_fill,
+            ))
+
+        return desc
+
+    @staticmethod
+    def from_im2col(tensor, pixel_box_lower_corner, pixel_box_upper_corner,
+                    channels_per_pixel, pixels_per_column, *,
+                    element_strides=None,
+                    data_type=None,
+                    interleave=TensorMapInterleave.NONE,
+                    swizzle=TensorMapSwizzle.NONE,
+                    l2_promotion=TensorMapL2Promotion.NONE,
+                    oob_fill=TensorMapOOBFill.NONE):
+        """Create an im2col TMA descriptor from a tensor object.
+
+        Im2col layout is used for convolution-style data access patterns.
+
+        Parameters
+        ----------
+        tensor : object
+            Any object supporting DLPack or ``__cuda_array_interface__``,
+            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
+            device-accessible memory with a 16-byte-aligned pointer.
+        pixel_box_lower_corner : tuple of int
+            Lower corner of the pixel bounding box for each spatial
+            dimension (rank - 2 elements). Specified in row-major order
+            matching the tensor's spatial dimensions.
+        pixel_box_upper_corner : tuple of int
+            Upper corner of the pixel bounding box for each spatial
+            dimension (rank - 2 elements). Specified in row-major order
+            matching the tensor's spatial dimensions.
+        channels_per_pixel : int
+            Number of channels per pixel.
+        pixels_per_column : int
+            Number of pixels per column.
+        element_strides : tuple of int, optional
+            Per-dimension element traversal strides. Default is all 1s.
+        data_type : TensorMapDataType, optional
+            Explicit data type override. If ``None``, inferred from the
+            tensor's dtype.
+        interleave : TensorMapInterleave
+            Interleave layout. Default ``NONE``.
+        swizzle : TensorMapSwizzle
+            Swizzle mode. Default ``NONE``.
+        l2_promotion : TensorMapL2Promotion
+            L2 promotion mode. Default ``NONE``.
+        oob_fill : TensorMapOOBFill
+            Out-of-bounds fill mode. Default ``NONE``.
+
+        Returns
+        -------
+        TensorMapDescriptor
+
+        Raises
+        ------
+        ValueError
+            If the tensor rank is outside [3, 5], the pointer is not
+            16-byte aligned, or other constraints are violated.
+        """
+        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
+
+        # Obtain a StridedMemoryView from the tensor
+        if isinstance(tensor, StridedMemoryView):
+            view = tensor
+        else:
+            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+        if not view.is_device_accessible:
+            raise ValueError("The tensor must be device-accessible")
+
+        desc._source_ref = tensor
+
+        tma_dt = _resolve_data_type(view, data_type)
+        cdef int c_data_type_int = int(tma_dt)
+        cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
+
+        cdef intptr_t global_address = view.ptr
+        shape = view.shape
+        strides = view.strides
+
+        cdef int rank = len(shape)
+        if rank < 3 or rank > 5:
+            raise ValueError(
+                f"Im2col tensor rank must be between 3 and 5, got {rank}")
+
+        if global_address % 16 != 0:
+            raise ValueError(
+                f"Global memory address must be 16-byte aligned, "
+                f"got address 0x{global_address:x}")
+
+        cdef int n_spatial = rank - 2
+        if len(pixel_box_lower_corner) != n_spatial:
+            raise ValueError(
+                f"pixel_box_lower_corner must have {n_spatial} elements "
+                f"(rank - 2), got {len(pixel_box_lower_corner)}")
+        if len(pixel_box_upper_corner) != n_spatial:
+            raise ValueError(
+                f"pixel_box_upper_corner must have {n_spatial} elements "
+                f"(rank - 2), got {len(pixel_box_upper_corner)}")
+
+        if element_strides is not None:
+            if len(element_strides) != rank:
+                raise ValueError(
+                    f"element_strides must have {rank} elements, got {len(element_strides)}")
+        else:
+            element_strides = (1,) * rank
+
+        cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
+        if strides is not None:
+            byte_strides = tuple(s * elem_size for s in strides)
+        else:
+            byte_strides = []
+            stride = elem_size
+            for i in range(rank - 1, -1, -1):
+                byte_strides.append(stride)
+                stride *= shape[i]
+            byte_strides.reverse()
+
+        # Reverse all dimension arrays for column-major convention
+        cdef uint64_t[5] c_global_dim
+        cdef uint64_t[4] c_global_strides
+        cdef uint32_t[5] c_element_strides
+        cdef int[3] c_pixel_box_lower  # max 3 spatial dims (rank 5 - 2)
+        cdef int[3] c_pixel_box_upper
+        cdef int i_c
+
+        for i_c in range(rank):
+            c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
+            c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
+
+        for i_c in range(rank - 1):
+            c_global_strides[i_c] = <uint64_t>byte_strides[rank - 2 - i_c]
+
+        # Reverse spatial dimensions for lower/upper corners
+        for i_c in range(n_spatial):
+            c_pixel_box_lower[i_c] = <int>pixel_box_lower_corner[n_spatial - 1 - i_c]
+            c_pixel_box_upper[i_c] = <int>pixel_box_upper_corner[n_spatial - 1 - i_c]
+
+        cdef uint32_t c_rank = <uint32_t>rank
+        cdef uint32_t c_channels = <uint32_t>channels_per_pixel
+        cdef uint32_t c_pixels = <uint32_t>pixels_per_column
+        cdef int c_interleave_int = int(interleave)
+        cdef int c_swizzle_int = int(swizzle)
+        cdef int c_l2_promotion_int = int(l2_promotion)
+        cdef int c_oob_fill_int = int(oob_fill)
+        cdef cydriver.CUtensorMapInterleave c_interleave = <cydriver.CUtensorMapInterleave>c_interleave_int
+        cdef cydriver.CUtensorMapSwizzle c_swizzle = <cydriver.CUtensorMapSwizzle>c_swizzle_int
+        cdef cydriver.CUtensorMapL2promotion c_l2_promotion = <cydriver.CUtensorMapL2promotion>c_l2_promotion_int
+        cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = <cydriver.CUtensorMapFloatOOBfill>c_oob_fill_int
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2col(
+                &desc._tensor_map,
+                c_data_type,
+                c_rank,
+                <void*>global_address,
+                c_global_dim,
+                c_global_strides,
+                c_pixel_box_lower,
+                c_pixel_box_upper,
+                c_channels,
+                c_pixels,
+                c_element_strides,
+                c_interleave,
+                c_swizzle,
+                c_l2_promotion,
+                c_oob_fill,
+            ))
+
+        return desc
+
+    def replace_address(self, tensor):
+        """Replace the global memory address in this tensor map descriptor.
+
+        This is useful when the tensor data has been reallocated but the
+        shape, strides, and other parameters remain the same.
+
+        Parameters
+        ----------
+        tensor : object
+            Any object supporting DLPack or ``__cuda_array_interface__``,
+            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
+            device-accessible memory with a 16-byte-aligned pointer.
+        """
+        if isinstance(tensor, StridedMemoryView):
+            view = tensor
+        else:
+            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+        if not view.is_device_accessible:
+            raise ValueError("The tensor must be device-accessible")
+
+        cdef intptr_t global_address = view.ptr
+        if global_address % 16 != 0:
+            raise ValueError(
+                f"Global memory address must be 16-byte aligned, "
+                f"got address 0x{global_address:x}")
+
+        self._source_ref = tensor
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress(
+                &self._tensor_map,
+                <void*>global_address,
+            ))
+
+    def __repr__(self):
+        return f"TensorMapDescriptor()"
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
new file mode 100644
index 0000000000..de51570ec4
--- /dev/null
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -0,0 +1,195 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates how to use TMA (Tensor Memory Accelerator) descriptors
+# with cuda.core on Hopper+ GPUs (compute capability >= 9.0).
+#
+# TMA enables efficient bulk data movement between global and shared memory using
+# hardware-managed tensor map descriptors. This example shows:
+#
+#   1. Creating a TMA tiled descriptor from a CuPy device array
+#   2. Passing the descriptor to a kernel via launch()
+#   3. Using TMA to load tiles into shared memory (via inline PTX)
+#   4. Updating the descriptor's source address with replace_address()
+#
+# Requirements:
+#   - Hopper or later GPU (compute capability >= 9.0)
+#   - CuPy
+#   - CUDA toolkit headers (CUDA_PATH or CUDA_HOME set)
+#
+# ################################################################################
+
+import sys
+
+import cupy as cp
+import numpy as np
+
+from cuda.core import (
+    Device,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    TensorMapDescriptor,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Check for Hopper+ GPU
+# ---------------------------------------------------------------------------
+dev = Device()
+arch = dev.compute_capability
+if arch < (9, 0):
+    print(
+        "TMA requires compute capability >= 9.0 (Hopper or later)",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+dev.set_current()
+
+arch_str = "".join(f"{i}" for i in arch)
+
+# ---------------------------------------------------------------------------
+# CUDA kernel that uses TMA to load a 1-D tile into shared memory, then
+# copies the tile to an output buffer so we can verify correctness.
+#
+# The CUtensorMap struct (128 bytes) is defined inline so the kernel can be
+# compiled with NVRTC without pulling in the full driver-API header.
+#
+# Key points:
+#   - The tensor map is passed by value with __grid_constant__ so the TMA
+#     hardware can read it from grid-constant memory.
+#   - Thread 0 in each block issues the TMA load and manages the mbarrier.
+#   - All threads wait on the mbarrier, then copy from shared to global.
+# ---------------------------------------------------------------------------
+TILE_SIZE = 128  # elements per tile (must match the kernel constant)
+
+code = r"""
+// Minimal definition of the 128-byte opaque tensor map struct.
+struct __align__(64) TensorMap { unsigned long long opaque[16]; };
+
+static constexpr int TILE_SIZE = 128;
+
+extern "C"
+__global__ void tma_copy(
+    const __grid_constant__ TensorMap tensor_map,
+    float* output,
+    int N)
+{
+    __shared__ __align__(128) float smem[TILE_SIZE];
+    __shared__ __align__(8) unsigned long long mbar;
+
+    const int tid        = threadIdx.x;
+    const int tile_start = blockIdx.x * TILE_SIZE;
+
+    // ---- Thread 0: set up mbarrier and issue the TMA load ----
+    if (tid == 0)
+    {
+        // Initialise a single-phase mbarrier (1 arriving thread).
+        asm volatile(
+            "mbarrier.init.shared.b64 [%0], 1;"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+
+        // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start'
+        // from the tensor described by 'tensor_map' into shared memory.
+        asm volatile(
+            "cp.async.bulk.tensor.1d.shared::cluster.global.tile"
+            ".mbarrier::complete_tx::bytes"
+            " [%0], [%1, {%2}], [%3];"
+            :: "r"((unsigned)__cvta_generic_to_shared(smem)),
+               "l"(&tensor_map),
+               "r"(tile_start),
+               "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+
+        // Tell the mbarrier how many bytes the TMA will deliver.
+        asm volatile(
+            "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)),
+               "r"((unsigned)(TILE_SIZE * sizeof(float))));
+    }
+
+    __syncthreads();
+
+    // ---- Wait for the TMA load to complete ----
+    if (tid == 0)
+    {
+        asm volatile(
+            "{ .reg .pred P;                                           \n"
+            "WAIT:                                                     \n"
+            "  mbarrier.try_wait.parity.shared.b64 P, [%0], 0;         \n"
+            "  @!P bra WAIT;                                           \n"
+            "}                                                         \n"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+    }
+
+    __syncthreads();
+
+    // ---- Copy the tile from shared memory to the output buffer ----
+    if (tid < TILE_SIZE)
+    {
+        const int idx = tile_start + tid;
+        if (idx < N)
+            output[idx] = smem[tid];
+    }
+}
+"""
+
+# ---------------------------------------------------------------------------
+# Compile the kernel
+# ---------------------------------------------------------------------------
+prog = Program(
+    code,
+    code_type="c++",
+    options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"),
+)
+mod = prog.compile("cubin")
+ker = mod.get_kernel("tma_copy")
+
+# ---------------------------------------------------------------------------
+# 1) Prepare input data on the device
+# ---------------------------------------------------------------------------
+N = 1024
+a = cp.arange(N, dtype=cp.float32)  # [0, 1, 2, ..., N-1]
+output = cp.zeros(N, dtype=cp.float32)
+dev.sync()  # cupy uses its own stream
+
+# ---------------------------------------------------------------------------
+# 2) Create a TMA tiled descriptor
+#    from_tiled() accepts any DLPack / __cuda_array_interface__ object.
+#    The dtype (float32) is inferred automatically from the CuPy array.
+# ---------------------------------------------------------------------------
+tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,))
+
+# ---------------------------------------------------------------------------
+# 3) Launch the kernel
+#    The TensorMapDescriptor is passed directly as a kernel argument — the
+#    128-byte struct is copied into kernel parameter space automatically.
+# ---------------------------------------------------------------------------
+n_tiles = N // TILE_SIZE
+config = LaunchConfig(grid=n_tiles, block=TILE_SIZE)
+launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N))
+dev.sync()
+
+assert cp.array_equal(output, a), "TMA copy produced incorrect results"
+print(f"TMA copy verified: {N} elements across {n_tiles} tiles")
+
+# ---------------------------------------------------------------------------
+# 4) Demonstrate replace_address()
+#    Create a second tensor with different content, point the *same*
+#    descriptor at it, and re-launch without rebuilding the descriptor.
+# ---------------------------------------------------------------------------
+b = cp.full(N, fill_value=42.0, dtype=cp.float32)
+dev.sync()
+
+tensor_map.replace_address(b)
+
+output2 = cp.zeros(N, dtype=cp.float32)
+dev.sync()
+
+launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N))
+dev.sync()
+
+assert cp.array_equal(output2, b), "replace_address produced incorrect results"
+print("replace_address verified: descriptor reused with new source tensor")
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
new file mode 100644
index 0000000000..0aa5968a85
--- /dev/null
+++ b/cuda_core/tests/test_tensor_map.py
@@ -0,0 +1,279 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+import numpy as np
+
+from cuda.core import (
+    Device,
+    TensorMapDescriptor,
+    TensorMapDataType,
+    TensorMapInterleave,
+    TensorMapL2Promotion,
+    TensorMapOOBFill,
+    TensorMapSwizzle,
+)
+
+
+@pytest.fixture
+def dev(init_cuda):
+    return Device()
+
+
+@pytest.fixture
+def skip_if_no_tma(dev):
+    if not dev.properties.tensor_map_access_supported:
+        pytest.skip("Device does not support TMA (requires compute capability 9.0+)")
+
+
+def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256):
+    """Allocate a device buffer and return it with proper alignment."""
+    n_elements = 1
+    for s in shape:
+        n_elements *= s
+    buf = dev.allocate(n_elements * np.dtype(dtype).itemsize + alignment)
+    return buf
+
+
+class TestTensorMapEnums:
+    """Test that enum wrappers expose the expected values."""
+
+    def test_data_type_values(self):
+        assert TensorMapDataType.UINT8 == 0
+        assert TensorMapDataType.FLOAT32 == 7
+        assert TensorMapDataType.FLOAT64 == 8
+        assert TensorMapDataType.BFLOAT16 == 9
+
+    def test_interleave_values(self):
+        assert TensorMapInterleave.NONE == 0
+        assert TensorMapInterleave.INTERLEAVE_16B == 1
+        assert TensorMapInterleave.INTERLEAVE_32B == 2
+
+    def test_swizzle_values(self):
+        assert TensorMapSwizzle.NONE == 0
+        assert TensorMapSwizzle.SWIZZLE_32B == 1
+        assert TensorMapSwizzle.SWIZZLE_64B == 2
+        assert TensorMapSwizzle.SWIZZLE_128B == 3
+
+    def test_l2_promotion_values(self):
+        assert TensorMapL2Promotion.NONE == 0
+        assert TensorMapL2Promotion.L2_64B == 1
+        assert TensorMapL2Promotion.L2_128B == 2
+        assert TensorMapL2Promotion.L2_256B == 3
+
+    def test_oob_fill_values(self):
+        assert TensorMapOOBFill.NONE == 0
+        assert TensorMapOOBFill.NAN_REQUEST_ZERO_FMA == 1
+
+
+class TestTensorMapDescriptorCreation:
+    """Test TensorMapDescriptor factory methods."""
+
+    def test_cannot_instantiate_directly(self):
+        with pytest.raises(RuntimeError, match="cannot be instantiated directly"):
+            TensorMapDescriptor()
+
+    def test_from_tiled_1d(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)  # 1024 float32 elements
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+        assert repr(desc) == "TensorMapDescriptor()"
+
+    def test_from_tiled_2d(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)  # 64x64 float32
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+    def test_from_tiled_3d(self, dev, skip_if_no_tma):
+        buf = dev.allocate(16 * 16 * 16 * 4)  # 16x16x16 float32
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(8, 8, 8),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+    def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+            swizzle=TensorMapSwizzle.SWIZZLE_128B,
+        )
+        assert desc is not None
+
+    def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+            l2_promotion=TensorMapL2Promotion.L2_128B,
+        )
+        assert desc is not None
+
+    def test_from_tiled_with_oob_fill(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+            oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA,
+        )
+        assert desc is not None
+
+
+class TestTensorMapDescriptorValidation:
+    """Test validation in TensorMapDescriptor factory methods."""
+
+    def test_invalid_rank_zero(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64)
+        with pytest.raises(ValueError, match="rank must be between 1 and 5"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(),
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match="box_dim must have 1 elements"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(32, 32),
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_box_dim_out_of_range(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match=r"box_dim\[0\] must be in \[1, 256\]"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(512,),
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_element_strides_rank_mismatch(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match="element_strides must have 1 elements"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(64,),
+                element_strides=(1, 1),
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_invalid_data_type(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(TypeError, match="data_type must be a TensorMapDataType"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(64,),
+                data_type=42,
+            )
+
+
+class TestTensorMapDtypeMapping:
+    """Test automatic dtype inference from numpy dtypes."""
+
+    @pytest.mark.parametrize("np_dtype,expected_tma_dt", [
+        (np.uint8, TensorMapDataType.UINT8),
+        (np.uint16, TensorMapDataType.UINT16),
+        (np.uint32, TensorMapDataType.UINT32),
+        (np.int32, TensorMapDataType.INT32),
+        (np.uint64, TensorMapDataType.UINT64),
+        (np.int64, TensorMapDataType.INT64),
+        (np.float16, TensorMapDataType.FLOAT16),
+        (np.float32, TensorMapDataType.FLOAT32),
+        (np.float64, TensorMapDataType.FLOAT64),
+    ])
+    def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma):
+        from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
+        assert _NUMPY_DTYPE_TO_TMA[np.dtype(np_dtype)] == expected_tma_dt
+
+    def test_bfloat16_mapping(self):
+        try:
+            from ml_dtypes import bfloat16
+            from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
+            assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16
+        except ImportError:
+            pytest.skip("ml_dtypes not installed")
+
+
+class TestTensorMapReplaceAddress:
+    """Test replace_address functionality."""
+
+    def test_replace_address(self, dev, skip_if_no_tma):
+        buf1 = dev.allocate(1024 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf1,
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+
+        buf2 = dev.allocate(1024 * 4)
+        desc.replace_address(buf2)
+        # No exception means success
+
+    def test_replace_address_requires_device_accessible(self, dev, skip_if_no_tma):
+        buf1 = dev.allocate(1024 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf1,
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        # Create a host-only array (not device-accessible)
+        host_arr = np.zeros(1024, dtype=np.float32)
+        with pytest.raises(ValueError, match="device-accessible"):
+            desc.replace_address(host_arr)
+
+
+class TestTensorMapIm2col:
+    """Test im2col TMA descriptor creation."""
+
+    def test_from_im2col_3d(self, dev, skip_if_no_tma):
+        # 3D tensor: batch=1, height=32, channels=64
+        buf = dev.allocate(1 * 32 * 64 * 4)
+        desc = TensorMapDescriptor.from_im2col(
+            buf,
+            pixel_box_lower_corner=(0,),
+            pixel_box_upper_corner=(4,),
+            channels_per_pixel=64,
+            pixels_per_column=4,
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+    def test_from_im2col_rank_validation(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match="Im2col tensor rank must be between 3 and 5"):
+            TensorMapDescriptor.from_im2col(
+                buf,
+                pixel_box_lower_corner=(),
+                pixel_box_upper_corner=(),
+                channels_per_pixel=64,
+                pixels_per_column=4,
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1 * 32 * 64 * 4)
+        with pytest.raises(ValueError, match="pixel_box_lower_corner must have 1 elements"):
+            TensorMapDescriptor.from_im2col(
+                buf,
+                pixel_box_lower_corner=(0, 0),
+                pixel_box_upper_corner=(4,),
+                channels_per_pixel=64,
+                pixels_per_column=4,
+                data_type=TensorMapDataType.FLOAT32,
+            )

From 77a3c8ecab4734eacbf7f36af964b103ca2ecf69 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 13:53:49 -0800
Subject: [PATCH 02/25] tma wide

---
 cuda_core/cuda/core/__init__.py     |   1 -
 cuda_core/cuda/core/_tensor_map.pxd |   1 +
 cuda_core/cuda/core/_tensor_map.pyx | 208 +++++++++++++++++++++++++++-
 cuda_core/pixi.toml                 |  13 +-
 cuda_core/tests/test_tensor_map.py  | 133 +++++++++++++++++-
 5 files changed, 341 insertions(+), 15 deletions(-)

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 84e4f3d356..d5f3693721 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -60,7 +60,6 @@
     StridedMemoryView,
     args_viewable_as_strided_memory,
 )
-<<<<<<< HEAD
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._stream import (
diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
index 3cfd571d89..fa2254ba70 100644
--- a/cuda_core/cuda/core/_tensor_map.pxd
+++ b/cuda_core/cuda/core/_tensor_map.pxd
@@ -8,5 +8,6 @@ from cuda.bindings cimport cydriver
 cdef class TensorMapDescriptor:
     cdef cydriver.CUtensorMap _tensor_map
     cdef object _source_ref
+    cdef object _repr_info
 
     cdef void* _get_data_ptr(self)
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
index 9375acf944..6b072f7404 100644
--- a/cuda_core/cuda/core/_tensor_map.pyx
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -80,6 +80,16 @@ class TensorMapOOBFill(enum.IntEnum):
     NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
 
 
+class TensorMapIm2ColWideMode(enum.IntEnum):
+    """Im2col wide mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapIm2ColWideMode`` driver enum values.
+    Supported on compute capability 10.0+.
+    """
+    W = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
+    W128 = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
+
+
 # Mapping from numpy dtype to TMA data type
 _NUMPY_DTYPE_TO_TMA = {
     numpy.dtype(numpy.uint8): TensorMapDataType.UINT8,
@@ -316,6 +326,13 @@ cdef class TensorMapDescriptor:
                 c_oob_fill,
             ))
 
+        desc._repr_info = {
+            "method": "tiled",
+            "rank": rank,
+            "data_type": tma_dt,
+            "swizzle": swizzle,
+        }
+
         return desc
 
     @staticmethod
@@ -483,6 +500,176 @@ cdef class TensorMapDescriptor:
                 c_oob_fill,
             ))
 
+        desc._repr_info = {
+            "method": "im2col",
+            "rank": rank,
+            "data_type": tma_dt,
+            "swizzle": swizzle,
+        }
+
+        return desc
+
+    @staticmethod
+    def from_im2col_wide(tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width,
+                         channels_per_pixel, pixels_per_column, *,
+                         element_strides=None,
+                         data_type=None,
+                         interleave=TensorMapInterleave.NONE,
+                         mode=TensorMapIm2ColWideMode.W,
+                         swizzle=TensorMapSwizzle.SWIZZLE_128B,
+                         l2_promotion=TensorMapL2Promotion.NONE,
+                         oob_fill=TensorMapOOBFill.NONE):
+        """Create an im2col-wide TMA descriptor from a tensor object.
+
+        Im2col-wide layout loads elements exclusively along the W (width)
+        dimension. This variant is supported on compute capability 10.0+
+        (Blackwell and later).
+
+        Parameters
+        ----------
+        tensor : object
+            Any object supporting DLPack or ``__cuda_array_interface__``,
+            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
+            device-accessible memory with a 16-byte-aligned pointer.
+        pixel_box_lower_corner_width : int
+            Lower corner of the pixel bounding box along the W dimension.
+        pixel_box_upper_corner_width : int
+            Upper corner of the pixel bounding box along the W dimension.
+        channels_per_pixel : int
+            Number of channels per pixel.
+        pixels_per_column : int
+            Number of pixels per column.
+        element_strides : tuple of int, optional
+            Per-dimension element traversal strides. Default is all 1s.
+        data_type : TensorMapDataType, optional
+            Explicit data type override. If ``None``, inferred from the
+            tensor's dtype.
+        interleave : TensorMapInterleave
+            Interleave layout. Default ``NONE``.
+        mode : TensorMapIm2ColWideMode
+            Im2col wide mode. Default ``W``.
+        swizzle : TensorMapSwizzle
+            Swizzle mode. Default ``SWIZZLE_128B``.
+        l2_promotion : TensorMapL2Promotion
+            L2 promotion mode. Default ``NONE``.
+        oob_fill : TensorMapOOBFill
+            Out-of-bounds fill mode. Default ``NONE``.
+
+        Returns
+        -------
+        TensorMapDescriptor
+
+        Raises
+        ------
+        ValueError
+            If the tensor rank is outside [3, 5], the pointer is not
+            16-byte aligned, or other constraints are violated.
+        """
+        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
+
+        # Obtain a StridedMemoryView from the tensor
+        if isinstance(tensor, StridedMemoryView):
+            view = tensor
+        else:
+            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+        if not view.is_device_accessible:
+            raise ValueError("The tensor must be device-accessible")
+
+        desc._source_ref = tensor
+
+        tma_dt = _resolve_data_type(view, data_type)
+        cdef int c_data_type_int = int(tma_dt)
+        cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
+
+        cdef intptr_t global_address = view.ptr
+        shape = view.shape
+        strides = view.strides
+
+        cdef int rank = len(shape)
+        if rank < 3 or rank > 5:
+            raise ValueError(
+                f"Im2col-wide tensor rank must be between 3 and 5, got {rank}")
+
+        if global_address % 16 != 0:
+            raise ValueError(
+                f"Global memory address must be 16-byte aligned, "
+                f"got address 0x{global_address:x}")
+
+        if element_strides is not None:
+            if len(element_strides) != rank:
+                raise ValueError(
+                    f"element_strides must have {rank} elements, got {len(element_strides)}")
+        else:
+            element_strides = (1,) * rank
+
+        cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
+        if strides is not None:
+            byte_strides = tuple(s * elem_size for s in strides)
+        else:
+            byte_strides = []
+            stride = elem_size
+            for i in range(rank - 1, -1, -1):
+                byte_strides.append(stride)
+                stride *= shape[i]
+            byte_strides.reverse()
+
+        # Reverse all dimension arrays for column-major convention
+        cdef uint64_t[5] c_global_dim
+        cdef uint64_t[4] c_global_strides
+        cdef uint32_t[5] c_element_strides
+        cdef int i_c
+
+        for i_c in range(rank):
+            c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
+            c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
+
+        for i_c in range(rank - 1):
+            c_global_strides[i_c] = <uint64_t>byte_strides[rank - 2 - i_c]
+
+        cdef uint32_t c_rank = <uint32_t>rank
+        cdef int c_lower_w = <int>pixel_box_lower_corner_width
+        cdef int c_upper_w = <int>pixel_box_upper_corner_width
+        cdef uint32_t c_channels = <uint32_t>channels_per_pixel
+        cdef uint32_t c_pixels = <uint32_t>pixels_per_column
+        cdef int c_interleave_int = int(interleave)
+        cdef int c_mode_int = int(mode)
+        cdef int c_swizzle_int = int(swizzle)
+        cdef int c_l2_promotion_int = int(l2_promotion)
+        cdef int c_oob_fill_int = int(oob_fill)
+        cdef cydriver.CUtensorMapInterleave c_interleave = <cydriver.CUtensorMapInterleave>c_interleave_int
+        cdef cydriver.CUtensorMapIm2ColWideMode c_mode = <cydriver.CUtensorMapIm2ColWideMode>c_mode_int
+        cdef cydriver.CUtensorMapSwizzle c_swizzle = <cydriver.CUtensorMapSwizzle>c_swizzle_int
+        cdef cydriver.CUtensorMapL2promotion c_l2_promotion = <cydriver.CUtensorMapL2promotion>c_l2_promotion_int
+        cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = <cydriver.CUtensorMapFloatOOBfill>c_oob_fill_int
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2colWide(
+                &desc._tensor_map,
+                c_data_type,
+                c_rank,
+                <void*>global_address,
+                c_global_dim,
+                c_global_strides,
+                c_lower_w,
+                c_upper_w,
+                c_channels,
+                c_pixels,
+                c_element_strides,
+                c_interleave,
+                c_mode,
+                c_swizzle,
+                c_l2_promotion,
+                c_oob_fill,
+            ))
+
+        desc._repr_info = {
+            "method": "im2col_wide",
+            "rank": rank,
+            "data_type": tma_dt,
+            "swizzle": swizzle,
+        }
+
         return desc
 
     def replace_address(self, tensor):
@@ -512,13 +699,28 @@ cdef class TensorMapDescriptor:
                 f"Global memory address must be 16-byte aligned, "
                 f"got address 0x{global_address:x}")
 
-        self._source_ref = tensor
-
         with nogil:
             HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress(
                 &self._tensor_map,
                 <void*>global_address,
             ))
 
+        # Update the source reference only after the driver call succeeds,
+        # so we don't drop the old tensor (risking a dangling pointer in the
+        # CUtensorMap struct) if the call fails.
+        self._source_ref = tensor
+
     def __repr__(self):
-        return f"TensorMapDescriptor()"
+        info = self._repr_info
+        if info is None:
+            return "TensorMapDescriptor()"
+        parts = []
+        if "method" in info:
+            parts.append(info["method"])
+        if "rank" in info:
+            parts.append(f"rank={info['rank']}")
+        if "data_type" in info:
+            parts.append(f"dtype={info['data_type'].name}")
+        if "swizzle" in info:
+            parts.append(f"swizzle={info['swizzle'].name}")
+        return f"TensorMapDescriptor({', '.join(parts)})"
diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml
index 9dc6ac1ed9..e351fb63b6 100644
--- a/cuda_core/pixi.toml
+++ b/cuda_core/pixi.toml
@@ -75,16 +75,9 @@ cuda = "13"
 [feature.cu13.dependencies]
 cuda-version = "13.1.*"
 
-[feature.cu12.system-requirements]
-cuda = "12"
-
-[feature.cu12.dependencies]
-cuda-version = "12.*"
-
-# We keep both cu12 and cu13 because cuda.core works with either major version
-# NOTE: Path dependency to ../cuda_bindings only works for cu13 (local bindings is v13.1)
-#       For cu12 testing, use conda-forge packages: temporarily change path to wildcard
-#       or skip cu12 locally: pixi run -e cu13 test
+# NOTE: cu12 environment is intentionally omitted because the path dependency
+# to ../cuda_bindings (v13.1) makes it unsolvable locally. For cu12 testing,
+# use conda-forge packages or CI workflows.
 [environments]
 default = { features = [
     "cu13",
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 0aa5968a85..f57e73ded8 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -9,6 +9,7 @@
     Device,
     TensorMapDescriptor,
     TensorMapDataType,
+    TensorMapIm2ColWideMode,
     TensorMapInterleave,
     TensorMapL2Promotion,
     TensorMapOOBFill,
@@ -36,6 +37,23 @@ def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256):
     return buf
 
 
+class _DeviceArray:
+    """Wrap a Buffer with explicit shape via __cuda_array_interface__.
+
+    dev.allocate() returns a 1D byte buffer. For multi-dimensional TMA tests
+    we need the tensor to report a proper shape/dtype so the TMA encoder sees
+    the correct rank, dimensions, and strides.
+    """
+    def __init__(self, buf, shape, dtype=np.float32):
+        self._buf = buf  # prevent GC
+        self.__cuda_array_interface__ = {
+            "shape": tuple(shape),
+            "typestr": np.dtype(dtype).str,
+            "data": (int(buf.handle), False),
+            "version": 3,
+        }
+
+
 class TestTensorMapEnums:
     """Test that enum wrappers expose the expected values."""
 
@@ -66,6 +84,10 @@ def test_oob_fill_values(self):
         assert TensorMapOOBFill.NONE == 0
         assert TensorMapOOBFill.NAN_REQUEST_ZERO_FMA == 1
 
+    def test_im2col_wide_mode_values(self):
+        assert TensorMapIm2ColWideMode.W == 0
+        assert TensorMapIm2ColWideMode.W128 == 1
+
 
 class TestTensorMapDescriptorCreation:
     """Test TensorMapDescriptor factory methods."""
@@ -82,7 +104,7 @@ def test_from_tiled_1d(self, dev, skip_if_no_tma):
             data_type=TensorMapDataType.FLOAT32,
         )
         assert desc is not None
-        assert repr(desc) == "TensorMapDescriptor()"
+        assert repr(desc) == "TensorMapDescriptor(tiled, rank=1, dtype=FLOAT32, swizzle=NONE)"
 
     def test_from_tiled_2d(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)  # 64x64 float32
@@ -102,6 +124,18 @@ def test_from_tiled_3d(self, dev, skip_if_no_tma):
         )
         assert desc is not None
 
+    def test_from_tiled_5d(self, dev, skip_if_no_tma):
+        # 5D: exercises all 5 c_global_dim / 4 c_global_strides slots
+        shape = (2, 4, 4, 4, 8)
+        n_bytes = 2 * 4 * 4 * 4 * 8 * 4  # float32
+        buf = dev.allocate(n_bytes)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_tiled(
+            tensor,
+            box_dim=(1, 2, 2, 2, 8),
+        )
+        assert desc is not None
+
     def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
         desc = TensorMapDescriptor.from_tiled(
@@ -277,3 +311,100 @@ def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma):
                 pixels_per_column=4,
                 data_type=TensorMapDataType.FLOAT32,
             )
+
+    def test_from_im2col_4d(self, dev, skip_if_no_tma):
+        # NHWC layout: N=1, H=8, W=8, C=64 — 2 spatial dims
+        # Exercises spatial corner reversal with n_spatial=2:
+        #   Python [H_lower, W_lower] -> driver [W_lower, H_lower]
+        shape = (1, 8, 8, 64)
+        buf = dev.allocate(1 * 8 * 8 * 64 * 4)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_im2col(
+            tensor,
+            pixel_box_lower_corner=(0, 0),
+            pixel_box_upper_corner=(4, 4),
+            channels_per_pixel=64,
+            pixels_per_column=16,
+        )
+        assert desc is not None
+
+    def test_from_im2col_5d(self, dev, skip_if_no_tma):
+        # NDHWC layout: N=1, D=4, H=8, W=8, C=64 — 3 spatial dims
+        # Exercises the full spatial corner reversal:
+        #   Python [D, H, W] -> driver [W, H, D]
+        shape = (1, 4, 8, 8, 64)
+        buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_im2col(
+            tensor,
+            pixel_box_lower_corner=(0, 0, 0),
+            pixel_box_upper_corner=(2, 4, 4),
+            channels_per_pixel=64,
+            pixels_per_column=32,
+        )
+        assert desc is not None
+
+
+class TestTensorMapIm2colWide:
+    """Test im2col-wide TMA descriptor creation (compute capability 10.0+)."""
+
+    @pytest.fixture
+    def skip_if_no_im2col_wide(self, dev):
+        cc = dev.compute_capability
+        if cc.major < 10:
+            pytest.skip("Device does not support im2col-wide (requires compute capability 10.0+)")
+
+    def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide):
+        # 3D tensor: batch=1, width=32, channels=64
+        buf = dev.allocate(1 * 32 * 64 * 4)
+        desc = TensorMapDescriptor.from_im2col_wide(
+            buf,
+            pixel_box_lower_corner_width=0,
+            pixel_box_upper_corner_width=4,
+            channels_per_pixel=64,
+            pixels_per_column=4,
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+    def test_from_im2col_wide_4d(self, dev, skip_if_no_im2col_wide):
+        # NHWC layout: N=1, H=8, W=8, C=64
+        # Wide mode only uses scalar W corners, even with higher rank
+        shape = (1, 8, 8, 64)
+        buf = dev.allocate(1 * 8 * 8 * 64 * 4)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_im2col_wide(
+            tensor,
+            pixel_box_lower_corner_width=0,
+            pixel_box_upper_corner_width=4,
+            channels_per_pixel=64,
+            pixels_per_column=16,
+        )
+        assert desc is not None
+
+    def test_from_im2col_wide_5d(self, dev, skip_if_no_im2col_wide):
+        # NDHWC layout: N=1, D=4, H=8, W=8, C=64
+        # Max rank boundary — verifies all 5 dim/stride slots are filled
+        shape = (1, 4, 8, 8, 64)
+        buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_im2col_wide(
+            tensor,
+            pixel_box_lower_corner_width=0,
+            pixel_box_upper_corner_width=4,
+            channels_per_pixel=64,
+            pixels_per_column=32,
+        )
+        assert desc is not None
+
+    def test_from_im2col_wide_rank_validation(self, dev, skip_if_no_im2col_wide):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match="Im2col-wide tensor rank must be between 3 and 5"):
+            TensorMapDescriptor.from_im2col_wide(
+                buf,
+                pixel_box_lower_corner_width=0,
+                pixel_box_upper_corner_width=4,
+                channels_per_pixel=64,
+                pixels_per_column=4,
+                data_type=TensorMapDataType.FLOAT32,
+            )

From 19c4a0f6ce25b30faffec5dbb0b78dca2496af8f Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 15:15:56 -0800
Subject: [PATCH 03/25] clean up

---
 cuda_core/cuda/core/_tensor_map.pxd  |   2 +-
 cuda_core/cuda/core/_tensor_map.pyx  | 185 ++++++++++-----------------
 cuda_core/examples/tma_tensor_map.py |   2 +-
 cuda_core/tests/test_tensor_map.py   |   2 +-
 4 files changed, 72 insertions(+), 119 deletions(-)

diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
index fa2254ba70..b2b165b5aa 100644
--- a/cuda_core/cuda/core/_tensor_map.pxd
+++ b/cuda_core/cuda/core/_tensor_map.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
index 6b072f7404..9edcccb928 100644
--- a/cuda_core/cuda/core/_tensor_map.pyx
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
@@ -150,6 +150,55 @@ def _resolve_data_type(view, data_type):
     return tma_dt
 
 
+def _get_validated_view(tensor):
+    """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer."""
+    if isinstance(tensor, StridedMemoryView):
+        view = tensor
+    else:
+        # stream_ptr=-1: no stream synchronization needed because descriptor
+        # creation only reads tensor metadata, it does not move data.
+        view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+    if not view.is_device_accessible:
+        raise ValueError("The tensor must be device-accessible")
+
+    if view.ptr % 16 != 0:
+        raise ValueError(
+            f"Global memory address must be 16-byte aligned, "
+            f"got address 0x{view.ptr:x}")
+
+    return view
+
+
+def _compute_byte_strides(shape, strides, elem_size):
+    """Compute byte strides from element strides or C-contiguous fallback.
+
+    Returns a tuple of byte strides in row-major order.
+    """
+    if strides is not None:
+        return tuple(s * elem_size for s in strides)
+
+    # C-contiguous: compute byte strides from shape, innermost first
+    rank = len(shape)
+    byte_strides = []
+    stride = elem_size
+    for i in range(rank - 1, -1, -1):
+        byte_strides.append(stride)
+        stride *= shape[i]
+    byte_strides.reverse()
+    return tuple(byte_strides)
+
+
+def _validate_element_strides(element_strides, rank):
+    """Validate or default element_strides to all-ones."""
+    if element_strides is not None:
+        if len(element_strides) != rank:
+            raise ValueError(
+                f"element_strides must have {rank} elements, got {len(element_strides)}")
+        return element_strides
+    return (1,) * rank
+
+
 cdef class TensorMapDescriptor:
     """Describes a TMA (Tensor Memory Accelerator) tensor map for Hopper+ GPUs.
 
@@ -171,8 +220,8 @@ cdef class TensorMapDescriptor:
     cdef void* _get_data_ptr(self):
         return <void*>&self._tensor_map
 
-    @staticmethod
-    def from_tiled(tensor, box_dim, *,
+    @classmethod
+    def from_tiled(cls, tensor, box_dim, *,
                    element_strides=None,
                    data_type=None,
                    interleave=TensorMapInterleave.NONE,
@@ -216,40 +265,23 @@ cdef class TensorMapDescriptor:
             If the tensor rank is outside [1, 5], the pointer is not
             16-byte aligned, or dimension/stride constraints are violated.
         """
-        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
+        cdef TensorMapDescriptor desc = cls.__new__(cls)
 
-        # Obtain a StridedMemoryView from the tensor
-        if isinstance(tensor, StridedMemoryView):
-            view = tensor
-        else:
-            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
-
-        if not view.is_device_accessible:
-            raise ValueError("The tensor must be device-accessible")
-
-        # Keep a strong reference to prevent GC
+        view = _get_validated_view(tensor)
         desc._source_ref = tensor
 
-        # Resolve data type
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
         cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
 
-        # Get tensor metadata
         cdef intptr_t global_address = view.ptr
         shape = view.shape
-        strides = view.strides  # in elements, can be None for C-contiguous
 
         cdef int rank = len(shape)
         if rank < 1 or rank > 5:
             raise ValueError(
                 f"Tensor rank must be between 1 and 5, got {rank}")
 
-        if global_address % 16 != 0:
-            raise ValueError(
-                f"Global memory address must be 16-byte aligned, "
-                f"got address 0x{global_address:x} (misaligned by {global_address % 16} bytes)")
-
         if len(box_dim) != rank:
             raise ValueError(
                 f"box_dim must have {rank} elements (same as tensor rank), "
@@ -260,25 +292,10 @@ cdef class TensorMapDescriptor:
                 raise ValueError(
                     f"box_dim[{i}] must be in [1, 256], got {bd}")
 
-        if element_strides is not None:
-            if len(element_strides) != rank:
-                raise ValueError(
-                    f"element_strides must have {rank} elements, got {len(element_strides)}")
-        else:
-            element_strides = (1,) * rank
+        element_strides = _validate_element_strides(element_strides, rank)
 
-        # Compute byte strides from element strides
         cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
-        if strides is not None:
-            byte_strides = tuple(s * elem_size for s in strides)
-        else:
-            # C-contiguous: strides in bytes, row-major
-            byte_strides = []
-            stride = elem_size
-            for i in range(rank - 1, -1, -1):
-                byte_strides.append(stride)
-                stride *= shape[i]
-            byte_strides.reverse()
+        byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
 
         # Reverse dimensions for column-major cuTensorMap convention
         # Python/DLPack: row-major (dim 0 = outermost)
@@ -335,8 +352,8 @@ cdef class TensorMapDescriptor:
 
         return desc
 
-    @staticmethod
-    def from_im2col(tensor, pixel_box_lower_corner, pixel_box_upper_corner,
+    @classmethod
+    def from_im2col(cls, tensor, pixel_box_lower_corner, pixel_box_upper_corner,
                     channels_per_pixel, pixels_per_column, *,
                     element_strides=None,
                     data_type=None,
@@ -390,17 +407,9 @@ cdef class TensorMapDescriptor:
             If the tensor rank is outside [3, 5], the pointer is not
             16-byte aligned, or other constraints are violated.
         """
-        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
-
-        # Obtain a StridedMemoryView from the tensor
-        if isinstance(tensor, StridedMemoryView):
-            view = tensor
-        else:
-            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
-
-        if not view.is_device_accessible:
-            raise ValueError("The tensor must be device-accessible")
+        cdef TensorMapDescriptor desc = cls.__new__(cls)
 
+        view = _get_validated_view(tensor)
         desc._source_ref = tensor
 
         tma_dt = _resolve_data_type(view, data_type)
@@ -409,18 +418,12 @@ cdef class TensorMapDescriptor:
 
         cdef intptr_t global_address = view.ptr
         shape = view.shape
-        strides = view.strides
 
         cdef int rank = len(shape)
         if rank < 3 or rank > 5:
             raise ValueError(
                 f"Im2col tensor rank must be between 3 and 5, got {rank}")
 
-        if global_address % 16 != 0:
-            raise ValueError(
-                f"Global memory address must be 16-byte aligned, "
-                f"got address 0x{global_address:x}")
-
         cdef int n_spatial = rank - 2
         if len(pixel_box_lower_corner) != n_spatial:
             raise ValueError(
@@ -431,23 +434,10 @@ cdef class TensorMapDescriptor:
                 f"pixel_box_upper_corner must have {n_spatial} elements "
                 f"(rank - 2), got {len(pixel_box_upper_corner)}")
 
-        if element_strides is not None:
-            if len(element_strides) != rank:
-                raise ValueError(
-                    f"element_strides must have {rank} elements, got {len(element_strides)}")
-        else:
-            element_strides = (1,) * rank
+        element_strides = _validate_element_strides(element_strides, rank)
 
         cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
-        if strides is not None:
-            byte_strides = tuple(s * elem_size for s in strides)
-        else:
-            byte_strides = []
-            stride = elem_size
-            for i in range(rank - 1, -1, -1):
-                byte_strides.append(stride)
-                stride *= shape[i]
-            byte_strides.reverse()
+        byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
 
         # Reverse all dimension arrays for column-major convention
         cdef uint64_t[5] c_global_dim
@@ -509,8 +499,8 @@ cdef class TensorMapDescriptor:
 
         return desc
 
-    @staticmethod
-    def from_im2col_wide(tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width,
+    @classmethod
+    def from_im2col_wide(cls, tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width,
                          channels_per_pixel, pixels_per_column, *,
                          element_strides=None,
                          data_type=None,
@@ -565,17 +555,9 @@ cdef class TensorMapDescriptor:
             If the tensor rank is outside [3, 5], the pointer is not
             16-byte aligned, or other constraints are violated.
         """
-        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
-
-        # Obtain a StridedMemoryView from the tensor
-        if isinstance(tensor, StridedMemoryView):
-            view = tensor
-        else:
-            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
-
-        if not view.is_device_accessible:
-            raise ValueError("The tensor must be device-accessible")
+        cdef TensorMapDescriptor desc = cls.__new__(cls)
 
+        view = _get_validated_view(tensor)
         desc._source_ref = tensor
 
         tma_dt = _resolve_data_type(view, data_type)
@@ -584,35 +566,16 @@ cdef class TensorMapDescriptor:
 
         cdef intptr_t global_address = view.ptr
         shape = view.shape
-        strides = view.strides
 
         cdef int rank = len(shape)
         if rank < 3 or rank > 5:
             raise ValueError(
                 f"Im2col-wide tensor rank must be between 3 and 5, got {rank}")
 
-        if global_address % 16 != 0:
-            raise ValueError(
-                f"Global memory address must be 16-byte aligned, "
-                f"got address 0x{global_address:x}")
-
-        if element_strides is not None:
-            if len(element_strides) != rank:
-                raise ValueError(
-                    f"element_strides must have {rank} elements, got {len(element_strides)}")
-        else:
-            element_strides = (1,) * rank
+        element_strides = _validate_element_strides(element_strides, rank)
 
         cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
-        if strides is not None:
-            byte_strides = tuple(s * elem_size for s in strides)
-        else:
-            byte_strides = []
-            stride = elem_size
-            for i in range(rank - 1, -1, -1):
-                byte_strides.append(stride)
-                stride *= shape[i]
-            byte_strides.reverse()
+        byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
 
         # Reverse all dimension arrays for column-major convention
         cdef uint64_t[5] c_global_dim
@@ -685,19 +648,9 @@ cdef class TensorMapDescriptor:
             or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
             device-accessible memory with a 16-byte-aligned pointer.
         """
-        if isinstance(tensor, StridedMemoryView):
-            view = tensor
-        else:
-            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
-
-        if not view.is_device_accessible:
-            raise ValueError("The tensor must be device-accessible")
+        view = _get_validated_view(tensor)
 
         cdef intptr_t global_address = view.ptr
-        if global_address % 16 != 0:
-            raise ValueError(
-                f"Global memory address must be 16-byte aligned, "
-                f"got address 0x{global_address:x}")
 
         with nogil:
             HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress(
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index de51570ec4..e93c02dc08 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index f57e73ded8..790307857b 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest

From 35a04b97367b5cf39a493d2ab6c325e70d970cb8 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 15:55:00 -0800
Subject: [PATCH 04/25] Add comments to prepare_tensor_map_arg explaining
 allocation and lifetime

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 cuda_core/cuda/core/_kernel_arg_handler.pyx | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
index e88def13cd..b55a4abc7a 100644
--- a/cuda_core/cuda/core/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -135,8 +135,13 @@ cdef inline int prepare_tensor_map_arg(
         vector.vector[void*]& data_addresses,
         TensorMapDescriptor arg,
         const size_t idx) except -1:
+    # Allocate a temporary buffer for the 128-byte CUtensorMap struct.
+    # We copy rather than pointing directly at arg._tensor_map for lifetime
+    # safety: ParamHolder owns and frees its argument buffers independently.
     cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap))
     memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap))
+    # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__,
+    # data_addresses[idx] is the pointer passed to cuLaunchKernel.
     data_addresses[idx] = ptr
     data[idx] = ptr
     return 0

From bb19e4f5c03cb65fd0d53a1cd1a58a8159acaee8 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 17:04:26 -0800
Subject: [PATCH 05/25] Address Copilot review feedback

- Remove unused _alloc_device_tensor helper from tests
- Add test for rank > 5 (6D tensor) to verify upper bound validation
- Add NULL check for PyMem_Malloc in prepare_tensor_map_arg

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 cuda_core/cuda/core/_kernel_arg_handler.pyx |  2 ++
 cuda_core/tests/test_tensor_map.py          | 21 +++++++++++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
index b55a4abc7a..5619ba7492 100644
--- a/cuda_core/cuda/core/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -139,6 +139,8 @@ cdef inline int prepare_tensor_map_arg(
     # We copy rather than pointing directly at arg._tensor_map for lifetime
     # safety: ParamHolder owns and frees its argument buffers independently.
     cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap))
+    if ptr is NULL:
+        raise MemoryError("Failed to allocate memory for CUtensorMap")
     memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap))
     # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__,
     # data_addresses[idx] is the pointer passed to cuLaunchKernel.
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 790307857b..96dd14a419 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -28,14 +28,6 @@ def skip_if_no_tma(dev):
         pytest.skip("Device does not support TMA (requires compute capability 9.0+)")
 
 
-def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256):
-    """Allocate a device buffer and return it with proper alignment."""
-    n_elements = 1
-    for s in shape:
-        n_elements *= s
-    buf = dev.allocate(n_elements * np.dtype(dtype).itemsize + alignment)
-    return buf
-
 
 class _DeviceArray:
     """Wrap a Buffer with explicit shape via __cuda_array_interface__.
@@ -179,6 +171,19 @@ def test_invalid_rank_zero(self, dev, skip_if_no_tma):
                 data_type=TensorMapDataType.FLOAT32,
             )
 
+    def test_invalid_rank_six(self, dev, skip_if_no_tma):
+        shape = (2, 2, 2, 2, 2, 2)
+        n_elements = 1
+        for s in shape:
+            n_elements *= s
+        buf = dev.allocate(n_elements * 4)
+        arr = _DeviceArray(buf, shape)
+        with pytest.raises(ValueError, match="rank must be between 1 and 5"):
+            TensorMapDescriptor.from_tiled(
+                arr,
+                box_dim=(2,) * 6,
+            )
+
     def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma):
         buf = dev.allocate(1024 * 4)
         with pytest.raises(ValueError, match="box_dim must have 1 elements"):

From 23a8900834a1b62a163664d3ab31fcbdf29bba02 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 17:20:05 -0800
Subject: [PATCH 06/25] Split TMA example into two focused files

Move the replace_address() demonstration into its own self-contained
example (tma_replace_address.py) so each file covers a single concept.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 cuda_core/examples/tma_replace_address.py | 189 ++++++++++++++++++++++
 cuda_core/examples/tma_tensor_map.py      |  20 ---
 2 files changed, 189 insertions(+), 20 deletions(-)
 create mode 100644 cuda_core/examples/tma_replace_address.py

diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py
new file mode 100644
index 0000000000..7f11cf551c
--- /dev/null
+++ b/cuda_core/examples/tma_replace_address.py
@@ -0,0 +1,189 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates how to use replace_address() to repoint a TMA
+# (Tensor Memory Accelerator) descriptor at a different tensor without
+# rebuilding the descriptor from scratch.
+#
+# The workflow is:
+#
+#   1. Create a TMA tiled descriptor and launch a kernel to verify it works
+#   2. Allocate a second tensor with different content
+#   3. Call replace_address() to repoint the same descriptor at the new tensor
+#   4. Re-launch the kernel and verify it reads from the new tensor
+#
+# This is useful when the tensor layout (shape, dtype, tile size) stays the
+# same but the underlying data buffer changes, e.g. double-buffering or
+# iterating over a sequence of same-shaped tensors.
+#
+# Requirements:
+#   - Hopper or later GPU (compute capability >= 9.0)
+#   - CuPy
+#   - CUDA toolkit headers (CUDA_PATH or CUDA_HOME set)
+#
+# ################################################################################
+
+import sys
+
+import cupy as cp
+import numpy as np
+
+from cuda.core import (
+    Device,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    TensorMapDescriptor,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Check for Hopper+ GPU
+# ---------------------------------------------------------------------------
+dev = Device()
+arch = dev.compute_capability
+if arch < (9, 0):
+    print(
+        "TMA requires compute capability >= 9.0 (Hopper or later)",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+dev.set_current()
+
+arch_str = "".join(f"{i}" for i in arch)
+
+# ---------------------------------------------------------------------------
+# CUDA kernel that uses TMA to load a 1-D tile into shared memory, then
+# copies the tile to an output buffer so we can verify correctness.
+#
+# The CUtensorMap struct (128 bytes) is defined inline so the kernel can be
+# compiled with NVRTC without pulling in the full driver-API header.
+#
+# Key points:
+#   - The tensor map is passed by value with __grid_constant__ so the TMA
+#     hardware can read it from grid-constant memory.
+#   - Thread 0 in each block issues the TMA load and manages the mbarrier.
+#   - All threads wait on the mbarrier, then copy from shared to global.
+# ---------------------------------------------------------------------------
+TILE_SIZE = 128  # elements per tile (must match the kernel constant)
+
+code = r"""
+// Minimal definition of the 128-byte opaque tensor map struct.
+struct __align__(64) TensorMap { unsigned long long opaque[16]; };
+
+static constexpr int TILE_SIZE = 128;
+
+extern "C"
+__global__ void tma_copy(
+    const __grid_constant__ TensorMap tensor_map,
+    float* output,
+    int N)
+{
+    __shared__ __align__(128) float smem[TILE_SIZE];
+    __shared__ __align__(8) unsigned long long mbar;
+
+    const int tid        = threadIdx.x;
+    const int tile_start = blockIdx.x * TILE_SIZE;
+
+    // ---- Thread 0: set up mbarrier and issue the TMA load ----
+    if (tid == 0)
+    {
+        // Initialise a single-phase mbarrier (1 arriving thread).
+        asm volatile(
+            "mbarrier.init.shared.b64 [%0], 1;"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+
+        // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start'
+        // from the tensor described by 'tensor_map' into shared memory.
+        asm volatile(
+            "cp.async.bulk.tensor.1d.shared::cluster.global.tile"
+            ".mbarrier::complete_tx::bytes"
+            " [%0], [%1, {%2}], [%3];"
+            :: "r"((unsigned)__cvta_generic_to_shared(smem)),
+               "l"(&tensor_map),
+               "r"(tile_start),
+               "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+
+        // Tell the mbarrier how many bytes the TMA will deliver.
+        asm volatile(
+            "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)),
+               "r"((unsigned)(TILE_SIZE * sizeof(float))));
+    }
+
+    __syncthreads();
+
+    // ---- Wait for the TMA load to complete ----
+    if (tid == 0)
+    {
+        asm volatile(
+            "{ .reg .pred P;                                           \n"
+            "WAIT:                                                     \n"
+            "  mbarrier.try_wait.parity.shared.b64 P, [%0], 0;         \n"
+            "  @!P bra WAIT;                                           \n"
+            "}                                                         \n"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+    }
+
+    __syncthreads();
+
+    // ---- Copy the tile from shared memory to the output buffer ----
+    if (tid < TILE_SIZE)
+    {
+        const int idx = tile_start + tid;
+        if (idx < N)
+            output[idx] = smem[tid];
+    }
+}
+"""
+
+# ---------------------------------------------------------------------------
+# Compile the kernel
+# ---------------------------------------------------------------------------
+prog = Program(
+    code,
+    code_type="c++",
+    options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"),
+)
+mod = prog.compile("cubin")
+ker = mod.get_kernel("tma_copy")
+
+# ---------------------------------------------------------------------------
+# 1) Prepare input data and verify the initial TMA copy
+# ---------------------------------------------------------------------------
+N = 1024
+a = cp.arange(N, dtype=cp.float32)  # [0, 1, 2, ..., N-1]
+output = cp.zeros(N, dtype=cp.float32)
+dev.sync()  # cupy uses its own stream
+
+tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,))
+
+n_tiles = N // TILE_SIZE
+config = LaunchConfig(grid=n_tiles, block=TILE_SIZE)
+launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N))
+dev.sync()
+
+assert cp.array_equal(output, a), "TMA copy produced incorrect results"
+print(f"TMA copy verified: {N} elements across {n_tiles} tiles")
+
+# ---------------------------------------------------------------------------
+# 2) Demonstrate replace_address()
+#    Create a second tensor with different content, point the *same*
+#    descriptor at it, and re-launch without rebuilding the descriptor.
+# ---------------------------------------------------------------------------
+b = cp.full(N, fill_value=42.0, dtype=cp.float32)
+dev.sync()
+
+tensor_map.replace_address(b)
+
+output2 = cp.zeros(N, dtype=cp.float32)
+dev.sync()
+
+launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N))
+dev.sync()
+
+assert cp.array_equal(output2, b), "replace_address produced incorrect results"
+print("replace_address verified: descriptor reused with new source tensor")
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index e93c02dc08..63c0ac7b5c 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -13,7 +13,6 @@
 #   1. Creating a TMA tiled descriptor from a CuPy device array
 #   2. Passing the descriptor to a kernel via launch()
 #   3. Using TMA to load tiles into shared memory (via inline PTX)
-#   4. Updating the descriptor's source address with replace_address()
 #
 # Requirements:
 #   - Hopper or later GPU (compute capability >= 9.0)
@@ -174,22 +173,3 @@
 
 assert cp.array_equal(output, a), "TMA copy produced incorrect results"
 print(f"TMA copy verified: {N} elements across {n_tiles} tiles")
-
-# ---------------------------------------------------------------------------
-# 4) Demonstrate replace_address()
-#    Create a second tensor with different content, point the *same*
-#    descriptor at it, and re-launch without rebuilding the descriptor.
-# ---------------------------------------------------------------------------
-b = cp.full(N, fill_value=42.0, dtype=cp.float32)
-dev.sync()
-
-tensor_map.replace_address(b)
-
-output2 = cp.zeros(N, dtype=cp.float32)
-dev.sync()
-
-launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N))
-dev.sync()
-
-assert cp.array_equal(output2, b), "replace_address produced incorrect results"
-print("replace_address verified: descriptor reused with new source tensor")

From 0a1b7202b9bbaee509c8a6b44cc47edb9d51d03b Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 25 Feb 2026 14:14:38 -0800
Subject: [PATCH 07/25] pre-commit

---
 cuda_core/cuda/core/__init__.py           |  1 +
 cuda_core/examples/tma_replace_address.py |  1 -
 cuda_core/examples/tma_tensor_map.py      |  1 -
 cuda_core/tests/test_tensor_map.py        | 37 ++++++++++++-----------
 4 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index d5f3693721..84e4f3d356 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -60,6 +60,7 @@
     StridedMemoryView,
     args_viewable_as_strided_memory,
 )
+<<<<<<< HEAD
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._stream import (
diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py
index 7f11cf551c..21fb91c254 100644
--- a/cuda_core/examples/tma_replace_address.py
+++ b/cuda_core/examples/tma_replace_address.py
@@ -30,7 +30,6 @@
 
 import cupy as cp
 import numpy as np
-
 from cuda.core import (
     Device,
     LaunchConfig,
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index 63c0ac7b5c..c85c5d96bd 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -25,7 +25,6 @@
 
 import cupy as cp
 import numpy as np
-
 from cuda.core import (
     Device,
     LaunchConfig,
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 96dd14a419..1d6e8195b6 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -1,14 +1,12 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import pytest
-
 import numpy as np
-
+import pytest
 from cuda.core import (
     Device,
-    TensorMapDescriptor,
     TensorMapDataType,
+    TensorMapDescriptor,
     TensorMapIm2ColWideMode,
     TensorMapInterleave,
     TensorMapL2Promotion,
@@ -28,7 +26,6 @@ def skip_if_no_tma(dev):
         pytest.skip("Device does not support TMA (requires compute capability 9.0+)")
 
 
-
 class _DeviceArray:
     """Wrap a Buffer with explicit shape via __cuda_array_interface__.
 
@@ -36,6 +33,7 @@ class _DeviceArray:
     we need the tensor to report a proper shape/dtype so the TMA encoder sees
     the correct rank, dimensions, and strides.
     """
+
     def __init__(self, buf, shape, dtype=np.float32):
         self._buf = buf  # prevent GC
         self.__cuda_array_interface__ = {
@@ -225,25 +223,30 @@ def test_invalid_data_type(self, dev, skip_if_no_tma):
 class TestTensorMapDtypeMapping:
     """Test automatic dtype inference from numpy dtypes."""
 
-    @pytest.mark.parametrize("np_dtype,expected_tma_dt", [
-        (np.uint8, TensorMapDataType.UINT8),
-        (np.uint16, TensorMapDataType.UINT16),
-        (np.uint32, TensorMapDataType.UINT32),
-        (np.int32, TensorMapDataType.INT32),
-        (np.uint64, TensorMapDataType.UINT64),
-        (np.int64, TensorMapDataType.INT64),
-        (np.float16, TensorMapDataType.FLOAT16),
-        (np.float32, TensorMapDataType.FLOAT32),
-        (np.float64, TensorMapDataType.FLOAT64),
-    ])
+    @pytest.mark.parametrize(
+        "np_dtype,expected_tma_dt",
+        [
+            (np.uint8, TensorMapDataType.UINT8),
+            (np.uint16, TensorMapDataType.UINT16),
+            (np.uint32, TensorMapDataType.UINT32),
+            (np.int32, TensorMapDataType.INT32),
+            (np.uint64, TensorMapDataType.UINT64),
+            (np.int64, TensorMapDataType.INT64),
+            (np.float16, TensorMapDataType.FLOAT16),
+            (np.float32, TensorMapDataType.FLOAT32),
+            (np.float64, TensorMapDataType.FLOAT64),
+        ],
+    )
     def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma):
         from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
+
         assert _NUMPY_DTYPE_TO_TMA[np.dtype(np_dtype)] == expected_tma_dt
 
     def test_bfloat16_mapping(self):
         try:
-            from ml_dtypes import bfloat16
             from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
+            from ml_dtypes import bfloat16
+
             assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16
         except ImportError:
             pytest.skip("ml_dtypes not installed")

From 44fbdcf80f7c917bbd9ef790da0c7001cd2b747f Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 25 Feb 2026 15:07:54 -0800
Subject: [PATCH 08/25] adding stride meta data to gpu allocated memory

---
 cuda_core/tests/test_tensor_map.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 1d6e8195b6..3e030dd29d 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -98,8 +98,9 @@ def test_from_tiled_1d(self, dev, skip_if_no_tma):
 
     def test_from_tiled_2d(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)  # 64x64 float32
+        tensor = _DeviceArray(buf, (64, 64))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
         )
@@ -107,8 +108,9 @@ def test_from_tiled_2d(self, dev, skip_if_no_tma):
 
     def test_from_tiled_3d(self, dev, skip_if_no_tma):
         buf = dev.allocate(16 * 16 * 16 * 4)  # 16x16x16 float32
+        tensor = _DeviceArray(buf, (16, 16, 16))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(8, 8, 8),
             data_type=TensorMapDataType.FLOAT32,
         )
@@ -128,8 +130,9 @@ def test_from_tiled_5d(self, dev, skip_if_no_tma):
 
     def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
             swizzle=TensorMapSwizzle.SWIZZLE_128B,
@@ -138,8 +141,9 @@ def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
 
     def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
             l2_promotion=TensorMapL2Promotion.L2_128B,
@@ -148,8 +152,9 @@ def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma):
 
     def test_from_tiled_with_oob_fill(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
             oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA,
@@ -162,9 +167,10 @@ class TestTensorMapDescriptorValidation:
 
     def test_invalid_rank_zero(self, dev, skip_if_no_tma):
         buf = dev.allocate(64)
+        tensor = _DeviceArray(buf, ())  # 0-dim tensor
         with pytest.raises(ValueError, match="rank must be between 1 and 5"):
             TensorMapDescriptor.from_tiled(
-                buf,
+                tensor,
                 box_dim=(),
                 data_type=TensorMapDataType.FLOAT32,
             )
@@ -286,8 +292,9 @@ class TestTensorMapIm2col:
     def test_from_im2col_3d(self, dev, skip_if_no_tma):
         # 3D tensor: batch=1, height=32, channels=64
         buf = dev.allocate(1 * 32 * 64 * 4)
+        tensor = _DeviceArray(buf, (1, 32, 64))
         desc = TensorMapDescriptor.from_im2col(
-            buf,
+            tensor,
             pixel_box_lower_corner=(0,),
             pixel_box_upper_corner=(4,),
             channels_per_pixel=64,
@@ -310,9 +317,10 @@ def test_from_im2col_rank_validation(self, dev, skip_if_no_tma):
 
     def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma):
         buf = dev.allocate(1 * 32 * 64 * 4)
+        tensor = _DeviceArray(buf, (1, 32, 64))  # 3D: n_spatial = 1
         with pytest.raises(ValueError, match="pixel_box_lower_corner must have 1 elements"):
             TensorMapDescriptor.from_im2col(
-                buf,
+                tensor,
                 pixel_box_lower_corner=(0, 0),
                 pixel_box_upper_corner=(4,),
                 channels_per_pixel=64,

From bdf39a24efcb528a2610967ea5f6459c0fee2472 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 25 Feb 2026 15:59:31 -0800
Subject: [PATCH 09/25] im2col fixes

---
 cuda_core/tests/test_tensor_map.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 3e030dd29d..8b5e27ce6f 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -373,8 +373,9 @@ def skip_if_no_im2col_wide(self, dev):
     def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide):
         # 3D tensor: batch=1, width=32, channels=64
         buf = dev.allocate(1 * 32 * 64 * 4)
+        tensor = _DeviceArray(buf, (1, 32, 64))
         desc = TensorMapDescriptor.from_im2col_wide(
-            buf,
+            tensor,
             pixel_box_lower_corner_width=0,
             pixel_box_upper_corner_width=4,
             channels_per_pixel=64,

From 96a3e84275f8d88e6476b1e71ef04bf6fedba6c8 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Tue, 3 Mar 2026 18:41:07 -0500
Subject: [PATCH 10/25] Reuse CCCL TMA descriptor construction for tiled
 TensorMap and keep validated views alive to avoid DLPack-backed pointer
 lifetime hazards. Add explicit tiled element-stride coverage and acknowledge
 the DLPack include-layout compatibility follow-up in NVIDIA/cccl#7871.

Made-with: Cursor
---
 .gitignore                                 |   1 +
 cuda_core/cuda/core/__init__.py            |   1 -
 cuda_core/cuda/core/_cpp/tensor_map.cpp    | 149 +++++++++
 cuda_core/cuda/core/_cpp/tensor_map_cccl.h |  43 +++
 cuda_core/cuda/core/_tensor_map.pxd        |   1 +
 cuda_core/cuda/core/_tensor_map.pyx        | 358 ++++++++++++++++-----
 cuda_core/examples/tma_replace_address.py  |   1 +
 cuda_core/examples/tma_tensor_map.py       |   1 +
 cuda_core/tests/test_tensor_map.py         |  27 +-
 9 files changed, 497 insertions(+), 85 deletions(-)
 create mode 100644 cuda_core/cuda/core/_cpp/tensor_map.cpp
 create mode 100644 cuda_core/cuda/core/_cpp/tensor_map_cccl.h

diff --git a/.gitignore b/.gitignore
index 7d9bcd38c3..18c919ca5e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ __pycache__/
 !*_impl.cpp
 !cuda_bindings/cuda/bindings/_lib/param_packer.cpp
 !cuda_bindings/cuda/bindings/_bindings/loader.cpp
+!cuda_core/cuda/core/_cpp/*.cpp
 cache_driver
 cache_runtime
 cache_nvrtc
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 84e4f3d356..d5f3693721 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -60,7 +60,6 @@
     StridedMemoryView,
     args_viewable_as_strided_memory,
 )
-<<<<<<< HEAD
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._stream import (
diff --git a/cuda_core/cuda/core/_cpp/tensor_map.cpp b/cuda_core/cuda/core/_cpp/tensor_map.cpp
new file mode 100644
index 0000000000..af09aa1b2f
--- /dev/null
+++ b/cuda_core/cuda/core/_cpp/tensor_map.cpp
@@ -0,0 +1,149 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tensor_map_cccl.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <exception>
+
+#if defined(__has_include)
+#  if __has_include(<cuda/tma>)
+#    include <cuda/tma>
+#    define CUDA_CORE_HAS_CUDA_TMA 1
+#  else
+#    define CUDA_CORE_HAS_CUDA_TMA 0
+#  endif
+#  if __has_include(<dlpack/dlpack.h>)
+#    include <dlpack/dlpack.h>
+#    define CUDA_CORE_HAS_DLPACK_H 1
+#  else
+#    define CUDA_CORE_HAS_DLPACK_H 0
+#  endif
+#else
+#  define CUDA_CORE_HAS_CUDA_TMA 0
+#  define CUDA_CORE_HAS_DLPACK_H 0
+#endif
+
+static inline void cuda_core_write_err(char* err, size_t cap, const char* msg) noexcept
+{
+  if (!err || cap == 0)
+    return;
+  if (!msg)
+  {
+    err[0] = '\0';
+    return;
+  }
+  size_t n = ::strlen(msg);
+  if (n >= cap)
+    n = cap - 1;
+  ::memcpy(err, msg, n);
+  err[n] = '\0';
+}
+
+int cuda_core_cccl_make_tma_descriptor_tiled(
+  void* out_tensor_map,
+  void* data,
+  int device_type,
+  int device_id,
+  int ndim,
+  const int64_t* shape,
+  const int64_t* strides,
+  uint8_t dtype_code,
+  uint8_t dtype_bits,
+  uint16_t dtype_lanes,
+  const int* box_sizes,
+  const int* elem_strides,
+  int interleave_layout,
+  int swizzle,
+  int l2_fetch_size,
+  int oob_fill,
+  char* err,
+  size_t err_cap) noexcept
+{
+#if !(CUDA_CORE_HAS_CUDA_TMA && CUDA_CORE_HAS_DLPACK_H)
+  (void)out_tensor_map;
+  (void)data;
+  (void)device_type;
+  (void)device_id;
+  (void)ndim;
+  (void)shape;
+  (void)strides;
+  (void)dtype_code;
+  (void)dtype_bits;
+  (void)dtype_lanes;
+  (void)box_sizes;
+  (void)elem_strides;
+  (void)interleave_layout;
+  (void)swizzle;
+  (void)l2_fetch_size;
+  (void)oob_fill;
+  cuda_core_write_err(err, err_cap, "CCCL <cuda/tma> and/or <dlpack/dlpack.h> not available at build time");
+  return 1;
+#else
+  try
+  {
+    if (!out_tensor_map)
+    {
+      cuda_core_write_err(err, err_cap, "out_tensor_map is NULL");
+      return 1;
+    }
+    if (!data)
+    {
+      cuda_core_write_err(err, err_cap, "tensor data pointer is NULL");
+      return 1;
+    }
+    if (!shape || !box_sizes || ndim <= 0)
+    {
+      cuda_core_write_err(err, err_cap, "invalid rank/shape/box_sizes");
+      return 1;
+    }
+
+    DLTensor t{};
+    t.data        = data;
+    t.device      = {static_cast<DLDeviceType>(device_type), device_id};
+    t.ndim        = ndim;
+    t.dtype.code  = dtype_code;
+    t.dtype.bits  = dtype_bits;
+    t.dtype.lanes = dtype_lanes;
+    // CCCL promises not to mutate the arrays, but DLPack uses non-const pointers.
+    t.shape       = const_cast<int64_t*>(shape);
+    t.strides     = const_cast<int64_t*>(strides);
+    t.byte_offset = 0;
+
+    const auto layout = static_cast<cuda::tma_interleave_layout>(interleave_layout);
+    const auto swz    = static_cast<cuda::tma_swizzle>(swizzle);
+    const auto l2     = static_cast<cuda::tma_l2_fetch_size>(l2_fetch_size);
+    const auto oob    = static_cast<cuda::tma_oob_fill>(oob_fill);
+
+    auto box = cuda::std::span<const int>(box_sizes, static_cast<size_t>(ndim));
+
+    CUtensorMap desc{};
+    if (elem_strides)
+    {
+      auto es = cuda::std::span<const int>(elem_strides, static_cast<size_t>(ndim));
+      desc    = cuda::make_tma_descriptor(t, box, es, layout, swz, l2, oob);
+    }
+    else
+    {
+      desc = cuda::make_tma_descriptor(t, box, layout, swz, l2, oob);
+    }
+
+    ::memcpy(out_tensor_map, &desc, sizeof(CUtensorMap));
+    cuda_core_write_err(err, err_cap, nullptr);
+    return 0;
+  }
+  catch (const std::exception& e)
+  {
+    cuda_core_write_err(err, err_cap, e.what());
+    return 1;
+  }
+  catch (...)
+  {
+    cuda_core_write_err(err, err_cap, "unknown error while building TMA descriptor");
+    return 1;
+  }
+#endif
+}
diff --git a/cuda_core/cuda/core/_cpp/tensor_map_cccl.h b/cuda_core/cuda/core/_cpp/tensor_map_cccl.h
new file mode 100644
index 0000000000..71be425182
--- /dev/null
+++ b/cuda_core/cuda/core/_cpp/tensor_map_cccl.h
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef CUDA_CORE_TENSOR_MAP_CCCL_H_
+#define CUDA_CORE_TENSOR_MAP_CCCL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Build a tiled CUtensorMap using CCCL's cuda::make_tma_descriptor (from <cuda/tma>).
+//
+// Returns 0 on success; on failure returns non-zero and writes a best-effort
+// human-readable message into (err, err_cap) if provided.
+int cuda_core_cccl_make_tma_descriptor_tiled(
+  void* out_tensor_map,
+  void* data,
+  int device_type,
+  int device_id,
+  int ndim,
+  const int64_t* shape,        // length ndim
+  const int64_t* strides,      // length ndim, or NULL for contiguous
+  uint8_t dtype_code,
+  uint8_t dtype_bits,
+  uint16_t dtype_lanes,
+  const int* box_sizes,        // length ndim
+  const int* elem_strides,     // length ndim, or NULL for all-ones overload
+  int interleave_layout,
+  int swizzle,
+  int l2_fetch_size,
+  int oob_fill,
+  char* err,
+  size_t err_cap) noexcept;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // CUDA_CORE_TENSOR_MAP_CCCL_H_
diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
index b2b165b5aa..07e8adebc9 100644
--- a/cuda_core/cuda/core/_tensor_map.pxd
+++ b/cuda_core/cuda/core/_tensor_map.pxd
@@ -8,6 +8,7 @@ from cuda.bindings cimport cydriver
 cdef class TensorMapDescriptor:
     cdef cydriver.CUtensorMap _tensor_map
     cdef object _source_ref
+    cdef object _view_ref
     cdef object _repr_info
 
     cdef void* _get_data_ptr(self)
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
index 9edcccb928..6054add9b6 100644
--- a/cuda_core/cuda/core/_tensor_map.pyx
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -2,9 +2,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport intptr_t, uint32_t, uint64_t
+from libc.stdint cimport intptr_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stddef cimport size_t
 from cuda.bindings cimport cydriver
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._dlpack cimport kDLInt, kDLUInt, kDLFloat, kDLBfloat, _kDLCUDA
 
 import enum
 
@@ -12,6 +14,27 @@ import numpy
 
 from cuda.core._memoryview import StridedMemoryView
 
+cdef extern from "_cpp/tensor_map_cccl.h":
+    int cuda_core_cccl_make_tma_descriptor_tiled(
+        void* out_tensor_map,
+        void* data,
+        int device_type,
+        int device_id,
+        int ndim,
+        const int64_t* shape,
+        const int64_t* strides,
+        uint8_t dtype_code,
+        uint8_t dtype_bits,
+        uint16_t dtype_lanes,
+        const int* box_sizes,
+        const int* elem_strides,
+        int interleave_layout,
+        int swizzle,
+        int l2_fetch_size,
+        int oob_fill,
+        char* err,
+        size_t err_cap) nogil
+
 
 try:
     from ml_dtypes import bfloat16 as ml_bfloat16
@@ -80,14 +103,25 @@ class TensorMapOOBFill(enum.IntEnum):
     NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
 
 
-class TensorMapIm2ColWideMode(enum.IntEnum):
-    """Im2col wide mode for tensor map descriptors.
+IF CUDA_CORE_BUILD_MAJOR >= 13:
+    class TensorMapIm2ColWideMode(enum.IntEnum):
+        """Im2col wide mode for tensor map descriptors.
 
-    These correspond to the ``CUtensorMapIm2ColWideMode`` driver enum values.
-    Supported on compute capability 10.0+.
-    """
-    W = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
-    W128 = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
+        These correspond to the ``CUtensorMapIm2ColWideMode`` driver enum values.
+        Supported on compute capability 10.0+.
+        """
+        W = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
+        W128 = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
+ELSE:
+    class TensorMapIm2ColWideMode(enum.IntEnum):
+        """Im2col wide mode for tensor map descriptors.
+
+        This enum is always defined for API stability, but the
+        :meth:`TensorMapDescriptor.from_im2col_wide` factory requires a CUDA 13+
+        build and will raise otherwise.
+        """
+        W = 0
+        W128 = 1
 
 
 # Mapping from numpy dtype to TMA data type
@@ -150,6 +184,65 @@ def _resolve_data_type(view, data_type):
     return tma_dt
 
 
+cdef inline bint _tma_dtype_to_dlpack(
+    object tma_dt,
+    uint8_t* out_code,
+    uint8_t* out_bits,
+    uint16_t* out_lanes,
+) noexcept:
+    if tma_dt == TensorMapDataType.UINT8:
+        out_code[0] = <uint8_t>kDLUInt
+        out_bits[0] = <uint8_t>8
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.UINT16:
+        out_code[0] = <uint8_t>kDLUInt
+        out_bits[0] = <uint8_t>16
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.UINT32:
+        out_code[0] = <uint8_t>kDLUInt
+        out_bits[0] = <uint8_t>32
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.UINT64:
+        out_code[0] = <uint8_t>kDLUInt
+        out_bits[0] = <uint8_t>64
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.INT32:
+        out_code[0] = <uint8_t>kDLInt
+        out_bits[0] = <uint8_t>32
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.INT64:
+        out_code[0] = <uint8_t>kDLInt
+        out_bits[0] = <uint8_t>64
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.FLOAT16:
+        out_code[0] = <uint8_t>kDLFloat
+        out_bits[0] = <uint8_t>16
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.FLOAT32:
+        out_code[0] = <uint8_t>kDLFloat
+        out_bits[0] = <uint8_t>32
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.FLOAT64:
+        out_code[0] = <uint8_t>kDLFloat
+        out_bits[0] = <uint8_t>64
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.BFLOAT16:
+        out_code[0] = <uint8_t>kDLBfloat
+        out_bits[0] = <uint8_t>16
+        out_lanes[0] = <uint16_t>1
+        return True
+    return False
+
+
 def _get_validated_view(tensor):
     """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer."""
     if isinstance(tensor, StridedMemoryView):
@@ -268,7 +361,11 @@ cdef class TensorMapDescriptor:
         cdef TensorMapDescriptor desc = cls.__new__(cls)
 
         view = _get_validated_view(tensor)
+        # Keep both the original tensor object and the validated view alive.
+        # For DLPack exporters, the view may hold the owning capsule whose
+        # deleter can free the backing allocation when released.
         desc._source_ref = tensor
+        desc._view_ref = view
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -292,8 +389,92 @@ cdef class TensorMapDescriptor:
                 raise ValueError(
                     f"box_dim[{i}] must be in [1, 256], got {bd}")
 
+        cdef bint elem_strides_provided = element_strides is not None
         element_strides = _validate_element_strides(element_strides, rank)
 
+        # Reuse CCCL/libcu++'s DLPack -> CUtensorMap conversion when possible.
+        # This avoids maintaining a second, independent validation/encoding implementation.
+        cdef uint8_t dl_code
+        cdef uint8_t dl_bits
+        cdef uint16_t dl_lanes
+        cdef int64_t c_shape[5]
+        cdef int64_t c_strides[5]
+        cdef int c_box_sizes[5]
+        cdef int c_elem_strides[5]
+        cdef const int64_t* c_strides_ptr
+        cdef const int* c_elem_strides_ptr
+        cdef char errbuf[512]
+        cdef int i_cccl
+        cdef int device_type
+        cdef int c_device_id
+        cdef int c_cccl_interleave_int
+        cdef int c_cccl_swizzle_int
+        cdef int c_cccl_l2_promotion_int
+        cdef int c_cccl_oob_fill_int
+        cdef int rc
+        if _tma_dtype_to_dlpack(tma_dt, &dl_code, &dl_bits, &dl_lanes):
+            c_strides_ptr = NULL
+            c_elem_strides_ptr = NULL
+            errbuf[0] = 0
+
+            for i_cccl in range(rank):
+                c_shape[i_cccl] = <int64_t>shape[i_cccl]
+                c_box_sizes[i_cccl] = <int>box_dim[i_cccl]
+                if elem_strides_provided:
+                    c_elem_strides[i_cccl] = <int>element_strides[i_cccl]
+
+            if view.strides is not None:
+                for i_cccl in range(rank):
+                    c_strides[i_cccl] = <int64_t>view.strides[i_cccl]
+                c_strides_ptr = &c_strides[0]
+
+            if elem_strides_provided:
+                c_elem_strides_ptr = &c_elem_strides[0]
+
+            device_type = <int>_kDLCUDA
+            c_device_id = <int>view.device_id
+            c_cccl_interleave_int = int(interleave)
+            c_cccl_swizzle_int = int(swizzle)
+            c_cccl_l2_promotion_int = int(l2_promotion)
+            c_cccl_oob_fill_int = int(oob_fill)
+
+            with nogil:
+                rc = cuda_core_cccl_make_tma_descriptor_tiled(
+                    <void*>&desc._tensor_map,
+                    <void*>global_address,
+                    device_type,
+                    c_device_id,
+                    rank,
+                    &c_shape[0],
+                    c_strides_ptr,
+                    dl_code,
+                    dl_bits,
+                    dl_lanes,
+                    &c_box_sizes[0],
+                    c_elem_strides_ptr,
+                    c_cccl_interleave_int,
+                    c_cccl_swizzle_int,
+                    c_cccl_l2_promotion_int,
+                    c_cccl_oob_fill_int,
+                    &errbuf[0],
+                    <size_t>sizeof(errbuf),
+                )
+
+            if rc == 0:
+                desc._repr_info = {
+                    "method": "tiled",
+                    "rank": rank,
+                    "data_type": tma_dt,
+                    "swizzle": swizzle,
+                }
+                return desc
+
+            msg = errbuf[:].split(b"\0", 1)[0].decode("utf-8", errors="replace")
+            # If CCCL isn't available at build time, fall back to the direct
+            # driver API path to preserve functionality on older toolchains.
+            if "not available at build time" not in msg:
+                raise ValueError(f"Failed to build TMA descriptor via CCCL: {msg}")
+
         cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
         byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
 
@@ -411,6 +592,7 @@ cdef class TensorMapDescriptor:
 
         view = _get_validated_view(tensor)
         desc._source_ref = tensor
+        desc._view_ref = view
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -447,6 +629,10 @@ cdef class TensorMapDescriptor:
         cdef int[3] c_pixel_box_upper
         cdef int i_c
 
+        for i_c in range(3):
+            c_pixel_box_lower[i_c] = 0
+            c_pixel_box_upper[i_c] = 0
+
         for i_c in range(rank):
             c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
             c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
@@ -555,85 +741,90 @@ cdef class TensorMapDescriptor:
             If the tensor rank is outside [3, 5], the pointer is not
             16-byte aligned, or other constraints are violated.
         """
-        cdef TensorMapDescriptor desc = cls.__new__(cls)
-
-        view = _get_validated_view(tensor)
-        desc._source_ref = tensor
-
-        tma_dt = _resolve_data_type(view, data_type)
-        cdef int c_data_type_int = int(tma_dt)
-        cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
-
-        cdef intptr_t global_address = view.ptr
-        shape = view.shape
-
-        cdef int rank = len(shape)
-        if rank < 3 or rank > 5:
-            raise ValueError(
-                f"Im2col-wide tensor rank must be between 3 and 5, got {rank}")
-
-        element_strides = _validate_element_strides(element_strides, rank)
-
-        cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
-        byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
-
-        # Reverse all dimension arrays for column-major convention
-        cdef uint64_t[5] c_global_dim
-        cdef uint64_t[4] c_global_strides
-        cdef uint32_t[5] c_element_strides
-        cdef int i_c
-
-        for i_c in range(rank):
-            c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
-            c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
+        IF CUDA_CORE_BUILD_MAJOR < 13:
+            raise RuntimeError(
+                "TensorMapDescriptor.from_im2col_wide requires a CUDA 13+ build")
+        ELSE:
+            cdef TensorMapDescriptor desc = cls.__new__(cls)
 
-        for i_c in range(rank - 1):
-            c_global_strides[i_c] = <uint64_t>byte_strides[rank - 2 - i_c]
+            view = _get_validated_view(tensor)
+            desc._source_ref = tensor
+            desc._view_ref = view
 
-        cdef uint32_t c_rank = <uint32_t>rank
-        cdef int c_lower_w = <int>pixel_box_lower_corner_width
-        cdef int c_upper_w = <int>pixel_box_upper_corner_width
-        cdef uint32_t c_channels = <uint32_t>channels_per_pixel
-        cdef uint32_t c_pixels = <uint32_t>pixels_per_column
-        cdef int c_interleave_int = int(interleave)
-        cdef int c_mode_int = int(mode)
-        cdef int c_swizzle_int = int(swizzle)
-        cdef int c_l2_promotion_int = int(l2_promotion)
-        cdef int c_oob_fill_int = int(oob_fill)
-        cdef cydriver.CUtensorMapInterleave c_interleave = <cydriver.CUtensorMapInterleave>c_interleave_int
-        cdef cydriver.CUtensorMapIm2ColWideMode c_mode = <cydriver.CUtensorMapIm2ColWideMode>c_mode_int
-        cdef cydriver.CUtensorMapSwizzle c_swizzle = <cydriver.CUtensorMapSwizzle>c_swizzle_int
-        cdef cydriver.CUtensorMapL2promotion c_l2_promotion = <cydriver.CUtensorMapL2promotion>c_l2_promotion_int
-        cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = <cydriver.CUtensorMapFloatOOBfill>c_oob_fill_int
+            tma_dt = _resolve_data_type(view, data_type)
+            cdef int c_data_type_int = int(tma_dt)
+            cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
 
-        with nogil:
-            HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2colWide(
-                &desc._tensor_map,
-                c_data_type,
-                c_rank,
-                <void*>global_address,
-                c_global_dim,
-                c_global_strides,
-                c_lower_w,
-                c_upper_w,
-                c_channels,
-                c_pixels,
-                c_element_strides,
-                c_interleave,
-                c_mode,
-                c_swizzle,
-                c_l2_promotion,
-                c_oob_fill,
-            ))
+            cdef intptr_t global_address = view.ptr
+            shape = view.shape
 
-        desc._repr_info = {
-            "method": "im2col_wide",
-            "rank": rank,
-            "data_type": tma_dt,
-            "swizzle": swizzle,
-        }
-
-        return desc
+            cdef int rank = len(shape)
+            if rank < 3 or rank > 5:
+                raise ValueError(
+                    f"Im2col-wide tensor rank must be between 3 and 5, got {rank}")
+
+            element_strides = _validate_element_strides(element_strides, rank)
+
+            cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
+            byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
+
+            # Reverse all dimension arrays for column-major convention
+            cdef uint64_t[5] c_global_dim
+            cdef uint64_t[4] c_global_strides
+            cdef uint32_t[5] c_element_strides
+            cdef int i_c
+
+            for i_c in range(rank):
+                c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
+                c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
+
+            for i_c in range(rank - 1):
+                c_global_strides[i_c] = <uint64_t>byte_strides[rank - 2 - i_c]
+
+            cdef uint32_t c_rank = <uint32_t>rank
+            cdef int c_lower_w = <int>pixel_box_lower_corner_width
+            cdef int c_upper_w = <int>pixel_box_upper_corner_width
+            cdef uint32_t c_channels = <uint32_t>channels_per_pixel
+            cdef uint32_t c_pixels = <uint32_t>pixels_per_column
+            cdef int c_interleave_int = int(interleave)
+            cdef int c_mode_int = int(mode)
+            cdef int c_swizzle_int = int(swizzle)
+            cdef int c_l2_promotion_int = int(l2_promotion)
+            cdef int c_oob_fill_int = int(oob_fill)
+            cdef cydriver.CUtensorMapInterleave c_interleave = <cydriver.CUtensorMapInterleave>c_interleave_int
+            cdef cydriver.CUtensorMapIm2ColWideMode c_mode = <cydriver.CUtensorMapIm2ColWideMode>c_mode_int
+            cdef cydriver.CUtensorMapSwizzle c_swizzle = <cydriver.CUtensorMapSwizzle>c_swizzle_int
+            cdef cydriver.CUtensorMapL2promotion c_l2_promotion = <cydriver.CUtensorMapL2promotion>c_l2_promotion_int
+            cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = <cydriver.CUtensorMapFloatOOBfill>c_oob_fill_int
+
+            with nogil:
+                HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2colWide(
+                    &desc._tensor_map,
+                    c_data_type,
+                    c_rank,
+                    <void*>global_address,
+                    c_global_dim,
+                    c_global_strides,
+                    c_lower_w,
+                    c_upper_w,
+                    c_channels,
+                    c_pixels,
+                    c_element_strides,
+                    c_interleave,
+                    c_mode,
+                    c_swizzle,
+                    c_l2_promotion,
+                    c_oob_fill,
+                ))
+
+            desc._repr_info = {
+                "method": "im2col_wide",
+                "rank": rank,
+                "data_type": tma_dt,
+                "swizzle": swizzle,
+            }
+
+            return desc
 
     def replace_address(self, tensor):
         """Replace the global memory address in this tensor map descriptor.
@@ -662,6 +853,7 @@ cdef class TensorMapDescriptor:
         # so we don't drop the old tensor (risking a dangling pointer in the
         # CUtensorMap struct) if the call fails.
         self._source_ref = tensor
+        self._view_ref = view
 
     def __repr__(self):
         info = self._repr_info
diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py
index 21fb91c254..7f11cf551c 100644
--- a/cuda_core/examples/tma_replace_address.py
+++ b/cuda_core/examples/tma_replace_address.py
@@ -30,6 +30,7 @@
 
 import cupy as cp
 import numpy as np
+
 from cuda.core import (
     Device,
     LaunchConfig,
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index c85c5d96bd..63c0ac7b5c 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -25,6 +25,7 @@
 
 import cupy as cp
 import numpy as np
+
 from cuda.core import (
     Device,
     LaunchConfig,
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 8b5e27ce6f..4db456d863 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pytest
+
 from cuda.core import (
     Device,
     TensorMapDataType,
@@ -128,6 +129,29 @@ def test_from_tiled_5d(self, dev, skip_if_no_tma):
         )
         assert desc is not None
 
+    def test_from_tiled_with_element_strides_buffer(self, dev, skip_if_no_tma):
+        # Use a Buffer input (DLPack path) and explicit element_strides.
+        buf = dev.allocate(1024 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(64,),
+            element_strides=(2,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+    def test_from_tiled_with_element_strides_cai(self, dev, skip_if_no_tma):
+        # Use a CAI-style tensor wrapper and explicit element_strides.
+        buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
+        desc = TensorMapDescriptor.from_tiled(
+            tensor,
+            box_dim=(32, 32),
+            element_strides=(2, 1),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
     def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
         tensor = _DeviceArray(buf, (64, 64))
@@ -250,9 +274,10 @@ def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma):
 
     def test_bfloat16_mapping(self):
         try:
-            from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
             from ml_dtypes import bfloat16
 
+            from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
+
             assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16
         except ImportError:
             pytest.skip("ml_dtypes not installed")

From 1a6b41608658499486fa5705eaea93551425442b Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Sat, 7 Mar 2026 08:00:36 -0500
Subject: [PATCH 11/25] Skip im2col-wide TensorMap tests when runtime support
 is unavailable.

Probe support in the fixture and skip when cuda.core is built without CUDA 13 im2col-wide support or when the driver/GPU reports CUDA_ERROR_INVALID_VALUE, so unsupported RTXPRO6000 lanes don't block unrelated changes.

Made-with: Cursor
---
 cuda_core/tests/test_tensor_map.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 4db456d863..2d6dc138c0 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -395,6 +395,31 @@ def skip_if_no_im2col_wide(self, dev):
         if cc.major < 10:
             pytest.skip("Device does not support im2col-wide (requires compute capability 10.0+)")
 
+        # Some environments in CI exercise this test module with a cuda.core
+        # build that does not include im2col-wide symbols (CUDA < 13 build),
+        # or with driver/GPU combinations that reject im2col-wide descriptor
+        # encoding for otherwise valid inputs. Probe once per test invocation
+        # and skip only for those known unsupported cases.
+        buf = dev.allocate(1 * 32 * 64 * 4)
+        tensor = _DeviceArray(buf, (1, 32, 64))
+        try:
+            TensorMapDescriptor.from_im2col_wide(
+                tensor,
+                pixel_box_lower_corner_width=0,
+                pixel_box_upper_corner_width=4,
+                channels_per_pixel=64,
+                pixels_per_column=4,
+                data_type=TensorMapDataType.FLOAT32,
+            )
+        except RuntimeError as e:
+            if "requires a CUDA 13+ build" in str(e):
+                pytest.skip("Im2col-wide requires cuda.core built with CUDA 13+")
+            raise
+        except Exception as e:
+            if "CUDA_ERROR_INVALID_VALUE" in str(e):
+                pytest.skip("Im2col-wide unsupported on this driver/GPU combination")
+            raise
+
     def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide):
         # 3D tensor: batch=1, width=32, channels=64
         buf = dev.allocate(1 * 32 * 64 * 4)

From 892ee605e500b2e9225f7077bd66e61e1fc4eb97 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Sat, 7 Mar 2026 09:41:46 -0500
Subject: [PATCH 12/25] Align TensorMap API surface with review feedback and
 enforce context safety.

Expose only TensorMapDescriptor in cuda.core, add StridedMemoryView.as_tensor_map(), remove redundant tensor-map fallback packing, and track/check descriptor context/device compatibility before replacement and kernel launch argument packing.

Made-with: Cursor
---
 .gitignore                                  |  1 -
 cuda_core/cuda/core/__init__.py             | 10 +----
 cuda_core/cuda/core/_kernel_arg_handler.pyx |  4 +-
 cuda_core/cuda/core/_memoryview.pyx         | 33 ++++++++++++++
 cuda_core/cuda/core/_tensor_map.pxd         |  4 ++
 cuda_core/cuda/core/_tensor_map.pyx         | 48 +++++++++++++++++++++
 cuda_core/examples/tma_replace_address.py   |  4 +-
 cuda_core/examples/tma_tensor_map.py        | 11 ++---
 cuda_core/tests/test_tensor_map.py          | 15 ++++++-
 9 files changed, 107 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index 18c919ca5e..7d9bcd38c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,7 +14,6 @@ __pycache__/
 !*_impl.cpp
 !cuda_bindings/cuda/bindings/_lib/param_packer.cpp
 !cuda_bindings/cuda/bindings/_bindings/loader.cpp
-!cuda_core/cuda/core/_cpp/*.cpp
 cache_driver
 cache_runtime
 cache_nvrtc
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index d5f3693721..c32ff79ecc 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -68,12 +68,4 @@
     Stream,
     StreamOptions,
 )
-from cuda.core._tensor_map import (
-    TensorMapDataType,
-    TensorMapDescriptor,
-    TensorMapIm2ColWideMode,
-    TensorMapInterleave,
-    TensorMapL2Promotion,
-    TensorMapOOBFill,
-    TensorMapSwizzle,
-)
+from cuda.core._tensor_map import TensorMapDescriptor
diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
index 5619ba7492..28a981fed2 100644
--- a/cuda_core/cuda/core/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -135,6 +135,7 @@ cdef inline int prepare_tensor_map_arg(
         vector.vector[void*]& data_addresses,
         TensorMapDescriptor arg,
         const size_t idx) except -1:
+    arg._check_context_compat()
     # Allocate a temporary buffer for the 128-byte CUtensorMap struct.
     # We copy rather than pointing directly at arg._tensor_map for lifetime
     # safety: ParamHolder owns and frees its argument buffers independently.
@@ -350,9 +351,6 @@ cdef class ParamHolder:
                 elif isinstance(arg, driver.CUgraphConditionalHandle):
                     prepare_arg[cydriver.CUgraphConditionalHandle](self.data, self.data_addresses, arg, i)
                     continue
-                elif isinstance(arg, tensor_map_descriptor_type):
-                    prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
-                    continue
                 # TODO: support ctypes/numpy struct
                 raise TypeError("the argument is of unsupported type: " + str(type(arg)))
 
diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
index 0e1df726c0..78002b3ff1 100644
--- a/cuda_core/cuda/core/_memoryview.pyx
+++ b/cuda_core/cuda/core/_memoryview.pyx
@@ -316,6 +316,39 @@ cdef class StridedMemoryView:
         view_buffer_strided(view, self.get_buffer(), layout, dtype, self.readonly)
         return view
 
+    def as_tensor_map(
+        self,
+        box_dim,
+        *,
+        element_strides=None,
+        data_type=None,
+        interleave=None,
+        swizzle=None,
+        l2_promotion=None,
+        oob_fill=None,
+    ):
+        """Create a tiled :obj:`TensorMapDescriptor` from this view.
+
+        This is a convenience wrapper around
+        :meth:`cuda.core._tensor_map.TensorMapDescriptor.from_tiled`.
+        """
+        from cuda.core._tensor_map import TensorMapDescriptor
+
+        kwargs = {}
+        if element_strides is not None:
+            kwargs["element_strides"] = element_strides
+        if data_type is not None:
+            kwargs["data_type"] = data_type
+        if interleave is not None:
+            kwargs["interleave"] = interleave
+        if swizzle is not None:
+            kwargs["swizzle"] = swizzle
+        if l2_promotion is not None:
+            kwargs["l2_promotion"] = l2_promotion
+        if oob_fill is not None:
+            kwargs["oob_fill"] = oob_fill
+        return TensorMapDescriptor.from_tiled(self, box_dim, **kwargs)
+
     def copy_from(
         self, other : StridedMemoryView, stream : Stream,
         allocator = None,
diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
index 07e8adebc9..4c60b7fc70 100644
--- a/cuda_core/cuda/core/_tensor_map.pxd
+++ b/cuda_core/cuda/core/_tensor_map.pxd
@@ -3,12 +3,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
+from libc.stdint cimport intptr_t
 
 
 cdef class TensorMapDescriptor:
     cdef cydriver.CUtensorMap _tensor_map
+    cdef int _device_id
+    cdef intptr_t _context
     cdef object _source_ref
     cdef object _view_ref
     cdef object _repr_info
 
+    cdef int _check_context_compat(self) except -1
     cdef void* _get_data_ptr(self)
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
index 6054add9b6..9d83a709cc 100644
--- a/cuda_core/cuda/core/_tensor_map.pyx
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -263,6 +263,22 @@ def _get_validated_view(tensor):
     return view
 
 
+cdef inline intptr_t _get_current_context_ptr() except? 0:
+    cdef cydriver.CUcontext ctx
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+    if ctx == NULL:
+        raise RuntimeError("TensorMapDescriptor requires an active CUDA context")
+    return <intptr_t>ctx
+
+
+cdef inline int _get_current_device_id() except -1:
+    cdef cydriver.CUdevice dev
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
+    return <int>dev
+
+
 def _compute_byte_strides(shape, strides, elem_size):
     """Compute byte strides from element strides or C-contiguous fallback.
 
@@ -313,6 +329,28 @@ cdef class TensorMapDescriptor:
     cdef void* _get_data_ptr(self):
         return <void*>&self._tensor_map
 
+    cdef int _check_context_compat(self) except -1:
+        cdef cydriver.CUcontext current_ctx
+        cdef cydriver.CUdevice current_dev
+        cdef int current_dev_id
+        if self._context == 0 and self._device_id < 0:
+            return 0
+        with nogil:
+            HANDLE_RETURN(cydriver.cuCtxGetCurrent(&current_ctx))
+        if current_ctx == NULL:
+            raise RuntimeError("TensorMapDescriptor requires an active CUDA context")
+        if self._context != 0 and <intptr_t>current_ctx != self._context:
+            raise RuntimeError(
+                "TensorMapDescriptor was created in a different CUDA context")
+        with nogil:
+            HANDLE_RETURN(cydriver.cuCtxGetDevice(&current_dev))
+        current_dev_id = <int>current_dev
+        if self._device_id >= 0 and current_dev_id != self._device_id:
+            raise RuntimeError(
+                f"TensorMapDescriptor belongs to device {self._device_id}, "
+                f"but current device is {current_dev_id}")
+        return 0
+
     @classmethod
     def from_tiled(cls, tensor, box_dim, *,
                    element_strides=None,
@@ -366,6 +404,8 @@ cdef class TensorMapDescriptor:
         # deleter can free the backing allocation when released.
         desc._source_ref = tensor
         desc._view_ref = view
+        desc._context = _get_current_context_ptr()
+        desc._device_id = _get_current_device_id()
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -593,6 +633,8 @@ cdef class TensorMapDescriptor:
         view = _get_validated_view(tensor)
         desc._source_ref = tensor
         desc._view_ref = view
+        desc._context = _get_current_context_ptr()
+        desc._device_id = _get_current_device_id()
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -750,6 +792,8 @@ cdef class TensorMapDescriptor:
             view = _get_validated_view(tensor)
             desc._source_ref = tensor
             desc._view_ref = view
+            desc._context = _get_current_context_ptr()
+            desc._device_id = _get_current_device_id()
 
             tma_dt = _resolve_data_type(view, data_type)
             cdef int c_data_type_int = int(tma_dt)
@@ -839,7 +883,11 @@ cdef class TensorMapDescriptor:
             or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
             device-accessible memory with a 16-byte-aligned pointer.
         """
+        self._check_context_compat()
         view = _get_validated_view(tensor)
+        if view.device_id != self._device_id:
+            raise ValueError(
+                f"replace_address expects tensor on device {self._device_id}, got {view.device_id}")
 
         cdef intptr_t global_address = view.ptr
 
diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py
index 7f11cf551c..a734301fd6 100644
--- a/cuda_core/examples/tma_replace_address.py
+++ b/cuda_core/examples/tma_replace_address.py
@@ -36,7 +36,7 @@
     LaunchConfig,
     Program,
     ProgramOptions,
-    TensorMapDescriptor,
+    StridedMemoryView,
     launch,
 )
 
@@ -159,7 +159,7 @@
 output = cp.zeros(N, dtype=cp.float32)
 dev.sync()  # cupy uses its own stream
 
-tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,))
+tensor_map = StridedMemoryView.from_any_interface(a, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,))
 
 n_tiles = N // TILE_SIZE
 config = LaunchConfig(grid=n_tiles, block=TILE_SIZE)
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index 63c0ac7b5c..2a5ce9ad86 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -31,7 +31,7 @@
     LaunchConfig,
     Program,
     ProgramOptions,
-    TensorMapDescriptor,
+    StridedMemoryView,
     launch,
 )
 
@@ -48,8 +48,6 @@
     sys.exit(0)
 dev.set_current()
 
-arch_str = "".join(f"{i}" for i in arch)
-
 # ---------------------------------------------------------------------------
 # CUDA kernel that uses TMA to load a 1-D tile into shared memory, then
 # copies the tile to an output buffer so we can verify correctness.
@@ -141,7 +139,7 @@
 prog = Program(
     code,
     code_type="c++",
-    options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"),
+    options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"),
 )
 mod = prog.compile("cubin")
 ker = mod.get_kernel("tma_copy")
@@ -155,11 +153,10 @@
 dev.sync()  # cupy uses its own stream
 
 # ---------------------------------------------------------------------------
-# 2) Create a TMA tiled descriptor
-#    from_tiled() accepts any DLPack / __cuda_array_interface__ object.
+# 2) Create a TMA tiled descriptor from a StridedMemoryView.
 #    The dtype (float32) is inferred automatically from the CuPy array.
 # ---------------------------------------------------------------------------
-tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,))
+tensor_map = StridedMemoryView.from_any_interface(a, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,))
 
 # ---------------------------------------------------------------------------
 # 3) Launch the kernel
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 2d6dc138c0..dee1f1e2e1 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -6,8 +6,11 @@
 
 from cuda.core import (
     Device,
-    TensorMapDataType,
+    StridedMemoryView,
     TensorMapDescriptor,
+)
+from cuda.core._tensor_map import (
+    TensorMapDataType,
     TensorMapIm2ColWideMode,
     TensorMapInterleave,
     TensorMapL2Promotion,
@@ -107,6 +110,16 @@ def test_from_tiled_2d(self, dev, skip_if_no_tma):
         )
         assert desc is not None
 
+    def test_strided_memory_view_as_tensor_map(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
+        view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+        desc = view.as_tensor_map(
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
     def test_from_tiled_3d(self, dev, skip_if_no_tma):
         buf = dev.allocate(16 * 16 * 16 * 4)  # 16x16x16 float32
         tensor = _DeviceArray(buf, (16, 16, 16))

From 5a0e141ebf5b4d7a65818c20d4adc4648d37571f Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Sat, 7 Mar 2026 09:54:32 -0500
Subject: [PATCH 13/25] Restore cu12 feature definitions in cuda_core pixi
 manifest.

Bring back the cu12 feature blocks so pixi can parse the manifest and local test commands no longer fail early with a missing feature error.

Made-with: Cursor
---
 cuda_core/pixi.toml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml
index e351fb63b6..9dc6ac1ed9 100644
--- a/cuda_core/pixi.toml
+++ b/cuda_core/pixi.toml
@@ -75,9 +75,16 @@ cuda = "13"
 [feature.cu13.dependencies]
 cuda-version = "13.1.*"
 
-# NOTE: cu12 environment is intentionally omitted because the path dependency
-# to ../cuda_bindings (v13.1) makes it unsolvable locally. For cu12 testing,
-# use conda-forge packages or CI workflows.
+[feature.cu12.system-requirements]
+cuda = "12"
+
+[feature.cu12.dependencies]
+cuda-version = "12.*"
+
+# We keep both cu12 and cu13 because cuda.core works with either major version
+# NOTE: Path dependency to ../cuda_bindings only works for cu13 (local bindings is v13.1)
+#       For cu12 testing, use conda-forge packages: temporarily change path to wildcard
+#       or skip cu12 locally: pixi run -e cu13 test
 [environments]
 default = { features = [
     "cu13",

From eef1c7a1293b618d90c222bed5d7345e3d3f8fb5 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 10 Mar 2026 10:39:26 -0700
Subject: [PATCH 14/25] Handle TensorMap device validation by DLPack type

Reject CUDA device-local tensors from a different GPU while still allowing CUDA host and managed memory.

Add regression tests for descriptor creation, replace_address, and the shared validation helper.
---
 cuda_core/cuda/core/_tensor_map.pyx |  27 ++++--
 cuda_core/tests/test_tensor_map.py  | 123 ++++++++++++++++++++++++++++
 2 files changed, 145 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
index 9d83a709cc..3b4d644e7b 100644
--- a/cuda_core/cuda/core/_tensor_map.pyx
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -263,6 +263,19 @@ def _get_validated_view(tensor):
     return view
 
 
+def _require_view_device(view, expected_device_id, operation):
+    """Ensure device-local tensors match the current CUDA device.
+
+    DLPack reports host/managed CUDA memory as ``kDLCUDAHost`` /
+    ``kDLCUDAManaged`` with ``device_id=0`` regardless of the current device,
+    so only true ``kDLCUDA`` tensors are rejected by device-id mismatch.
+    """
+    device_type, device_id = view.__dlpack_device__()
+    if device_type == _kDLCUDA and device_id != expected_device_id:
+        raise ValueError(
+            f"{operation} expects tensor on device {expected_device_id}, got {device_id}")
+
+
 cdef inline intptr_t _get_current_context_ptr() except? 0:
     cdef cydriver.CUcontext ctx
     with nogil:
@@ -406,6 +419,7 @@ cdef class TensorMapDescriptor:
         desc._view_ref = view
         desc._context = _get_current_context_ptr()
         desc._device_id = _get_current_device_id()
+        _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_tiled")
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -447,6 +461,8 @@ cdef class TensorMapDescriptor:
         cdef int i_cccl
         cdef int device_type
         cdef int c_device_id
+        cdef int dl_device_type
+        cdef int dl_device_id
         cdef int c_cccl_interleave_int
         cdef int c_cccl_swizzle_int
         cdef int c_cccl_l2_promotion_int
@@ -471,8 +487,9 @@ cdef class TensorMapDescriptor:
             if elem_strides_provided:
                 c_elem_strides_ptr = &c_elem_strides[0]
 
-            device_type = <int>_kDLCUDA
-            c_device_id = <int>view.device_id
+            dl_device_type, dl_device_id = view.__dlpack_device__()
+            device_type = dl_device_type
+            c_device_id = dl_device_id
             c_cccl_interleave_int = int(interleave)
             c_cccl_swizzle_int = int(swizzle)
             c_cccl_l2_promotion_int = int(l2_promotion)
@@ -635,6 +652,7 @@ cdef class TensorMapDescriptor:
         desc._view_ref = view
         desc._context = _get_current_context_ptr()
         desc._device_id = _get_current_device_id()
+        _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_im2col")
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -794,6 +812,7 @@ cdef class TensorMapDescriptor:
             desc._view_ref = view
             desc._context = _get_current_context_ptr()
             desc._device_id = _get_current_device_id()
+            _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_im2col_wide")
 
             tma_dt = _resolve_data_type(view, data_type)
             cdef int c_data_type_int = int(tma_dt)
@@ -885,9 +904,7 @@ cdef class TensorMapDescriptor:
         """
         self._check_context_compat()
         view = _get_validated_view(tensor)
-        if view.device_id != self._device_id:
-            raise ValueError(
-                f"replace_address expects tensor on device {self._device_id}, got {view.device_id}")
+        _require_view_device(view, self._device_id, "replace_address")
 
         cdef intptr_t global_address = view.ptr
 
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index dee1f1e2e1..a370558019 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -4,11 +4,15 @@
 import numpy as np
 import pytest
 
+from conftest import create_managed_memory_resource_or_skip, skip_if_managed_memory_unsupported
 from cuda.core import (
     Device,
+    ManagedMemoryResourceOptions,
     StridedMemoryView,
     TensorMapDescriptor,
+    system,
 )
+from cuda.core._dlpack import DLDeviceType
 from cuda.core._tensor_map import (
     TensorMapDataType,
     TensorMapIm2ColWideMode,
@@ -16,6 +20,7 @@
     TensorMapL2Promotion,
     TensorMapOOBFill,
     TensorMapSwizzle,
+    _require_view_device,
 )
 
 
@@ -48,6 +53,15 @@ def __init__(self, buf, shape, dtype=np.float32):
         }
 
 
+class _MockTensorMapView:
+    def __init__(self, device_type, device_id):
+        self._device_type = device_type
+        self._device_id = device_id
+
+    def __dlpack_device__(self):
+        return (self._device_type, self._device_id)
+
+
 class TestTensorMapEnums:
     """Test that enum wrappers expose the expected values."""
 
@@ -323,6 +337,115 @@ def test_replace_address_requires_device_accessible(self, dev, skip_if_no_tma):
         with pytest.raises(ValueError, match="device-accessible"):
             desc.replace_address(host_arr)
 
+    def test_replace_address_rejects_tensor_from_other_device(self, dev, skip_if_no_tma):
+        if system.get_num_devices() < 2:
+            pytest.skip("requires multi-GPU")
+
+        dev0 = dev
+        dev1 = Device(1)
+
+        dev0.set_current()
+        buf0 = dev0.allocate(1024 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf0,
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+
+        dev1.set_current()
+        buf1 = dev1.allocate(1024 * 4)
+        dev0.set_current()
+
+        with pytest.raises(ValueError, match=r"replace_address expects tensor on device 0, got 1"):
+            desc.replace_address(buf1)
+
+    def test_replace_address_accepts_managed_buffer_on_nonzero_device(self, init_cuda):
+        if system.get_num_devices() < 2:
+            pytest.skip("requires multi-GPU")
+
+        dev1 = Device(1)
+        if not dev1.properties.tensor_map_access_supported:
+            pytest.skip("Device does not support TMA (requires compute capability 9.0+)")
+        skip_if_managed_memory_unsupported(dev1)
+
+        dev1.set_current()
+        desc = TensorMapDescriptor.from_tiled(
+            dev1.allocate(1024 * 4),
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+
+        mr = create_managed_memory_resource_or_skip(
+            ManagedMemoryResourceOptions(preferred_location=dev1.device_id)
+        )
+        managed_buf = mr.allocate(1024 * 4)
+
+        desc.replace_address(managed_buf)
+
+
+class TestTensorMapMultiDeviceValidation:
+    """Test multi-device validation for descriptor creation."""
+
+    def test_from_tiled_rejects_tensor_from_other_device(self, init_cuda):
+        if system.get_num_devices() < 2:
+            pytest.skip("requires multi-GPU")
+
+        dev0 = Device(0)
+        dev1 = Device(1)
+
+        dev1.set_current()
+        buf1 = dev1.allocate(1024 * 4)
+        dev0.set_current()
+
+        with pytest.raises(
+            ValueError,
+            match=r"TensorMapDescriptor\.from_tiled expects tensor on device 0, got 1",
+        ):
+            TensorMapDescriptor.from_tiled(
+                buf1,
+                box_dim=(64,),
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_from_tiled_accepts_managed_buffer_on_nonzero_device(self, init_cuda):
+        if system.get_num_devices() < 2:
+            pytest.skip("requires multi-GPU")
+
+        dev1 = Device(1)
+        if not dev1.properties.tensor_map_access_supported:
+            pytest.skip("Device does not support TMA (requires compute capability 9.0+)")
+        skip_if_managed_memory_unsupported(dev1)
+
+        dev1.set_current()
+        mr = create_managed_memory_resource_or_skip(
+            ManagedMemoryResourceOptions(preferred_location=dev1.device_id)
+        )
+        managed_buf = mr.allocate(1024 * 4)
+
+        desc = TensorMapDescriptor.from_tiled(
+            managed_buf,
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+
+class TestTensorMapDeviceValidation:
+    """Test device validation behavior for tensor-map-compatible views."""
+
+    def test_require_view_device_accepts_same_cuda_device(self):
+        _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDA, 1), 1, "op")
+
+    def test_require_view_device_rejects_different_cuda_device(self):
+        with pytest.raises(ValueError, match=r"op expects tensor on device 0, got 1"):
+            _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDA, 1), 0, "op")
+
+    def test_require_view_device_allows_cuda_host_memory(self):
+        _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDAHost, 0), 1, "op")
+
+    def test_require_view_device_allows_cuda_managed_memory(self):
+        _require_view_device(_MockTensorMapView(DLDeviceType.kDLCUDAManaged, 0), 1, "op")
+
 
 class TestTensorMapIm2col:
     """Test im2col TMA descriptor creation."""

From d6c311aa9b62deb00370b102f9d7ce3909e5e1df Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 11 Mar 2026 09:26:26 -0700
Subject: [PATCH 15/25] formatting change

---
 cuda_core/tests/test_tensor_map.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index a370558019..b50067c9a3 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -375,9 +375,7 @@ def test_replace_address_accepts_managed_buffer_on_nonzero_device(self, init_cud
             data_type=TensorMapDataType.FLOAT32,
         )
 
-        mr = create_managed_memory_resource_or_skip(
-            ManagedMemoryResourceOptions(preferred_location=dev1.device_id)
-        )
+        mr = create_managed_memory_resource_or_skip(ManagedMemoryResourceOptions(preferred_location=dev1.device_id))
         managed_buf = mr.allocate(1024 * 4)
 
         desc.replace_address(managed_buf)
@@ -417,9 +415,7 @@ def test_from_tiled_accepts_managed_buffer_on_nonzero_device(self, init_cuda):
         skip_if_managed_memory_unsupported(dev1)
 
         dev1.set_current()
-        mr = create_managed_memory_resource_or_skip(
-            ManagedMemoryResourceOptions(preferred_location=dev1.device_id)
-        )
+        mr = create_managed_memory_resource_or_skip(ManagedMemoryResourceOptions(preferred_location=dev1.device_id))
         managed_buf = mr.allocate(1024 * 4)
 
         desc = TensorMapDescriptor.from_tiled(

From 9673bcf9c777031ed575e74b73cd4146ab3acd1c Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@users.noreply.github.com>
Date: Fri, 13 Mar 2026 14:20:02 -0700
Subject: [PATCH 16/25] Update cuda_core/cuda/core/_cpp/tensor_map_cccl.h

Co-authored-by: Leo Fang <leo80042@gmail.com>
---
 cuda_core/cuda/core/_cpp/tensor_map_cccl.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/_cpp/tensor_map_cccl.h b/cuda_core/cuda/core/_cpp/tensor_map_cccl.h
index 71be425182..37f0e0fd8d 100644
--- a/cuda_core/cuda/core/_cpp/tensor_map_cccl.h
+++ b/cuda_core/cuda/core/_cpp/tensor_map_cccl.h
@@ -5,11 +5,13 @@
 #ifndef CUDA_CORE_TENSOR_MAP_CCCL_H_
 #define CUDA_CORE_TENSOR_MAP_CCCL_H_
 
-#include <stddef.h>
-#include <stdint.h>
-
 #ifdef __cplusplus
+#include <cstddef>
+#include <cstdint>
 extern "C" {
+#else
+#include <stddef.h>
+#include <stdint.h>
 #endif
 
 // Build a tiled CUtensorMap using CCCL's cuda::make_tma_descriptor (from <cuda/tma>).

From ae86192e05236fb77f039cd6665415ca01135ac1 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@users.noreply.github.com>
Date: Fri, 13 Mar 2026 18:16:46 -0700
Subject: [PATCH 17/25] Update cuda_core/examples/tma_replace_address.py

Co-authored-by: Leo Fang <leo80042@gmail.com>
---
 cuda_core/examples/tma_replace_address.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py
index a734301fd6..fd19aeca31 100644
--- a/cuda_core/examples/tma_replace_address.py
+++ b/cuda_core/examples/tma_replace_address.py
@@ -146,7 +146,7 @@
 prog = Program(
     code,
     code_type="c++",
-    options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"),
+    options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"),
 )
 mod = prog.compile("cubin")
 ker = mod.get_kernel("tma_copy")

From 232b621556492a6e2c2b8b8d28a07aa1798444a4 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@users.noreply.github.com>
Date: Fri, 13 Mar 2026 18:17:02 -0700
Subject: [PATCH 18/25] Update cuda_core/cuda/core/__init__.py

Co-authored-by: Leo Fang <leo80042@gmail.com>
---
 cuda_core/cuda/core/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index c32ff79ecc..139078e86e 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -68,4 +68,4 @@
     Stream,
     StreamOptions,
 )
-from cuda.core._tensor_map import TensorMapDescriptor
+from cuda.core._tensor_map import TensorMapDescriptor, TensorMapDescriptorOptions

From 358d9753685637faa0190e8baf4b6e33d0fa476a Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Sat, 14 Mar 2026 08:43:11 -0400
Subject: [PATCH 19/25] Align TensorMap creation and launch behavior with the
 latest review guidance.

Keep the public TMA entry point on StridedMemoryView and remove avoidable launch/build overhead so the reviewed API stays smaller without regressing local CUDA builds.

Made-with: Cursor
---
 cuda_core/cuda/core/_cpp/tensor_map.cpp     |   7 +-
 cuda_core/cuda/core/_kernel_arg_handler.pyx |  22 +---
 cuda_core/cuda/core/_memoryview.pyx         |   6 +-
 cuda_core/cuda/core/_tensor_map.pyx         | 120 ++++++++++++--------
 cuda_core/tests/test_tensor_map.py          | 106 +++++++++--------
 5 files changed, 139 insertions(+), 122 deletions(-)

diff --git a/cuda_core/cuda/core/_cpp/tensor_map.cpp b/cuda_core/cuda/core/_cpp/tensor_map.cpp
index af09aa1b2f..df3f7654e5 100644
--- a/cuda_core/cuda/core/_cpp/tensor_map.cpp
+++ b/cuda_core/cuda/core/_cpp/tensor_map.cpp
@@ -10,13 +10,18 @@
 #include <exception>
 
 #if defined(__has_include)
+// Older CTK releases do not ship <cuda/tma>. When it is unavailable we keep
+// the CCCL helper compiled out and fall back to the direct driver path.
 #  if __has_include(<cuda/tma>)
 #    include <cuda/tma>
 #    define CUDA_CORE_HAS_CUDA_TMA 1
 #  else
 #    define CUDA_CORE_HAS_CUDA_TMA 0
 #  endif
-#  if __has_include(<dlpack/dlpack.h>)
+#  if __has_include("dlpack.h")
+#    include "dlpack.h"
+#    define CUDA_CORE_HAS_DLPACK_H 1
+#  elif __has_include(<dlpack/dlpack.h>)
 #    include <dlpack/dlpack.h>
 #    define CUDA_CORE_HAS_DLPACK_H 1
 #  else
diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
index 28a981fed2..35eea2de47 100644
--- a/cuda_core/cuda/core/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -6,7 +6,6 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
 from libc.stdint cimport (intptr_t,
                           int8_t, int16_t, int32_t, int64_t,
                           uint8_t, uint16_t, uint32_t, uint64_t,)
-from libc.string cimport memcpy
 from libcpp cimport bool as cpp_bool
 from libcpp.complex cimport complex as cpp_complex
 from libcpp cimport nullptr
@@ -135,18 +134,9 @@ cdef inline int prepare_tensor_map_arg(
         vector.vector[void*]& data_addresses,
         TensorMapDescriptor arg,
         const size_t idx) except -1:
-    arg._check_context_compat()
-    # Allocate a temporary buffer for the 128-byte CUtensorMap struct.
-    # We copy rather than pointing directly at arg._tensor_map for lifetime
-    # safety: ParamHolder owns and frees its argument buffers independently.
-    cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap))
-    if ptr is NULL:
-        raise MemoryError("Failed to allocate memory for CUtensorMap")
-    memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap))
-    # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__,
-    # data_addresses[idx] is the pointer passed to cuLaunchKernel.
-    data_addresses[idx] = ptr
-    data[idx] = ptr
+    # cuLaunchKernel copies argument bytes during launch, so a TensorMap
+    # descriptor can point directly at its internal CUtensorMap storage.
+    data_addresses[idx] = arg._get_data_ptr()
     return 0
 
 
@@ -299,9 +289,6 @@ cdef class ParamHolder:
                     # it's a CUdeviceptr:
                     self.data_addresses[i] = <void*><intptr_t>(arg.handle.getPtr())
                 continue
-            elif arg_type is tensor_map_descriptor_type:
-                prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
-                continue
             elif arg_type is bool:
                 prepare_arg[cpp_bool](self.data, self.data_addresses, arg, i)
                 continue
@@ -319,6 +306,9 @@ cdef class ParamHolder:
             elif arg_type is complex:
                 prepare_arg[cpp_double_complex](self.data, self.data_addresses, arg, i)
                 continue
+            elif arg_type is tensor_map_descriptor_type:
+                prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
+                continue
 
             not_prepared = prepare_numpy_arg(self.data, self.data_addresses, arg, i)
             if not_prepared:
diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
index 78002b3ff1..e34aca424d 100644
--- a/cuda_core/cuda/core/_memoryview.pyx
+++ b/cuda_core/cuda/core/_memoryview.pyx
@@ -329,8 +329,8 @@ cdef class StridedMemoryView:
     ):
         """Create a tiled :obj:`TensorMapDescriptor` from this view.
 
-        This is a convenience wrapper around
-        :meth:`cuda.core._tensor_map.TensorMapDescriptor.from_tiled`.
+        This is the public entry point for creating tiled tensor map
+        descriptors in ``cuda.core``.
         """
         from cuda.core._tensor_map import TensorMapDescriptor
 
@@ -347,7 +347,7 @@ cdef class StridedMemoryView:
             kwargs["l2_promotion"] = l2_promotion
         if oob_fill is not None:
             kwargs["oob_fill"] = oob_fill
-        return TensorMapDescriptor.from_tiled(self, box_dim, **kwargs)
+        return TensorMapDescriptor._from_tiled(self, box_dim, **kwargs)
 
     def copy_from(
         self, other : StridedMemoryView, stream : Stream,
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
index 3b4d644e7b..7ed69bf722 100644
--- a/cuda_core/cuda/core/_tensor_map.pyx
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -117,7 +117,7 @@ ELSE:
         """Im2col wide mode for tensor map descriptors.
 
         This enum is always defined for API stability, but the
-        :meth:`TensorMapDescriptor.from_im2col_wide` factory requires a CUDA 13+
+        :meth:`TensorMapDescriptor._from_im2col_wide` factory requires a CUDA 13+
         build and will raise otherwise.
         """
         W = 0
@@ -163,10 +163,20 @@ def _resolve_data_type(view, data_type):
     """Resolve the TMA data type from an explicit value or the view's dtype."""
 
     if data_type is not None:
-        if not isinstance(data_type, TensorMapDataType):
+        if isinstance(data_type, TensorMapDataType):
+            return data_type
+        try:
+            dt = numpy.dtype(data_type)
+        except TypeError as e:
             raise TypeError(
-                f"data_type must be a TensorMapDataType, got {type(data_type)}")
-        return data_type
+                "data_type must be a TensorMapDataType or a numpy/ml_dtypes dtype, "
+                f"got {type(data_type)}") from e
+        tma_dt = _NUMPY_DTYPE_TO_TMA.get(dt)
+        if tma_dt is None:
+            raise ValueError(
+                f"Unsupported dtype {dt} for TMA; "
+                f"supported dtypes: {list(_NUMPY_DTYPE_TO_TMA.keys())}.")
+        return tma_dt
 
     dt = view.dtype
     if dt is None:
@@ -243,15 +253,7 @@ cdef inline bint _tma_dtype_to_dlpack(
     return False
 
 
-def _get_validated_view(tensor):
-    """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer."""
-    if isinstance(tensor, StridedMemoryView):
-        view = tensor
-    else:
-        # stream_ptr=-1: no stream synchronization needed because descriptor
-        # creation only reads tensor metadata, it does not move data.
-        view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
-
+cdef inline int _validate_tensor_map_view(view) except -1:
     if not view.is_device_accessible:
         raise ValueError("The tensor must be device-accessible")
 
@@ -259,7 +261,18 @@ def _get_validated_view(tensor):
         raise ValueError(
             f"Global memory address must be 16-byte aligned, "
             f"got address 0x{view.ptr:x}")
+    return 0
 
+
+def _get_validated_view(tensor):
+    """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer."""
+    if isinstance(tensor, StridedMemoryView):
+        view = tensor
+    else:
+        # stream_ptr=-1: no stream synchronization needed because descriptor
+        # creation only reads tensor metadata, it does not move data.
+        view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+    _validate_tensor_map_view(view)
     return view
 
 
@@ -292,6 +305,17 @@ cdef inline int _get_current_device_id() except -1:
     return <int>dev
 
 
+cdef inline int _require_view_device(
+    view,
+    int device_id,
+    object caller,
+) except -1:
+    if view.device_id != device_id:
+        raise ValueError(
+            f"{caller} expects tensor on device {device_id}, got {view.device_id}")
+    return 0
+
+
 def _compute_byte_strides(shape, strides, elem_size):
     """Compute byte strides from element strides or C-contiguous fallback.
 
@@ -328,16 +352,17 @@ cdef class TensorMapDescriptor:
     used by the hardware TMA unit for efficient bulk data movement between
     global and shared memory.
 
-    Instances are created via the class methods :meth:`from_tiled` and
-    :meth:`from_im2col`, and can be passed directly to
-    :func:`~cuda.core.launch` as a kernel argument.
+    Public tiled descriptors are created via
+    :meth:`cuda.core.StridedMemoryView.as_tensor_map`. Specialized
+    ``_from_*`` helpers remain private while this API surface settles, and
+    descriptors can be passed directly to :func:`~cuda.core.launch` as a
+    kernel argument.
     """
 
     def __init__(self):
         raise RuntimeError(
             "TensorMapDescriptor cannot be instantiated directly. "
-            "Use TensorMapDescriptor.from_tiled() or "
-            "TensorMapDescriptor.from_im2col().")
+            "Use StridedMemoryView.as_tensor_map() instead.")
 
     cdef void* _get_data_ptr(self):
         return <void*>&self._tensor_map
@@ -364,22 +389,27 @@ cdef class TensorMapDescriptor:
                 f"but current device is {current_dev_id}")
         return 0
 
+    @property
+    def device(self):
+        """Return the :obj:`~cuda.core.Device` associated with this descriptor."""
+        if self._device_id >= 0:
+            from cuda.core._device import Device
+            return Device(self._device_id)
+
     @classmethod
-    def from_tiled(cls, tensor, box_dim, *,
+    def _from_tiled(cls, view, box_dim, *,
                    element_strides=None,
                    data_type=None,
                    interleave=TensorMapInterleave.NONE,
                    swizzle=TensorMapSwizzle.NONE,
                    l2_promotion=TensorMapL2Promotion.NONE,
                    oob_fill=TensorMapOOBFill.NONE):
-        """Create a tiled TMA descriptor from a tensor object.
+        """Create a tiled TMA descriptor from a validated view.
 
         Parameters
         ----------
-        tensor : object
-            Any object supporting DLPack or ``__cuda_array_interface__``,
-            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
-            device-accessible memory with a 16-byte-aligned pointer.
+        view : StridedMemoryView
+            A device-accessible view with a 16-byte-aligned pointer.
         box_dim : tuple of int
             The size of each tile dimension (in elements). Must have the
             same rank as the tensor and each value must be in [1, 256].
@@ -411,15 +441,15 @@ cdef class TensorMapDescriptor:
         """
         cdef TensorMapDescriptor desc = cls.__new__(cls)
 
-        view = _get_validated_view(tensor)
+        _validate_tensor_map_view(view)
         # Keep both the original tensor object and the validated view alive.
         # For DLPack exporters, the view may hold the owning capsule whose
         # deleter can free the backing allocation when released.
-        desc._source_ref = tensor
+        desc._source_ref = view.exporting_obj
         desc._view_ref = view
         desc._context = _get_current_context_ptr()
         desc._device_id = _get_current_device_id()
-        _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_tiled")
+        _require_view_device(view, desc._device_id, "TensorMapDescriptor._from_tiled")
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -591,7 +621,7 @@ cdef class TensorMapDescriptor:
         return desc
 
     @classmethod
-    def from_im2col(cls, tensor, pixel_box_lower_corner, pixel_box_upper_corner,
+    def _from_im2col(cls, view, pixel_box_lower_corner, pixel_box_upper_corner,
                     channels_per_pixel, pixels_per_column, *,
                     element_strides=None,
                     data_type=None,
@@ -599,16 +629,14 @@ cdef class TensorMapDescriptor:
                     swizzle=TensorMapSwizzle.NONE,
                     l2_promotion=TensorMapL2Promotion.NONE,
                     oob_fill=TensorMapOOBFill.NONE):
-        """Create an im2col TMA descriptor from a tensor object.
+        """Create an im2col TMA descriptor from a validated view.
 
         Im2col layout is used for convolution-style data access patterns.
 
         Parameters
         ----------
-        tensor : object
-            Any object supporting DLPack or ``__cuda_array_interface__``,
-            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
-            device-accessible memory with a 16-byte-aligned pointer.
+        view : StridedMemoryView
+            A device-accessible view with a 16-byte-aligned pointer.
         pixel_box_lower_corner : tuple of int
             Lower corner of the pixel bounding box for each spatial
             dimension (rank - 2 elements). Specified in row-major order
@@ -647,12 +675,12 @@ cdef class TensorMapDescriptor:
         """
         cdef TensorMapDescriptor desc = cls.__new__(cls)
 
-        view = _get_validated_view(tensor)
-        desc._source_ref = tensor
+        _validate_tensor_map_view(view)
+        desc._source_ref = view.exporting_obj
         desc._view_ref = view
         desc._context = _get_current_context_ptr()
         desc._device_id = _get_current_device_id()
-        _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_im2col")
+        _require_view_device(view, desc._device_id, "TensorMapDescriptor._from_im2col")
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -746,7 +774,7 @@ cdef class TensorMapDescriptor:
         return desc
 
     @classmethod
-    def from_im2col_wide(cls, tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width,
+    def _from_im2col_wide(cls, view, pixel_box_lower_corner_width, pixel_box_upper_corner_width,
                          channels_per_pixel, pixels_per_column, *,
                          element_strides=None,
                          data_type=None,
@@ -755,7 +783,7 @@ cdef class TensorMapDescriptor:
                          swizzle=TensorMapSwizzle.SWIZZLE_128B,
                          l2_promotion=TensorMapL2Promotion.NONE,
                          oob_fill=TensorMapOOBFill.NONE):
-        """Create an im2col-wide TMA descriptor from a tensor object.
+        """Create an im2col-wide TMA descriptor from a validated view.
 
         Im2col-wide layout loads elements exclusively along the W (width)
         dimension. This variant is supported on compute capability 10.0+
@@ -763,10 +791,8 @@ cdef class TensorMapDescriptor:
 
         Parameters
         ----------
-        tensor : object
-            Any object supporting DLPack or ``__cuda_array_interface__``,
-            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
-            device-accessible memory with a 16-byte-aligned pointer.
+        view : StridedMemoryView
+            A device-accessible view with a 16-byte-aligned pointer.
         pixel_box_lower_corner_width : int
             Lower corner of the pixel bounding box along the W dimension.
         pixel_box_upper_corner_width : int
@@ -803,16 +829,16 @@ cdef class TensorMapDescriptor:
         """
         IF CUDA_CORE_BUILD_MAJOR < 13:
             raise RuntimeError(
-                "TensorMapDescriptor.from_im2col_wide requires a CUDA 13+ build")
+                "TensorMapDescriptor._from_im2col_wide requires a CUDA 13+ build")
         ELSE:
             cdef TensorMapDescriptor desc = cls.__new__(cls)
 
-            view = _get_validated_view(tensor)
-            desc._source_ref = tensor
+            _validate_tensor_map_view(view)
+            desc._source_ref = view.exporting_obj
             desc._view_ref = view
             desc._context = _get_current_context_ptr()
             desc._device_id = _get_current_device_id()
-            _require_view_device(view, desc._device_id, "TensorMapDescriptor.from_im2col_wide")
+            _require_view_device(view, desc._device_id, "TensorMapDescriptor._from_im2col_wide")
 
             tma_dt = _resolve_data_type(view, data_type)
             cdef int c_data_type_int = int(tma_dt)
@@ -917,7 +943,7 @@ cdef class TensorMapDescriptor:
         # Update the source reference only after the driver call succeeds,
         # so we don't drop the old tensor (risking a dangling pointer in the
         # CUtensorMap struct) if the call fails.
-        self._source_ref = tensor
+        self._source_ref = view.exporting_obj
         self._view_ref = view
 
     def __repr__(self):
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index b50067c9a3..099140354c 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -61,6 +61,11 @@ def __init__(self, device_type, device_id):
     def __dlpack_device__(self):
         return (self._device_type, self._device_id)
 
+def _as_view(obj):
+    if isinstance(obj, StridedMemoryView):
+        return obj
+    return StridedMemoryView.from_any_interface(obj, stream_ptr=-1)
+
 
 class TestTensorMapEnums:
     """Test that enum wrappers expose the expected values."""
@@ -106,19 +111,25 @@ def test_cannot_instantiate_directly(self):
 
     def test_from_tiled_1d(self, dev, skip_if_no_tma):
         buf = dev.allocate(1024 * 4)  # 1024 float32 elements
-        desc = TensorMapDescriptor.from_tiled(
-            buf,
+        desc = _as_view(buf).as_tensor_map(
             box_dim=(64,),
             data_type=TensorMapDataType.FLOAT32,
         )
         assert desc is not None
         assert repr(desc) == "TensorMapDescriptor(tiled, rank=1, dtype=FLOAT32, swizzle=NONE)"
 
+    def test_device_property(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        desc = _as_view(buf).as_tensor_map(
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc.device.device_id == dev.device_id
+
     def test_from_tiled_2d(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)  # 64x64 float32
         tensor = _DeviceArray(buf, (64, 64))
-        desc = TensorMapDescriptor.from_tiled(
-            tensor,
+        desc = _as_view(tensor).as_tensor_map(
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
         )
@@ -137,8 +148,7 @@ def test_strided_memory_view_as_tensor_map(self, dev, skip_if_no_tma):
     def test_from_tiled_3d(self, dev, skip_if_no_tma):
         buf = dev.allocate(16 * 16 * 16 * 4)  # 16x16x16 float32
         tensor = _DeviceArray(buf, (16, 16, 16))
-        desc = TensorMapDescriptor.from_tiled(
-            tensor,
+        desc = _as_view(tensor).as_tensor_map(
             box_dim=(8, 8, 8),
             data_type=TensorMapDataType.FLOAT32,
         )
@@ -150,8 +160,7 @@ def test_from_tiled_5d(self, dev, skip_if_no_tma):
         n_bytes = 2 * 4 * 4 * 4 * 8 * 4  # float32
         buf = dev.allocate(n_bytes)
         tensor = _DeviceArray(buf, shape)
-        desc = TensorMapDescriptor.from_tiled(
-            tensor,
+        desc = _as_view(tensor).as_tensor_map(
             box_dim=(1, 2, 2, 2, 8),
         )
         assert desc is not None
@@ -159,8 +168,7 @@ def test_from_tiled_5d(self, dev, skip_if_no_tma):
     def test_from_tiled_with_element_strides_buffer(self, dev, skip_if_no_tma):
         # Use a Buffer input (DLPack path) and explicit element_strides.
         buf = dev.allocate(1024 * 4)
-        desc = TensorMapDescriptor.from_tiled(
-            buf,
+        desc = _as_view(buf).as_tensor_map(
             box_dim=(64,),
             element_strides=(2,),
             data_type=TensorMapDataType.FLOAT32,
@@ -171,8 +179,7 @@ def test_from_tiled_with_element_strides_cai(self, dev, skip_if_no_tma):
         # Use a CAI-style tensor wrapper and explicit element_strides.
         buf = dev.allocate(64 * 64 * 4)
         tensor = _DeviceArray(buf, (64, 64))
-        desc = TensorMapDescriptor.from_tiled(
-            tensor,
+        desc = _as_view(tensor).as_tensor_map(
             box_dim=(32, 32),
             element_strides=(2, 1),
             data_type=TensorMapDataType.FLOAT32,
@@ -182,8 +189,7 @@ def test_from_tiled_with_element_strides_cai(self, dev, skip_if_no_tma):
     def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
         tensor = _DeviceArray(buf, (64, 64))
-        desc = TensorMapDescriptor.from_tiled(
-            tensor,
+        desc = _as_view(tensor).as_tensor_map(
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
             swizzle=TensorMapSwizzle.SWIZZLE_128B,
@@ -193,8 +199,7 @@ def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
     def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
         tensor = _DeviceArray(buf, (64, 64))
-        desc = TensorMapDescriptor.from_tiled(
-            tensor,
+        desc = _as_view(tensor).as_tensor_map(
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
             l2_promotion=TensorMapL2Promotion.L2_128B,
@@ -204,8 +209,7 @@ def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma):
     def test_from_tiled_with_oob_fill(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
         tensor = _DeviceArray(buf, (64, 64))
-        desc = TensorMapDescriptor.from_tiled(
-            tensor,
+        desc = _as_view(tensor).as_tensor_map(
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
             oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA,
@@ -220,8 +224,7 @@ def test_invalid_rank_zero(self, dev, skip_if_no_tma):
         buf = dev.allocate(64)
         tensor = _DeviceArray(buf, ())  # 0-dim tensor
         with pytest.raises(ValueError, match="rank must be between 1 and 5"):
-            TensorMapDescriptor.from_tiled(
-                tensor,
+            _as_view(tensor).as_tensor_map(
                 box_dim=(),
                 data_type=TensorMapDataType.FLOAT32,
             )
@@ -234,16 +237,14 @@ def test_invalid_rank_six(self, dev, skip_if_no_tma):
         buf = dev.allocate(n_elements * 4)
         arr = _DeviceArray(buf, shape)
         with pytest.raises(ValueError, match="rank must be between 1 and 5"):
-            TensorMapDescriptor.from_tiled(
-                arr,
+            _as_view(arr).as_tensor_map(
                 box_dim=(2,) * 6,
             )
 
     def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma):
         buf = dev.allocate(1024 * 4)
         with pytest.raises(ValueError, match="box_dim must have 1 elements"):
-            TensorMapDescriptor.from_tiled(
-                buf,
+            _as_view(buf).as_tensor_map(
                 box_dim=(32, 32),
                 data_type=TensorMapDataType.FLOAT32,
             )
@@ -251,8 +252,7 @@ def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma):
     def test_box_dim_out_of_range(self, dev, skip_if_no_tma):
         buf = dev.allocate(1024 * 4)
         with pytest.raises(ValueError, match=r"box_dim\[0\] must be in \[1, 256\]"):
-            TensorMapDescriptor.from_tiled(
-                buf,
+            _as_view(buf).as_tensor_map(
                 box_dim=(512,),
                 data_type=TensorMapDataType.FLOAT32,
             )
@@ -260,8 +260,7 @@ def test_box_dim_out_of_range(self, dev, skip_if_no_tma):
     def test_element_strides_rank_mismatch(self, dev, skip_if_no_tma):
         buf = dev.allocate(1024 * 4)
         with pytest.raises(ValueError, match="element_strides must have 1 elements"):
-            TensorMapDescriptor.from_tiled(
-                buf,
+            _as_view(buf).as_tensor_map(
                 box_dim=(64,),
                 element_strides=(1, 1),
                 data_type=TensorMapDataType.FLOAT32,
@@ -269,9 +268,8 @@ def test_element_strides_rank_mismatch(self, dev, skip_if_no_tma):
 
     def test_invalid_data_type(self, dev, skip_if_no_tma):
         buf = dev.allocate(1024 * 4)
-        with pytest.raises(TypeError, match="data_type must be a TensorMapDataType"):
-            TensorMapDescriptor.from_tiled(
-                buf,
+        with pytest.raises(TypeError, match="data_type must be"):
+            _as_view(buf).as_tensor_map(
                 box_dim=(64,),
                 data_type=42,
             )
@@ -315,8 +313,7 @@ class TestTensorMapReplaceAddress:
 
     def test_replace_address(self, dev, skip_if_no_tma):
         buf1 = dev.allocate(1024 * 4)
-        desc = TensorMapDescriptor.from_tiled(
-            buf1,
+        desc = _as_view(buf1).as_tensor_map(
             box_dim=(64,),
             data_type=TensorMapDataType.FLOAT32,
         )
@@ -327,8 +324,7 @@ def test_replace_address(self, dev, skip_if_no_tma):
 
     def test_replace_address_requires_device_accessible(self, dev, skip_if_no_tma):
         buf1 = dev.allocate(1024 * 4)
-        desc = TensorMapDescriptor.from_tiled(
-            buf1,
+        desc = _as_view(buf1).as_tensor_map(
             box_dim=(64,),
             data_type=TensorMapDataType.FLOAT32,
         )
@@ -450,8 +446,8 @@ def test_from_im2col_3d(self, dev, skip_if_no_tma):
         # 3D tensor: batch=1, height=32, channels=64
         buf = dev.allocate(1 * 32 * 64 * 4)
         tensor = _DeviceArray(buf, (1, 32, 64))
-        desc = TensorMapDescriptor.from_im2col(
-            tensor,
+        desc = TensorMapDescriptor._from_im2col(
+            _as_view(tensor),
             pixel_box_lower_corner=(0,),
             pixel_box_upper_corner=(4,),
             channels_per_pixel=64,
@@ -463,8 +459,8 @@ def test_from_im2col_3d(self, dev, skip_if_no_tma):
     def test_from_im2col_rank_validation(self, dev, skip_if_no_tma):
         buf = dev.allocate(1024 * 4)
         with pytest.raises(ValueError, match="Im2col tensor rank must be between 3 and 5"):
-            TensorMapDescriptor.from_im2col(
-                buf,
+            TensorMapDescriptor._from_im2col(
+                _as_view(buf),
                 pixel_box_lower_corner=(),
                 pixel_box_upper_corner=(),
                 channels_per_pixel=64,
@@ -476,8 +472,8 @@ def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma):
         buf = dev.allocate(1 * 32 * 64 * 4)
         tensor = _DeviceArray(buf, (1, 32, 64))  # 3D: n_spatial = 1
         with pytest.raises(ValueError, match="pixel_box_lower_corner must have 1 elements"):
-            TensorMapDescriptor.from_im2col(
-                tensor,
+            TensorMapDescriptor._from_im2col(
+                _as_view(tensor),
                 pixel_box_lower_corner=(0, 0),
                 pixel_box_upper_corner=(4,),
                 channels_per_pixel=64,
@@ -492,8 +488,8 @@ def test_from_im2col_4d(self, dev, skip_if_no_tma):
         shape = (1, 8, 8, 64)
         buf = dev.allocate(1 * 8 * 8 * 64 * 4)
         tensor = _DeviceArray(buf, shape)
-        desc = TensorMapDescriptor.from_im2col(
-            tensor,
+        desc = TensorMapDescriptor._from_im2col(
+            _as_view(tensor),
             pixel_box_lower_corner=(0, 0),
             pixel_box_upper_corner=(4, 4),
             channels_per_pixel=64,
@@ -508,8 +504,8 @@ def test_from_im2col_5d(self, dev, skip_if_no_tma):
         shape = (1, 4, 8, 8, 64)
         buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4)
         tensor = _DeviceArray(buf, shape)
-        desc = TensorMapDescriptor.from_im2col(
-            tensor,
+        desc = TensorMapDescriptor._from_im2col(
+            _as_view(tensor),
             pixel_box_lower_corner=(0, 0, 0),
             pixel_box_upper_corner=(2, 4, 4),
             channels_per_pixel=64,
@@ -535,8 +531,8 @@ def skip_if_no_im2col_wide(self, dev):
         buf = dev.allocate(1 * 32 * 64 * 4)
         tensor = _DeviceArray(buf, (1, 32, 64))
         try:
-            TensorMapDescriptor.from_im2col_wide(
-                tensor,
+            TensorMapDescriptor._from_im2col_wide(
+                _as_view(tensor),
                 pixel_box_lower_corner_width=0,
                 pixel_box_upper_corner_width=4,
                 channels_per_pixel=64,
@@ -556,8 +552,8 @@ def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide):
         # 3D tensor: batch=1, width=32, channels=64
         buf = dev.allocate(1 * 32 * 64 * 4)
         tensor = _DeviceArray(buf, (1, 32, 64))
-        desc = TensorMapDescriptor.from_im2col_wide(
-            tensor,
+        desc = TensorMapDescriptor._from_im2col_wide(
+            _as_view(tensor),
             pixel_box_lower_corner_width=0,
             pixel_box_upper_corner_width=4,
             channels_per_pixel=64,
@@ -572,8 +568,8 @@ def test_from_im2col_wide_4d(self, dev, skip_if_no_im2col_wide):
         shape = (1, 8, 8, 64)
         buf = dev.allocate(1 * 8 * 8 * 64 * 4)
         tensor = _DeviceArray(buf, shape)
-        desc = TensorMapDescriptor.from_im2col_wide(
-            tensor,
+        desc = TensorMapDescriptor._from_im2col_wide(
+            _as_view(tensor),
             pixel_box_lower_corner_width=0,
             pixel_box_upper_corner_width=4,
             channels_per_pixel=64,
@@ -587,8 +583,8 @@ def test_from_im2col_wide_5d(self, dev, skip_if_no_im2col_wide):
         shape = (1, 4, 8, 8, 64)
         buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4)
         tensor = _DeviceArray(buf, shape)
-        desc = TensorMapDescriptor.from_im2col_wide(
-            tensor,
+        desc = TensorMapDescriptor._from_im2col_wide(
+            _as_view(tensor),
             pixel_box_lower_corner_width=0,
             pixel_box_upper_corner_width=4,
             channels_per_pixel=64,
@@ -599,8 +595,8 @@ def test_from_im2col_wide_5d(self, dev, skip_if_no_im2col_wide):
     def test_from_im2col_wide_rank_validation(self, dev, skip_if_no_im2col_wide):
         buf = dev.allocate(1024 * 4)
         with pytest.raises(ValueError, match="Im2col-wide tensor rank must be between 3 and 5"):
-            TensorMapDescriptor.from_im2col_wide(
-                buf,
+            TensorMapDescriptor._from_im2col_wide(
+                _as_view(buf),
                 pixel_box_lower_corner_width=0,
                 pixel_box_upper_corner_width=4,
                 channels_per_pixel=64,

From e67e9d3e3dcc83c38ee98a4df43bd46d3d3224a8 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Sat, 14 Mar 2026 08:56:26 -0400
Subject: [PATCH 20/25] Consolidate the TMA examples around the libcudacxx
 wrappers.

Keep the example surface smaller and closer to CUDA C++ by showing barrier/TMA helpers and replace_address() in one place instead of duplicating raw PTX snippets.

Made-with: Cursor
---
 cuda_core/examples/tma_replace_address.py | 189 ----------------------
 cuda_core/examples/tma_tensor_map.py      | 167 +++++++++----------
 2 files changed, 79 insertions(+), 277 deletions(-)
 delete mode 100644 cuda_core/examples/tma_replace_address.py

diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py
deleted file mode 100644
index fd19aeca31..0000000000
--- a/cuda_core/examples/tma_replace_address.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-# ################################################################################
-#
-# This example demonstrates how to use replace_address() to repoint a TMA
-# (Tensor Memory Accelerator) descriptor at a different tensor without
-# rebuilding the descriptor from scratch.
-#
-# The workflow is:
-#
-#   1. Create a TMA tiled descriptor and launch a kernel to verify it works
-#   2. Allocate a second tensor with different content
-#   3. Call replace_address() to repoint the same descriptor at the new tensor
-#   4. Re-launch the kernel and verify it reads from the new tensor
-#
-# This is useful when the tensor layout (shape, dtype, tile size) stays the
-# same but the underlying data buffer changes, e.g. double-buffering or
-# iterating over a sequence of same-shaped tensors.
-#
-# Requirements:
-#   - Hopper or later GPU (compute capability >= 9.0)
-#   - CuPy
-#   - CUDA toolkit headers (CUDA_PATH or CUDA_HOME set)
-#
-# ################################################################################
-
-import sys
-
-import cupy as cp
-import numpy as np
-
-from cuda.core import (
-    Device,
-    LaunchConfig,
-    Program,
-    ProgramOptions,
-    StridedMemoryView,
-    launch,
-)
-
-# ---------------------------------------------------------------------------
-# Check for Hopper+ GPU
-# ---------------------------------------------------------------------------
-dev = Device()
-arch = dev.compute_capability
-if arch < (9, 0):
-    print(
-        "TMA requires compute capability >= 9.0 (Hopper or later)",
-        file=sys.stderr,
-    )
-    sys.exit(0)
-dev.set_current()
-
-arch_str = "".join(f"{i}" for i in arch)
-
-# ---------------------------------------------------------------------------
-# CUDA kernel that uses TMA to load a 1-D tile into shared memory, then
-# copies the tile to an output buffer so we can verify correctness.
-#
-# The CUtensorMap struct (128 bytes) is defined inline so the kernel can be
-# compiled with NVRTC without pulling in the full driver-API header.
-#
-# Key points:
-#   - The tensor map is passed by value with __grid_constant__ so the TMA
-#     hardware can read it from grid-constant memory.
-#   - Thread 0 in each block issues the TMA load and manages the mbarrier.
-#   - All threads wait on the mbarrier, then copy from shared to global.
-# ---------------------------------------------------------------------------
-TILE_SIZE = 128  # elements per tile (must match the kernel constant)
-
-code = r"""
-// Minimal definition of the 128-byte opaque tensor map struct.
-struct __align__(64) TensorMap { unsigned long long opaque[16]; };
-
-static constexpr int TILE_SIZE = 128;
-
-extern "C"
-__global__ void tma_copy(
-    const __grid_constant__ TensorMap tensor_map,
-    float* output,
-    int N)
-{
-    __shared__ __align__(128) float smem[TILE_SIZE];
-    __shared__ __align__(8) unsigned long long mbar;
-
-    const int tid        = threadIdx.x;
-    const int tile_start = blockIdx.x * TILE_SIZE;
-
-    // ---- Thread 0: set up mbarrier and issue the TMA load ----
-    if (tid == 0)
-    {
-        // Initialise a single-phase mbarrier (1 arriving thread).
-        asm volatile(
-            "mbarrier.init.shared.b64 [%0], 1;"
-            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
-
-        // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start'
-        // from the tensor described by 'tensor_map' into shared memory.
-        asm volatile(
-            "cp.async.bulk.tensor.1d.shared::cluster.global.tile"
-            ".mbarrier::complete_tx::bytes"
-            " [%0], [%1, {%2}], [%3];"
-            :: "r"((unsigned)__cvta_generic_to_shared(smem)),
-               "l"(&tensor_map),
-               "r"(tile_start),
-               "r"((unsigned)__cvta_generic_to_shared(&mbar)));
-
-        // Tell the mbarrier how many bytes the TMA will deliver.
-        asm volatile(
-            "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"
-            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)),
-               "r"((unsigned)(TILE_SIZE * sizeof(float))));
-    }
-
-    __syncthreads();
-
-    // ---- Wait for the TMA load to complete ----
-    if (tid == 0)
-    {
-        asm volatile(
-            "{ .reg .pred P;                                           \n"
-            "WAIT:                                                     \n"
-            "  mbarrier.try_wait.parity.shared.b64 P, [%0], 0;         \n"
-            "  @!P bra WAIT;                                           \n"
-            "}                                                         \n"
-            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
-    }
-
-    __syncthreads();
-
-    // ---- Copy the tile from shared memory to the output buffer ----
-    if (tid < TILE_SIZE)
-    {
-        const int idx = tile_start + tid;
-        if (idx < N)
-            output[idx] = smem[tid];
-    }
-}
-"""
-
-# ---------------------------------------------------------------------------
-# Compile the kernel
-# ---------------------------------------------------------------------------
-prog = Program(
-    code,
-    code_type="c++",
-    options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"),
-)
-mod = prog.compile("cubin")
-ker = mod.get_kernel("tma_copy")
-
-# ---------------------------------------------------------------------------
-# 1) Prepare input data and verify the initial TMA copy
-# ---------------------------------------------------------------------------
-N = 1024
-a = cp.arange(N, dtype=cp.float32)  # [0, 1, 2, ..., N-1]
-output = cp.zeros(N, dtype=cp.float32)
-dev.sync()  # cupy uses its own stream
-
-tensor_map = StridedMemoryView.from_any_interface(a, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,))
-
-n_tiles = N // TILE_SIZE
-config = LaunchConfig(grid=n_tiles, block=TILE_SIZE)
-launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N))
-dev.sync()
-
-assert cp.array_equal(output, a), "TMA copy produced incorrect results"
-print(f"TMA copy verified: {N} elements across {n_tiles} tiles")
-
-# ---------------------------------------------------------------------------
-# 2) Demonstrate replace_address()
-#    Create a second tensor with different content, point the *same*
-#    descriptor at it, and re-launch without rebuilding the descriptor.
-# ---------------------------------------------------------------------------
-b = cp.full(N, fill_value=42.0, dtype=cp.float32)
-dev.sync()
-
-tensor_map.replace_address(b)
-
-output2 = cp.zeros(N, dtype=cp.float32)
-dev.sync()
-
-launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N))
-dev.sync()
-
-assert cp.array_equal(output2, b), "replace_address produced incorrect results"
-print("replace_address verified: descriptor reused with new source tensor")
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index 2a5ce9ad86..2a622a552d 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -4,15 +4,16 @@
 
 # ################################################################################
 #
-# This example demonstrates how to use TMA (Tensor Memory Accelerator) descriptors
-# with cuda.core on Hopper+ GPUs (compute capability >= 9.0).
+# This example demonstrates how to use TMA (Tensor Memory Accelerator)
+# descriptors with cuda.core on Hopper+ GPUs (compute capability >= 9.0).
 #
-# TMA enables efficient bulk data movement between global and shared memory using
-# hardware-managed tensor map descriptors. This example shows:
+# TMA enables efficient bulk data movement between global and shared memory
+# using hardware-managed tensor map descriptors. This example shows:
 #
 #   1. Creating a TMA tiled descriptor from a CuPy device array
 #   2. Passing the descriptor to a kernel via launch()
-#   3. Using TMA to load tiles into shared memory (via inline PTX)
+#   3. Using libcudacxx TMA/barrier wrappers instead of raw PTX
+#   4. Reusing the same descriptor with replace_address()
 #
 # Requirements:
 #   - Hopper or later GPU (compute capability >= 9.0)
@@ -35,39 +36,31 @@
     launch,
 )
 
-# ---------------------------------------------------------------------------
-# Check for Hopper+ GPU
-# ---------------------------------------------------------------------------
-dev = Device()
-arch = dev.compute_capability
-if arch < (9, 0):
-    print(
-        "TMA requires compute capability >= 9.0 (Hopper or later)",
-        file=sys.stderr,
-    )
-    sys.exit(0)
-dev.set_current()
-
 # ---------------------------------------------------------------------------
 # CUDA kernel that uses TMA to load a 1-D tile into shared memory, then
 # copies the tile to an output buffer so we can verify correctness.
 #
 # The CUtensorMap struct (128 bytes) is defined inline so the kernel can be
-# compiled with NVRTC without pulling in the full driver-API header.
+# compiled with NVRTC without pulling in the full driver-API header. The
+# kernel uses libcudacxx's `cuda::barrier` and TMA wrapper helpers rather
+# than embedding raw PTX strings.
 #
 # Key points:
 #   - The tensor map is passed by value with __grid_constant__ so the TMA
 #     hardware can read it from grid-constant memory.
-#   - Thread 0 in each block issues the TMA load and manages the mbarrier.
-#   - All threads wait on the mbarrier, then copy from shared to global.
+#   - Thread 0 in each block issues the TMA load and waits on the barrier.
+#   - All threads synchronize before copying from shared to global memory.
 # ---------------------------------------------------------------------------
 TILE_SIZE = 128  # elements per tile (must match the kernel constant)
 
 code = r"""
+#include <cuda/barrier>
+
 // Minimal definition of the 128-byte opaque tensor map struct.
 struct __align__(64) TensorMap { unsigned long long opaque[16]; };
 
 static constexpr int TILE_SIZE = 128;
+using TmaBarrier = cuda::barrier<cuda::thread_scope_block>;
 
 extern "C"
 __global__ void tma_copy(
@@ -76,54 +69,28 @@
     int N)
 {
     __shared__ __align__(128) float smem[TILE_SIZE];
-    __shared__ __align__(8) unsigned long long mbar;
+    __shared__ TmaBarrier bar;
 
     const int tid        = threadIdx.x;
     const int tile_start = blockIdx.x * TILE_SIZE;
 
-    // ---- Thread 0: set up mbarrier and issue the TMA load ----
     if (tid == 0)
     {
-        // Initialise a single-phase mbarrier (1 arriving thread).
-        asm volatile(
-            "mbarrier.init.shared.b64 [%0], 1;"
-            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
-
-        // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start'
-        // from the tensor described by 'tensor_map' into shared memory.
-        asm volatile(
-            "cp.async.bulk.tensor.1d.shared::cluster.global.tile"
-            ".mbarrier::complete_tx::bytes"
-            " [%0], [%1, {%2}], [%3];"
-            :: "r"((unsigned)__cvta_generic_to_shared(smem)),
-               "l"(&tensor_map),
-               "r"(tile_start),
-               "r"((unsigned)__cvta_generic_to_shared(&mbar)));
-
-        // Tell the mbarrier how many bytes the TMA will deliver.
-        asm volatile(
-            "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"
-            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)),
-               "r"((unsigned)(TILE_SIZE * sizeof(float))));
+        init(&bar, 1);
     }
-
     __syncthreads();
 
-    // ---- Wait for the TMA load to complete ----
     if (tid == 0)
     {
-        asm volatile(
-            "{ .reg .pred P;                                           \n"
-            "WAIT:                                                     \n"
-            "  mbarrier.try_wait.parity.shared.b64 P, [%0], 0;         \n"
-            "  @!P bra WAIT;                                           \n"
-            "}                                                         \n"
-            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+        cuda::device::experimental::cp_async_bulk_tensor_1d_global_to_shared(
+            smem,
+            reinterpret_cast<const CUtensorMap*>(&tensor_map),
+            tile_start,
+            bar);
+        bar.wait(cuda::device::barrier_arrive_tx(bar, 1, TILE_SIZE * sizeof(float)));
     }
-
     __syncthreads();
 
-    // ---- Copy the tile from shared memory to the output buffer ----
     if (tid < TILE_SIZE)
     {
         const int idx = tile_start + tid;
@@ -133,40 +100,64 @@
 }
 """
 
-# ---------------------------------------------------------------------------
-# Compile the kernel
-# ---------------------------------------------------------------------------
-prog = Program(
-    code,
-    code_type="c++",
-    options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"),
-)
-mod = prog.compile("cubin")
-ker = mod.get_kernel("tma_copy")
+def main():
+    # -----------------------------------------------------------------------
+    # Check for Hopper+ GPU
+    # -----------------------------------------------------------------------
+    dev = Device()
+    arch = dev.compute_capability
+    if arch < (9, 0):
+        print(
+            "TMA requires compute capability >= 9.0 (Hopper or later)",
+            file=sys.stderr,
+        )
+        sys.exit(0)
+    dev.set_current()
+
+    # -----------------------------------------------------------------------
+    # Compile the kernel
+    # -----------------------------------------------------------------------
+    prog = Program(
+        code,
+        code_type="c++",
+        options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"),
+    )
+    mod = prog.compile("cubin")
+    ker = mod.get_kernel("tma_copy")
 
-# ---------------------------------------------------------------------------
-# 1) Prepare input data on the device
-# ---------------------------------------------------------------------------
-N = 1024
-a = cp.arange(N, dtype=cp.float32)  # [0, 1, 2, ..., N-1]
-output = cp.zeros(N, dtype=cp.float32)
-dev.sync()  # cupy uses its own stream
+    # -----------------------------------------------------------------------
+    # 1) Prepare input data and verify the initial TMA copy
+    # -----------------------------------------------------------------------
+    n = 1024
+    src = cp.arange(n, dtype=cp.float32)
+    output = cp.zeros(n, dtype=cp.float32)
+    dev.sync()  # CuPy uses its own stream
 
-# ---------------------------------------------------------------------------
-# 2) Create a TMA tiled descriptor from a StridedMemoryView.
-#    The dtype (float32) is inferred automatically from the CuPy array.
-# ---------------------------------------------------------------------------
-tensor_map = StridedMemoryView.from_any_interface(a, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,))
+    tensor_map = StridedMemoryView.from_any_interface(src, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,))
+
+    n_tiles = n // TILE_SIZE
+    config = LaunchConfig(grid=n_tiles, block=TILE_SIZE)
+    launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(n))
+    dev.sync()
+
+    assert cp.array_equal(output, src), "TMA copy produced incorrect results"
+    print(f"TMA copy verified: {n} elements across {n_tiles} tiles")
+
+    # -----------------------------------------------------------------------
+    # 2) Demonstrate replace_address() without rebuilding the descriptor
+    # -----------------------------------------------------------------------
+    replacement = cp.full(n, fill_value=42.0, dtype=cp.float32)
+    dev.sync()
+
+    tensor_map.replace_address(replacement)
+
+    output2 = cp.zeros(n, dtype=cp.float32)
+    launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(n))
+    dev.sync()
+
+    assert cp.array_equal(output2, replacement), "replace_address produced incorrect results"
+    print("replace_address verified: descriptor reused with new source tensor")
 
-# ---------------------------------------------------------------------------
-# 3) Launch the kernel
-#    The TensorMapDescriptor is passed directly as a kernel argument — the
-#    128-byte struct is copied into kernel parameter space automatically.
-# ---------------------------------------------------------------------------
-n_tiles = N // TILE_SIZE
-config = LaunchConfig(grid=n_tiles, block=TILE_SIZE)
-launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N))
-dev.sync()
 
-assert cp.array_equal(output, a), "TMA copy produced incorrect results"
-print(f"TMA copy verified: {N} elements across {n_tiles} tiles")
+if __name__ == "__main__":
+    main()

From 9ff8d0fc002cc16f5a0b1583a0f1d62ad969d947 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Sat, 14 Mar 2026 08:58:03 -0400
Subject: [PATCH 21/25] Teach the TMA example where to find libcudacxx headers.

Use the toolkit include and optional cccl include roots when compiling the wrapper-based example so NVRTC can resolve cuda/barrier outside the test harness.

Made-with: Cursor
---
 cuda_core/examples/tma_tensor_map.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index 2a622a552d..b914651089 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -22,6 +22,7 @@
 #
 # ################################################################################
 
+import os
 import sys
 
 import cupy as cp
@@ -100,6 +101,25 @@
 }
 """
 
+
+def _get_cccl_include_paths():
+    cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
+    if cuda_path is None:
+        print("This example requires CUDA_PATH or CUDA_HOME to point to a CUDA toolkit.", file=sys.stderr)
+        sys.exit(1)
+
+    cuda_include = os.path.join(cuda_path, "include")
+    if not os.path.isdir(cuda_include):
+        print(f"CUDA include directory not found: {cuda_include}", file=sys.stderr)
+        sys.exit(1)
+
+    include_path = [cuda_include]
+    cccl_include = os.path.join(cuda_include, "cccl")
+    if os.path.isdir(cccl_include):
+        include_path.insert(0, cccl_include)
+    return include_path
+
+
 def main():
     # -----------------------------------------------------------------------
     # Check for Hopper+ GPU
@@ -113,6 +133,7 @@ def main():
         )
         sys.exit(0)
     dev.set_current()
+    include_path = _get_cccl_include_paths()
 
     # -----------------------------------------------------------------------
     # Compile the kernel
@@ -120,7 +141,7 @@ def main():
     prog = Program(
         code,
         code_type="c++",
-        options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"),
+        options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}", include_path=include_path),
     )
     mod = prog.compile("cubin")
     ker = mod.get_kernel("tma_copy")

From 719f0f357c853b939f54a2152f0ba204f3d91947 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Sat, 14 Mar 2026 09:18:28 -0400
Subject: [PATCH 22/25] Bundle tiled TensorMap options and type retained views.

Centralize the tiled descriptor arguments in an options object, keep dtype-like inputs on the public path while using raw driver values internally, and declare StridedMemoryView in a pxd so retained views stay typed without extra helper indirection.

Made-with: Cursor
---
 cuda_core/cuda/core/_memoryview.pxd |  24 +++
 cuda_core/cuda/core/_memoryview.pyx |  38 +---
 cuda_core/cuda/core/_tensor_map.pxd |   3 +-
 cuda_core/cuda/core/_tensor_map.pyx | 271 +++++++++++++++++++++-------
 cuda_core/tests/test_tensor_map.py  |  33 ++++
 5 files changed, 276 insertions(+), 93 deletions(-)
 create mode 100644 cuda_core/cuda/core/_memoryview.pxd

diff --git a/cuda_core/cuda/core/_memoryview.pxd b/cuda_core/cuda/core/_memoryview.pxd
new file mode 100644
index 0000000000..c12aa38ad8
--- /dev/null
+++ b/cuda_core/cuda/core/_memoryview.pxd
@@ -0,0 +1,24 @@
+from libc.stdint cimport intptr_t
+
+from cuda.core._dlpack cimport DLTensor
+from cuda.core._layout cimport _StridedLayout
+
+
+cdef class StridedMemoryView:
+    cdef readonly:
+        intptr_t ptr
+        int device_id
+        bint is_device_accessible
+        bint readonly
+        object exporting_obj
+
+    cdef:
+        object metadata
+        DLTensor* dl_tensor
+        _StridedLayout _layout
+        object _buffer
+        object _dtype
+
+    cdef inline _StridedLayout get_layout(self)
+    cdef inline object get_buffer(self)
+    cdef inline object get_dtype(self)
diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
index e34aca424d..7dc32b7ec7 100644
--- a/cuda_core/cuda/core/_memoryview.pyx
+++ b/cuda_core/cuda/core/_memoryview.pyx
@@ -107,35 +107,6 @@ cdef class StridedMemoryView:
         it will be the Buffer instance passed to the method.
 
     """
-    cdef readonly:
-        intptr_t ptr
-        int device_id
-        bint is_device_accessible
-        bint readonly
-        object exporting_obj
-
-    cdef:
-        # If using dlpack, this is a strong reference to the result of
-        # obj.__dlpack__() so we can lazily create shape and strides from
-        # it later.  If using CAI, this is a reference to the source
-        # `__cuda_array_interface__` object.
-        object metadata
-
-        # The tensor object if has obj has __dlpack__, otherwise must be NULL
-        DLTensor *dl_tensor
-
-        # Memoized properties
-        # Either lazily inferred from dl_tensor/metadata,
-        # or explicitly provided if created with from_buffer().
-        _StridedLayout _layout
-        # Either exporting_obj if it is a Buffer, otherwise a Buffer instance
-        # with owner set to the exporting object.
-        object _buffer
-        # Either lazily inferred from dl_tensor/metadata,
-        # or explicitly provided if created with from_buffer().
-        # In the latter case, it can be None.
-        object _dtype
-
     def __init__(self, obj: object = None, stream_ptr: int | None = None) -> None:
         cdef str clsname = self.__class__.__name__
         if obj is not None:
@@ -318,8 +289,9 @@ cdef class StridedMemoryView:
 
     def as_tensor_map(
         self,
-        box_dim,
+        box_dim=None,
         *,
+        options=None,
         element_strides=None,
         data_type=None,
         interleave=None,
@@ -330,11 +302,15 @@ cdef class StridedMemoryView:
         """Create a tiled :obj:`TensorMapDescriptor` from this view.
 
         This is the public entry point for creating tiled tensor map
-        descriptors in ``cuda.core``.
+        descriptors in ``cuda.core``. Pass either ``box_dim`` and the
+        individual keyword arguments directly, or provide bundled tiled
+        options via ``options=``.
         """
         from cuda.core._tensor_map import TensorMapDescriptor
 
         kwargs = {}
+        if options is not None:
+            kwargs["options"] = options
         if element_strides is not None:
             kwargs["element_strides"] = element_strides
         if data_type is not None:
diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
index 4c60b7fc70..25aef56626 100644
--- a/cuda_core/cuda/core/_tensor_map.pxd
+++ b/cuda_core/cuda/core/_tensor_map.pxd
@@ -4,6 +4,7 @@
 
 from cuda.bindings cimport cydriver
 from libc.stdint cimport intptr_t
+from cuda.core._memoryview cimport StridedMemoryView
 
 
 cdef class TensorMapDescriptor:
@@ -11,7 +12,7 @@ cdef class TensorMapDescriptor:
     cdef int _device_id
     cdef intptr_t _context
     cdef object _source_ref
-    cdef object _view_ref
+    cdef StridedMemoryView _view_ref
     cdef object _repr_info
 
     cdef int _check_context_compat(self) except -1
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
index 7ed69bf722..c101ab1963 100644
--- a/cuda_core/cuda/core/_tensor_map.pyx
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -9,10 +9,12 @@ from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 from cuda.core._dlpack cimport kDLInt, kDLUInt, kDLFloat, kDLBfloat, _kDLCUDA
 
 import enum
+from dataclasses import dataclass
 
 import numpy
 
 from cuda.core._memoryview import StridedMemoryView
+from cuda.core._utils.cuda_utils import check_or_create_options
 
 cdef extern from "_cpp/tensor_map_cccl.h":
     int cuda_core_cccl_make_tma_descriptor_tiled(
@@ -124,38 +126,166 @@ ELSE:
         W128 = 1
 
 
+_TMA_DT_UINT8 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT8)
+_TMA_DT_UINT16 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT16)
+_TMA_DT_UINT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT32)
+_TMA_DT_INT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_INT32)
+_TMA_DT_UINT64 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT64)
+_TMA_DT_INT64 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_INT64)
+_TMA_DT_FLOAT16 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT16)
+_TMA_DT_FLOAT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32)
+_TMA_DT_FLOAT64 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT64)
+_TMA_DT_BFLOAT16 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_BFLOAT16)
+_TMA_DT_FLOAT32_FTZ = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ)
+_TMA_DT_TFLOAT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32)
+_TMA_DT_TFLOAT32_FTZ = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ)
+
+
+def _normalize_tensor_map_data_type(data_type):
+    if data_type is None or isinstance(data_type, TensorMapDataType):
+        return data_type
+    try:
+        return numpy.dtype(data_type)
+    except TypeError as e:
+        raise TypeError(
+            "data_type must be a TensorMapDataType or a numpy/ml_dtypes dtype, "
+            f"got {type(data_type)}") from e
+
+
+def _normalize_tensor_map_sequence(name, values):
+    try:
+        values = tuple(values)
+    except TypeError as e:
+        raise TypeError(f"{name} must be a tuple of ints, got {type(values)}") from e
+    for i, value in enumerate(values):
+        if not isinstance(value, int):
+            raise TypeError(f"{name}[{i}] must be an int, got {type(value)}")
+    return values
+
+
+def _require_tensor_map_enum(name, value, enum_type):
+    if not isinstance(value, enum_type):
+        raise TypeError(f"{name} must be a {enum_type.__name__}, got {type(value)}")
+    return value
+
+
+@dataclass
+class TensorMapDescriptorOptions:
+    """Options for :meth:`cuda.core.StridedMemoryView.as_tensor_map`.
+
+    Attributes
+    ----------
+    box_dim : tuple[int, ...]
+        Tile size for each tensor dimension, expressed in elements.
+    element_strides : tuple[int, ...], optional
+        Per-dimension element traversal strides.
+    data_type : object, optional
+        Explicit dtype override. Prefer NumPy or ``ml_dtypes`` dtype objects;
+        :class:`TensorMapDataType` remains accepted for compatibility.
+    interleave : TensorMapInterleave, optional
+        Interleave layout. Default ``NONE``.
+    swizzle : TensorMapSwizzle, optional
+        Swizzle mode. Default ``NONE``.
+    l2_promotion : TensorMapL2Promotion, optional
+        L2 promotion mode. Default ``NONE``.
+    oob_fill : TensorMapOOBFill, optional
+        Out-of-bounds fill mode. Default ``NONE``.
+    """
+
+    box_dim: tuple[int, ...]
+    element_strides: tuple[int, ...] | None = None
+    data_type: object = None
+    interleave: TensorMapInterleave = TensorMapInterleave.NONE
+    swizzle: TensorMapSwizzle = TensorMapSwizzle.NONE
+    l2_promotion: TensorMapL2Promotion = TensorMapL2Promotion.NONE
+    oob_fill: TensorMapOOBFill = TensorMapOOBFill.NONE
+
+    def __post_init__(self):
+        self.box_dim = _normalize_tensor_map_sequence("box_dim", self.box_dim)
+        if self.element_strides is not None:
+            self.element_strides = _normalize_tensor_map_sequence("element_strides", self.element_strides)
+        self.data_type = _normalize_tensor_map_data_type(self.data_type)
+        self.interleave = _require_tensor_map_enum("interleave", self.interleave, TensorMapInterleave)
+        self.swizzle = _require_tensor_map_enum("swizzle", self.swizzle, TensorMapSwizzle)
+        self.l2_promotion = _require_tensor_map_enum("l2_promotion", self.l2_promotion, TensorMapL2Promotion)
+        self.oob_fill = _require_tensor_map_enum("oob_fill", self.oob_fill, TensorMapOOBFill)
+
+
+def _coerce_tensor_map_descriptor_options(
+    box_dim,
+    options,
+    *,
+    element_strides,
+    data_type,
+    interleave,
+    swizzle,
+    l2_promotion,
+    oob_fill,
+):
+    if options is not None:
+        if (
+            box_dim is not None
+            or element_strides is not None
+            or data_type is not None
+            or interleave != TensorMapInterleave.NONE
+            or swizzle != TensorMapSwizzle.NONE
+            or l2_promotion != TensorMapL2Promotion.NONE
+            or oob_fill != TensorMapOOBFill.NONE
+        ):
+            raise TypeError(
+                "Specify either options or the individual tensor map arguments, not both")
+        return check_or_create_options(
+            TensorMapDescriptorOptions,
+            options,
+            "Tensor map descriptor options",
+        )
+
+    if box_dim is None:
+        raise TypeError("box_dim is required unless options is provided")
+
+    return TensorMapDescriptorOptions(
+        box_dim=box_dim,
+        element_strides=element_strides,
+        data_type=data_type,
+        interleave=interleave,
+        swizzle=swizzle,
+        l2_promotion=l2_promotion,
+        oob_fill=oob_fill,
+    )
+
+
 # Mapping from numpy dtype to TMA data type
 _NUMPY_DTYPE_TO_TMA = {
-    numpy.dtype(numpy.uint8): TensorMapDataType.UINT8,
-    numpy.dtype(numpy.uint16): TensorMapDataType.UINT16,
-    numpy.dtype(numpy.uint32): TensorMapDataType.UINT32,
-    numpy.dtype(numpy.int32): TensorMapDataType.INT32,
-    numpy.dtype(numpy.uint64): TensorMapDataType.UINT64,
-    numpy.dtype(numpy.int64): TensorMapDataType.INT64,
-    numpy.dtype(numpy.float16): TensorMapDataType.FLOAT16,
-    numpy.dtype(numpy.float32): TensorMapDataType.FLOAT32,
-    numpy.dtype(numpy.float64): TensorMapDataType.FLOAT64,
+    numpy.dtype(numpy.uint8): _TMA_DT_UINT8,
+    numpy.dtype(numpy.uint16): _TMA_DT_UINT16,
+    numpy.dtype(numpy.uint32): _TMA_DT_UINT32,
+    numpy.dtype(numpy.int32): _TMA_DT_INT32,
+    numpy.dtype(numpy.uint64): _TMA_DT_UINT64,
+    numpy.dtype(numpy.int64): _TMA_DT_INT64,
+    numpy.dtype(numpy.float16): _TMA_DT_FLOAT16,
+    numpy.dtype(numpy.float32): _TMA_DT_FLOAT32,
+    numpy.dtype(numpy.float64): _TMA_DT_FLOAT64,
 }
 
 if ml_bfloat16 is not None:
-    _NUMPY_DTYPE_TO_TMA[numpy.dtype(ml_bfloat16)] = TensorMapDataType.BFLOAT16
+    _NUMPY_DTYPE_TO_TMA[numpy.dtype(ml_bfloat16)] = _TMA_DT_BFLOAT16
 
 
 # Mapping from TMA data type to element size in bytes
 _TMA_DATA_TYPE_SIZE = {
-    TensorMapDataType.UINT8: 1,
-    TensorMapDataType.UINT16: 2,
-    TensorMapDataType.UINT32: 4,
-    TensorMapDataType.INT32: 4,
-    TensorMapDataType.UINT64: 8,
-    TensorMapDataType.INT64: 8,
-    TensorMapDataType.FLOAT16: 2,
-    TensorMapDataType.FLOAT32: 4,
-    TensorMapDataType.FLOAT64: 8,
-    TensorMapDataType.BFLOAT16: 2,
-    TensorMapDataType.FLOAT32_FTZ: 4,
-    TensorMapDataType.TFLOAT32: 4,
-    TensorMapDataType.TFLOAT32_FTZ: 4,
+    _TMA_DT_UINT8: 1,
+    _TMA_DT_UINT16: 2,
+    _TMA_DT_UINT32: 4,
+    _TMA_DT_INT32: 4,
+    _TMA_DT_UINT64: 8,
+    _TMA_DT_INT64: 8,
+    _TMA_DT_FLOAT16: 2,
+    _TMA_DT_FLOAT32: 4,
+    _TMA_DT_FLOAT64: 8,
+    _TMA_DT_BFLOAT16: 2,
+    _TMA_DT_FLOAT32_FTZ: 4,
+    _TMA_DT_TFLOAT32: 4,
+    _TMA_DT_TFLOAT32_FTZ: 4,
 }
 
 
@@ -164,13 +294,8 @@ def _resolve_data_type(view, data_type):
 
     if data_type is not None:
         if isinstance(data_type, TensorMapDataType):
-            return data_type
-        try:
-            dt = numpy.dtype(data_type)
-        except TypeError as e:
-            raise TypeError(
-                "data_type must be a TensorMapDataType or a numpy/ml_dtypes dtype, "
-                f"got {type(data_type)}") from e
+            return int(data_type)
+        dt = _normalize_tensor_map_data_type(data_type)
         tma_dt = _NUMPY_DTYPE_TO_TMA.get(dt)
         if tma_dt is None:
             raise ValueError(
@@ -195,57 +320,57 @@ def _resolve_data_type(view, data_type):
 
 
 cdef inline bint _tma_dtype_to_dlpack(
-    object tma_dt,
+    int tma_dt,
     uint8_t* out_code,
     uint8_t* out_bits,
     uint16_t* out_lanes,
 ) noexcept:
-    if tma_dt == TensorMapDataType.UINT8:
+    if tma_dt == _TMA_DT_UINT8:
         out_code[0] = <uint8_t>kDLUInt
         out_bits[0] = <uint8_t>8
         out_lanes[0] = <uint16_t>1
         return True
-    if tma_dt == TensorMapDataType.UINT16:
+    if tma_dt == _TMA_DT_UINT16:
         out_code[0] = <uint8_t>kDLUInt
         out_bits[0] = <uint8_t>16
         out_lanes[0] = <uint16_t>1
         return True
-    if tma_dt == TensorMapDataType.UINT32:
+    if tma_dt == _TMA_DT_UINT32:
         out_code[0] = <uint8_t>kDLUInt
         out_bits[0] = <uint8_t>32
         out_lanes[0] = <uint16_t>1
         return True
-    if tma_dt == TensorMapDataType.UINT64:
+    if tma_dt == _TMA_DT_UINT64:
         out_code[0] = <uint8_t>kDLUInt
         out_bits[0] = <uint8_t>64
         out_lanes[0] = <uint16_t>1
         return True
-    if tma_dt == TensorMapDataType.INT32:
+    if tma_dt == _TMA_DT_INT32:
         out_code[0] = <uint8_t>kDLInt
         out_bits[0] = <uint8_t>32
         out_lanes[0] = <uint16_t>1
         return True
-    if tma_dt == TensorMapDataType.INT64:
+    if tma_dt == _TMA_DT_INT64:
         out_code[0] = <uint8_t>kDLInt
         out_bits[0] = <uint8_t>64
         out_lanes[0] = <uint16_t>1
         return True
-    if tma_dt == TensorMapDataType.FLOAT16:
+    if tma_dt == _TMA_DT_FLOAT16:
         out_code[0] = <uint8_t>kDLFloat
         out_bits[0] = <uint8_t>16
         out_lanes[0] = <uint16_t>1
         return True
-    if tma_dt == TensorMapDataType.FLOAT32:
+    if tma_dt == _TMA_DT_FLOAT32:
         out_code[0] = <uint8_t>kDLFloat
         out_bits[0] = <uint8_t>32
         out_lanes[0] = <uint16_t>1
         return True
-    if tma_dt == TensorMapDataType.FLOAT64:
+    if tma_dt == _TMA_DT_FLOAT64:
         out_code[0] = <uint8_t>kDLFloat
         out_bits[0] = <uint8_t>64
         out_lanes[0] = <uint16_t>1
         return True
-    if tma_dt == TensorMapDataType.BFLOAT16:
+    if tma_dt == _TMA_DT_BFLOAT16:
         out_code[0] = <uint8_t>kDLBfloat
         out_bits[0] = <uint8_t>16
         out_lanes[0] = <uint16_t>1
@@ -287,8 +412,6 @@ def _require_view_device(view, expected_device_id, operation):
     if device_type == _kDLCUDA and device_id != expected_device_id:
         raise ValueError(
             f"{operation} expects tensor on device {expected_device_id}, got {device_id}")
-
-
 cdef inline intptr_t _get_current_context_ptr() except? 0:
     cdef cydriver.CUcontext ctx
     with nogil:
@@ -397,7 +520,8 @@ cdef class TensorMapDescriptor:
             return Device(self._device_id)
 
     @classmethod
-    def _from_tiled(cls, view, box_dim, *,
+    def _from_tiled(cls, view, box_dim=None, *,
+                   options=None,
                    element_strides=None,
                    data_type=None,
                    interleave=TensorMapInterleave.NONE,
@@ -410,16 +534,21 @@ cdef class TensorMapDescriptor:
         ----------
         view : StridedMemoryView
             A device-accessible view with a 16-byte-aligned pointer.
-        box_dim : tuple of int
+        box_dim : tuple of int, optional
             The size of each tile dimension (in elements). Must have the
             same rank as the tensor and each value must be in [1, 256].
             Specified in the same (row-major) order as the tensor shape.
+            Required unless ``options`` is provided.
+        options : TensorMapDescriptorOptions or mapping, optional
+            Bundled tiled-descriptor options. When provided, do not also pass
+            ``box_dim`` or the individual option kwargs.
         element_strides : tuple of int, optional
             Per-dimension element traversal strides. Default is all 1s.
             Specified in the same (row-major) order as the tensor shape.
-        data_type : TensorMapDataType, optional
-            Explicit data type override. If ``None``, inferred from the
-            tensor's dtype.
+        data_type : dtype-like or TensorMapDataType, optional
+            Explicit dtype override. If ``None``, inferred from the tensor's
+            dtype. Prefer NumPy or ``ml_dtypes`` dtype objects; the enum is
+            accepted for compatibility.
         interleave : TensorMapInterleave
             Interleave layout. Default ``NONE``.
         swizzle : TensorMapSwizzle
@@ -441,6 +570,24 @@ cdef class TensorMapDescriptor:
         """
         cdef TensorMapDescriptor desc = cls.__new__(cls)
 
+        opts = _coerce_tensor_map_descriptor_options(
+            box_dim,
+            options,
+            element_strides=element_strides,
+            data_type=data_type,
+            interleave=interleave,
+            swizzle=swizzle,
+            l2_promotion=l2_promotion,
+            oob_fill=oob_fill,
+        )
+        box_dim = opts.box_dim
+        element_strides = opts.element_strides
+        data_type = opts.data_type
+        interleave = opts.interleave
+        swizzle = opts.swizzle
+        l2_promotion = opts.l2_promotion
+        oob_fill = opts.oob_fill
+
         _validate_tensor_map_view(view)
         # Keep both the original tensor object and the validated view alive.
         # For DLPack exporters, the view may hold the owning capsule whose
@@ -452,7 +599,7 @@ cdef class TensorMapDescriptor:
         _require_view_device(view, desc._device_id, "TensorMapDescriptor._from_tiled")
 
         tma_dt = _resolve_data_type(view, data_type)
-        cdef int c_data_type_int = int(tma_dt)
+        cdef int c_data_type_int = tma_dt
         cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
 
         cdef intptr_t global_address = view.ptr
@@ -551,7 +698,7 @@ cdef class TensorMapDescriptor:
                 desc._repr_info = {
                     "method": "tiled",
                     "rank": rank,
-                    "data_type": tma_dt,
+                    "data_type": TensorMapDataType(tma_dt),
                     "swizzle": swizzle,
                 }
                 return desc
@@ -614,7 +761,7 @@ cdef class TensorMapDescriptor:
         desc._repr_info = {
             "method": "tiled",
             "rank": rank,
-            "data_type": tma_dt,
+            "data_type": TensorMapDataType(tma_dt),
             "swizzle": swizzle,
         }
 
@@ -651,9 +798,10 @@ cdef class TensorMapDescriptor:
             Number of pixels per column.
         element_strides : tuple of int, optional
             Per-dimension element traversal strides. Default is all 1s.
-        data_type : TensorMapDataType, optional
-            Explicit data type override. If ``None``, inferred from the
-            tensor's dtype.
+        data_type : dtype-like or TensorMapDataType, optional
+            Explicit dtype override. If ``None``, inferred from the tensor's
+            dtype. Prefer NumPy or ``ml_dtypes`` dtype objects; the enum is
+            accepted for compatibility.
         interleave : TensorMapInterleave
             Interleave layout. Default ``NONE``.
         swizzle : TensorMapSwizzle
@@ -683,7 +831,7 @@ cdef class TensorMapDescriptor:
         _require_view_device(view, desc._device_id, "TensorMapDescriptor._from_im2col")
 
         tma_dt = _resolve_data_type(view, data_type)
-        cdef int c_data_type_int = int(tma_dt)
+        cdef int c_data_type_int = tma_dt
         cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
 
         cdef intptr_t global_address = view.ptr
@@ -767,7 +915,7 @@ cdef class TensorMapDescriptor:
         desc._repr_info = {
             "method": "im2col",
             "rank": rank,
-            "data_type": tma_dt,
+            "data_type": TensorMapDataType(tma_dt),
             "swizzle": swizzle,
         }
 
@@ -803,9 +951,10 @@ cdef class TensorMapDescriptor:
             Number of pixels per column.
         element_strides : tuple of int, optional
             Per-dimension element traversal strides. Default is all 1s.
-        data_type : TensorMapDataType, optional
-            Explicit data type override. If ``None``, inferred from the
-            tensor's dtype.
+        data_type : dtype-like or TensorMapDataType, optional
+            Explicit dtype override. If ``None``, inferred from the tensor's
+            dtype. Prefer NumPy or ``ml_dtypes`` dtype objects; the enum is
+            accepted for compatibility.
         interleave : TensorMapInterleave
             Interleave layout. Default ``NONE``.
         mode : TensorMapIm2ColWideMode
@@ -841,7 +990,7 @@ cdef class TensorMapDescriptor:
             _require_view_device(view, desc._device_id, "TensorMapDescriptor._from_im2col_wide")
 
             tma_dt = _resolve_data_type(view, data_type)
-            cdef int c_data_type_int = int(tma_dt)
+            cdef int c_data_type_int = tma_dt
             cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
 
             cdef intptr_t global_address = view.ptr
@@ -909,7 +1058,7 @@ cdef class TensorMapDescriptor:
             desc._repr_info = {
                 "method": "im2col_wide",
                 "rank": rank,
-                "data_type": tma_dt,
+                "data_type": TensorMapDataType(tma_dt),
                 "swizzle": swizzle,
             }
 
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 099140354c..3fb5f9808a 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -15,6 +15,7 @@
 from cuda.core._dlpack import DLDeviceType
 from cuda.core._tensor_map import (
     TensorMapDataType,
+    TensorMapDescriptorOptions,
     TensorMapIm2ColWideMode,
     TensorMapInterleave,
     TensorMapL2Promotion,
@@ -145,6 +146,38 @@ def test_strided_memory_view_as_tensor_map(self, dev, skip_if_no_tma):
         )
         assert desc is not None
 
+    def test_strided_memory_view_as_tensor_map_options(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
+        view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+        desc = view.as_tensor_map(
+            options=TensorMapDescriptorOptions(
+                box_dim=(32, 32),
+                data_type=np.float32,
+                swizzle=TensorMapSwizzle.SWIZZLE_128B,
+            )
+        )
+        assert desc is not None
+
+    def test_strided_memory_view_as_tensor_map_options_dict(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        desc = _as_view(buf).as_tensor_map(
+            options={
+                "box_dim": (64,),
+                "data_type": np.float32,
+                "element_strides": (1,),
+            }
+        )
+        assert desc is not None
+
+    def test_strided_memory_view_as_tensor_map_rejects_options_with_kwargs(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(TypeError, match="Specify either options or the individual tensor map arguments"):
+            _as_view(buf).as_tensor_map(
+                box_dim=(64,),
+                options=TensorMapDescriptorOptions(box_dim=(64,)),
+            )
+
     def test_from_tiled_3d(self, dev, skip_if_no_tma):
         buf = dev.allocate(16 * 16 * 16 * 4)  # 16x16x16 float32
         tensor = _DeviceArray(buf, (16, 16, 16))

From ad1c8009bfc3085ae6949eef9ee831a91d74fb37 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Sat, 14 Mar 2026 09:42:18 -0400
Subject: [PATCH 23/25] Keep the rebased TensorMap validation helper
 consistent.

Remove the stale Cython-only `_require_view_device` definition left behind while porting the TensorMap fixes onto the PR head branch so the extension builds against the newer managed-memory-aware helper.

Made-with: Cursor
---
 cuda_core/cuda/core/_tensor_map.pyx | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
index c101ab1963..e1e5daa9fd 100644
--- a/cuda_core/cuda/core/_tensor_map.pyx
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -427,18 +427,6 @@ cdef inline int _get_current_device_id() except -1:
         HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
     return <int>dev
 
-
-cdef inline int _require_view_device(
-    view,
-    int device_id,
-    object caller,
-) except -1:
-    if view.device_id != device_id:
-        raise ValueError(
-            f"{caller} expects tensor on device {device_id}, got {view.device_id}")
-    return 0
-
-
 def _compute_byte_strides(shape, strides, elem_size):
     """Compute byte strides from element strides or C-contiguous fallback.
 

From a1203ac9d1ff30492c90fd0cc6b05da7064702f9 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Sat, 14 Mar 2026 09:44:34 -0400
Subject: [PATCH 24/25] Apply the pre-commit fixes for the rebased TensorMap
 branch.

Add the missing SPDX header on the new `_memoryview.pxd` file and keep the test module formatted the way `ruff format` expects so pre-commit.ci can clear on the live PR branch.

Made-with: Cursor
---
 cuda_core/cuda/core/_memoryview.pxd | 4 ++++
 cuda_core/tests/test_tensor_map.py  | 1 +
 2 files changed, 5 insertions(+)

diff --git a/cuda_core/cuda/core/_memoryview.pxd b/cuda_core/cuda/core/_memoryview.pxd
index c12aa38ad8..5b50ae6dc7 100644
--- a/cuda_core/cuda/core/_memoryview.pxd
+++ b/cuda_core/cuda/core/_memoryview.pxd
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
 from libc.stdint cimport intptr_t
 
 from cuda.core._dlpack cimport DLTensor
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 3fb5f9808a..967b96b701 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -62,6 +62,7 @@ def __init__(self, device_type, device_id):
     def __dlpack_device__(self):
         return (self._device_type, self._device_id)
 
+
 def _as_view(obj):
     if isinstance(obj, StridedMemoryView):
         return obj

From 3c9e32daa649816ccba84d4d9335ba4d8cbb2b2c Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Sat, 14 Mar 2026 10:16:40 -0400
Subject: [PATCH 25/25] Keep the TensorMap multi-GPU tests on the view-based
 API.

Replace the last stale `TensorMapDescriptor.from_tiled()` call sites with `StridedMemoryView.as_tensor_map()` so the multi-device CI coverage exercises the constructor path that actually exists on this branch.

Made-with: Cursor
---
 cuda_core/tests/test_tensor_map.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 967b96b701..9ca8790d2b 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -376,8 +376,7 @@ def test_replace_address_rejects_tensor_from_other_device(self, dev, skip_if_no_
 
         dev0.set_current()
         buf0 = dev0.allocate(1024 * 4)
-        desc = TensorMapDescriptor.from_tiled(
-            buf0,
+        desc = _as_view(buf0).as_tensor_map(
             box_dim=(64,),
             data_type=TensorMapDataType.FLOAT32,
         )
@@ -399,8 +398,7 @@ def test_replace_address_accepts_managed_buffer_on_nonzero_device(self, init_cud
         skip_if_managed_memory_unsupported(dev1)
 
         dev1.set_current()
-        desc = TensorMapDescriptor.from_tiled(
-            dev1.allocate(1024 * 4),
+        desc = _as_view(dev1.allocate(1024 * 4)).as_tensor_map(
             box_dim=(64,),
             data_type=TensorMapDataType.FLOAT32,
         )
@@ -427,10 +425,9 @@ def test_from_tiled_rejects_tensor_from_other_device(self, init_cuda):
 
         with pytest.raises(
             ValueError,
-            match=r"TensorMapDescriptor\.from_tiled expects tensor on device 0, got 1",
+            match=r"TensorMapDescriptor\._from_tiled expects tensor on device 0, got 1",
         ):
-            TensorMapDescriptor.from_tiled(
-                buf1,
+            _as_view(buf1).as_tensor_map(
                 box_dim=(64,),
                 data_type=TensorMapDataType.FLOAT32,
             )
@@ -448,8 +445,7 @@ def test_from_tiled_accepts_managed_buffer_on_nonzero_device(self, init_cuda):
         mr = create_managed_memory_resource_or_skip(ManagedMemoryResourceOptions(preferred_location=dev1.device_id))
         managed_buf = mr.allocate(1024 * 4)
 
-        desc = TensorMapDescriptor.from_tiled(
-            managed_buf,
+        desc = _as_view(managed_buf).as_tensor_map(
             box_dim=(64,),
             data_type=TensorMapDataType.FLOAT32,
         )