NVIDIA
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cuda_core/cuda/core/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎cuda_core/cuda/core/__init__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎cuda_core/cuda/core/_cpp/tensor_map.cpp‎
Lines changed: 149 additions & 0 deletions b/‎cuda_core/cuda/core/_cpp/tensor_map.cpp‎
Lines changed: 149 additions & 0 deletions
diff --git a/‎cuda_core/cuda/core/_cpp/tensor_map_cccl.h‎
Lines changed: 43 additions & 0 deletions b/‎cuda_core/cuda/core/_cpp/tensor_map_cccl.h‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎cuda_core/cuda/core/_tensor_map.pxd‎
Lines changed: 1 addition & 0 deletions b/‎cuda_core/cuda/core/_tensor_map.pxd‎
Lines changed: 1 addition & 0 deletions
@@ -14,6 +14,7 @@ __pycache__/
 !*_impl.cpp
 !cuda_bindings/cuda/bindings/_lib/param_packer.cpp
 !cuda_bindings/cuda/bindings/_bindings/loader.cpp
+!cuda_core/cuda/core/_cpp/*.cpp
 cache_driver
 cache_runtime
 cache_nvrtc
 
@@ -60,7 +60,6 @@
     StridedMemoryView,
     args_viewable_as_strided_memory,
 )
-<<<<<<< HEAD
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._stream import (
 
@@ -0,0 +1,149 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tensor_map_cccl.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <exception>
+
+#if defined(__has_include)
+#  if __has_include(<cuda/tma>)
+#    include <cuda/tma>
+#    define CUDA_CORE_HAS_CUDA_TMA 1
+#  else
+#    define CUDA_CORE_HAS_CUDA_TMA 0
+#  endif
+#  if __has_include(<dlpack/dlpack.h>)
+#    include <dlpack/dlpack.h>
+#    define CUDA_CORE_HAS_DLPACK_H 1
+#  else
+#    define CUDA_CORE_HAS_DLPACK_H 0
+#  endif
+#else
+#  define CUDA_CORE_HAS_CUDA_TMA 0
+#  define CUDA_CORE_HAS_DLPACK_H 0
+#endif
+
+static inline void cuda_core_write_err(char* err, size_t cap, const char* msg) noexcept
+{
+  if (!err || cap == 0)
+    return;
+  if (!msg)
+  {
+    err[0] = '\0';
+    return;
+  }
+  size_t n = ::strlen(msg);
+  if (n >= cap)
+    n = cap - 1;
+  ::memcpy(err, msg, n);
+  err[n] = '\0';
+}
+
+int cuda_core_cccl_make_tma_descriptor_tiled(
+  void* out_tensor_map,
+  void* data,
+  int device_type,
+  int device_id,
+  int ndim,
+  const int64_t* shape,
+  const int64_t* strides,
+  uint8_t dtype_code,
+  uint8_t dtype_bits,
+  uint16_t dtype_lanes,
+  const int* box_sizes,
+  const int* elem_strides,
+  int interleave_layout,
+  int swizzle,
+  int l2_fetch_size,
+  int oob_fill,
+  char* err,
+  size_t err_cap) noexcept
+{
+#if !(CUDA_CORE_HAS_CUDA_TMA && CUDA_CORE_HAS_DLPACK_H)
+  (void)out_tensor_map;
+  (void)data;
+  (void)device_type;
+  (void)device_id;
+  (void)ndim;
+  (void)shape;
+  (void)strides;
+  (void)dtype_code;
+  (void)dtype_bits;
+  (void)dtype_lanes;
+  (void)box_sizes;
+  (void)elem_strides;
+  (void)interleave_layout;
+  (void)swizzle;
+  (void)l2_fetch_size;
+  (void)oob_fill;
+  cuda_core_write_err(err, err_cap, "CCCL <cuda/tma> and/or <dlpack/dlpack.h> not available at build time");
+  return 1;
+#else
+  try
+  {
+    if (!out_tensor_map)
+    {
+      cuda_core_write_err(err, err_cap, "out_tensor_map is NULL");
+      return 1;
+    }
+    if (!data)
+    {
+      cuda_core_write_err(err, err_cap, "tensor data pointer is NULL");
+      return 1;
+    }
+    if (!shape || !box_sizes || ndim <= 0)
+    {
+      cuda_core_write_err(err, err_cap, "invalid rank/shape/box_sizes");
+      return 1;
+    }
+
+    DLTensor t{};
+    t.data        = data;
+    t.device      = {static_cast<DLDeviceType>(device_type), device_id};
+    t.ndim        = ndim;
+    t.dtype.code  = dtype_code;
+    t.dtype.bits  = dtype_bits;
+    t.dtype.lanes = dtype_lanes;
+    // CCCL promises not to mutate the arrays, but DLPack uses non-const pointers.
+    t.shape       = const_cast<int64_t*>(shape);
+    t.strides     = const_cast<int64_t*>(strides);
+    t.byte_offset = 0;
+
+    const auto layout = static_cast<cuda::tma_interleave_layout>(interleave_layout);
+    const auto swz    = static_cast<cuda::tma_swizzle>(swizzle);
+    const auto l2     = static_cast<cuda::tma_l2_fetch_size>(l2_fetch_size);
+    const auto oob    = static_cast<cuda::tma_oob_fill>(oob_fill);
+
+    auto box = cuda::std::span<const int>(box_sizes, static_cast<size_t>(ndim));
+
+    CUtensorMap desc{};
+    if (elem_strides)
+    {
+      auto es = cuda::std::span<const int>(elem_strides, static_cast<size_t>(ndim));
+      desc    = cuda::make_tma_descriptor(t, box, es, layout, swz, l2, oob);
+    }
+    else
+    {
+      desc = cuda::make_tma_descriptor(t, box, layout, swz, l2, oob);
+    }
+
+    ::memcpy(out_tensor_map, &desc, sizeof(CUtensorMap));
+    cuda_core_write_err(err, err_cap, nullptr);
+    return 0;
+  }
+  catch (const std::exception& e)
+  {
+    cuda_core_write_err(err, err_cap, e.what());
+    return 1;
+  }
+  catch (...)
+  {
+    cuda_core_write_err(err, err_cap, "unknown error while building TMA descriptor");
+    return 1;
+  }
+#endif
+}
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef CUDA_CORE_TENSOR_MAP_CCCL_H_
+#define CUDA_CORE_TENSOR_MAP_CCCL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Build a tiled CUtensorMap using CCCL's cuda::make_tma_descriptor (from <cuda/tma>).
+//
+// Returns 0 on success; on failure returns non-zero and writes a best-effort
+// human-readable message into (err, err_cap) if provided.
+int cuda_core_cccl_make_tma_descriptor_tiled(
+  void* out_tensor_map,
+  void* data,
+  int device_type,
+  int device_id,
+  int ndim,
+  const int64_t* shape,        // length ndim
+  const int64_t* strides,      // length ndim, or NULL for contiguous
+  uint8_t dtype_code,
+  uint8_t dtype_bits,
+  uint16_t dtype_lanes,
+  const int* box_sizes,        // length ndim
+  const int* elem_strides,     // length ndim, or NULL for all-ones overload
+  int interleave_layout,
+  int swizzle,
+  int l2_fetch_size,
+  int oob_fill,
+  char* err,
+  size_t err_cap) noexcept;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // CUDA_CORE_TENSOR_MAP_CCCL_H_
@@ -8,6 +8,7 @@ from cuda.bindings cimport cydriver
 cdef class TensorMapDescriptor:
     cdef cydriver.CUtensorMap _tensor_map
     cdef object _source_ref
+    cdef object _view_ref
     cdef object _repr_info
 
     cdef void* _get_data_ptr(self)
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,6 @@`
`60`	`60`	`StridedMemoryView,`
`61`	`61`	`args_viewable_as_strided_memory,`
`62`	`62`	`)`
`63`		`-<<<<<<< HEAD`
`64`	`63`	`from cuda.core._module import Kernel, ObjectCode`
`65`	`64`	`from cuda.core._program import Program, ProgramOptions`
`66`	`65`	`from cuda.core._stream import (`