Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
e3e1899
initial commit
rparolin Feb 23, 2026
77a3c8e
tma wide
rparolin Feb 24, 2026
19c4a0f
clean up
rparolin Feb 24, 2026
35a04b9
Add comments to prepare_tensor_map_arg explaining allocation and life…
rparolin Feb 24, 2026
bb19e4f
Address Copilot review feedback
rparolin Feb 25, 2026
23a8900
Split TMA example into two focused files
rparolin Feb 25, 2026
0a1b720
pre-commit
rparolin Feb 25, 2026
44fbdcf
adding stride meta data to gpu allocated memory
rparolin Feb 25, 2026
bdf39a2
im2col fixes
rparolin Feb 25, 2026
96a3e84
Reuse CCCL TMA descriptor construction for tiled TensorMap and keep v…
cpcloud Mar 3, 2026
1a6b416
Skip im2col-wide TensorMap tests when runtime support is unavailable.
cpcloud Mar 7, 2026
892ee60
Align TensorMap API surface with review feedback and enforce context …
cpcloud Mar 7, 2026
5a0e141
Restore cu12 feature definitions in cuda_core pixi manifest.
cpcloud Mar 7, 2026
eef1c7a
Handle TensorMap device validation by DLPack type
rparolin Mar 10, 2026
99ff204
Merge branch 'main' into rparolin/tma_feature
rparolin Mar 11, 2026
d6c311a
formatting change
rparolin Mar 11, 2026
9673bcf
Update cuda_core/cuda/core/_cpp/tensor_map_cccl.h
rparolin Mar 13, 2026
ae86192
Update cuda_core/examples/tma_replace_address.py
rparolin Mar 14, 2026
232b621
Update cuda_core/cuda/core/__init__.py
rparolin Mar 14, 2026
358d975
Align TensorMap creation and launch behavior with the latest review g…
cpcloud Mar 14, 2026
e67e9d3
Consolidate the TMA examples around the libcudacxx wrappers.
cpcloud Mar 14, 2026
9ff8d0f
Teach the TMA example where to find libcudacxx headers.
cpcloud Mar 14, 2026
719f0f3
Bundle tiled TensorMap options and type retained views.
cpcloud Mar 14, 2026
ad1c800
Keep the rebased TensorMap validation helper consistent.
cpcloud Mar 14, 2026
a1203ac
Apply the pre-commit fixes for the rebased TensorMap branch.
cpcloud Mar 14, 2026
3c9e32d
Keep the TensorMap multi-GPU tests on the view-based API.
cpcloud Mar 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cuda_core/cuda/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,4 @@
Stream,
StreamOptions,
)
from cuda.core._tensor_map import TensorMapDescriptor, TensorMapDescriptorOptions
154 changes: 154 additions & 0 deletions cuda_core/cuda/core/_cpp/tensor_map.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// SPDX-License-Identifier: Apache-2.0

#include "tensor_map_cccl.h"

#include <string.h>

#include <algorithm>
#include <exception>

#if defined(__has_include)
// Older CTK releases do not ship <cuda/tma>. When it is unavailable we keep
// the CCCL helper compiled out and fall back to the direct driver path.
# if __has_include(<cuda/tma>)
# include <cuda/tma>
# define CUDA_CORE_HAS_CUDA_TMA 1
# else
# define CUDA_CORE_HAS_CUDA_TMA 0
# endif
# if __has_include("dlpack.h")
# include "dlpack.h"
# define CUDA_CORE_HAS_DLPACK_H 1
# elif __has_include(<dlpack/dlpack.h>)
# include <dlpack/dlpack.h>
# define CUDA_CORE_HAS_DLPACK_H 1
# else
# define CUDA_CORE_HAS_DLPACK_H 0
# endif
#else
# define CUDA_CORE_HAS_CUDA_TMA 0
# define CUDA_CORE_HAS_DLPACK_H 0
#endif

static inline void cuda_core_write_err(char* err, size_t cap, const char* msg) noexcept
{
if (!err || cap == 0)
return;
if (!msg)
{
err[0] = '\0';
return;
}
size_t n = ::strlen(msg);
if (n >= cap)
n = cap - 1;
::memcpy(err, msg, n);
err[n] = '\0';
}

int cuda_core_cccl_make_tma_descriptor_tiled(
void* out_tensor_map,
void* data,
int device_type,
int device_id,
int ndim,
const int64_t* shape,
const int64_t* strides,
uint8_t dtype_code,
uint8_t dtype_bits,
uint16_t dtype_lanes,
const int* box_sizes,
const int* elem_strides,
int interleave_layout,
int swizzle,
int l2_fetch_size,
int oob_fill,
char* err,
size_t err_cap) noexcept
{
#if !(CUDA_CORE_HAS_CUDA_TMA && CUDA_CORE_HAS_DLPACK_H)
(void)out_tensor_map;
(void)data;
(void)device_type;
(void)device_id;
(void)ndim;
(void)shape;
(void)strides;
(void)dtype_code;
(void)dtype_bits;
(void)dtype_lanes;
(void)box_sizes;
(void)elem_strides;
(void)interleave_layout;
(void)swizzle;
(void)l2_fetch_size;
(void)oob_fill;
cuda_core_write_err(err, err_cap, "CCCL <cuda/tma> and/or <dlpack/dlpack.h> not available at build time");
return 1;
#else
try
{
if (!out_tensor_map)
{
cuda_core_write_err(err, err_cap, "out_tensor_map is NULL");
return 1;
}
if (!data)
{
cuda_core_write_err(err, err_cap, "tensor data pointer is NULL");
return 1;
}
if (!shape || !box_sizes || ndim <= 0)
{
cuda_core_write_err(err, err_cap, "invalid rank/shape/box_sizes");
return 1;
}

DLTensor t{};
t.data = data;
t.device = {static_cast<DLDeviceType>(device_type), device_id};
t.ndim = ndim;
t.dtype.code = dtype_code;
t.dtype.bits = dtype_bits;
t.dtype.lanes = dtype_lanes;
// CCCL promises not to mutate the arrays, but DLPack uses non-const pointers.
t.shape = const_cast<int64_t*>(shape);
t.strides = const_cast<int64_t*>(strides);
t.byte_offset = 0;

const auto layout = static_cast<cuda::tma_interleave_layout>(interleave_layout);
const auto swz = static_cast<cuda::tma_swizzle>(swizzle);
const auto l2 = static_cast<cuda::tma_l2_fetch_size>(l2_fetch_size);
const auto oob = static_cast<cuda::tma_oob_fill>(oob_fill);

auto box = cuda::std::span<const int>(box_sizes, static_cast<size_t>(ndim));

CUtensorMap desc{};
if (elem_strides)
{
auto es = cuda::std::span<const int>(elem_strides, static_cast<size_t>(ndim));
desc = cuda::make_tma_descriptor(t, box, es, layout, swz, l2, oob);
}
else
{
desc = cuda::make_tma_descriptor(t, box, layout, swz, l2, oob);
}

::memcpy(out_tensor_map, &desc, sizeof(CUtensorMap));
cuda_core_write_err(err, err_cap, nullptr);
return 0;
}
catch (const std::exception& e)
{
cuda_core_write_err(err, err_cap, e.what());
return 1;
}
catch (...)
{
cuda_core_write_err(err, err_cap, "unknown error while building TMA descriptor");
return 1;
}
#endif
}
45 changes: 45 additions & 0 deletions cuda_core/cuda/core/_cpp/tensor_map_cccl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// SPDX-License-Identifier: Apache-2.0

#ifndef CUDA_CORE_TENSOR_MAP_CCCL_H_
#define CUDA_CORE_TENSOR_MAP_CCCL_H_

#ifdef __cplusplus
#include <cstddef>
#include <cstdint>
extern "C" {
#else
#include <stddef.h>
#include <stdint.h>
#endif

// Build a tiled CUtensorMap using CCCL's cuda::make_tma_descriptor (from <cuda/tma>).
//
// Returns 0 on success; on failure returns non-zero and writes a best-effort
// human-readable message into (err, err_cap) if provided.
int cuda_core_cccl_make_tma_descriptor_tiled(
void* out_tensor_map,
void* data,
int device_type,
int device_id,
int ndim,
const int64_t* shape, // length ndim
const int64_t* strides, // length ndim, or NULL for contiguous
uint8_t dtype_code,
uint8_t dtype_bits,
uint16_t dtype_lanes,
const int* box_sizes, // length ndim
const int* elem_strides, // length ndim, or NULL for all-ones overload
int interleave_layout,
int swizzle,
int l2_fetch_size,
int oob_fill,
char* err,
size_t err_cap) noexcept;

#ifdef __cplusplus
} // extern "C"
#endif

#endif // CUDA_CORE_TENSOR_MAP_CCCL_H_
19 changes: 19 additions & 0 deletions cuda_core/cuda/core/_kernel_arg_handler.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ import ctypes
import numpy

from cuda.core._memory import Buffer
from cuda.core._tensor_map import TensorMapDescriptor as _TensorMapDescriptor_py
from cuda.core._tensor_map cimport TensorMapDescriptor
from cuda.core._utils.cuda_utils import driver
from cuda.bindings cimport cydriver

Expand Down Expand Up @@ -97,6 +99,9 @@ cdef object numpy_complex64 = numpy.complex64
cdef object numpy_complex128 = numpy.complex128


cdef object tensor_map_descriptor_type = _TensorMapDescriptor_py


# limitation due to cython/cython#534
ctypedef void* voidptr

Expand Down Expand Up @@ -124,6 +129,17 @@ cdef inline int prepare_arg(
return 0


cdef inline int prepare_tensor_map_arg(
vector.vector[void*]& data,
vector.vector[void*]& data_addresses,
TensorMapDescriptor arg,
const size_t idx) except -1:
# cuLaunchKernel copies argument bytes during launch, so a TensorMap
# descriptor can point directly at its internal CUtensorMap storage.
data_addresses[idx] = arg._get_data_ptr()
return 0


cdef inline int prepare_ctypes_arg(
vector.vector[void*]& data,
vector.vector[void*]& data_addresses,
Expand Down Expand Up @@ -290,6 +306,9 @@ cdef class ParamHolder:
elif arg_type is complex:
prepare_arg[cpp_double_complex](self.data, self.data_addresses, arg, i)
continue
elif arg_type is tensor_map_descriptor_type:
prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
continue

not_prepared = prepare_numpy_arg(self.data, self.data_addresses, arg, i)
if not_prepared:
Expand Down
28 changes: 28 additions & 0 deletions cuda_core/cuda/core/_memoryview.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

from libc.stdint cimport intptr_t

from cuda.core._dlpack cimport DLTensor
from cuda.core._layout cimport _StridedLayout


cdef class StridedMemoryView:
cdef readonly:
intptr_t ptr
int device_id
bint is_device_accessible
bint readonly
object exporting_obj

cdef:
object metadata
DLTensor* dl_tensor
_StridedLayout _layout
object _buffer
object _dtype

cdef inline _StridedLayout get_layout(self)
cdef inline object get_buffer(self)
cdef inline object get_dtype(self)
67 changes: 38 additions & 29 deletions cuda_core/cuda/core/_memoryview.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -107,35 +107,6 @@ cdef class StridedMemoryView:
it will be the Buffer instance passed to the method.

"""
cdef readonly:
intptr_t ptr
int device_id
bint is_device_accessible
bint readonly
object exporting_obj

cdef:
# If using dlpack, this is a strong reference to the result of
# obj.__dlpack__() so we can lazily create shape and strides from
# it later. If using CAI, this is a reference to the source
# `__cuda_array_interface__` object.
object metadata

# The tensor object if has obj has __dlpack__, otherwise must be NULL
DLTensor *dl_tensor

# Memoized properties
# Either lazily inferred from dl_tensor/metadata,
# or explicitly provided if created with from_buffer().
_StridedLayout _layout
# Either exporting_obj if it is a Buffer, otherwise a Buffer instance
# with owner set to the exporting object.
object _buffer
# Either lazily inferred from dl_tensor/metadata,
# or explicitly provided if created with from_buffer().
# In the latter case, it can be None.
object _dtype

def __init__(self, obj: object = None, stream_ptr: int | None = None) -> None:
cdef str clsname = self.__class__.__name__
if obj is not None:
Expand Down Expand Up @@ -316,6 +287,44 @@ cdef class StridedMemoryView:
view_buffer_strided(view, self.get_buffer(), layout, dtype, self.readonly)
return view

def as_tensor_map(
self,
box_dim=None,
*,
options=None,
element_strides=None,
data_type=None,
interleave=None,
swizzle=None,
l2_promotion=None,
oob_fill=None,
):
"""Create a tiled :obj:`TensorMapDescriptor` from this view.

This is the public entry point for creating tiled tensor map
descriptors in ``cuda.core``. Pass either ``box_dim`` and the
individual keyword arguments directly, or provide bundled tiled
options via ``options=``.
"""
from cuda.core._tensor_map import TensorMapDescriptor

kwargs = {}
if options is not None:
kwargs["options"] = options
if element_strides is not None:
kwargs["element_strides"] = element_strides
if data_type is not None:
kwargs["data_type"] = data_type
if interleave is not None:
kwargs["interleave"] = interleave
if swizzle is not None:
kwargs["swizzle"] = swizzle
if l2_promotion is not None:
kwargs["l2_promotion"] = l2_promotion
if oob_fill is not None:
kwargs["oob_fill"] = oob_fill
return TensorMapDescriptor._from_tiled(self, box_dim, **kwargs)

def copy_from(
self, other : StridedMemoryView, stream : Stream,
allocator = None,
Expand Down
19 changes: 19 additions & 0 deletions cuda_core/cuda/core/_tensor_map.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

from cuda.bindings cimport cydriver
from libc.stdint cimport intptr_t
from cuda.core._memoryview cimport StridedMemoryView


cdef class TensorMapDescriptor:
cdef cydriver.CUtensorMap _tensor_map
cdef int _device_id
cdef intptr_t _context
cdef object _source_ref
cdef StridedMemoryView _view_ref
cdef object _repr_info

cdef int _check_context_compat(self) except -1
cdef void* _get_data_ptr(self)
Loading
Loading