Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions test/samples/TPushTPop/a3/test3/kernel.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// RUN: ptoas --pto-arch a3 %s
module {
func.func @matmul_tpush_tpop_loop4_validshape_print(
%gm_a: !pto.ptr<f32>,
%gm_b_all: !pto.ptr<f32>,
%gm_slot_buffer: !pto.ptr<f32>)
attributes {pto.entry} {
func.call @matmul_tpush_tpop_loop4_validshape_print_cube(%gm_a, %gm_b_all,
%gm_slot_buffer)
: (!pto.ptr<f32>, !pto.ptr<f32>, !pto.ptr<f32>) -> ()
func.call @matmul_tpush_tpop_loop4_validshape_print_vector(%gm_slot_buffer)
: (!pto.ptr<f32>) -> ()
return
}

func.func private @matmul_tpush_tpop_loop4_validshape_print_cube(
%gm_a: !pto.ptr<f32>,
%gm_b_all: !pto.ptr<f32>,
%gm_slot_buffer: !pto.ptr<f32>)
attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index

%c2v_import = pto.import_reserved_buffer {
name = "c2v_fifo",
peer_func = @matmul_tpush_tpop_loop4_validshape_print_vector
} -> i32
pto.aic_initialize_pipe {dir_mask = 1, slot_size = 1024}
(gm_slot_buffer = %gm_slot_buffer : !pto.ptr<f32>,
c2v_consumer_buf = %c2v_import : i32,
v2c_consumer_buf = %c0_i32 : i32)

%mat_a_tile = pto.alloc_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
%mat_b_tile = pto.alloc_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
%left_tile = pto.alloc_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=row_major, fractal=512, pad=0>
%right_tile = pto.alloc_tile : !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>
%acc_tile = pto.alloc_tile valid_row = %c16 valid_col = %c16
: !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=?, v_col=?, blayout=col_major, slayout=row_major, fractal=1024, pad=0>

%gm_a_view = pto.make_tensor_view %gm_a, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
%gm_b_all_view = pto.make_tensor_view %gm_b_all, shape = [%c64, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
%gm_a_tile_view = pto.partition_view %gm_a_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>

pto.tload ins(%gm_a_tile_view : !pto.partition_tensor_view<16x16xf32>) outs(%mat_a_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
pto.tmov ins(%mat_a_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%left_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=row_major, fractal=512, pad=0>)

scf.for %i = %c0 to %c4 step %c1 {
%row_offset = arith.muli %i, %c16 : index
%gm_b_iter = pto.partition_view %gm_b_all_view, offsets = [%row_offset, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>

pto.tload ins(%gm_b_iter : !pto.partition_tensor_view<16x16xf32>) outs(%mat_b_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
pto.tmov ins(%mat_b_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%right_tile : !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
pto.tmatmul ins(%left_tile, %right_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=?, v_col=?, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
pto.set_validshape %acc_tile, %c8, %c16
: !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=?, v_col=?, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
Comment on lines +59 to +60
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The placement of pto.set_validshape inside the loop causes an inconsistency between iterations. %acc_tile is initialized with a 16x16 valid shape (line 42). In the first iteration, pto.tmatmul (line 58) operates on a 16x16 region. However, set_validshape updates the tile handle metadata in-place to 8x16, meaning all subsequent tmatmul calls in the loop will see and potentially respect the restricted 8x16 valid shape. If tmatmul is intended to always compute a full 16x16 result, consider moving set_validshape before the loop or resetting the shape after the push.

pto.tpush_to_aiv(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=?, v_col=?, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
}
return
}

func.func private @matmul_tpush_tpop_loop4_validshape_print_vector(%gm_slot_buffer: !pto.ptr<f32>)
attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c2v_local = pto.reserve_buffer {
name = "c2v_fifo",
size = 8192,
location = #pto.address_space<vec>,
auto = true
} -> i32
pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024}
(gm_slot_buffer = %gm_slot_buffer : !pto.ptr<f32>,
c2v_consumer_buf = %c2v_local : i32,
v2c_consumer_buf = %c0_i32 : i32)

%vec_print = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=16, v_row=4, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
scf.for %i = %c0 to %c4 step %c1 {
%fifo_tile = pto.tpop_from_aic {split = 0}
-> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=16, v_row=4, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tmov ins(%fifo_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=16, v_row=4, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%vec_print : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=16, v_row=4, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tprint ins(%vec_print : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=16, v_row=4, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
Comment on lines +87 to +88
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The pto.tmov operation is redundant. %fifo_tile is already a tile buffer in the vec location with the required shape and type. You can pass %fifo_tile directly to pto.tprint and remove this move, as well as the unnecessary %vec_print allocation at line 83.

      pto.tprint ins(%fifo_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=16, v_row=4, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)

pto.tfree_from_aic {split = 0}
}
Comment on lines +84 to +90
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

There is a significant data mismatch and potential logic error in the vector kernel loop:

  1. Loop Count Mismatch: The cube kernel pushes 4 tiles of shape 8x16 (total 32 rows). The vector kernel loop only runs 4 times (%c4) and pops 4x16 tiles (total 16 rows). To consume all pushed data, the vector loop should run 8 times (%c8).
  2. Shape and Split Mismatch: The cube pushes 8x16 tiles with {split = 0}, while the vector pops 4x16 tiles with {split = 0}. Typically, {split = 0} (no split) requires exact shape matching between the producer and consumer. If splitting is intended, the split attribute should likely be non-zero (e.g., split = 1 for height split) to allow popping sub-tiles from a larger pushed slot.
  3. Premature Slot Release: pto.tfree_from_aic is called in every iteration. If one push (8x16) is intended to be consumed by two pops (4x16), calling tfree after the first pop will release the FIFO slot and discard the remaining data.

return
}
}
Loading