diff --git a/test/samples/TPushTPop/test6/kernel.pto b/test/samples/TPushTPop/test6/kernel.pto new file mode 100644 index 000000000..8c261bec1 --- /dev/null +++ b/test/samples/TPushTPop/test6/kernel.pto @@ -0,0 +1,87 @@ +// RUN: ptoas --pto-arch a5 %s +module { + func.func @matmul_tpush_tpop_loop4_print( + %gm_a: !pto.ptr, + %gm_b_all: !pto.ptr) + attributes {pto.entry} { + func.call @matmul_tpush_tpop_loop4_print_cube(%gm_a, %gm_b_all) + : (!pto.ptr, !pto.ptr) -> () + func.call @matmul_tpush_tpop_loop4_print_vector() : () -> () + return + } + + func.func private @matmul_tpush_tpop_loop4_print_cube( + %gm_a: !pto.ptr, + %gm_b_all: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + + %c2v_import = pto.import_reserved_buffer { + name = "c2v_fifo", + peer_func = @matmul_tpush_tpop_loop4_print_vector + } -> i32 + pto.aic_initialize_pipe {dir_mask = 1, slot_size = 1024} + (c2v_consumer_buf = %c2v_import : i32, + v2c_consumer_buf = %c0_i32 : i32) + + %mat_a_tile = pto.alloc_tile : !pto.tile_buf + %mat_b_tile = pto.alloc_tile : !pto.tile_buf + %left_tile = pto.alloc_tile : !pto.tile_buf + %right_tile = pto.alloc_tile : !pto.tile_buf + %c8 = arith.constant 8 : index + %acc_tile = pto.alloc_tile valid_row = %c16 valid_col = %c16 + : !pto.tile_buf + + %gm_a_view = pto.make_tensor_view %gm_a, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view + %gm_b_all_view = pto.make_tensor_view %gm_b_all, shape = [%c64, %c16], strides = [%c16, %c1] : !pto.tensor_view + %gm_a_tile_view = pto.partition_view %gm_a_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<16x16xf32> + + pto.tload ins(%gm_a_tile_view : !pto.partition_tensor_view<16x16xf32>) outs(%mat_a_tile : !pto.tile_buf) + pto.tmov ins(%mat_a_tile : !pto.tile_buf) outs(%left_tile : !pto.tile_buf) + + scf.for %i = %c0 to %c4 step %c1 { + %row_offset = arith.muli %i, %c16 : index + %gm_b_iter = pto.partition_view %gm_b_all_view, offsets = [%row_offset, %c0], sizes = [%c16, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<16x16xf32> + + pto.tload ins(%gm_b_iter : !pto.partition_tensor_view<16x16xf32>) outs(%mat_b_tile : !pto.tile_buf) + pto.tmov ins(%mat_b_tile : !pto.tile_buf) outs(%right_tile : !pto.tile_buf) + pto.tmatmul ins(%left_tile, %right_tile : !pto.tile_buf, !pto.tile_buf) outs(%acc_tile : !pto.tile_buf) + pto.set_validshape %acc_tile, %c8, %c16 + : !pto.tile_buf + pto.tpush_to_aiv(%acc_tile : !pto.tile_buf) {split = 1} + } + return + } + + func.func private @matmul_tpush_tpop_loop4_print_vector() + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c2v_local = pto.reserve_buffer { + name = "c2v_fifo", + size = 8192, + location = #pto.address_space, + auto = true + } -> i32 + pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024} + (c2v_consumer_buf = %c2v_local : i32, + v2c_consumer_buf = %c0_i32 : i32) + + %vec_print = pto.alloc_tile : !pto.tile_buf + scf.for %i = %c0 to %c4 step %c1 { + %fifo_tile = pto.tpop_from_aic {split = 1} + -> !pto.tile_buf + pto.tmov ins(%fifo_tile : !pto.tile_buf) outs(%vec_print : !pto.tile_buf) + pto.tprint ins(%vec_print : !pto.tile_buf) + pto.tfree_from_aic {split = 1} + } + return + } +}