diff --git a/test/samples/TPushTPop/test6/kernel.pto b/test/samples/TPushTPop/test6/kernel.pto
new file mode 100644
index 000000000..8c261bec1
--- /dev/null
+++ b/test/samples/TPushTPop/test6/kernel.pto
@@ -0,0 +1,87 @@
+// RUN: ptoas --pto-arch a5 %s
+module {
+  func.func @matmul_tpush_tpop_loop4_print(
+      %gm_a: !pto.ptr<f32>,
+      %gm_b_all: !pto.ptr<f32>)
+      attributes {pto.entry} {
+    func.call @matmul_tpush_tpop_loop4_print_cube(%gm_a, %gm_b_all)
+      : (!pto.ptr<f32>, !pto.ptr<f32>) -> ()
+    func.call @matmul_tpush_tpop_loop4_print_vector() : () -> ()
+    return
+  }
+
+  func.func private @matmul_tpush_tpop_loop4_print_cube(
+      %gm_a: !pto.ptr<f32>,
+      %gm_b_all: !pto.ptr<f32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c16 = arith.constant 16 : index
+    %c64 = arith.constant 64 : index
+
+    %c2v_import = pto.import_reserved_buffer {
+      name = "c2v_fifo",
+      peer_func = @matmul_tpush_tpop_loop4_print_vector
+    } -> i32
+    pto.aic_initialize_pipe {dir_mask = 1, slot_size = 1024}
+      (c2v_consumer_buf = %c2v_import : i32,
+       v2c_consumer_buf = %c0_i32 : i32)
+
+    %mat_a_tile = pto.alloc_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %mat_b_tile = pto.alloc_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %left_tile = pto.alloc_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %right_tile = pto.alloc_tile : !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    %c8 = arith.constant 8 : index
+    %acc_tile = pto.alloc_tile valid_row = %c16 valid_col = %c16
+      : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=?, v_col=?, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+
+    %gm_a_view = pto.make_tensor_view %gm_a, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %gm_b_all_view = pto.make_tensor_view %gm_b_all, shape = [%c64, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %gm_a_tile_view = pto.partition_view %gm_a_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+
+    pto.tload ins(%gm_a_tile_view : !pto.partition_tensor_view<16x16xf32>) outs(%mat_a_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    pto.tmov ins(%mat_a_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%left_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+
+    scf.for %i = %c0 to %c4 step %c1 {
+      %row_offset = arith.muli %i, %c16 : index
+      %gm_b_iter = pto.partition_view %gm_b_all_view, offsets = [%row_offset, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+
+      pto.tload ins(%gm_b_iter : !pto.partition_tensor_view<16x16xf32>) outs(%mat_b_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      pto.tmov ins(%mat_b_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%right_tile : !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+      pto.tmatmul ins(%left_tile, %right_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=?, v_col=?, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+      pto.set_validshape %acc_tile, %c8, %c16
+        : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=?, v_col=?, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+      pto.tpush_to_aiv(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=?, v_col=?, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 1}
+    }
+    return
+  }
+
+  func.func private @matmul_tpush_tpop_loop4_print_vector()
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c2v_local = pto.reserve_buffer {
+      name = "c2v_fifo",
+      size = 8192,
+      location = #pto.address_space<vec>,
+      auto = true
+    } -> i32
+    pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024}
+      (c2v_consumer_buf = %c2v_local : i32,
+       v2c_consumer_buf = %c0_i32 : i32)
+
+    %vec_print = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=16, v_row=4, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    scf.for %i = %c0 to %c4 step %c1 {
+      %fifo_tile = pto.tpop_from_aic {split = 1}
+        -> !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=16, v_row=4, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tmov ins(%fifo_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=16, v_row=4, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%vec_print : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=16, v_row=4, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tprint ins(%vec_print : !pto.tile_buf<loc=vec, dtype=f32, rows=4, cols=16, v_row=4, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tfree_from_aic {split = 1}
+    }
+    return
+  }
+}