diff --git a/include/aster/Dialect/AMDGCN/Analysis/ReachingDefinitions.h b/include/aster/Dialect/AMDGCN/Analysis/ReachingDefinitions.h index 817749089..8f80d562a 100644 --- a/include/aster/Dialect/AMDGCN/Analysis/ReachingDefinitions.h +++ b/include/aster/Dialect/AMDGCN/Analysis/ReachingDefinitions.h @@ -179,7 +179,7 @@ class ReachingDefinitionsAnalysis friend class ::mlir::DataFlowSolver; ReachingDefinitionsAnalysis( DataFlowSolver &solver, - llvm::function_ref definitionFilter, + llvm::function_ref definitionFilter = {}, llvm::function_ref killCallback = {}) : Base(solver), definitionFilter(definitionFilter), diff --git a/include/aster/Dialect/AMDGCN/IR/AMDGCNAttrs.td b/include/aster/Dialect/AMDGCN/IR/AMDGCNAttrs.td index c8219f0a8..43543adc4 100644 --- a/include/aster/Dialect/AMDGCN/IR/AMDGCNAttrs.td +++ b/include/aster/Dialect/AMDGCN/IR/AMDGCNAttrs.td @@ -73,6 +73,20 @@ def ValueSchedulerAttr : AMDGCN_Attr<"ValueScheduler", "value_scheduler", [ }]; } +def RegisterSchedulerAttr : AMDGCN_Attr<"RegisterScheduler", "register_scheduler", [ + DeclareAttrInterfaceMethods + ]> { + let summary = "Register-aware scheduling graph builder for AMDGCN instructions"; + let description = [{ + Like the value scheduler, builds SSA edges, wait and barrier edges, and i1 + serialization. Additionally, after non-SSA edges, adds dependencies from + reaching definitions for register operands on `InstOpInterface` ins and outs. + + Requires IR in post-ToRegisterSemantics DPS normal form (no value-semantic + `outs` on instructions), the same precondition as `ReachingDefinitionsAnalysis`. + }]; +} + def InstPropLabelerAttr : AMDGCN_Attr<"InstPropLabeler", "inst_prop_labeler", [ DeclareAttrInterfaceMethods ]> { diff --git a/include/aster/Dialect/AMDGCN/IR/AMDGCNOps.td b/include/aster/Dialect/AMDGCN/IR/AMDGCNOps.td index b769d5de3..cd515c50c 100644 --- a/include/aster/Dialect/AMDGCN/IR/AMDGCNOps.td +++ b/include/aster/Dialect/AMDGCN/IR/AMDGCNOps.td @@ -693,7 +693,6 @@ def AMDGCN_TestInstOp : AMDGCN_Op<"test_inst", [ /// Get the opcode of the instruction. InstAttr getOpcodeAttr() { - assert(false && "not yet implemented"); return InstAttr(); } /// Get the instruction output operands. diff --git a/include/aster/Dialect/AMDGCN/Transforms/AMDGCNPasses.td b/include/aster/Dialect/AMDGCN/Transforms/AMDGCNPasses.td index dee4d1363..e2e29ce8e 100644 --- a/include/aster/Dialect/AMDGCN/Transforms/AMDGCNPasses.td +++ b/include/aster/Dialect/AMDGCN/Transforms/AMDGCNPasses.td @@ -423,6 +423,8 @@ def LowLevelScheduler : Pass<"amdgcn-low-level-scheduler", let options = [ Option<"debugStalls", "debug-stalls", "bool", "false", "Annotate each op with sched.stall_cycles and sched.stall_reason">, + Option<"registerSemantics", "register-semantics", "bool", "false", + "Use register semantics for scheduling dependencies">, ]; } diff --git a/lib/Dialect/AMDGCN/IR/SchedAttrs.cpp b/lib/Dialect/AMDGCN/IR/SchedAttrs.cpp index 71e88452f..09dac4db0 100644 --- a/lib/Dialect/AMDGCN/IR/SchedAttrs.cpp +++ b/lib/Dialect/AMDGCN/IR/SchedAttrs.cpp @@ -8,8 +8,11 @@ // //===----------------------------------------------------------------------===// +#include "aster/Dialect/AMDGCN/Analysis/ReachingDefinitions.h" #include "aster/Dialect/AMDGCN/Analysis/WaitAnalysis.h" #include "aster/Dialect/AMDGCN/IR/AMDGCNOps.h" +#include "aster/Dialect/AMDGCN/IR/AMDGCNTypes.h" +#include "aster/Dialect/LSIR/IR/LSIROps.h" #include "aster/Interfaces/SchedInterfaces.h" #include "mlir/Analysis/DataFlowFramework.h" #include "llvm/ADT/STLExtras.h" @@ -29,8 +32,9 @@ using namespace mlir::aster::amdgcn; namespace { struct GraphBuilder { - GraphBuilder(Block *block, const DataFlowSolver &solver) - : block(block), solver(const_cast(solver)) { + GraphBuilder(Block *block, const DataFlowSolver &solver, bool useRegisterDeps) + : block(block), solver(const_cast(solver)), + useRegisterDeps(useRegisterDeps) { assert(block && "expected a valid block"); } @@ -44,6 +48,9 @@ struct GraphBuilder { /// Build the non-SSA dependencies for the graph. void buildNonSSADeps(SchedGraph &graph); + /// Add edges from reaching definitions for DPS register ins/outs. + void buildRegisterDeps(SchedGraph &graph); + /// Handle a wait operation. void handleWaitOp(SchedGraph &graph, int64_t pos, WaitOp wait); @@ -57,6 +64,7 @@ struct GraphBuilder { Block *block; SmallVector syncPoints; DataFlowSolver &solver; + bool useRegisterDeps; }; } // namespace @@ -71,6 +79,8 @@ ValueSchedulerAttr::initializeAnalyses(SchedAnalysis &analysis) const { LogicalResult GraphBuilder::run(SchedGraph &graph) { buildSSADeps(graph); buildNonSSADeps(graph); + if (useRegisterDeps) + buildRegisterDeps(graph); addI1SerializationEdges(graph); return success(); } @@ -86,9 +96,27 @@ void GraphBuilder::buildSSADeps(SchedGraph &graph) { bool hasEffects = op->hasTrait() || op->hasTrait(); + bool isPureOp = mlir::isPure(op); + + // If we're using register dependencies, then treat effects on register + // resources as non-effects. + if (useRegisterDeps && isa(op) && + !isa(op)) { + auto eOp = dyn_cast(op); + SmallVector> effects; + if (eOp) { + eOp.getEffects(effects); + isPureOp = llvm::all_of(effects, [](const SideEffects::EffectInstance< + MemoryEffects::Effect> &effect) { + return isa( + effect.getResource()); + }); + } + } + // If the operation has no side-effect we need to treat it as a possible // sync point. Same for non-pure operations. - if ((!hasEffects || !mlir::isPure(op)) && + if ((!hasEffects || !isPureOp) && !isa(op)) { LDBG() << "Adding sync point: " << i; syncPoints.push_back(i); @@ -144,6 +172,52 @@ void GraphBuilder::buildNonSSADeps(SchedGraph &graph) { } } +void GraphBuilder::buildRegisterDeps(SchedGraph &graph) { + // Helper function to add edges from reaching definitions for register + // operands. + auto addEdges = [&](Operation *op, int64_t opId, ValueRange values, + const ReachingDefinitionsState *beforeState) { + for (Value value : values) { + FailureOr allocasOrFailure = getAllocasOrFailure(value); + assert(succeeded(allocasOrFailure) && "expected valid allocas"); + for (Value alloc : *allocasOrFailure) { + for (const Definition &def : beforeState->getRange(alloc)) { + assert(def.definition && "expected valid definition"); + Operation *producer = def.definition->getOwner(); + int64_t pOpId = graph.getOpId(producer); + if (pOpId >= 0 && pOpId < opId) + graph.addEdge(producer, op); + } + } + } + }; + for (auto [i, op] : llvm::enumerate(graph.getOps())) { + auto instOp = dyn_cast(op); + // Skip non-InstOpInterface operations. + if (!instOp) + continue; + + const auto *beforeState = solver.lookupState( + solver.getProgramPointBefore(op)); + assert(beforeState && "expected valid reaching definitions state"); + ValueRange outs = instOp.getInstOuts(); + ValueRange ins = instOp.getInstIns(); + addEdges(op, i, ins, beforeState); + // Make sure we never clobber. + addEdges(op, i, outs, beforeState); + for (Operation *pOp : graph.getOps().take_front(i)) { + ValueRange prevVals = pOp->getOperands(); + if (llvm::any_of(prevVals, [&](Value val) { + return llvm::is_contained(outs, val); + })) + graph.addEdge(pOp, op); + if (llvm::any_of(prevVals, + [&](Value val) { return llvm::is_contained(ins, val); })) + graph.addEdge(pOp, op); + } + } +} + void GraphBuilder::handleWaitOp(SchedGraph &graph, int64_t pos, WaitOp wait) { // Get the wait state. const WaitState *state = @@ -305,7 +379,30 @@ FailureOr ValueSchedulerAttr::createGraph(Block *block, const SchedAnalysis &analysis) const { SchedGraph graph(block); - GraphBuilder builder(block, analysis.getSolver()); + GraphBuilder builder(block, analysis.getSolver(), /*useRegisterDeps=*/false); + if (failed(builder.run(graph))) + return failure(); + graph.compress(); + return graph; +} + +//===----------------------------------------------------------------------===// +// RegisterSchedulerAttr - SchedGraphAttrInterface +//===----------------------------------------------------------------------===// + +LogicalResult +RegisterSchedulerAttr::initializeAnalyses(SchedAnalysis &analysis) const { + analysis.getSolver().load(analysis.getDomInfo()); + analysis.getSolver().load(); + analysis.setRunDataflowAnalyses(); + return success(); +} + +FailureOr +RegisterSchedulerAttr::createGraph(Block *block, + const SchedAnalysis &analysis) const { + SchedGraph graph(block); + GraphBuilder builder(block, analysis.getSolver(), /*useRegisterDeps=*/true); if (failed(builder.run(graph))) return failure(); graph.compress(); diff --git a/lib/Dialect/AMDGCN/Transforms/LowLevelScheduler.cpp b/lib/Dialect/AMDGCN/Transforms/LowLevelScheduler.cpp index 7ff314f2d..0162ce27b 100644 --- a/lib/Dialect/AMDGCN/Transforms/LowLevelScheduler.cpp +++ b/lib/Dialect/AMDGCN/Transforms/LowLevelScheduler.cpp @@ -47,8 +47,11 @@ struct LowLevelSchedulerPass /*emitDiagnostics=*/true))) return signalPassFailure(); + SchedGraphAttrInterface builderAttr = ValueSchedulerAttr::get(ctx); + if (registerSemantics) + builderAttr = RegisterSchedulerAttr::get(ctx); GenericSchedulerAttr compositeAttr = GenericSchedulerAttr::get( - ctx, ValueSchedulerAttr::get(ctx), + ctx, builderAttr, SchedListLabelerAttr::get(ctx, ArrayRef{}), LowLevelSchedulerAttr::get(ctx, debugStalls)); diff --git a/lib/Dialect/AMDGCN/Transforms/Pipelines.cpp b/lib/Dialect/AMDGCN/Transforms/Pipelines.cpp index f9351956e..1c76f7734 100644 --- a/lib/Dialect/AMDGCN/Transforms/Pipelines.cpp +++ b/lib/Dialect/AMDGCN/Transforms/Pipelines.cpp @@ -73,14 +73,10 @@ static void buildRegAllocPassPipeline(OpPassManager &pm, pm.addPass(createHoistIterArgWaits()); pm.addPass(createCanonicalizerPass()); } - if (options.llSched) - pm.addPass(createLowLevelScheduler()); pm.addPass(createAMDGCNBufferization()); - if (options.hoistIterArgWaits) { - pm.addPass(createHoistIterArgWaits()); - pm.addPass(createCanonicalizerPass()); - } pm.addPass(createToRegisterSemantics()); + // if (options.llSched) + pm.addPass(createLowLevelScheduler({false, true})); // Post-condition of to-register-semantics is now enforced by // KernelOp::verifyRegions() via the normal_forms attribute set by the pass. pm.addPass(createRegisterDCE()); diff --git a/test/Dialect/AMDGCN/Transforms/ll-sched-register.mlir b/test/Dialect/AMDGCN/Transforms/ll-sched-register.mlir new file mode 100644 index 000000000..7a3ecb80d --- /dev/null +++ b/test/Dialect/AMDGCN/Transforms/ll-sched-register.mlir @@ -0,0 +1,395 @@ +// RUN: aster-opt %s --pass-pipeline="builtin.module(amdgcn.kernel(amdgcn-low-level-scheduler{register-semantics=true}))" | FileCheck %s + +// CHECK-LABEL: amdgcn.kernel @i1_serialize_cmpi_select { +// CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_0:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_1:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_2:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_3:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[CMPI_0:.*]] = lsir.cmpi i32 slt %[[VAL_0]], %[[CONSTANT_0]] : !amdgcn.vgpr, i32 +// CHECK: lsir.select %[[VAL_1]], %[[CMPI_0]], %[[VAL_2]], %[[VAL_0]] : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[CMPI_1:.*]] = lsir.cmpi i32 slt %[[VAL_2]], %[[CONSTANT_0]] : !amdgcn.vgpr, i32 +// CHECK: lsir.select %[[VAL_3]], %[[CMPI_1]], %[[VAL_0]], %[[VAL_2]] : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: end_kernel +// CHECK: } +amdgcn.kernel @i1_serialize_cmpi_select { + %c0_i32 = arith.constant 0 : i32 + %0 = alloca : !amdgcn.vgpr + %1 = alloca : !amdgcn.vgpr + %2 = alloca : !amdgcn.vgpr + %3 = alloca : !amdgcn.vgpr + %4 = lsir.cmpi i32 slt %0, %c0_i32 : !amdgcn.vgpr, i32 + lsir.select %1, %4, %2, %0 : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr + %5 = lsir.cmpi i32 slt %2, %c0_i32 : !amdgcn.vgpr, i32 + lsir.select %3, %5, %0, %2 : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr + end_kernel +} + +// CHECK-LABEL: amdgcn.kernel @i1_serialize_three_chains { +// CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_0:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_1:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_2:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_3:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_4:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_5:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[CMPI_0:.*]] = lsir.cmpi i32 slt %[[VAL_0]], %[[CONSTANT_0]] : !amdgcn.vgpr, i32 +// CHECK: lsir.select %[[VAL_1]], %[[CMPI_0]], %[[VAL_2]], %[[VAL_0]] : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[CMPI_1:.*]] = lsir.cmpi i32 slt %[[VAL_2]], %[[CONSTANT_0]] : !amdgcn.vgpr, i32 +// CHECK: lsir.select %[[VAL_3]], %[[CMPI_1]], %[[VAL_0]], %[[VAL_2]] : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[CMPI_2:.*]] = lsir.cmpi i32 slt %[[VAL_4]], %[[CONSTANT_0]] : !amdgcn.vgpr, i32 +// CHECK: lsir.select %[[VAL_5]], %[[CMPI_2]], %[[VAL_0]], %[[VAL_4]] : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: end_kernel +// CHECK: } +amdgcn.kernel @i1_serialize_three_chains { + %c0_i32 = arith.constant 0 : i32 + %0 = alloca : !amdgcn.vgpr + %1 = alloca : !amdgcn.vgpr + %2 = alloca : !amdgcn.vgpr + %3 = alloca : !amdgcn.vgpr + %4 = alloca : !amdgcn.vgpr + %5 = alloca : !amdgcn.vgpr + %6 = lsir.cmpi i32 slt %0, %c0_i32 : !amdgcn.vgpr, i32 + lsir.select %1, %6, %2, %0 : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr + %7 = lsir.cmpi i32 slt %2, %c0_i32 : !amdgcn.vgpr, i32 + lsir.select %3, %7, %0, %2 : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr + %8 = lsir.cmpi i32 slt %4, %c0_i32 : !amdgcn.vgpr, i32 + lsir.select %5, %8, %0, %4 : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr + end_kernel +} + +// CHECK-LABEL: amdgcn.kernel @i1_serialize_fanout { +// CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_0:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_1:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_2:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_3:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_4:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_5:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[CMPI_0:.*]] = lsir.cmpi i32 slt %[[VAL_0]], %[[CONSTANT_0]] : !amdgcn.vgpr, i32 +// CHECK: lsir.select %[[VAL_1]], %[[CMPI_0]], %[[VAL_2]], %[[VAL_0]] : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: lsir.select %[[VAL_3]], %[[CMPI_0]], %[[VAL_4]], %[[VAL_0]] : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[CMPI_1:.*]] = lsir.cmpi i32 slt %[[VAL_2]], %[[CONSTANT_0]] : !amdgcn.vgpr, i32 +// CHECK: lsir.select %[[VAL_5]], %[[CMPI_1]], %[[VAL_0]], %[[VAL_2]] : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: end_kernel +// CHECK: } +amdgcn.kernel @i1_serialize_fanout { + %c0_i32 = arith.constant 0 : i32 + %0 = alloca : !amdgcn.vgpr + %1 = alloca : !amdgcn.vgpr + %2 = alloca : !amdgcn.vgpr + %3 = alloca : !amdgcn.vgpr + %4 = alloca : !amdgcn.vgpr + %5 = alloca : !amdgcn.vgpr + %6 = lsir.cmpi i32 slt %0, %c0_i32 : !amdgcn.vgpr, i32 + lsir.select %1, %6, %2, %0 : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr + lsir.select %3, %6, %4, %0 : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr + %7 = lsir.cmpi i32 slt %2, %c0_i32 : !amdgcn.vgpr, i32 + lsir.select %5, %7, %0, %2 : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr + end_kernel +} + +// CHECK-LABEL: amdgcn.kernel @group_valu_salu { +// CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : i32 +// CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_0:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_1:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_2:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_3:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_4:.*]] = alloca : !amdgcn.sgpr +// CHECK: %[[VAL_5:.*]] = alloca : !amdgcn.sgpr +// CHECK: amdgcn.vop1.vop1 %[[VAL_0]], %[[VAL_1]] : (!amdgcn.vgpr, !amdgcn.vgpr) -> () +// CHECK: sop1 s_mov_b32 outs %[[VAL_4]] ins %[[CONSTANT_1]] : !amdgcn.sgpr, i32 +// CHECK: amdgcn.vop1.vop1 %[[VAL_2]], %[[VAL_3]] : (!amdgcn.vgpr, !amdgcn.vgpr) -> () +// CHECK: sop1 s_mov_b32 outs %[[VAL_5]] ins %[[CONSTANT_0]] : !amdgcn.sgpr, i32 +// CHECK: end_kernel +// CHECK: } +amdgcn.kernel @group_valu_salu { + %c1_i32 = arith.constant 1 : i32 + %c0_i32 = arith.constant 0 : i32 + %0 = alloca : !amdgcn.vgpr + %1 = alloca : !amdgcn.vgpr + %2 = alloca : !amdgcn.vgpr + %3 = alloca : !amdgcn.vgpr + %4 = alloca : !amdgcn.sgpr + %5 = alloca : !amdgcn.sgpr + amdgcn.vop1.vop1 %0, %1 : (!amdgcn.vgpr, !amdgcn.vgpr) -> () + sop1 s_mov_b32 outs %4 ins %c0_i32 : !amdgcn.sgpr, i32 + amdgcn.vop1.vop1 %2, %3 : (!amdgcn.vgpr, !amdgcn.vgpr) -> () + sop1 s_mov_b32 outs %5 ins %c1_i32 : !amdgcn.sgpr, i32 + end_kernel +} + +// CHECK-LABEL: amdgcn.kernel @respect_data_deps { +// CHECK: %[[CONSTANT_0:.*]] = arith.constant 42 : i32 +// CHECK: %[[VAL_0:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_1:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_2:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_3:.*]] = alloca : !amdgcn.sgpr +// CHECK: amdgcn.vop1.vop1 %[[VAL_0]], %[[VAL_2]] : (!amdgcn.vgpr, !amdgcn.vgpr) -> () +// CHECK: sop1 s_mov_b32 outs %[[VAL_3]] ins %[[CONSTANT_0]] : !amdgcn.sgpr, i32 +// CHECK: vop2 v_add_u32 outs %[[VAL_1]] ins %[[VAL_0]], %[[VAL_2]] : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: end_kernel +// CHECK: } +amdgcn.kernel @respect_data_deps { + %c42_i32 = arith.constant 42 : i32 + %0 = alloca : !amdgcn.vgpr + %1 = alloca : !amdgcn.vgpr + %2 = alloca : !amdgcn.vgpr + %3 = alloca : !amdgcn.sgpr + amdgcn.vop1.vop1 %0, %2 : (!amdgcn.vgpr, !amdgcn.vgpr) -> () + sop1 s_mov_b32 outs %3 ins %c42_i32 : !amdgcn.sgpr, i32 + vop2 v_add_u32 outs %1 ins %0, %2 : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr + end_kernel +} + +// CHECK-LABEL: amdgcn.kernel @vmem_addr_load_interleave { +// CHECK: %[[CONSTANT_0:.*]] = arith.constant 3072 : i32 +// CHECK: %[[CONSTANT_1:.*]] = arith.constant 2048 : i32 +// CHECK: %[[CONSTANT_2:.*]] = arith.constant 1024 : i32 +// CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_0:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_1:.*]] = alloca : !amdgcn.sgpr +// CHECK: %[[VAL_2:.*]] = alloca : !amdgcn.sgpr +// CHECK: %[[VAL_3:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_4:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_5:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_6:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_7:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_8:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_9:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_10:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_11:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_12:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_13:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_14:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_15:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_16:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_17:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_18:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_19:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_20:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_21:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_22:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_23:.*]] = make_register_range %[[VAL_1]], %[[VAL_2]] : !amdgcn.sgpr, !amdgcn.sgpr +// CHECK: %[[VAL_24:.*]] = make_register_range %[[VAL_3]], %[[VAL_4]], %[[VAL_5]], %[[VAL_6]] : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[VAL_25:.*]] = make_register_range %[[VAL_7]], %[[VAL_8]], %[[VAL_9]], %[[VAL_10]] : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[VAL_26:.*]] = make_register_range %[[VAL_11]], %[[VAL_12]], %[[VAL_13]], %[[VAL_14]] : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[VAL_27:.*]] = make_register_range %[[VAL_15]], %[[VAL_16]], %[[VAL_17]], %[[VAL_18]] : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: vop2 v_add_u32 outs %[[VAL_19]] ins %[[CONSTANT_3]], %[[VAL_0]] : !amdgcn.vgpr, i32, !amdgcn.vgpr +// CHECK: %[[VAL_28:.*]] = load global_load_dwordx4 dest %[[VAL_24]] addr %[[VAL_23]] offset d(%[[VAL_19]]) : dps(!amdgcn.vgpr<[? : ? + 4]>) ins(!amdgcn.sgpr<[? : ? + 2]>, !amdgcn.vgpr) -> !amdgcn.read_token +// CHECK: vop2 v_add_u32 outs %[[VAL_20]] ins %[[CONSTANT_2]], %[[VAL_0]] : !amdgcn.vgpr, i32, !amdgcn.vgpr +// CHECK: %[[VAL_29:.*]] = load global_load_dwordx4 dest %[[VAL_25]] addr %[[VAL_23]] offset d(%[[VAL_20]]) : dps(!amdgcn.vgpr<[? : ? + 4]>) ins(!amdgcn.sgpr<[? : ? + 2]>, !amdgcn.vgpr) -> !amdgcn.read_token +// CHECK: vop2 v_add_u32 outs %[[VAL_21]] ins %[[CONSTANT_1]], %[[VAL_0]] : !amdgcn.vgpr, i32, !amdgcn.vgpr +// CHECK: vop2 v_add_u32 outs %[[VAL_22]] ins %[[CONSTANT_0]], %[[VAL_0]] : !amdgcn.vgpr, i32, !amdgcn.vgpr +// CHECK: %[[VAL_30:.*]] = load global_load_dwordx4 dest %[[VAL_26]] addr %[[VAL_23]] offset d(%[[VAL_21]]) : dps(!amdgcn.vgpr<[? : ? + 4]>) ins(!amdgcn.sgpr<[? : ? + 2]>, !amdgcn.vgpr) -> !amdgcn.read_token +// CHECK: %[[VAL_31:.*]] = load global_load_dwordx4 dest %[[VAL_27]] addr %[[VAL_23]] offset d(%[[VAL_22]]) : dps(!amdgcn.vgpr<[? : ? + 4]>) ins(!amdgcn.sgpr<[? : ? + 2]>, !amdgcn.vgpr) -> !amdgcn.read_token +// CHECK: end_kernel +// CHECK: } +amdgcn.kernel @vmem_addr_load_interleave { + %c3072_i32 = arith.constant 3072 : i32 + %c2048_i32 = arith.constant 2048 : i32 + %c1024_i32 = arith.constant 1024 : i32 + %c0_i32 = arith.constant 0 : i32 + %0 = alloca : !amdgcn.vgpr + %1 = alloca : !amdgcn.sgpr + %2 = alloca : !amdgcn.sgpr + %3 = make_register_range %1, %2 : !amdgcn.sgpr, !amdgcn.sgpr + %4 = alloca : !amdgcn.vgpr + %5 = alloca : !amdgcn.vgpr + %6 = alloca : !amdgcn.vgpr + %7 = alloca : !amdgcn.vgpr + %8 = make_register_range %4, %5, %6, %7 : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr + %9 = alloca : !amdgcn.vgpr + %10 = alloca : !amdgcn.vgpr + %11 = alloca : !amdgcn.vgpr + %12 = alloca : !amdgcn.vgpr + %13 = make_register_range %9, %10, %11, %12 : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr + %14 = alloca : !amdgcn.vgpr + %15 = alloca : !amdgcn.vgpr + %16 = alloca : !amdgcn.vgpr + %17 = alloca : !amdgcn.vgpr + %18 = make_register_range %14, %15, %16, %17 : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr + %19 = alloca : !amdgcn.vgpr + %20 = alloca : !amdgcn.vgpr + %21 = alloca : !amdgcn.vgpr + %22 = alloca : !amdgcn.vgpr + %23 = make_register_range %19, %20, %21, %22 : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr + %24 = alloca : !amdgcn.vgpr + %25 = alloca : !amdgcn.vgpr + %26 = alloca : !amdgcn.vgpr + %27 = alloca : !amdgcn.vgpr + vop2 v_add_u32 outs %24 ins %c0_i32, %0 : !amdgcn.vgpr, i32, !amdgcn.vgpr + vop2 v_add_u32 outs %25 ins %c1024_i32, %0 : !amdgcn.vgpr, i32, !amdgcn.vgpr + vop2 v_add_u32 outs %26 ins %c2048_i32, %0 : !amdgcn.vgpr, i32, !amdgcn.vgpr + vop2 v_add_u32 outs %27 ins %c3072_i32, %0 : !amdgcn.vgpr, i32, !amdgcn.vgpr + %token = load global_load_dwordx4 dest %8 addr %3 offset d(%24) : dps(!amdgcn.vgpr<[? : ? + 4]>) ins(!amdgcn.sgpr<[? : ? + 2]>, !amdgcn.vgpr) -> !amdgcn.read_token + %token_0 = load global_load_dwordx4 dest %13 addr %3 offset d(%25) : dps(!amdgcn.vgpr<[? : ? + 4]>) ins(!amdgcn.sgpr<[? : ? + 2]>, !amdgcn.vgpr) -> !amdgcn.read_token + %token_1 = load global_load_dwordx4 dest %18 addr %3 offset d(%26) : dps(!amdgcn.vgpr<[? : ? + 4]>) ins(!amdgcn.sgpr<[? : ? + 2]>, !amdgcn.vgpr) -> !amdgcn.read_token + %token_2 = load global_load_dwordx4 dest %23 addr %3 offset d(%27) : dps(!amdgcn.vgpr<[? : ? + 4]>) ins(!amdgcn.sgpr<[? : ? + 2]>, !amdgcn.vgpr) -> !amdgcn.read_token + end_kernel +} + +// CHECK-LABEL: amdgcn.kernel @barrier_separates_lds { +// CHECK: %[[CONSTANT_0:.*]] = arith.constant 8 : i32 +// CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_0:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_1:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_2:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_3:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_4:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_5:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_6:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_7:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_8:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_9:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_10:.*]] = make_register_range %[[VAL_2]], %[[VAL_3]] : !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[VAL_11:.*]] = make_register_range %[[VAL_4]], %[[VAL_5]] : !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[VAL_12:.*]] = make_register_range %[[VAL_6]], %[[VAL_7]] : !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[VAL_13:.*]] = make_register_range %[[VAL_8]], %[[VAL_9]] : !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[VAL_14:.*]] = store ds_write_b64 data %[[VAL_10]] addr %[[VAL_0]] offset c(%[[CONSTANT_1]]) : ins(!amdgcn.vgpr<[? : ? + 2]>, !amdgcn.vgpr, i32) -> !amdgcn.write_token +// CHECK: %[[VAL_15:.*]] = store ds_write_b64 data %[[VAL_11]] addr %[[VAL_1]] offset c(%[[CONSTANT_1]]) : ins(!amdgcn.vgpr<[? : ? + 2]>, !amdgcn.vgpr, i32) -> !amdgcn.write_token +// CHECK: amdgcn.sopp.sopp +// CHECK: %[[VAL_16:.*]] = load ds_read_b64 dest %[[VAL_12]] addr %[[VAL_0]] offset c(%[[CONSTANT_0]]) : dps(!amdgcn.vgpr<[? : ? + 2]>) ins(!amdgcn.vgpr, i32) -> !amdgcn.read_token +// CHECK: %[[VAL_17:.*]] = load ds_read_b64 dest %[[VAL_13]] addr %[[VAL_1]] offset c(%[[CONSTANT_0]]) : dps(!amdgcn.vgpr<[? : ? + 2]>) ins(!amdgcn.vgpr, i32) -> !amdgcn.read_token +// CHECK: end_kernel +// CHECK: } +amdgcn.kernel @barrier_separates_lds { + %c8_i32 = arith.constant 8 : i32 + %c0_i32 = arith.constant 0 : i32 + %0 = alloca : !amdgcn.vgpr + %1 = alloca : !amdgcn.vgpr + %2 = alloca : !amdgcn.vgpr + %3 = alloca : !amdgcn.vgpr + %4 = make_register_range %2, %3 : !amdgcn.vgpr, !amdgcn.vgpr + %5 = alloca : !amdgcn.vgpr + %6 = alloca : !amdgcn.vgpr + %7 = make_register_range %5, %6 : !amdgcn.vgpr, !amdgcn.vgpr + %8 = alloca : !amdgcn.vgpr + %9 = alloca : !amdgcn.vgpr + %10 = make_register_range %8, %9 : !amdgcn.vgpr, !amdgcn.vgpr + %11 = alloca : !amdgcn.vgpr + %12 = alloca : !amdgcn.vgpr + %13 = make_register_range %11, %12 : !amdgcn.vgpr, !amdgcn.vgpr + %14 = store ds_write_b64 data %4 addr %0 offset c(%c0_i32) : ins(!amdgcn.vgpr<[? : ? + 2]>, !amdgcn.vgpr, i32) -> !amdgcn.write_token + %15 = store ds_write_b64 data %7 addr %1 offset c(%c0_i32) : ins(!amdgcn.vgpr<[? : ? + 2]>, !amdgcn.vgpr, i32) -> !amdgcn.write_token + amdgcn.sopp.sopp + %token = load ds_read_b64 dest %10 addr %0 offset c(%c8_i32) : dps(!amdgcn.vgpr<[? : ? + 2]>) ins(!amdgcn.vgpr, i32) -> !amdgcn.read_token + %token_0 = load ds_read_b64 dest %13 addr %1 offset c(%c8_i32) : dps(!amdgcn.vgpr<[? : ? + 2]>) ins(!amdgcn.vgpr, i32) -> !amdgcn.read_token + end_kernel +} + +// CHECK-LABEL: amdgcn.kernel @lds_ops_ordered { +// CHECK: %[[CONSTANT_0:.*]] = arith.constant 8 : i32 +// CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_0:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_1:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_2:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_3:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_4:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_5:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_6:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_7:.*]] = make_register_range %[[VAL_1]], %[[VAL_2]] : !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[VAL_8:.*]] = make_register_range %[[VAL_3]], %[[VAL_4]] : !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[VAL_9:.*]] = make_register_range %[[VAL_5]], %[[VAL_6]] : !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[VAL_10:.*]] = store ds_write_b64 data %[[VAL_7]] addr %[[VAL_0]] offset c(%[[CONSTANT_1]]) : ins(!amdgcn.vgpr<[? : ? + 2]>, !amdgcn.vgpr, i32) -> !amdgcn.write_token +// CHECK: %[[VAL_11:.*]] = load ds_read_b64 dest %[[VAL_9]] addr %[[VAL_0]] offset c(%[[CONSTANT_0]]) : dps(!amdgcn.vgpr<[? : ? + 2]>) ins(!amdgcn.vgpr, i32) -> !amdgcn.read_token +// CHECK: %[[VAL_12:.*]] = store ds_write_b64 data %[[VAL_8]] addr %[[VAL_0]] offset c(%[[CONSTANT_1]]) : ins(!amdgcn.vgpr<[? : ? + 2]>, !amdgcn.vgpr, i32) -> !amdgcn.write_token +// CHECK: end_kernel +// CHECK: } +amdgcn.kernel @lds_ops_ordered { + %c8_i32 = arith.constant 8 : i32 + %c0_i32 = arith.constant 0 : i32 + %0 = alloca : !amdgcn.vgpr + %1 = alloca : !amdgcn.vgpr + %2 = alloca : !amdgcn.vgpr + %3 = make_register_range %1, %2 : !amdgcn.vgpr, !amdgcn.vgpr + %4 = alloca : !amdgcn.vgpr + %5 = alloca : !amdgcn.vgpr + %6 = make_register_range %4, %5 : !amdgcn.vgpr, !amdgcn.vgpr + %7 = alloca : !amdgcn.vgpr + %8 = alloca : !amdgcn.vgpr + %9 = make_register_range %7, %8 : !amdgcn.vgpr, !amdgcn.vgpr + %10 = store ds_write_b64 data %3 addr %0 offset c(%c0_i32) : ins(!amdgcn.vgpr<[? : ? + 2]>, !amdgcn.vgpr, i32) -> !amdgcn.write_token + %token = load ds_read_b64 dest %9 addr %0 offset c(%c8_i32) : dps(!amdgcn.vgpr<[? : ? + 2]>) ins(!amdgcn.vgpr, i32) -> !amdgcn.read_token + %11 = store ds_write_b64 data %6 addr %0 offset c(%c0_i32) : ins(!amdgcn.vgpr<[? : ? + 2]>, !amdgcn.vgpr, i32) -> !amdgcn.write_token + end_kernel +} + +// CHECK-LABEL: amdgcn.kernel @lgkm_wait_gates_ds_read { +// CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_0:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_1:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_2:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_3:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_4:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_5:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_6:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_7:.*]] = make_register_range %[[VAL_0]], %[[VAL_1]] : !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[VAL_8:.*]] = make_register_range %[[VAL_6]], %[[VAL_4]] : !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: amdgcn.vop1.vop1 %[[VAL_3]], %[[VAL_4]] : (!amdgcn.vgpr, !amdgcn.vgpr) -> () +// CHECK: amdgcn.vop1.vop1 %[[VAL_6]], %[[VAL_3]] : (!amdgcn.vgpr, !amdgcn.vgpr) -> () +// CHECK: amdgcn.vop1.vop1 %[[VAL_4]], %[[VAL_3]] : (!amdgcn.vgpr, !amdgcn.vgpr) -> () +// CHECK: %[[VAL_9:.*]] = store ds_write_b64 data %[[VAL_8]] addr %[[VAL_5]] offset c(%[[CONSTANT_0]]) : ins(!amdgcn.vgpr<[? : ? + 2]>, !amdgcn.vgpr, i32) -> !amdgcn.write_token +// CHECK: wait deps %[[VAL_9]] : !amdgcn.write_token +// CHECK: %[[VAL_10:.*]] = load ds_read_b64 dest %[[VAL_7]] addr %[[VAL_2]] offset c(%[[CONSTANT_0]]) : dps(!amdgcn.vgpr<[? : ? + 2]>) ins(!amdgcn.vgpr, i32) -> !amdgcn.read_token +// CHECK: end_kernel +// CHECK: } +amdgcn.kernel @lgkm_wait_gates_ds_read { + %c0_i32 = arith.constant 0 : i32 + %0 = alloca : !amdgcn.vgpr + %1 = alloca : !amdgcn.vgpr + %2 = alloca : !amdgcn.vgpr + %3 = alloca : !amdgcn.vgpr + %4 = alloca : !amdgcn.vgpr + %5 = alloca : !amdgcn.vgpr + amdgcn.vop1.vop1 %3, %4 : (!amdgcn.vgpr, !amdgcn.vgpr) -> () + %6 = alloca : !amdgcn.vgpr + amdgcn.vop1.vop1 %6, %3 : (!amdgcn.vgpr, !amdgcn.vgpr) -> () + amdgcn.vop1.vop1 %4, %3 : (!amdgcn.vgpr, !amdgcn.vgpr) -> () + %7 = make_register_range %0, %1 : !amdgcn.vgpr, !amdgcn.vgpr + %8 = make_register_range %6, %4 : !amdgcn.vgpr, !amdgcn.vgpr + %9 = store ds_write_b64 data %8 addr %5 offset c(%c0_i32) : ins(!amdgcn.vgpr<[? : ? + 2]>, !amdgcn.vgpr, i32) -> !amdgcn.write_token + wait deps %9 : !amdgcn.write_token + %token = load ds_read_b64 dest %7 addr %2 offset c(%c0_i32) : dps(!amdgcn.vgpr<[? : ? + 2]>) ins(!amdgcn.vgpr, i32) -> !amdgcn.read_token + end_kernel +} + +// CHECK-LABEL: amdgcn.kernel @vmem_ops_ordered { +// CHECK: %[[VAL_0:.*]] = alloca : !amdgcn.sgpr +// CHECK: %[[VAL_1:.*]] = alloca : !amdgcn.sgpr +// CHECK: %[[VAL_2:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_3:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_4:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_5:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_6:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_7:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_8:.*]] = alloca : !amdgcn.vgpr +// CHECK: %[[VAL_9:.*]] = make_register_range %[[VAL_0]], %[[VAL_1]] : !amdgcn.sgpr, !amdgcn.sgpr +// CHECK: %[[VAL_10:.*]] = make_register_range %[[VAL_2]] : !amdgcn.vgpr +// CHECK: %[[VAL_11:.*]] = make_register_range %[[VAL_3]] : !amdgcn.vgpr +// CHECK: %[[VAL_12:.*]] = make_register_range %[[VAL_4]], %[[VAL_5]], %[[VAL_6]], %[[VAL_7]] : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: %[[VAL_13:.*]] = store global_store_dword data %[[VAL_10]] addr %[[VAL_9]] : ins(!amdgcn.vgpr, !amdgcn.sgpr<[? : ? + 2]>) -> !amdgcn.write_token +// CHECK: %[[VAL_14:.*]] = load global_load_dwordx4 dest %[[VAL_12]] addr %[[VAL_9]] offset d(%[[VAL_8]]) : dps(!amdgcn.vgpr<[? : ? + 4]>) ins(!amdgcn.sgpr<[? : ? + 2]>, !amdgcn.vgpr) -> !amdgcn.read_token +// CHECK: %[[VAL_15:.*]] = store global_store_dword data %[[VAL_11]] addr %[[VAL_9]] : ins(!amdgcn.vgpr, !amdgcn.sgpr<[? : ? + 2]>) -> !amdgcn.write_token +// CHECK: end_kernel +// CHECK: } +amdgcn.kernel @vmem_ops_ordered { + %0 = alloca : !amdgcn.sgpr + %1 = alloca : !amdgcn.sgpr + %2 = make_register_range %0, %1 : !amdgcn.sgpr, !amdgcn.sgpr + %3 = alloca : !amdgcn.vgpr + %4 = alloca : !amdgcn.vgpr + %5 = alloca : !amdgcn.vgpr + %6 = alloca : !amdgcn.vgpr + %7 = alloca : !amdgcn.vgpr + %8 = alloca : !amdgcn.vgpr + %9 = make_register_range %5, %6, %7, %8 : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr + %10 = alloca : !amdgcn.vgpr + %11 = make_register_range %3 : !amdgcn.vgpr + %12 = make_register_range %4 : !amdgcn.vgpr + %13 = store global_store_dword data %11 addr %2 : ins(!amdgcn.vgpr, !amdgcn.sgpr<[? : ? + 2]>) -> !amdgcn.write_token + %token = load global_load_dwordx4 dest %9 addr %2 offset d(%10) : dps(!amdgcn.vgpr<[? : ? + 4]>) ins(!amdgcn.sgpr<[? : ? + 2]>, !amdgcn.vgpr) -> !amdgcn.read_token + %14 = store global_store_dword data %12 addr %2 : ins(!amdgcn.vgpr, !amdgcn.sgpr<[? : ? + 2]>) -> !amdgcn.write_token + end_kernel +} diff --git a/test/Dialect/AMDGCN/Transforms/sched-register.mlir b/test/Dialect/AMDGCN/Transforms/sched-register.mlir new file mode 100644 index 000000000..e38da67b5 --- /dev/null +++ b/test/Dialect/AMDGCN/Transforms/sched-register.mlir @@ -0,0 +1,27 @@ +// RUN: aster-opt %s --aster-apply-sched=scheds=sched --allow-unregistered-dialect | FileCheck %s + +#sched = #aster_utils.generic_scheduler<#amdgcn.register_scheduler, #aster_utils.sched_stage_labeler, #aster_utils.stage_topo_sort_sched> + +// Post-register-semantics DPS kernel (`!amdgcn.vgpr`). Register scheduler adds +// reaching-definition edges; schedule must keep the vop2 chain order. +// CHECK-LABEL: func.func @ssa_chain_register( +// CHECK-SAME: ) { +// CHECK: %[[A0:.*]] = amdgcn.alloca +// CHECK: %[[A1:.*]] = amdgcn.alloca +// CHECK: %[[A2:.*]] = amdgcn.alloca +// CHECK: %[[A3:.*]] = amdgcn.alloca +// CHECK: amdgcn.vop2 v_add_u32 outs %[[A3]] ins %[[A1]], %[[A0]] {sched.stage = 1 : i32} : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: amdgcn.vop2 v_add_u32 outs %[[A2]] ins %[[A1]], %[[A0]] {sched.stage = 2 : i32} : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: amdgcn.vop2 v_add_u32 outs %[[A2]] ins %[[A2]], %[[A3]] {sched.stage = 0 : i32} : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr +// CHECK: return +// CHECK: } +func.func @ssa_chain_register() attributes {sched = #sched} { + %0 = amdgcn.alloca {sched.stage = 0 : i32} : !amdgcn.vgpr + %1 = amdgcn.alloca {sched.stage = 0 : i32} : !amdgcn.vgpr + %2 = amdgcn.alloca {sched.stage = 0 : i32} : !amdgcn.vgpr + %3 = amdgcn.alloca {sched.stage = 0 : i32} : !amdgcn.vgpr + amdgcn.vop2 v_add_u32 outs %2 ins %1, %0 {sched.stage = 2 : i32} : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr + amdgcn.vop2 v_add_u32 outs %3 ins %1, %0 {sched.stage = 1 : i32} : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr + amdgcn.vop2 v_add_u32 outs %2 ins %2, %3 {sched.stage = 0 : i32} : !amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr + return +}