From e37d0bf41d36e4517b3eba7b68bce1b8c13df197 Mon Sep 17 00:00:00 2001 From: Fabian Mora Date: Thu, 9 Apr 2026 15:42:54 +0000 Subject: [PATCH] step Signed-off-by: Fabian Mora --- .../aster/Dialect/AMDGCN/IR/AMDGCNTypes.td | 87 +--- .../Dialect/AMDGCN/Transforms/AMDGCNPasses.td | 41 +- include/aster/Dialect/LSIR/IR/LSIROps.h | 1 + include/aster/Dialect/LSIR/IR/LSIROps.td | 131 ++++- include/aster/Interfaces/RegisterType.td | 39 +- include/aster/Transforms/Passes.td | 20 + lib/Analysis/MemoryDependenceAnalysis.cpp | 6 +- .../AMDGCN/CodeGen/CodeGenPatterns.cpp | 8 + lib/Dialect/AMDGCN/IR/AMDGCN.cpp | 2 +- lib/Dialect/AMDGCN/IR/AMDGCNAttrs.cpp | 3 +- lib/Dialect/AMDGCN/IR/AMDGCNTypes.cpp | 23 +- lib/Dialect/AMDGCN/IR/AMDGCNVerifiers.cpp | 4 +- .../AMDGCN/Transforms/AMDGCNBufferization.cpp | 4 +- lib/Dialect/AMDGCN/Transforms/CMakeLists.txt | 1 - lib/Dialect/AMDGCN/Transforms/LegalizeCF.cpp | 483 ++---------------- lib/Dialect/AMDGCN/Transforms/Mem2Reg.cpp | 4 +- .../AMDGCN/Transforms/ToAMDGCNPatterns.cpp | 145 +++++- .../AMDGCN/Transforms/ToRegisterSemantics.cpp | 3 +- lib/Dialect/LSIR/CodeGen/CodeGenPatterns.cpp | 104 ++-- lib/Dialect/LSIR/IR/LSIROps.cpp | 30 ++ lib/Target/ASM/TranslateModule.cpp | 4 +- lib/Transforms/CMakeLists.txt | 2 + .../Transforms/ConvertSCFControlFlow.cpp | 82 +-- python/aster/pass_pipelines.py | 2 +- .../AMDGCN/Analysis/cdna3-hazards.mlir | 16 +- .../AMDGCN/Analysis/range-constraints.mlir | 22 +- .../Analysis/register-interference.mlir | 10 +- .../AMDGCN/Analysis/register-liveness.mlir | 71 +-- .../IR/normal-forms-no-lsir-compute-ops.mlir | 6 +- ...mal-forms-no-lsir-control-ops-invalid.mlir | 6 +- .../AMDGCN/Transforms/bufferization.mlir | 34 +- .../chained-select-dps-violation.mlir | 24 +- .../Transforms/convert-scf-iter-args.mlir | 2 +- .../AMDGCN/Transforms/convert-scf-nf.mlir | 11 +- .../AMDGCN/Transforms/convert-scf.mlir | 41 +- .../AMDGCN/Transforms/legalize-cf-nf.mlir | 5 +- .../AMDGCN/Transforms/legalize-cf.mlir | 236 +++------ .../AMDGCN/Transforms/legalize-operands.mlir | 42 +- test/Dialect/AMDGCN/cmp-ops.mlir | 34 +- .../LSIR/CodeGen/arith-minmax-codegen.mlir | 6 +- test/Dialect/LSIR/Transforms/codegen-cf.mlir | 40 +- .../LSIR/Transforms/codegen-func-cf.mlir | 6 +- test/Dialect/LSIR/ops.mlir | 80 +-- test/Target/ASM/cbranch.mlir | 12 +- test/Target/ASM/g2s-load-lds.mlir | 12 +- test/Target/ASM/loops.mlir | 12 +- test/Target/ASM/s-mov-m0.mlir | 8 +- test/Target/ASM/vopc-branch.mlir | 18 +- test/integration/g2s-load-lds-e2e.mlir | 6 +- test/integration/sreg-roundtrip-e2e.mlir | 6 +- test/integration/vopc-branch-e2e.mlir | 15 +- 51 files changed, 833 insertions(+), 1177 deletions(-) rename lib/{Dialect/AMDGCN => }/Transforms/ConvertSCFControlFlow.cpp (73%) diff --git a/include/aster/Dialect/AMDGCN/IR/AMDGCNTypes.td b/include/aster/Dialect/AMDGCN/IR/AMDGCNTypes.td index f48883376..64a8e2d50 100644 --- a/include/aster/Dialect/AMDGCN/IR/AMDGCNTypes.td +++ b/include/aster/Dialect/AMDGCN/IR/AMDGCNTypes.td @@ -50,14 +50,9 @@ def AGPRType : AMDGCN_RegisterDef<"AGPR", "agpr", [MemRefElementTypeInterface]> let genVerifyDecl = 1; let extraClassDeclaration = [{ Register getReg() const { return getRange().begin(); } - - /// Returns true if the register is relocatable. - bool isRelocatable() const { return getReg().isRelocatable(); } - //===------------------------------------------------------------------===// // RegisterTypeInterface //===------------------------------------------------------------------===// - bool isRegisterRange() const { return getRange().size() > 1; } RegisterRange getAsRange() const { return getRange(); } @@ -67,9 +62,6 @@ def AGPRType : AMDGCN_RegisterDef<"AGPR", "agpr", [MemRefElementTypeInterface]> RegisterTypeInterface cloneRegisterType(RegisterRange range) const { return AGPRType::get(getContext(), range); } - RegisterTypeInterface cloneRegisterType(Register reg) const { - return AGPRType::get(getContext(), RegisterRange(reg, 1)); - } //===------------------------------------------------------------------===// // ResourceTypeInterface @@ -97,14 +89,9 @@ def SGPRType : AMDGCN_RegisterDef<"SGPR", "sgpr", [MemRefElementTypeInterface]> let genVerifyDecl = 1; let extraClassDeclaration = [{ Register getReg() const { return getRange().begin(); } - - /// Returns true if the register is relocatable. - bool isRelocatable() const { return getReg().isRelocatable(); } - //===------------------------------------------------------------------===// // RegisterTypeInterface //===------------------------------------------------------------------===// - bool isRegisterRange() const { return getRange().size() > 1; } RegisterRange getAsRange() const { return getRange(); } @@ -114,9 +101,6 @@ def SGPRType : AMDGCN_RegisterDef<"SGPR", "sgpr", [MemRefElementTypeInterface]> RegisterTypeInterface cloneRegisterType(RegisterRange range) const { return SGPRType::get(getContext(), range); } - RegisterTypeInterface cloneRegisterType(Register reg) const { - return SGPRType::get(getContext(), RegisterRange(reg, 1)); - } //===------------------------------------------------------------------===// // ResourceTypeInterface @@ -145,13 +129,9 @@ def VGPRType : AMDGCN_RegisterDef<"VGPR", "vgpr", [MemRefElementTypeInterface]> let extraClassDeclaration = [{ Register getReg() const { return getRange().begin(); } - /// Returns true if the register is relocatable. - bool isRelocatable() const { return getRange().begin().isRelocatable(); } - //===------------------------------------------------------------------===// // RegisterTypeInterface //===------------------------------------------------------------------===// - bool isRegisterRange() const { return getRange().size() > 1; } RegisterRange getAsRange() const { return getRange(); } @@ -161,9 +141,6 @@ def VGPRType : AMDGCN_RegisterDef<"VGPR", "vgpr", [MemRefElementTypeInterface]> RegisterTypeInterface cloneRegisterType(RegisterRange range) const { return VGPRType::get(getContext(), range); } - RegisterTypeInterface cloneRegisterType(Register reg) const { - return VGPRType::get(getContext(), RegisterRange(reg, 1)); - } //===------------------------------------------------------------------===// // ResourceTypeInterface @@ -176,71 +153,43 @@ def VGPRType : AMDGCN_RegisterDef<"VGPR", "vgpr", [MemRefElementTypeInterface]> // SREG like types //===----------------------------------------------------------------------===// -/// Special registers to model state. -def SREGType : AMDGCN_RegisterDef<"SREG", "sreg", [MemRefElementTypeInterface]> { - let summary = "SREG type"; - let parameters = (ins - DefaultValuedParameter<"Register", "Register()">:$reg, - "SregKind":$kind); - let assemblyFormat = "`<`$kind (`,` $reg^)?`>`"; - let genVerifyDecl = 1; - let extraClassDeclaration = [{ - /// Returns true if the register is relocatable. - bool isRelocatable() const { return getReg().isRelocatable(); } - - //===------------------------------------------------------------------===// - // RegisterTypeInterface - //===------------------------------------------------------------------===// - bool isRegisterRange() const { return false; } - RegisterRange getAsRange() const { - return RegisterRange(getReg(), 1); - } - RegisterKind getRegisterKind() const { - return RegisterKind::SREG; - } - RegisterTypeInterface cloneRegisterType(RegisterRange range) const { - assert(range.size() == 1 && "SREG type can only clone single register"); - return SREGType::get(getContext(), range.begin(), getKind()); - } - RegisterTypeInterface cloneRegisterType(Register reg) const { - return SREGType::get(getContext(), reg, getKind()); - } - - //===------------------------------------------------------------------===// - // ResourceTypeInterface - //===------------------------------------------------------------------===// - Resource *getResource() const; - }]; -} - /// Special registers to model state. class SREGBase : AMDGCN_RegisterDef { let summary = kind # " special register type"; - let assemblyFormat = ""; + let parameters = (ins + DefaultValuedParameter<"Register", "Register()">:$reg + ); + let assemblyFormat = "(`<` $reg^ `>`)?"; + let builders = [ + TypeBuilder<(ins CArg<"Register", "Register(0)">:$reg), [{ + return $_get($_ctxt, normalizeRegister(reg)); + }]> + ]; + let skipDefaultBuilders = 1; string declarations = StrSubst<[{ /// The register kind for this SREG type. static constexpr RegisterKind kRegisterKind = RegisterKind::$kind; }], [VarRepl<"kind", kind>]>.result; let extraClassDeclaration = declarations # [{ - /// Returns true if the register is relocatable. - bool isRelocatable() const { return false; } + static Register normalizeRegister(Register reg) { + if (reg.getSemantics() == RegisterSemantics::Unallocated) + return Register(0); + return reg; + } //===------------------------------------------------------------------===// // RegisterTypeInterface //===------------------------------------------------------------------===// - bool isRegisterRange() const { return false; } RegisterRange getAsRange() const { - return RegisterRange(Register(0), 1); + return RegisterRange(getReg(), 1); } RegisterKind getRegisterKind() const { return kRegisterKind; } RegisterTypeInterface cloneRegisterType(RegisterRange range) const { - return get(getContext()); - } - RegisterTypeInterface cloneRegisterType(Register reg) const { - return get(getContext()); + assert(range.size() == 1 && "SREG type can only clone single register"); + return get(getContext(), range.begin()); } //===------------------------------------------------------------------===// diff --git a/include/aster/Dialect/AMDGCN/Transforms/AMDGCNPasses.td b/include/aster/Dialect/AMDGCN/Transforms/AMDGCNPasses.td index dee4d1363..e6eaefecf 100644 --- a/include/aster/Dialect/AMDGCN/Transforms/AMDGCNPasses.td +++ b/include/aster/Dialect/AMDGCN/Transforms/AMDGCNPasses.td @@ -291,44 +291,19 @@ def ConvertLDSBuffers : Pass<"amdgcn-convert-lds-buffers"> { ]; } -def ConvertSCFControlFlow : Pass<"amdgcn-convert-scf-control-flow"> { - let summary = "Convert SCF control flow to AMDGCN control flow instructions"; - let description = [{ - This pass converts SCF structured control flow operations (such as scf.for, - scf.if, scf.while) to AMDGCN control flow instructions. - - The pass first runs thread uniform analysis to determine whether loop - induction variables and conditions are uniform across all threads. Based on - this analysis: - - - For thread-uniform conditions: emit scalar compare instructions (s_cmp_*) - and branch on SCC - - For thread-divergent conditions: emit vector compare instructions (v_cmp_*) - and branch on VCC/VCCZ - - This pass should run after the ABI has been set and before register - allocation. - - Post-condition: #amdgcn.no_scf_ops - }]; - let dependentDialects = [ - "mlir::aster::amdgcn::AMDGCNDialect", - "mlir::aster::lsir::LSIRDialect", - "mlir::cf::ControlFlowDialect" - ]; -} - def LegalizeCF : Pass<"amdgcn-legalize-cf"> { let summary = "Legalize CF dialect ops to AMDGCN scalar branch instructions"; let description = [{ - This pass legalizes CF dialect operations (cf.cond_br, cf.br) and lsir.cmpi - to AMDGCN scalar branch and compare instructions. It runs after register - allocation when operands are in physical registers and values flow through - side effects. + This pass legalizes CF dialect operations (cf.cond_br, cf.br) to AMDGCN + scalar branch instructions. It runs after register allocation when operands + are in physical registers and values flow through side effects. + + The pass expects cf.cond_br conditions to come from amdgcn.is_cc which + tests an SCC or VCC register. Transformations: - - lsir.cmpi (returns i1) -> s_cmp_* (sets SCC flag) - - cf.cond_br -> s_cbranch_scc1 / scc0 + s_branch + - cf.cond_br (cond from amdgcn.is_cc) -> s_cbranch_scc1/scc0 or + s_cbranch_vccnz/vccz + s_branch - cf.br -> s_branch Pre-condition: #amdgcn.all_registers_allocated diff --git a/include/aster/Dialect/LSIR/IR/LSIROps.h b/include/aster/Dialect/LSIR/IR/LSIROps.h index 9c1802ce5..a7bcac601 100644 --- a/include/aster/Dialect/LSIR/IR/LSIROps.h +++ b/include/aster/Dialect/LSIR/IR/LSIROps.h @@ -25,6 +25,7 @@ #include "mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Dialect.h" +#include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Interfaces/InferIntRangeInterface.h" #include "mlir/Interfaces/InferTypeOpInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" diff --git a/include/aster/Dialect/LSIR/IR/LSIROps.td b/include/aster/Dialect/LSIR/IR/LSIROps.td index f35033c96..c66bb9d64 100644 --- a/include/aster/Dialect/LSIR/IR/LSIROps.td +++ b/include/aster/Dialect/LSIR/IR/LSIROps.td @@ -21,6 +21,7 @@ include "aster/Dialect/LSIR/IR/LSIRTypes.td" include "aster/Interfaces/AllocaOpInterface.td" include "mlir/Dialect/Arith/IR/ArithBase.td" include "mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.td" +include "mlir/Interfaces/ControlFlowInterfaces.td" include "mlir/Interfaces/InferIntRangeInterface.td" include "mlir/Interfaces/InferTypeOpInterface.td" include "mlir/Interfaces/SideEffectInterfaces.td" @@ -117,6 +118,90 @@ def LSIR_AllocaOp : LSIR_Op<"alloca", [Pure, AllocaOpInterface]> { }]; } +//===----------------------------------------------------------------------===// +// BrOp +//===----------------------------------------------------------------------===// + +def LSIR_BranchOp : LSIR_Op<"br", [ + DeclareOpInterfaceMethods, + Pure, Terminator]> { + let summary = "Unconditional branch operation"; + let description = [{ + The `lsir.br` operation unconditionally branches to a destination block, + passing the given operands as block arguments. Unlike `cf.br`, this op is + used after codegen when all values are in register types. + + Example: + ```mlir + lsir.br ^bb1(%val : !amdgcn.sgpr) + ``` + }]; + let arguments = (ins Variadic:$destOperands); + let successors = (successor AnySuccessor:$dest); + let builders = [ + OpBuilder<(ins "::mlir::Block *":$dest, + CArg<"::mlir::ValueRange", "{}">:$destOperands), [{ + $_state.addSuccessors(dest); + $_state.addOperands(destOperands); + }]> + ]; + let assemblyFormat = [{ + $dest (`(` $destOperands^ `:` type($destOperands) `)`)? attr-dict + }]; +} + +//===----------------------------------------------------------------------===// +// CondBrOp +//===----------------------------------------------------------------------===// + +def LSIR_CondBranchOp : LSIR_Op<"cond_br", + [AttrSizedOperandSegments, + DeclareOpInterfaceMethods, + Pure, Terminator]> { + let summary = "Conditional branch operation"; + let description = [{ + The `lsir.cond_br` operation branches conditionally based on a register + type (SCC or VCC). Unlike `cf.cond_br` which takes `i1`, this op takes + a register type directly. Branches to `trueDest` when the condition + register is nonzero, otherwise to `falseDest`. + + Example: + ```mlir + lsir.cond_br %cond : !amdgcn.scc, ^bb1(%val : !amdgcn.sgpr), ^bb2 + ``` + }]; + let arguments = (ins + RegType:$condition, + Variadic:$trueDestOperands, + Variadic:$falseDestOperands + ); + let successors = (successor AnySuccessor:$trueDest, AnySuccessor:$falseDest); + let builders = [ + OpBuilder<(ins "::mlir::Value":$condition, + "::mlir::Block *":$trueDest, + "::mlir::ValueRange":$trueOperands, + "::mlir::Block *":$falseDest, + "::mlir::ValueRange":$falseOperands), [{ + $_state.addOperands(condition); + $_state.addOperands(trueOperands); + $_state.addOperands(falseOperands); + $_state.addAttribute( + getOperandSegmentSizeAttr(), + $_builder.getDenseI32ArrayAttr( + {1, static_cast(trueOperands.size()), + static_cast(falseOperands.size())})); + $_state.addSuccessors(trueDest); + $_state.addSuccessors(falseDest); + }]> + ]; + let assemblyFormat = [{ + $condition `:` type($condition) `,` + $trueDest (`(` $trueDestOperands^ `:` type($trueDestOperands) `)`)? `,` + $falseDest (`(` $falseDestOperands^ `:` type($falseDestOperands) `)`)? + attr-dict + }]; +} + //===----------------------------------------------------------------------===// // AndIOp //===----------------------------------------------------------------------===// @@ -171,26 +256,29 @@ def LSIR_AssumeNoaliasOp : LSIR_Op<"assume_noalias", [ // CmpFOp //===----------------------------------------------------------------------===// -def LSIR_CmpFOp : LSIR_InstOp<"cmpf", [PureInst]> { +def LSIR_CmpFOp : LSIR_InstOp<"cmpf", [InstInferType, PureInst]> { let summary = "Floating point comparison operation"; let description = [{ - The `lsir.cmpf` operation compares two floating point values and returns i1. - The `i1` return value is kept late in the pipeline and is only translated to - SCC after register allocation, together with cf branch operations. + The `lsir.cmpf` operation compares two floating point values and writes the + result to a destination register (VCC for vector operands). Follows + destination-passing style: the caller allocates the output register and + passes it as `$dst`; the operation returns it. Example: ```mlir - %result = lsir.cmpf f32 olt %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr + %result = lsir.cmpf f32 olt %dst, %lhs, %rhs + : !amdgcn.sreg, !amdgcn.vgpr, !amdgcn.vgpr ``` }]; let leadingArguments = (ins AnyFloatTypeAttr:$semantics, Arith_CmpFPredicateAttr:$predicate ); + let outputs = (ins RegType:$dst); let inputs = (ins FloatOrRegType:$lhs, FloatOrRegType:$rhs); - let leadingResults = (outs I1:$result); let assemblyFormat = [{ - $semantics $predicate $lhs `,` $rhs attr-dict `:` type($lhs) `,` type($rhs) + $semantics $predicate $dst `,` $lhs `,` $rhs attr-dict + `:` type($dst) `,` type($lhs) `,` type($rhs) }]; } @@ -198,27 +286,29 @@ def LSIR_CmpFOp : LSIR_InstOp<"cmpf", [PureInst]> { // CmpIOp //===----------------------------------------------------------------------===// -def LSIR_CmpIOp : LSIR_InstOp<"cmpi", [PureInst]> { +def LSIR_CmpIOp : LSIR_InstOp<"cmpi", [InstInferType, PureInst]> { let summary = "Integer comparison operation"; let description = [{ The `lsir.cmpi` operation compares two integer operands according to the - specified predicate and returns i1. - The `i1` return value is kept late in the pipeline and is only translated to - SCC after register allocation, together with cf branch operations. + specified predicate and writes the result to a destination register (SCC or + VCC). Follows destination-passing style: the caller allocates the output + register and passes it as `$dst`; the operation returns it. Example: ```mlir - %result = lsir.cmpi i32 eq %lhs, %rhs : !amdgcn.sgpr, !amdgcn.sgpr + %result = lsir.cmpi i32 eq %dst, %lhs, %rhs + : !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr ``` }]; let leadingArguments = (ins AnyIntTypeAttr:$semantics, Arith_CmpIPredicateAttr:$predicate ); + let outputs = (ins RegType:$dst); let inputs = (ins IntOrRegType:$lhs, IntOrRegType:$rhs); - let leadingResults = (outs I1:$result); let assemblyFormat = [{ - $semantics $predicate $lhs `,` $rhs attr-dict `:` type($lhs) `,` type($rhs) + $semantics $predicate $dst `,` $lhs `,` $rhs attr-dict + `:` type($dst) `,` type($lhs) `,` type($rhs) }]; } @@ -713,24 +803,19 @@ def LSIR_RemUIOp : BinaryIOp<"remui"> { def LSIR_SelectOp : LSIR_InstOp<"select", [PureInst, InstInferType]> { let summary = "Select operation"; let description = [{ - The `lsir.select` operation selects between two values based on a condition. - The condition can be either a register type or i1 (from lsir.cmpi/cmpf). - - When the condition is i1, LegalizeCF fuses the cmpi + select into - s_cmp + s_cselect_b32. When the condition is a register, it lowers to - a conditional move instruction. + The `lsir.select` operation selects between two values based on a register + condition (SCC or VCC). LegalizeCF lowers this to s_cselect_b32 (SCC) or + v_cndmask_b32 (VCC). Example: ```mlir %result = lsir.select %dst, %cond, %tv, %fv : !amdgcn.sgpr, !amdgcn.sgpr, !amdgcn.sgpr, !amdgcn.sgpr - %result = lsir.select %dst, %i1_cond, %tv, %fv - : !amdgcn.sgpr, i1, i32, i32 ``` }]; let outputs = (ins RegType:$dst); let inputs = (ins - IntFloatOrRegType:$condition, + RegType:$condition, IntFloatOrRegType:$true_value, IntFloatOrRegType:$false_value ); diff --git a/include/aster/Interfaces/RegisterType.td b/include/aster/Interfaces/RegisterType.td index 9c2e4febc..58b9c7ce8 100644 --- a/include/aster/Interfaces/RegisterType.td +++ b/include/aster/Interfaces/RegisterType.td @@ -24,23 +24,13 @@ include "aster/Interfaces/ResourceInterfaces.td" //===----------------------------------------------------------------------===// def RegisterTypeInterface : TypeInterface<"RegisterTypeInterface", [ - ResourceTypeInterface + ResourceTypeInterface, ]> { let description = [{ This interface defines a common API for interacting with register types. }]; let cppNamespace = "::mlir::aster"; let methods = [ - InterfaceMethod<[{ - This method returns whether the register is relocatable. - }], - "bool", "isRelocatable" - >, - InterfaceMethod<[{ - This method returns whether the type is a register range. - }], - "bool", "isRegisterRange" - >, InterfaceMethod<[{ This method returns the register's range. }], @@ -53,24 +43,18 @@ def RegisterTypeInterface : TypeInterface<"RegisterTypeInterface", [ (ins "::mlir::aster::RegisterRange":$range) >, InterfaceMethod<[{ - This method returns a clone of the register. - }], - "::mlir::aster::RegisterTypeInterface", "cloneRegisterType", - (ins "::mlir::aster::Register":$reg) - >, - InterfaceMethod<[{ - This method returns the size in bytes of the register(s). - Each register is 4 bytes (32-bit). Returns std::nullopt if the type + This method returns the size in bits of the register(s). + Each register is 32 bits. Returns std::nullopt if the type is not a valid register type. }], - "std::optional", "getSizeInBytes", (ins), [{}], [{ - return $_type.getAsRange().size() * 4; + "std::optional", "getSizeInBits", (ins), [{}], [{ + return $_type.getAsRange().size() * 32; }] > ]; let extraTraitClassDeclaration = [{ bool isAllocatable() const { - return $_type.isRelocatable(); + return !$_type.hasAllocatedSemantics(); } }]; let extraSharedClassDeclaration = [{ @@ -103,6 +87,17 @@ def RegisterTypeInterface : TypeInterface<"RegisterTypeInterface", [ ::mlir::aster::RegisterTypeInterface getAsValue() const { return $_type.cloneRegisterType($_type.getAsRange().getAsValueRange()); } + + /// Returns true if this register type represents a range of registers. + bool isRegisterRange() const { + return $_type.getAsRange().size() > 1; + } + + /// This method returns a clone of the register type using the provided + /// register as the base of the range. + ::mlir::aster::RegisterTypeInterface cloneRegisterType(::mlir::aster::Register reg) const { + return $_type.cloneRegisterType(::mlir::aster::RegisterRange(reg, 1)); + } }]; } diff --git a/include/aster/Transforms/Passes.td b/include/aster/Transforms/Passes.td index fc49182a8..1e9839312 100644 --- a/include/aster/Transforms/Passes.td +++ b/include/aster/Transforms/Passes.td @@ -345,4 +345,24 @@ def CFGSimplification : Pass<"aster-cfg-simplification"> { let dependentDialects = []; } +//===----------------------------------------------------------------------===// +// ConvertSCFControlFlow +//===----------------------------------------------------------------------===// + +def ConvertSCFControlFlow : Pass<"aster-convert-scf-control-flow"> { + let summary = "Convert SCF control flow to CF dialect with basic block structure"; + let description = [{ + Converts scf.for and scf.if to cf dialect operations with explicit basic + block structure. Handles both integer and index induction variable types. + Does not check for thread uniformity. + + Post-condition: no scf.for or scf.if ops remain. + }]; + let dependentDialects = [ + "mlir::arith::ArithDialect", + "mlir::cf::ControlFlowDialect", + "mlir::scf::SCFDialect" + ]; +} + #endif // ASTER_TRANSFORMS_PASSES_TD diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp index 909b9ea80..6dab1294b 100644 --- a/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -177,9 +177,9 @@ bool MemoryDependenceAnalysis::isStoreOp(Operation *op) { static int64_t computeAccessLength(Type type) { if (auto regType = dyn_cast(type)) { - std::optional sizeInBytes = regType.getSizeInBytes(); - assert(sizeInBytes.has_value() && "register type must have valid size"); - return *sizeInBytes; + std::optional sizeInBits = regType.getSizeInBits(); + assert(sizeInBits.has_value() && "register type must have valid size"); + return *sizeInBits / 8; } // Conservative: assume 4 bytes if we can't determine precisely. return 4; diff --git a/lib/Dialect/AMDGCN/CodeGen/CodeGenPatterns.cpp b/lib/Dialect/AMDGCN/CodeGen/CodeGenPatterns.cpp index da1c05f12..91424ff02 100644 --- a/lib/Dialect/AMDGCN/CodeGen/CodeGenPatterns.cpp +++ b/lib/Dialect/AMDGCN/CodeGen/CodeGenPatterns.cpp @@ -301,6 +301,14 @@ static Type convertTypeImpl(Value value, const CodeGenConverter &converter) { if (isa(value.getType())) return value.getType(); + // i1 values map to SCC (thread-uniform) or VCC (divergent). + if (value.getType().isInteger(1)) { + std::optional isUniform = converter.isThreadUniform(value); + if (isUniform.has_value() && *isUniform) + return amdgcn::SCCType::get(value.getContext(), Register()); + return amdgcn::VCCType::get(value.getContext(), Register()); + } + int64_t typeSize = converter.getTypeSize(value.getType()); int64_t numWords = (typeSize + 3) / 4; diff --git a/lib/Dialect/AMDGCN/IR/AMDGCN.cpp b/lib/Dialect/AMDGCN/IR/AMDGCN.cpp index ef88622fb..e63c197d4 100644 --- a/lib/Dialect/AMDGCN/IR/AMDGCN.cpp +++ b/lib/Dialect/AMDGCN/IR/AMDGCN.cpp @@ -798,7 +798,7 @@ void LoadToLDSOp::getEffects( /// Check if a type is an unallocated register (relocatable). static bool isUnallocatedRegister(Type type) { auto regType = dyn_cast(type); - return regType && regType.isRelocatable(); + return regType && !regType.hasAllocatedSemantics(); } /// Parse the output types for CmpIOp. diff --git a/lib/Dialect/AMDGCN/IR/AMDGCNAttrs.cpp b/lib/Dialect/AMDGCN/IR/AMDGCNAttrs.cpp index d9cdfbfe8..05f1da1d2 100644 --- a/lib/Dialect/AMDGCN/IR/AMDGCNAttrs.cpp +++ b/lib/Dialect/AMDGCN/IR/AMDGCNAttrs.cpp @@ -301,7 +301,8 @@ LogicalResult NoLsirComputeOpsAttr::verifyOperation( // Allow control-flow ops (lowered by LegalizeCF) and copy (regalloc // primitive). - if (isa(op)) + if (isa(op)) return success(); return emitError() << "normal form violation: LSIR compute/memory " diff --git a/lib/Dialect/AMDGCN/IR/AMDGCNTypes.cpp b/lib/Dialect/AMDGCN/IR/AMDGCNTypes.cpp index 63ca67319..efce063c7 100644 --- a/lib/Dialect/AMDGCN/IR/AMDGCNTypes.cpp +++ b/lib/Dialect/AMDGCN/IR/AMDGCNTypes.cpp @@ -37,7 +37,7 @@ LogicalResult verifyRegisterRange(function_ref emitError, return emitError() << "align must be a power of 2, got " << alignment; // Check alignment if the range is allocated - if (!range.begin().isRelocatable()) { + if (range.getSemantics() == RegisterSemantics::Allocated) { if (alignment <= 0) return emitError() << "align must be positive, got " << alignment; @@ -100,24 +100,3 @@ LogicalResult VGPRType::verify(function_ref emitError, } Resource *VGPRType::getResource() const { return VGPRResource::get(); } - -//===----------------------------------------------------------------------===// -// SREG types -//===----------------------------------------------------------------------===// - -LogicalResult SREGType::verify(function_ref emitError, - Register reg, SregKind kind) { - if (!reg.isValid()) - return emitError() << "SREG must be non-negative"; - switch (kind) { - case SregKind::Scc: { - if (!reg.isRelocatable() && reg.getRegister() != 0) { - return emitError() << "SCC SREG must be register 0"; - } - break; - } - } - return success(); -} - -Resource *SREGType::getResource() const { return SGPRResource::get(); } diff --git a/lib/Dialect/AMDGCN/IR/AMDGCNVerifiers.cpp b/lib/Dialect/AMDGCN/IR/AMDGCNVerifiers.cpp index 65eed9190..db747d972 100644 --- a/lib/Dialect/AMDGCN/IR/AMDGCNVerifiers.cpp +++ b/lib/Dialect/AMDGCN/IR/AMDGCNVerifiers.cpp @@ -79,7 +79,7 @@ static LogicalResult checkOperand(Operation *op, Type type, int32_t pos, .attachNote(state.getLoc()) << "is invalid"; } - if (!allowUnallocated && regTy.isRelocatable()) { + if (!allowUnallocated && !regTy.hasAllocatedSemantics()) { return (op->emitError(direction + " operand ") << pos << " is unallocated register type: " << type) .attachNote(state.getLoc()) @@ -99,7 +99,7 @@ static LogicalResult checkValue(Operation *op, Type type, int32_t pos, .attachNote(state.getLoc()) << "is invalid"; } - if (!allowUnallocated && regTy.isRelocatable()) { + if (!allowUnallocated && !regTy.hasAllocatedSemantics()) { return (op->emitError(direction) << pos << " is unallocated register type: " << type) .attachNote(state.getLoc()) diff --git a/lib/Dialect/AMDGCN/Transforms/AMDGCNBufferization.cpp b/lib/Dialect/AMDGCN/Transforms/AMDGCNBufferization.cpp index ed506a614..07f3a88fe 100644 --- a/lib/Dialect/AMDGCN/Transforms/AMDGCNBufferization.cpp +++ b/lib/Dialect/AMDGCN/Transforms/AMDGCNBufferization.cpp @@ -294,8 +294,8 @@ void BufferizationImpl::handlePhiForwardGroup(IRRewriter &rewriter, } // Create a branch op to the block to forward to. - cf::BranchOp::create(rewriter, std::get<0>(phiForwards[start])->getLoc(), - std::get<3>(phiForwards[start]), fwdValues); + lsir::BranchOp::create(rewriter, std::get<0>(phiForwards[start])->getLoc(), + std::get<3>(phiForwards[start]), fwdValues); } void BufferizationImpl::handleBlocksAndTerminators(IRRewriter &rewriter, diff --git a/lib/Dialect/AMDGCN/Transforms/CMakeLists.txt b/lib/Dialect/AMDGCN/Transforms/CMakeLists.txt index 535c29c4f..9e68a081d 100644 --- a/lib/Dialect/AMDGCN/Transforms/CMakeLists.txt +++ b/lib/Dialect/AMDGCN/Transforms/CMakeLists.txt @@ -3,7 +3,6 @@ add_mlir_library(AMDGCNTransforms AMDGCNBufferization.cpp AMDGCNHazards.cpp ConvertLDSBuffers.cpp - ConvertSCFControlFlow.cpp ConvertWaits.cpp ExpandMetadataOps.cpp HoistIterArgWaits.cpp diff --git a/lib/Dialect/AMDGCN/Transforms/LegalizeCF.cpp b/lib/Dialect/AMDGCN/Transforms/LegalizeCF.cpp index 65773d1cf..5fe91bba2 100644 --- a/lib/Dialect/AMDGCN/Transforms/LegalizeCF.cpp +++ b/lib/Dialect/AMDGCN/Transforms/LegalizeCF.cpp @@ -8,14 +8,14 @@ // //===----------------------------------------------------------------------===// // -// This pass legalizes CF dialect operations (cf.cond_br, cf.br) and lsir.cmpi -// to AMDGCN scalar branch and compare instructions. It runs after register -// allocation when operands are in physical registers. +// This pass legalizes CF dialect operations (cf.cond_br, cf.br) to AMDGCN +// scalar branch instructions. It runs after register allocation when operands +// are in physical registers. // -// Transformations: -// - lsir.cmpi (SGPR/i32 operands) -> s_cmp_* (sets SCC flag) -// - lsir.cmpi (VGPR operands) -> v_cmp_* (sets VCC flag) -// - cf.cond_br -> s_cbranch_scc1/scc0 or s_cbranch_vccnz/vccz + s_branch +// The pass expects cf.cond_br conditions to come from amdgcn.is_cc (which +// tests an SCC or VCC register). The transformation: +// - cf.cond_br (cond from amdgcn.is_cc) -> s_cbranch_scc1/scc0 or +// s_cbranch_vccnz/vccz + s_branch // - cf.br -> s_branch // //===----------------------------------------------------------------------===// @@ -24,13 +24,12 @@ #include "aster/Dialect/AMDGCN/IR/AMDGCNOps.h" #include "aster/Dialect/AMDGCN/IR/AMDGCNTypes.h" #include "aster/Dialect/AMDGCN/Transforms/Passes.h" -#include "aster/Dialect/LSIR/IR/LSIRDialect.h" #include "aster/Dialect/LSIR/IR/LSIROps.h" -#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" #include "mlir/IR/Builders.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/SmallVector.h" namespace mlir::aster { namespace amdgcn { @@ -54,175 +53,21 @@ struct LegalizeCF : public amdgcn::impl::LegalizeCFBase { void runOnOperation() override; private: - /// Verify i1 lifetime constraints for SCC/VCC registers: - /// 1. No i1 value is used across block boundaries (flag reg not preserved). - /// 2. No two lsir.cmp ops have overlapping lifetimes within a block - /// (clobber). - LogicalResult verifyI1Lifetimes(Operation *op); - - /// Get or create the lowered amdgcn.cmpi + alloca for an lsir.cmpi. - /// Selects s_cmp_* (SCC) for scalar operands, v_cmp_* (VCC) for vector. - /// On first call for a given cmpOp, creates the alloca and cmpi at the - /// original lsir.cmpi location. On subsequent calls, returns the cached - /// result. - Value getOrCreateLoweredCmp(lsir::CmpIOp cmpOp, IRRewriter &rewriter); - - /// Lower lsir.cmpi + cf.cond_br pattern to AMDGCN compare + branch. - LogicalResult lowerCondBranch(cf::CondBranchOp condBr); - - /// Lower cf.br to s_branch. - LogicalResult lowerBranch(cf::BranchOp br); - - /// Lower lsir.cmpi + lsir.select(i1) pattern to s_cmp + s_cselect_b32 - /// or v_cmp + v_cndmask_b32. - LogicalResult lowerSelect(lsir::SelectOp selectOp); - - /// Map from original lsir.cmpi to the SCC/VCC alloca value from the lowered - /// amdgcn.cmpi. Used to deduplicate compare lowering on fan-out. - DenseMap loweredCmpMap; -}; - -/// Map arith::CmpIPredicate to the appropriate s_cmp_* opcode (scalar). -static OpCode getScalarCompareOpCode(arith::CmpIPredicate predicate) { - switch (predicate) { - case arith::CmpIPredicate::eq: - return OpCode::S_CMP_EQ_I32; - case arith::CmpIPredicate::ne: - return OpCode::S_CMP_LG_I32; - case arith::CmpIPredicate::slt: - return OpCode::S_CMP_LT_I32; - case arith::CmpIPredicate::sle: - return OpCode::S_CMP_LE_I32; - case arith::CmpIPredicate::sgt: - return OpCode::S_CMP_GT_I32; - case arith::CmpIPredicate::sge: - return OpCode::S_CMP_GE_I32; - case arith::CmpIPredicate::ult: - return OpCode::S_CMP_LT_U32; - case arith::CmpIPredicate::ule: - return OpCode::S_CMP_LE_U32; - case arith::CmpIPredicate::ugt: - return OpCode::S_CMP_GT_U32; - case arith::CmpIPredicate::uge: - return OpCode::S_CMP_GE_U32; - } - llvm_unreachable("Unknown CmpIPredicate"); -} - -/// Map arith::CmpIPredicate to the appropriate v_cmp_* opcode (vector, 32-bit -/// encoding). The 32-bit VOPC encoding requires rhs (src1) to be a VGPR. -/// If operands need swapping, the predicate should be flipped first. -static OpCode getVectorCompareOpCode(arith::CmpIPredicate predicate) { - switch (predicate) { - case arith::CmpIPredicate::eq: - return OpCode::V_CMP_EQ_I32; - case arith::CmpIPredicate::ne: - return OpCode::V_CMP_NE_I32; - case arith::CmpIPredicate::slt: - return OpCode::V_CMP_LT_I32; - case arith::CmpIPredicate::sle: - return OpCode::V_CMP_LE_I32; - case arith::CmpIPredicate::sgt: - return OpCode::V_CMP_GT_I32; - case arith::CmpIPredicate::sge: - return OpCode::V_CMP_GE_I32; - case arith::CmpIPredicate::ult: - return OpCode::V_CMP_LT_U32; - case arith::CmpIPredicate::ule: - return OpCode::V_CMP_LE_U32; - case arith::CmpIPredicate::ugt: - return OpCode::V_CMP_GT_U32; - case arith::CmpIPredicate::uge: - return OpCode::V_CMP_GE_U32; - } - llvm_unreachable("Unknown CmpIPredicate"); -} - -/// Swap a comparison predicate (a < b becomes b > a). -static arith::CmpIPredicate swapPredicate(arith::CmpIPredicate pred) { - switch (pred) { - case arith::CmpIPredicate::eq: - return arith::CmpIPredicate::eq; - case arith::CmpIPredicate::ne: - return arith::CmpIPredicate::ne; - case arith::CmpIPredicate::slt: - return arith::CmpIPredicate::sgt; - case arith::CmpIPredicate::sle: - return arith::CmpIPredicate::sge; - case arith::CmpIPredicate::sgt: - return arith::CmpIPredicate::slt; - case arith::CmpIPredicate::sge: - return arith::CmpIPredicate::sle; - case arith::CmpIPredicate::ult: - return arith::CmpIPredicate::ugt; - case arith::CmpIPredicate::ule: - return arith::CmpIPredicate::uge; - case arith::CmpIPredicate::ugt: - return arith::CmpIPredicate::ult; - case arith::CmpIPredicate::uge: - return arith::CmpIPredicate::ule; - } - llvm_unreachable("Unknown CmpIPredicate"); -} - -/// Returns true if either operand of the compare is a VGPR. -static bool hasVGPROperand(lsir::CmpIOp cmpOp) { - return isa(cmpOp.getLhs().getType()) || - isa(cmpOp.getRhs().getType()); -} - -Value LegalizeCF::getOrCreateLoweredCmp(lsir::CmpIOp cmpOp, - IRRewriter &rewriter) { - auto it = loweredCmpMap.find(cmpOp); - if (it != loweredCmpMap.end()) - return it->second; - - // Create the lowered compare at the original lsir.cmpi location. - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(cmpOp); - Location loc = cmpOp.getLoc(); + /// Lower lsir.cond_br to AMDGCN scalar/vector branch instructions. + /// The condition is a register type (SCC or VCC) directly. + LogicalResult lowerCondBranch(lsir::CondBranchOp condBr); - bool isVector = hasVGPROperand(cmpOp); - if (isVector) { - // Vector compare: v_cmp_* writes to VCC. - // The 32-bit VOPC encoding requires src1 (rhs) to be a VGPR. - // If rhs is not a VGPR, swap operands and flip the predicate. - Value lhs = cmpOp.getLhs(); - Value rhs = cmpOp.getRhs(); - arith::CmpIPredicate pred = cmpOp.getPredicate(); - if (!isa(rhs.getType())) { - assert(isa(lhs.getType()) && - "at least one operand must be a VGPR for vector compare"); - std::swap(lhs, rhs); - pred = swapPredicate(pred); - } - Type vccType = VCCType::get(rewriter.getContext()); - Value vcc = AllocaOp::create(rewriter, loc, vccType); - OpCode cmpOpCode = getVectorCompareOpCode(pred); - amdgcn::CmpIOp::create(rewriter, loc, cmpOpCode, vcc, lhs, rhs); - loweredCmpMap[cmpOp] = vcc; - return vcc; - } - - // Scalar compare: s_cmp_* writes to SCC. - Type sccType = SCCType::get(rewriter.getContext()); - Value scc = AllocaOp::create(rewriter, loc, sccType); - OpCode cmpOpCode = getScalarCompareOpCode(cmpOp.getPredicate()); - amdgcn::CmpIOp::create(rewriter, loc, cmpOpCode, scc, cmpOp.getLhs(), - cmpOp.getRhs()); + /// Lower lsir.br to s_branch. + LogicalResult lowerBranch(lsir::BranchOp br); - loweredCmpMap[cmpOp] = scc; - return scc; -} + /// Lower lsir.select with a register condition (SCC or VCC). + LogicalResult lowerSelect(lsir::SelectOp selectOp); +}; -LogicalResult LegalizeCF::lowerCondBranch(cf::CondBranchOp condBr) { - // Get the condition - must come from lsir.cmpi - Value condition = condBr.getCondition(); - auto cmpOp = condition.getDefiningOp(); - if (!cmpOp) { - return condBr.emitError() - << "cf.cond_br condition must come from lsir.cmpi for legalization"; - } +LogicalResult LegalizeCF::lowerCondBranch(lsir::CondBranchOp condBr) { + // The condition is a register type (SCC or VCC) directly. + Value flagReg = condBr.getCondition(); + bool isVector = isa(flagReg.getType()); // Note: We just drop block arguments as they are allocated and all values // flow through side effects. @@ -232,9 +77,9 @@ LogicalResult LegalizeCF::lowerCondBranch(cf::CondBranchOp condBr) { {condBr.getTrueDestOperands(), condBr.getFalseDestOperands()}) { for (Value operand : brOpRange) { Type type = operand.getType(); - if (!isa(type)) { + if (!isa(type)) { return condBr.emitError() - << "cf.br operand must have an allocated register type"; + << "lsir.cond_br operand must have an allocated register type"; } } } @@ -242,9 +87,6 @@ LogicalResult LegalizeCF::lowerCondBranch(cf::CondBranchOp condBr) { IRRewriter rewriter(condBr); rewriter.setInsertionPoint(condBr); - Value flagReg = getOrCreateLoweredCmp(cmpOp, rewriter); - bool isVector = isa(flagReg.getType()); - // Create conditional branch based on which destination is the next physical // block. The fallthrough target must be the next block. Location loc = condBr.getLoc(); @@ -253,21 +95,24 @@ LogicalResult LegalizeCF::lowerCondBranch(cf::CondBranchOp condBr) { Block *currentBlock = condBr->getBlock(); Block *nextBlock = currentBlock->getNextNode(); - // Select branch opcodes based on whether the compare wrote SCC or VCC. - OpCode branchTrue = + // lsir.cond_br branches to trueDest when the condition register is nonzero. + // Select branch opcodes based on whether the flag register is SCC or VCC. + OpCode branchIfTrue = isVector ? OpCode::S_CBRANCH_VCCNZ : OpCode::S_CBRANCH_SCC1; - OpCode branchFalse = + OpCode branchIfFalse = isVector ? OpCode::S_CBRANCH_VCCZ : OpCode::S_CBRANCH_SCC0; // amdgcn::CBranchOp takes a label; later, the actual 16-bit PC-relative // offset is computed by the LLVM assembler (MC layer) when it assembles this // text into binary machine code. This is happening outside of aster. if (falseDest == nextBlock) { - // Branch to trueDest if flag set, fallthrough to falseDest - CBranchOp::create(rewriter, loc, branchTrue, flagReg, trueDest, falseDest); + // Branch to trueDest if condition true, fallthrough to falseDest. + CBranchOp::create(rewriter, loc, branchIfTrue, flagReg, trueDest, + falseDest); } else if (trueDest == nextBlock) { - // Branch to falseDest if flag clear, fallthrough to trueDest - CBranchOp::create(rewriter, loc, branchFalse, flagReg, falseDest, trueDest); + // Branch to falseDest if condition false, fallthrough to trueDest. + CBranchOp::create(rewriter, loc, branchIfFalse, flagReg, falseDest, + trueDest); } else { // TODO: neither destination is the next block, we need more sophisticated // logic to insert explicit branch and create a new block. For this to @@ -286,16 +131,16 @@ LogicalResult LegalizeCF::lowerCondBranch(cf::CondBranchOp condBr) { return success(); } -LogicalResult LegalizeCF::lowerBranch(cf::BranchOp br) { +LogicalResult LegalizeCF::lowerBranch(lsir::BranchOp br) { // Note: We just drop block arguments as they are allocated and all values // flow through side effects. // TODO: In the future, this is better done as a RA legalization once we have // a side-effecting representation of instructions without return values. for (Value operand : br.getDestOperands()) { Type type = operand.getType(); - if (!isa(type)) { + if (!isa(type)) { return br.emitError() - << "cf.br operand must have an allocated register type"; + << "lsir.br operand must have an allocated register type"; } } @@ -313,26 +158,14 @@ LogicalResult LegalizeCF::lowerBranch(cf::BranchOp br) { } LogicalResult LegalizeCF::lowerSelect(lsir::SelectOp selectOp) { - Value condition = selectOp.getCondition(); - // Only handle i1-conditioned selects (from lsir.cmpi). - // Register-conditioned selects are handled elsewhere. - if (!condition.getType().isInteger(1)) - return success(); - - auto cmpOp = condition.getDefiningOp(); - if (!cmpOp) { - return selectOp.emitError() - << "lsir.select with i1 condition must come from lsir.cmpi"; - } + // The condition is always a register type (SCC or VCC) after codegen. + Value flagReg = selectOp.getCondition(); + bool isVector = isa(flagReg.getType()); Location loc = selectOp.getLoc(); IRRewriter rewriter(selectOp); rewriter.setInsertionPoint(selectOp); - // Ensure the compare is lowered. - Value flagReg = getOrCreateLoweredCmp(cmpOp, rewriter); - bool isVector = isa(flagReg.getType()); - Value dst = selectOp.getDst(); if (isVector) { // v_cndmask_b32: vdst = VCC[lane] ? src1 : src0 (note: reversed order!) @@ -366,75 +199,6 @@ LogicalResult LegalizeCF::lowerSelect(lsir::SelectOp selectOp) { return success(); } -/// Find the last user of `value` in `block`, by operation order. -/// Returns nullptr if no user exists in the block. -static Operation *findLastUserInBlock(Value value, Block *block) { - Operation *lastUser = nullptr; - for (Operation *user : value.getUsers()) { - if (user->getBlock() != block) - continue; - if (!lastUser || lastUser->isBeforeInBlock(user)) - lastUser = user; - } - return lastUser; -} - -LogicalResult LegalizeCF::verifyI1Lifetimes(Operation *op) { - LogicalResult result = success(); - - op->walk([&](Block *block) { - // Track the currently-live i1 value and where its lifetime ends. - Operation *activeI1Op = nullptr; - Operation *activeI1OpLastUserOp = nullptr; - - for (Operation &innerOp : *block) { - Value i1; - if (auto cmpOp = dyn_cast(&innerOp)) - i1 = cmpOp.getResult(); - else if (auto cmpOp = dyn_cast(&innerOp)) - i1 = cmpOp.getResult(); - else - continue; - - // Check cross-block usage: all users of this cmpi must be in the same - // block. SCC/VCC are not preserved across block boundaries. - for (Operation *user : i1.getUsers()) { - if (user->getBlock() != block) { - innerOp.emitError() - << "has consumer in a different block; flag register (SCC/VCC) " - "is not preserved across block boundaries"; - result = failure(); - return WalkResult::interrupt(); - } - } - - // Check overlap: any cmpi (even dead ones) clobbers the flag register, - // so if a previous i1 is still live, this is an error. - if (activeI1Op && activeI1OpLastUserOp && - !activeI1OpLastUserOp->isBeforeInBlock(&innerOp)) { - innerOp.emitError() - << "would clobber flag register from earlier compare; i1 " - "lifetimes must not overlap"; - result = failure(); - return WalkResult::interrupt(); - } - - // Dead cmpi (no users) is benign for tracking purposes -- it clobbers - // SCC but has no consumers that could be affected by a future clobber. - // Don't update activeI1 so it doesn't block subsequent live cmpi ops. - if (i1.use_empty()) - continue; - - // Start tracking this cmpi's lifetime. - activeI1Op = &innerOp; - activeI1OpLastUserOp = findLastUserInBlock(i1, block); - } - return WalkResult::advance(); - }); - - return result; -} - void LegalizeCF::runOnOperation() { Operation *op = getOperation(); @@ -448,58 +212,20 @@ void LegalizeCF::runOnOperation() { } } - // Precondition: verify i1 lifetimes are non-overlapping and block-local. - // SCC/VCC are implicit flag registers with no spill capability, so - // overlapping lifetimes or cross-block usage would produce silent - // miscompilation. - if (failed(verifyI1Lifetimes(op))) { - signalPassFailure(); - return; - } - - // Construct allocated register to alloca map. - // After canonicalize + CSE (run in the backend pipeline before this pass), - // there must be exactly one alloca per concrete register type. CSE - // deduplicates allocas of the same type since allocated allocas are Pure. - DenseMap allocatedRegisterToAllocaMap; - bool hasDuplicates = false; - op->walk([&](AllocaOp alloca) { - auto regType = cast(alloca.getType()); - if (regType.isRelocatable()) { - alloca.emitOpError("alloca must have a fixed register type " - "(register coloring must run before LegalizeCF)"); - hasDuplicates = true; - return; - } - auto [it, inserted] = - allocatedRegisterToAllocaMap.try_emplace(regType, alloca); - if (!inserted) { - alloca.emitOpError("duplicate alloca for register type ") - << alloca.getType() - << " (canonicalize + CSE should deduplicate allocated allocas)"; - hasDuplicates = true; - } - }); - if (hasDuplicates) { - signalPassFailure(); - return; - } - // Collect all operations to lower. SmallVector selects; - SmallVector condBranches; - SmallVector branches; + SmallVector condBranches; + SmallVector branches; op->walk([&](Operation *innerOp) { if (auto selectOp = dyn_cast(innerOp)) selects.push_back(selectOp); - else if (auto condBr = dyn_cast(innerOp)) + else if (auto condBr = dyn_cast(innerOp)) condBranches.push_back(condBr); - else if (auto br = dyn_cast(innerOp)) + else if (auto br = dyn_cast(innerOp)) branches.push_back(br); }); - // Lower i1-conditioned selects first (they reference lsir.cmpi which may - // also be used by cond_br). + // Lower i1-conditioned selects (they reference amdgcn.is_cc). for (lsir::SelectOp selectOp : selects) { if (failed(lowerSelect(selectOp))) { signalPassFailure(); @@ -507,137 +233,22 @@ void LegalizeCF::runOnOperation() { } } - // Lower conditional branches (they may reference lsir.cmpi) - for (cf::CondBranchOp condBr : condBranches) { + // Lower conditional branches. + for (lsir::CondBranchOp condBr : condBranches) { if (failed(lowerCondBranch(condBr))) { signalPassFailure(); return; } } - // Lower unconditional branches - for (cf::BranchOp br : branches) { + // Lower unconditional branches. + for (lsir::BranchOp br : branches) { if (failed(lowerBranch(br))) { signalPassFailure(); return; } } - // Erase original lsir.cmpi ops that were lowered. Collect first, then - // clear the map before erasing to avoid dangling pointers during iteration. - SmallVector cmpsToErase; - for (auto &[cmpOp, scc] : loweredCmpMap) { - assert(cmpOp->use_empty() && - "lsir.cmpi still has uses after all consumers lowered"); - cmpsToErase.push_back(cmpOp); - } - loweredCmpMap.clear(); - for (Operation *cmpOp : cmpsToErase) - cmpOp->erase(); - - // Iterate all blocks in all regions of the function and replace block - // arguments with the corresponding alloca. - // - // For register range block arguments, we decompose them to individual - // registers since ranges are composite constructs without their own allocas. - // Each range block arg is replaced by reconstructing the range from its - // constituent allocas using make_register_range at the block entry. - // - // This is a simple way of legalizing block arguments, late in the pipeline. - // - // Note and caveat: taking the alloc is fine because at this point values do - // not flow through SSA values anymore, except i1 cf.cond_br conditions. - // While this is correct, it is easily confusing since SSA and side-effects - // are mixed in the same representation. - // - // TODO: In the very short future, this is better done as a RA legalization - // once we have a side-effecting representation of instructions without return - // values. - op->walk([&](Block *block) { - IRRewriter rewriter(op->getContext()); - - // Drop all block arguments, if any. - for (int i = block->getNumArguments() - 1; i >= 0; --i) { - // Always erase index i; indices shift after each erase. - BlockArgument arg = block->getArgument(i); - RegisterTypeInterface regType = - cast(arg.getType()); - - // Simple case: non-range register type - if (!regType.isRegisterRange()) { - auto it = allocatedRegisterToAllocaMap.find(regType); - if (it == allocatedRegisterToAllocaMap.end()) { - block->getParentOp()->emitError() - << "Alloca not found for register type " << regType; - signalPassFailure(); - return WalkResult::interrupt(); - } - arg.replaceAllUsesWith(it->second); - block->eraseArgument(i); - continue; - } - - // Complex case: register range type - decompose to constituents - RegisterRange range = regType.getAsRange(); - Register beginReg = range.begin(); - int16_t rangeSize = range.size(); - - if (beginReg.isRelocatable()) { - block->getParentOp()->emitError() - << "Cannot legalize relocatable register range block argument"; - signalPassFailure(); - return WalkResult::interrupt(); - } - - // Collect allocas for all constituent registers - SmallVector constituentAllocas; - constituentAllocas.reserve(rangeSize); - - auto rangeRegType = cast(regType); - RegisterKind regKind = rangeRegType.getRegisterKind(); - - for (int16_t offset = 0; offset < rangeSize; ++offset) { - Register reg = beginReg.getWithOffset(offset); - - RegisterTypeInterface constituentType; - MLIRContext *ctx = block->getParentOp()->getContext(); - switch (regKind) { - case RegisterKind::SGPR: - constituentType = SGPRType::get(ctx, reg); - break; - case RegisterKind::VGPR: - constituentType = VGPRType::get(ctx, reg); - break; - case RegisterKind::AGPR: - constituentType = AGPRType::get(ctx, reg); - break; - default: - block->getParentOp()->emitError() - << "Unsupported register kind for range block argument"; - signalPassFailure(); - return WalkResult::interrupt(); - } - - auto it = allocatedRegisterToAllocaMap.find(constituentType); - if (it == allocatedRegisterToAllocaMap.end()) { - block->getParentOp()->emitError() - << "Alloca not found for constituent register " << constituentType - << " in range " << regType; - signalPassFailure(); - return WalkResult::interrupt(); - } - constituentAllocas.push_back(it->second); - } - - rewriter.setInsertionPointToStart(block); - Value reconstructedRange = MakeRegisterRangeOp::create( - rewriter, arg.getLoc(), constituentAllocas); - arg.replaceAllUsesWith(reconstructedRange); - block->eraseArgument(i); - } - return WalkResult::advance(); - }); - // Set post-condition: no CF branches remain. if (auto kernelOp = dyn_cast(op)) kernelOp.addNormalForms({NoCfBranchesAttr::get(op->getContext())}); diff --git a/lib/Dialect/AMDGCN/Transforms/Mem2Reg.cpp b/lib/Dialect/AMDGCN/Transforms/Mem2Reg.cpp index 7354a08b1..53a944b83 100644 --- a/lib/Dialect/AMDGCN/Transforms/Mem2Reg.cpp +++ b/lib/Dialect/AMDGCN/Transforms/Mem2Reg.cpp @@ -63,8 +63,8 @@ void Mem2Reg::runOnOperation() { SmallVector allocas; RegisterRange range = regType.getAsRange(); for (int16_t i = 0; i < range.size(); ++i) { - Register reg = regType.isRelocatable() - ? Register() + Register reg = !regType.hasAllocatedSemantics() + ? range.begin() : Register(range.begin().getRegister() + i); allocas.push_back(amdgcn::AllocaOp::create( rewriter, pOp.getLoc(), getRegisterType(regType, reg))); diff --git a/lib/Dialect/AMDGCN/Transforms/ToAMDGCNPatterns.cpp b/lib/Dialect/AMDGCN/Transforms/ToAMDGCNPatterns.cpp index d5cf9e5ae..8fc5ee2cc 100644 --- a/lib/Dialect/AMDGCN/Transforms/ToAMDGCNPatterns.cpp +++ b/lib/Dialect/AMDGCN/Transforms/ToAMDGCNPatterns.cpp @@ -296,6 +296,16 @@ struct WaitOpPattern : public OpRewritePattern { PatternRewriter &rewriter) const override; }; +//===----------------------------------------------------------------------===// +// CmpIOpPattern +//===----------------------------------------------------------------------===// + +struct CmpIOpPattern : public OpRewritePattern { + using Base::Base; + LogicalResult matchAndRewrite(lsir::CmpIOp op, + PatternRewriter &rewriter) const override; +}; + //===----------------------------------------------------------------------===// // PtrAddOpPattern //===----------------------------------------------------------------------===// @@ -2110,6 +2120,131 @@ PtrAddOpPattern::matchAndRewrite(PtrAddOp op, PatternRewriter &rewriter) const { return success(); } +//===----------------------------------------------------------------------===// +// CmpIOpPattern helpers +//===----------------------------------------------------------------------===// + +/// Map arith::CmpIPredicate to the appropriate s_cmp_* opcode (scalar). +static OpCode getScalarCompareOpCode(arith::CmpIPredicate predicate) { + switch (predicate) { + case arith::CmpIPredicate::eq: + return OpCode::S_CMP_EQ_I32; + case arith::CmpIPredicate::ne: + return OpCode::S_CMP_LG_I32; + case arith::CmpIPredicate::slt: + return OpCode::S_CMP_LT_I32; + case arith::CmpIPredicate::sle: + return OpCode::S_CMP_LE_I32; + case arith::CmpIPredicate::sgt: + return OpCode::S_CMP_GT_I32; + case arith::CmpIPredicate::sge: + return OpCode::S_CMP_GE_I32; + case arith::CmpIPredicate::ult: + return OpCode::S_CMP_LT_U32; + case arith::CmpIPredicate::ule: + return OpCode::S_CMP_LE_U32; + case arith::CmpIPredicate::ugt: + return OpCode::S_CMP_GT_U32; + case arith::CmpIPredicate::uge: + return OpCode::S_CMP_GE_U32; + } + llvm_unreachable("unknown CmpIPredicate"); +} + +/// Map arith::CmpIPredicate to the appropriate v_cmp_* opcode (vector, 32-bit +/// encoding). The 32-bit VOPC encoding requires rhs (src1) to be a VGPR. If +/// operands need swapping, the predicate should be flipped first. +static OpCode getVectorCompareOpCode(arith::CmpIPredicate predicate) { + switch (predicate) { + case arith::CmpIPredicate::eq: + return OpCode::V_CMP_EQ_I32; + case arith::CmpIPredicate::ne: + return OpCode::V_CMP_NE_I32; + case arith::CmpIPredicate::slt: + return OpCode::V_CMP_LT_I32; + case arith::CmpIPredicate::sle: + return OpCode::V_CMP_LE_I32; + case arith::CmpIPredicate::sgt: + return OpCode::V_CMP_GT_I32; + case arith::CmpIPredicate::sge: + return OpCode::V_CMP_GE_I32; + case arith::CmpIPredicate::ult: + return OpCode::V_CMP_LT_U32; + case arith::CmpIPredicate::ule: + return OpCode::V_CMP_LE_U32; + case arith::CmpIPredicate::ugt: + return OpCode::V_CMP_GT_U32; + case arith::CmpIPredicate::uge: + return OpCode::V_CMP_GE_U32; + } + llvm_unreachable("unknown CmpIPredicate"); +} + +/// Swap a comparison predicate (a op b becomes b swapped_op a). +static arith::CmpIPredicate swapPredicate(arith::CmpIPredicate pred) { + switch (pred) { + case arith::CmpIPredicate::eq: + return arith::CmpIPredicate::eq; + case arith::CmpIPredicate::ne: + return arith::CmpIPredicate::ne; + case arith::CmpIPredicate::slt: + return arith::CmpIPredicate::sgt; + case arith::CmpIPredicate::sle: + return arith::CmpIPredicate::sge; + case arith::CmpIPredicate::sgt: + return arith::CmpIPredicate::slt; + case arith::CmpIPredicate::sge: + return arith::CmpIPredicate::sle; + case arith::CmpIPredicate::ult: + return arith::CmpIPredicate::ugt; + case arith::CmpIPredicate::ule: + return arith::CmpIPredicate::uge; + case arith::CmpIPredicate::ugt: + return arith::CmpIPredicate::ult; + case arith::CmpIPredicate::uge: + return arith::CmpIPredicate::ule; + } + llvm_unreachable("unknown CmpIPredicate"); +} + +//===----------------------------------------------------------------------===// +// CmpIOpPattern +//===----------------------------------------------------------------------===// + +LogicalResult CmpIOpPattern::matchAndRewrite(lsir::CmpIOp op, + PatternRewriter &rewriter) const { + Value dst = op.getDst(); + Value lhs = op.getLhs(); + Value rhs = op.getRhs(); + arith::CmpIPredicate pred = op.getPredicate(); + Location loc = op.getLoc(); + + if (isa(dst.getType())) { + // Vector compare: v_cmp_* writes to VCC. The 32-bit VOPC encoding requires + // src1 (rhs) to be a VGPR. If rhs is not a VGPR, swap operands and flip + // the predicate. + if (!isa(rhs.getType())) { + assert(isa(lhs.getType()) && + "at least one operand must be a VGPR for vector compare"); + std::swap(lhs, rhs); + pred = swapPredicate(pred); + } + Value result = + amdgcn::CmpIOp::create(rewriter, loc, getVectorCompareOpCode(pred), dst, + lhs, rhs) + .getDestRes(); + rewriter.replaceOp(op, result); + return success(); + } + + // Scalar compare: s_cmp_* writes to SCC. + Value result = amdgcn::CmpIOp::create( + rewriter, loc, getScalarCompareOpCode(pred), dst, lhs, rhs) + .getDestRes(); + rewriter.replaceOp(op, result); + return success(); +} + //===----------------------------------------------------------------------===// // ToAMDGCNPass patterns //===----------------------------------------------------------------------===// @@ -2117,11 +2252,11 @@ PtrAddOpPattern::matchAndRewrite(PtrAddOp op, PatternRewriter &rewriter) const { void mlir::aster::amdgcn::populateToAMDGCNPatterns( RewritePatternSet &patterns) { patterns.add< // Arithmetic ops. - AddFOpPattern, AddIOpPattern, AndIOpPattern, ExtSIOpPattern, - ExtUIOpPattern, MaximumFOpPattern, MinimumFOpPattern, MulFOpPattern, - MulIOpPattern, MulHiSIOpPattern, OrIOpPattern, ShLIOpPattern, - ShRSIOpPattern, ShRUIOpPattern, SubFOpPattern, SubIOpPattern, - XOrIOpPattern, + AddFOpPattern, AddIOpPattern, AndIOpPattern, CmpIOpPattern, + ExtSIOpPattern, ExtUIOpPattern, MaximumFOpPattern, MinimumFOpPattern, + MulFOpPattern, MulIOpPattern, MulHiSIOpPattern, OrIOpPattern, + ShLIOpPattern, ShRSIOpPattern, ShRUIOpPattern, SubFOpPattern, + SubIOpPattern, XOrIOpPattern, // Memory ops. AllocaOpPattern, AssumeNoaliasOpPattern, LoadOpPattern, StoreOpPattern, // Data movement ops. diff --git a/lib/Dialect/AMDGCN/Transforms/ToRegisterSemantics.cpp b/lib/Dialect/AMDGCN/Transforms/ToRegisterSemantics.cpp index a2733424d..fb0e2be48 100644 --- a/lib/Dialect/AMDGCN/Transforms/ToRegisterSemantics.cpp +++ b/lib/Dialect/AMDGCN/Transforms/ToRegisterSemantics.cpp @@ -396,7 +396,8 @@ void ToRegisterSemantics::runOnOperation() { RewritePatternSet patterns(ctx); patterns .add, + SplitRegisterRangePattern, GenericOpPattern, + GenericOpPattern, GenericOpPattern, DeallocCastOpPattern>( ctx); if (failed(applyPatternsGreedily( diff --git a/lib/Dialect/LSIR/CodeGen/CodeGenPatterns.cpp b/lib/Dialect/LSIR/CodeGen/CodeGenPatterns.cpp index 01d45a449..b7e43179e 100644 --- a/lib/Dialect/LSIR/CodeGen/CodeGenPatterns.cpp +++ b/lib/Dialect/LSIR/CodeGen/CodeGenPatterns.cpp @@ -16,6 +16,8 @@ #include "aster/CodeGen/CodeGen.h" #include "aster/Dialect/AMDGCN/IR/AMDGCNOps.h" +#include "aster/Dialect/AMDGCN/IR/AMDGCNTypes.h" +#include "aster/Dialect/AMDGCN/IR/Interfaces/AMDGCNRegisterTypeInterface.h" #include "aster/Dialect/AsterUtils/IR/AsterUtilsOps.h" #include "aster/Dialect/LSIR/IR/LSIRDialect.h" #include "aster/Dialect/LSIR/IR/LSIROps.h" @@ -128,12 +130,20 @@ struct ArithMinMaxOpPattern : public OpCodeGenPattern { Location loc = op.getLoc(); Value lhs = adaptor.getLhs(), rhs = adaptor.getRhs(); // Lower to lsir.cmpi + lsir.select directly (skip arith intermediates). + // lsir.cmpi uses DPS: determine the compare dst type (SCC for scalar ops, + // VCC for vector ops) from the lhs register kind. Type regType = this->converter.convertType(op); Value dst = this->createAlloca(rewriter, loc, regType); + Type cmpDstType = + isa(lhs.getType()) + ? Type(amdgcn::VCCType::get(rewriter.getContext(), Register())) + : Type(amdgcn::SCCType::get(rewriter.getContext(), Register())); + Value cmpDst = this->createAlloca(rewriter, loc, cmpDstType); Value cmp = lsir::CmpIOp::create( - rewriter, loc, rewriter.getI1Type(), - TypeAttr::get(op.getLhs().getType()), - arith::CmpIPredicateAttr::get(rewriter.getContext(), pred), lhs, rhs); + rewriter, loc, TypeAttr::get(op.getLhs().getType()), + arith::CmpIPredicateAttr::get(rewriter.getContext(), pred), + cmpDst, lhs, rhs) + .getDstRes(); rewriter.replaceOpWithNewOp(op, dst, cmp, lhs, rhs); return success(); } @@ -271,10 +281,12 @@ LogicalResult ArithCmpIOpPattern::matchAndRewrite(arith::CmpIOp op, arith::CmpIOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const { - // lsir.cmpi returns i1 directly, operands are converted to registers - rewriter.replaceOpWithNewOp( - op, rewriter.getI1Type(), TypeAttr::get(op.getLhs().getType()), - op.getPredicateAttr(), adaptor.getLhs(), adaptor.getRhs()); + Type dstType = converter.convertType(op.getResult()); + Value dst = createAlloca(rewriter, op.getLoc(), dstType); + auto cmpOp = lsir::CmpIOp::create( + rewriter, op.getLoc(), TypeAttr::get(op.getLhs().getType()), + op.getPredicateAttr(), dst, adaptor.getLhs(), adaptor.getRhs()); + rewriter.replaceOp(op, cmpOp); return success(); } @@ -286,10 +298,12 @@ LogicalResult ArithCmpFOpPattern::matchAndRewrite(arith::CmpFOp op, arith::CmpFOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const { - // lsir.cmpf returns i1 directly, operands are converted to registers - rewriter.replaceOpWithNewOp( - op, rewriter.getI1Type(), TypeAttr::get(op.getLhs().getType()), - op.getPredicateAttr(), adaptor.getLhs(), adaptor.getRhs()); + Type dstType = amdgcn::VCCType::get(op.getContext(), Register()); + Value dst = createAlloca(rewriter, op.getLoc(), dstType); + auto cmpOp = lsir::CmpFOp::create( + rewriter, op.getLoc(), TypeAttr::get(op.getLhs().getType()), + op.getPredicateAttr(), dst, adaptor.getLhs(), adaptor.getRhs()); + rewriter.replaceOp(op, cmpOp); return success(); } @@ -337,7 +351,13 @@ LogicalResult CFCondBranchOpPattern::matchAndRewrite( ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); - // Convert operands to match the expected converted block argument types + // The condition must already be a register type (from lsir.cmpi). If it is + // still i1 (e.g. from lsir.cmpf), we cannot convert this op. + Value cond = adaptor.getCondition(); + if (!isa(cond.getType())) + return failure(); + + // Convert operands to match the expected converted block argument types. SmallVector trueOperands = convertBranchOperands(adaptor.getTrueDestOperands(), op.getTrueDest(), *getTypeConverter(), rewriter, loc); @@ -345,8 +365,8 @@ LogicalResult CFCondBranchOpPattern::matchAndRewrite( convertBranchOperands(adaptor.getFalseDestOperands(), op.getFalseDest(), *getTypeConverter(), rewriter, loc); - rewriter.replaceOpWithNewOp( - op, op.getCondition(), op.getTrueDest(), trueOperands, op.getFalseDest(), + rewriter.replaceOpWithNewOp( + op, cond, op.getTrueDest(), trueOperands, op.getFalseDest(), falseOperands); return success(); } @@ -361,12 +381,12 @@ CFBranchOpPattern::matchAndRewrite(cf::BranchOp op, ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); - // Convert operands to match the expected converted block argument types + // Convert operands to match the expected converted block argument types. SmallVector destOperands = convertBranchOperands(adaptor.getDestOperands(), op.getDest(), *getTypeConverter(), rewriter, loc); - rewriter.replaceOpWithNewOp(op, op.getDest(), destOperands); + rewriter.replaceOpWithNewOp(op, op.getDest(), destOperands); return success(); } @@ -395,16 +415,13 @@ KernelOpConversion::matchAndRewrite(amdgcn::KernelOp op, LogicalResult ArithSelectOpPattern::matchAndRewrite( arith::SelectOp op, arith::SelectOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const { + // lsir.select requires a register condition. If the condition is still i1 + // (e.g. from lsir.cmpf), we cannot convert this op. + Value cond = adaptor.getCondition(); + if (!isa(cond.getType())) + return failure(); Type type = this->converter.convertType(op); Value dst = this->createAlloca(rewriter, op.getLoc(), type); - // If the original condition comes from lsir.cmpi/cmpf (i1 result), use the - // original value to avoid the type converter wrapping it in a cast. - // For block-argument i1 conditions, the type converter correctly maps them - // to register types, so we use the adapted value. - Value cond = op.getCondition().getDefiningOp() || - op.getCondition().getDefiningOp() - ? op.getCondition() - : adaptor.getCondition(); rewriter.replaceOpWithNewOp( op, dst, cond, adaptor.getTrueValue(), adaptor.getFalseValue()); return success(); @@ -454,40 +471,24 @@ void mlir::aster::lsir::populateCodeGenPatterns(CodeGenConverter &converter, lsir::FromRegOp, lsir::ToRegOp, lsir::RegConstraintOp>(); target.addLegalOp(); - // arith.cmpi/cmpf are always converted to lsir counterparts. - // They return i1 but their operands are converted to register types. + // arith.cmpi is converted to lsir.cmpi (DPS, returns SCC/VCC register). + // arith.cmpf is converted to lsir.cmpf (returns i1 for now). target.addIllegalOp(); - // Helper to check if operands are legal for CF ops. Operands are legal if - // they are register types OR if they come from constants (which stay scalar). - auto cfOperandsLegal = [&](ValueRange operands) { - return llvm::all_of(operands, [&](Value v) { - Type t = v.getType(); - // Register types are legal - if (isa(t)) - return true; - return false; - }); - }; - - // CF dialect ops are dynamically legal when their branch operands are either - // register types or constants. The condition stays as i1. - target.addDynamicallyLegalOp([&](cf::CondBranchOp op) { - return cfOperandsLegal(op.getTrueDestOperands()) && - cfOperandsLegal(op.getFalseDestOperands()); - }); - target.addDynamicallyLegalOp( - [&](cf::BranchOp op) { return cfOperandsLegal(op.getDestOperands()); }); + // CF dialect branch ops are always illegal — they must be replaced by the + // corresponding lsir.br / lsir.cond_br ops that carry register conditions. + target.addIllegalOp(); // KernelOp is dynamically legal - it becomes legal once the // KernelOpConversion pattern has converted all block argument types. // Start as illegal to ensure the pattern runs. target.addDynamicallyLegalOp([&](amdgcn::KernelOp op) { - // Check if any block in the body has non-register, non-i1 arguments + // Check if any block in the body has non-register arguments. Token types + // are always legal. for (Block &block : op.getBodyRegion()) { for (BlockArgument arg : block.getArguments()) { Type t = arg.getType(); - if (!isa(t) && !t.isInteger(1)) + if (!isa(t)) return false; } } @@ -538,10 +539,9 @@ void mlir::aster::lsir::populateCodeGenPatterns(CodeGenConverter &converter, // These patterns go together for proper composable control-flow // support. CF patterns need the type converter to handle block // argument conversion. KernelOp conversion handles block - // argument types in kernel bodies. Cmp ops are converted to lsir - // counterparts returning i1, which persists late in the pipeline - // and is only translated to SCC after register allocation, - // together with cf branch operations. + // argument types in kernel bodies. arith.cmpi is converted to + // lsir.cmpi (DPS, SCC/VCC dst); cf.br/cf.cond_br are replaced + // by lsir.br/lsir.cond_br that carry register conditions. ArithCmpIOpPattern, ArithCmpFOpPattern, CFCondBranchOpPattern, CFBranchOpPattern, KernelOpConversion, AssumeUniformOpPattern // That's all folks! diff --git a/lib/Dialect/LSIR/IR/LSIROps.cpp b/lib/Dialect/LSIR/IR/LSIROps.cpp index 917028577..6395ea5af 100644 --- a/lib/Dialect/LSIR/IR/LSIROps.cpp +++ b/lib/Dialect/LSIR/IR/LSIROps.cpp @@ -18,6 +18,7 @@ #include "mlir/IR/OpDefinition.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" +#include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Support/LLVM.h" #include "mlir/Transforms/InliningUtils.h" @@ -198,6 +199,35 @@ LogicalResult RegCastOp::canonicalize(RegCastOp op, return failure(); } +//===----------------------------------------------------------------------===// +// LSIR BranchOp +//===----------------------------------------------------------------------===// + +SuccessorOperands BranchOp::getSuccessorOperands(unsigned index) { + assert(index == 0 && "invalid successor index"); + return SuccessorOperands(getDestOperandsMutable()); +} + +Block *BranchOp::getSuccessorForOperands(ArrayRef) { + return getDest(); +} + +//===----------------------------------------------------------------------===// +// LSIR CondBranchOp +//===----------------------------------------------------------------------===// + +SuccessorOperands CondBranchOp::getSuccessorOperands(unsigned index) { + assert(index < 2 && "invalid successor index"); + return SuccessorOperands(index == 0 ? getTrueDestOperandsMutable() + : getFalseDestOperandsMutable()); +} + +Block *CondBranchOp::getSuccessorForOperands(ArrayRef operands) { + if (auto condAttr = dyn_cast_or_null(operands.front())) + return condAttr.getValue().isOne() ? getTrueDest() : getFalseDest(); + return nullptr; +} + //===----------------------------------------------------------------------===// // LSIR IncGen //===----------------------------------------------------------------------===// diff --git a/lib/Target/ASM/TranslateModule.cpp b/lib/Target/ASM/TranslateModule.cpp index 66823b2b0..04c37115f 100644 --- a/lib/Target/ASM/TranslateModule.cpp +++ b/lib/Target/ASM/TranslateModule.cpp @@ -298,8 +298,8 @@ FailureOr RegisterUsage::countKernelRegisters(KernelOp kernel) { DenseSet usedAGPRs; auto result = kernel.walk([&](AllocaOp op) -> WalkResult { AMDGCNRegisterTypeInterface type = op.getType(); - if (type.isRelocatable()) { - op->emitError() << "expected non-relocatable registers"; + if (!type.hasAllocatedSemantics()) { + op->emitError() << "expected allocated registers"; return WalkResult::interrupt(); } RegisterRange range = type.getAsRange(); diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt index 5e094e79c..c002ac148 100644 --- a/lib/Transforms/CMakeLists.txt +++ b/lib/Transforms/CMakeLists.txt @@ -4,6 +4,7 @@ add_mlir_library(AsterTransforms AffineOptimizePtrAdd.cpp CanonicalizePtrs.cpp ConstexprExpansion.cpp + ConvertSCFControlFlow.cpp FactorizeAffineExpr.cpp DecomposeMemrefIterArgs.cpp LDSMultibufferPrep.cpp @@ -31,6 +32,7 @@ add_mlir_library(AsterTransforms MLIRAffineTransforms MLIRArithDialect MLIRArithTransforms + MLIRControlFlowDialect MLIRControlFlowInterfaces MLIRFuncDialect MLIRGPUDialect diff --git a/lib/Dialect/AMDGCN/Transforms/ConvertSCFControlFlow.cpp b/lib/Transforms/ConvertSCFControlFlow.cpp similarity index 73% rename from lib/Dialect/AMDGCN/Transforms/ConvertSCFControlFlow.cpp rename to lib/Transforms/ConvertSCFControlFlow.cpp index 25e544680..5fd243d51 100644 --- a/lib/Dialect/AMDGCN/Transforms/ConvertSCFControlFlow.cpp +++ b/lib/Transforms/ConvertSCFControlFlow.cpp @@ -1,4 +1,4 @@ -//===- ConvertSCFControlFlow.cpp - SCF to AMDGCN control flow conversion --===// +//===- ConvertSCFControlFlow.cpp - SCF to CF control flow conversion ------===// // // Copyright 2025 The ASTER Authors // @@ -9,18 +9,12 @@ //===----------------------------------------------------------------------===// // // This file implements the pass that converts SCF control flow operations to -// CF dialect operations with explicit basic block structure. The pass uses -// thread uniform analysis to ensure loops are uniform before conversion. +// CF dialect operations with explicit basic block structure. // //===----------------------------------------------------------------------===// -#include "aster/Dialect/AMDGCN/Transforms/Passes.h" +#include "aster/Transforms/Passes.h" -#include "aster/Analysis/ABIAnalysis.h" -#include "aster/Dialect/AMDGCN/IR/AMDGCNAttrs.h" -#include "aster/Dialect/AMDGCN/IR/AMDGCNDialect.h" -#include "aster/Dialect/AMDGCN/IR/AMDGCNOps.h" -#include "aster/Dialect/LSIR/IR/LSIRDialect.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" #include "mlir/Dialect/SCF/IR/SCF.h" @@ -29,10 +23,8 @@ #include "mlir/Transforms/DialectConversion.h" namespace mlir::aster { -namespace amdgcn { #define GEN_PASS_DEF_CONVERTSCFCONTROLFLOW -#include "aster/Dialect/AMDGCN/Transforms/Passes.h.inc" -} // namespace amdgcn +#include "aster/Transforms/Passes.h.inc" } // namespace mlir::aster using namespace mlir; @@ -44,22 +36,20 @@ namespace { //===----------------------------------------------------------------------===// struct ConvertSCFControlFlow - : public amdgcn::impl::ConvertSCFControlFlowBase { + : public aster::impl::ConvertSCFControlFlowBase { public: using Base::Base; void runOnOperation() override; private: /// Convert a scf.for operation to CF dialect control flow. - LogicalResult convertForOp(scf::ForOp forOp, const ABIAnalysis &abiAnalysis); + LogicalResult convertForOp(scf::ForOp forOp); /// Convert a scf.if operation to CF dialect control flow. - LogicalResult convertIfOp(scf::IfOp ifOp, const ABIAnalysis &abiAnalysis); + LogicalResult convertIfOp(scf::IfOp ifOp); }; -LogicalResult -ConvertSCFControlFlow::convertForOp(scf::ForOp forOp, - const ABIAnalysis &abiAnalysis) { +LogicalResult ConvertSCFControlFlow::convertForOp(scf::ForOp forOp) { Location loc = forOp.getLoc(); IRRewriter rewriter(forOp); @@ -70,34 +60,6 @@ ConvertSCFControlFlow::convertForOp(scf::ForOp forOp, Type ivType = forOp.getInductionVar().getType(); - // Check if a value is i32 or index_cast from i32. - auto isI32OrCastFromI32 = [](Value v) { - if (v.getType().isInteger(32)) - return true; - if (v.getType().isIndex()) { - if (auto castOp = v.getDefiningOp()) - return castOp.getIn().getType().isInteger(32); - } - return false; - }; - - // Only i32 (or index_cast from i32) bounds are supported. - if (!isI32OrCastFromI32(lowerBound) || !isI32OrCastFromI32(upperBound) || - !isI32OrCastFromI32(step)) { - return forOp.emitError() - << "only i32 induction variables are supported in this conversion " - "(bounds must be i32 or arith.index_cast from i32)"; - } - - // Check if the loop is thread-uniform. - bool isUniform = abiAnalysis.isThreadUniform(lowerBound).value_or(false) && - abiAnalysis.isThreadUniform(upperBound).value_or(false) && - abiAnalysis.isThreadUniform(step).value_or(false); - if (!isUniform) { - return forOp.emitError() - << "only thread-uniform loops are supported in this conversion"; - } - // Get the yield op and its operands before modifying the body. auto yieldOp = cast(forOp.getBody()->getTerminator()); SmallVector yieldOperands(yieldOp.getOperands()); @@ -137,9 +99,6 @@ ConvertSCFControlFlow::convertForOp(scf::ForOp forOp, iterArgBlockArgs.push_back(bbBody->getArgument(i + 1)); // Build the mapping from original region args to block args. - // This mapping is used by inlineBlockBefore to remap the body, but - // yieldOperands may also reference old block args (e.g., swap patterns - // like `scf.yield %b, %a`), so we must remap them too. SmallVector bodyArgMapping = {ivBlockArg}; bodyArgMapping.append(iterArgBlockArgs); @@ -155,9 +114,7 @@ ConvertSCFControlFlow::convertForOp(scf::ForOp forOp, rewriter.inlineBlockBefore(forOp.getBody(), bbBody, bbBody->end(), bodyArgMapping); - // Remap yield operands: they may reference old block args (now dead) - // from the original for body. After inlining, those block args have been - // replaced, but yieldOperands still holds the old Value references. + // Remap yield operands: they may reference old block args (now dead). for (Value &val : yieldOperands) val = blockArgMapping.lookupOrDefault(val); @@ -181,20 +138,12 @@ ConvertSCFControlFlow::convertForOp(scf::ForOp forOp, return success(); } -LogicalResult -ConvertSCFControlFlow::convertIfOp(scf::IfOp ifOp, - const ABIAnalysis &abiAnalysis) { +LogicalResult ConvertSCFControlFlow::convertIfOp(scf::IfOp ifOp) { Location loc = ifOp.getLoc(); IRRewriter rewriter(ifOp); Value condition = ifOp.getCondition(); - // Check if the condition is thread-uniform. - if (!abiAnalysis.isThreadUniform(condition).value_or(false)) { - return ifOp.emitError() - << "only thread-uniform conditions are supported in this conversion"; - } - bool hasElse = !ifOp.getElseRegion().empty(); // Capture yield operands and block pointers before modifying anything. @@ -248,9 +197,6 @@ ConvertSCFControlFlow::convertIfOp(scf::IfOp ifOp, void ConvertSCFControlFlow::runOnOperation() { Operation *op = getOperation(); - // Get the ABI analysis which includes thread uniform analysis. - auto &abiAnalysis = getAnalysis(); - // Collect all SCF operations first to avoid modifying while iterating. // Walk is post-order (inner before outer), but we need top-down order // (outer before inner) so that converting an outer op inlines the body @@ -266,9 +212,9 @@ void ConvertSCFControlFlow::runOnOperation() { for (Operation *scfOp : scfOps) { LogicalResult result = success(); if (auto forOp = dyn_cast(scfOp)) - result = convertForOp(forOp, abiAnalysis); + result = convertForOp(forOp); else if (auto ifOp = dyn_cast(scfOp)) - result = convertIfOp(ifOp, abiAnalysis); + result = convertIfOp(ifOp); if (failed(result)) { signalPassFailure(); return; @@ -290,10 +236,6 @@ void ConvertSCFControlFlow::runOnOperation() { signalPassFailure(); } }); - - // Set post-condition: no SCF ops remain. - if (auto kernelOp = dyn_cast(op)) - kernelOp.addNormalForms({amdgcn::NoScfOpsAttr::get(op->getContext())}); } } // namespace diff --git a/python/aster/pass_pipelines.py b/python/aster/pass_pipelines.py index f19ae2248..b5d190120 100644 --- a/python/aster/pass_pipelines.py +++ b/python/aster/pass_pipelines.py @@ -198,7 +198,7 @@ def phase_scf_pipelining(lcm_unroll=True, unroll_factor_multiplier=1, # Convert SCF control flow to AMDGCN control flow # Note: control flow support is very limited atm, add NORMAL FORMS # to harden invariants. - "amdgcn-convert-scf-control-flow", + "aster-convert-scf-control-flow", "canonicalize", "cse", "aster-codegen", "canonicalize", "cse", "canonicalize", diff --git a/test/Dialect/AMDGCN/Analysis/cdna3-hazards.mlir b/test/Dialect/AMDGCN/Analysis/cdna3-hazards.mlir index 419fca06b..05f485466 100644 --- a/test/Dialect/AMDGCN/Analysis/cdna3-hazards.mlir +++ b/test/Dialect/AMDGCN/Analysis/cdna3-hazards.mlir @@ -36,25 +36,25 @@ func.func @cdna3_store_hazard_detected(%arg0: !amdgcn.vgpr<0>, %arg1: !amdgcn.vg //===----------------------------------------------------------------------===// // CHECK-LABEL: Symbol: cdna3_vcc_vccz_hazard_detected -// CHECK: Op: func.func @cdna3_vcc_vccz_hazard_detected(%{{.*}}: !amdgcn.vcc, %{{.*}}: !amdgcn.vccz, %{{.*}}: !amdgcn.vgpr<0>, %{{.*}}: !amdgcn.vgpr<1>, %{{.*}}: !amdgcn.vgpr<2>) {...} +// CHECK: Op: func.func @cdna3_vcc_vccz_hazard_detected(%{{.*}}: !amdgcn.vcc<0>, %{{.*}}: !amdgcn.vccz<0>, %{{.*}}: !amdgcn.vgpr<0>, %{{.*}}: !amdgcn.vgpr<1>, %{{.*}}: !amdgcn.vgpr<2>) {...} // CHECK: HAZARD STATE AFTER: -// CHECK: Op: amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>) +// CHECK: Op: amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc<0>) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>) // CHECK: HAZARD STATE AFTER: { // CHECK: active = [ -// CHECK: {#amdgcn.cdna3_vcc_exec_vccz_execz_hazard, amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>), none, {v:5, s:0, ds:0}} +// CHECK: {#amdgcn.cdna3_vcc_exec_vccz_execz_hazard, amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc<0>) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>), none, {v:5, s:0, ds:0}} // CHECK: ] // CHECK: nop counts = {v:0, s:0, ds:0} // CHECK: } -// CHECK: Op: amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(!amdgcn.vccz, !amdgcn.vgpr<2>) +// CHECK: Op: amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc<0>) ins(!amdgcn.vccz<0>, !amdgcn.vgpr<2>) // CHECK: HAZARD STATE AFTER: { // CHECK: active = [ -// CHECK: {#amdgcn.cdna3_vcc_exec_vccz_execz_hazard, amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(!amdgcn.vccz, !amdgcn.vgpr<2>), none, {v:5, s:0, ds:0}} +// CHECK: {#amdgcn.cdna3_vcc_exec_vccz_execz_hazard, amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc<0>) ins(!amdgcn.vccz<0>, !amdgcn.vgpr<2>), none, {v:5, s:0, ds:0}} // CHECK: ] // CHECK: nop counts = {v:5, s:0, ds:0} // CHECK: } -func.func @cdna3_vcc_vccz_hazard_detected(%arg0: !amdgcn.vcc, %arg1: !amdgcn.vccz, %arg2: !amdgcn.vgpr<0>, %arg3: !amdgcn.vgpr<1>, %arg4: !amdgcn.vgpr<2>) { - amdgcn.cmpi v_cmp_eq_i32 outs %arg0 ins %arg2, %arg3 : outs(!amdgcn.vcc) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>) - amdgcn.cmpi v_cmp_eq_i32 outs %arg0 ins %arg1, %arg4 : outs(!amdgcn.vcc) ins(!amdgcn.vccz, !amdgcn.vgpr<2>) +func.func @cdna3_vcc_vccz_hazard_detected(%arg0: !amdgcn.vcc<0>, %arg1: !amdgcn.vccz<0>, %arg2: !amdgcn.vgpr<0>, %arg3: !amdgcn.vgpr<1>, %arg4: !amdgcn.vgpr<2>) { + amdgcn.cmpi v_cmp_eq_i32 outs %arg0 ins %arg2, %arg3 : outs(!amdgcn.vcc<0>) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>) + amdgcn.cmpi v_cmp_eq_i32 outs %arg0 ins %arg1, %arg4 : outs(!amdgcn.vcc<0>) ins(!amdgcn.vccz<0>, !amdgcn.vgpr<2>) return } diff --git a/test/Dialect/AMDGCN/Analysis/range-constraints.mlir b/test/Dialect/AMDGCN/Analysis/range-constraints.mlir index 15b126ccf..5098a4540 100644 --- a/test/Dialect/AMDGCN/Analysis/range-constraints.mlir +++ b/test/Dialect/AMDGCN/Analysis/range-constraints.mlir @@ -260,14 +260,16 @@ amdgcn.module @range_tests target = isa = { // CHECK: results: [5 = `%{{.*}}`] // CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr` // CHECK: results: [6 = `%{{.*}}`] -// CHECK: Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr, i32` +// CHECK: Operation: `%{{.*}} = lsir.alloca : !amdgcn.scc` // CHECK: results: [7 = `%{{.*}}`] -// CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr` +// CHECK: Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr, i32` // CHECK: results: [8 = `%{{.*}}`] // CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr` // CHECK: results: [9 = `%{{.*}}`] // CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr` // CHECK: results: [10 = `%{{.*}}`] +// CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr` +// CHECK: results: [11 = `%{{.*}}`] // CHECK: Symbol: phi_coalescing_2 // CHECK: No range constraints amdgcn.module @range_tests target = isa = { @@ -281,9 +283,10 @@ amdgcn.module @range_tests target = isa = { %5 = alloca : !amdgcn.vgpr test_inst outs %0 ins %2 : (!amdgcn.vgpr, !amdgcn.sgpr) -> () test_inst outs %1 ins %3 : (!amdgcn.vgpr, !amdgcn.sgpr) -> () - %8 = lsir.cmpi i32 eq %2, %c0_i32 : !amdgcn.sgpr, i32 + %scc = lsir.alloca : !amdgcn.scc + %8 = lsir.cmpi i32 eq %scc, %2, %c0_i32 : !amdgcn.scc, !amdgcn.sgpr, i32 %9 = alloca : !amdgcn.vgpr - cf.cond_br %8, ^bb1, ^bb2 + lsir.cond_br %8 : !amdgcn.scc, ^bb1, ^bb2 ^bb1: // CHECK: pred: ^bb0 test_inst outs %4 ins %0 : (!amdgcn.vgpr, !amdgcn.vgpr) -> () %11 = alloca : !amdgcn.vgpr @@ -314,10 +317,12 @@ amdgcn.module @range_tests target = isa = { // CHECK: results: [3 = `%{{.*}}`] // CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.sgpr` // CHECK: results: [4 = `%{{.*}}`] -// CHECK: Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr, i32` +// CHECK: Operation: `%{{.*}} = lsir.alloca : !amdgcn.scc` // CHECK: results: [5 = `%{{.*}}`] -// CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr` +// CHECK: Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr, i32` // CHECK: results: [6 = `%{{.*}}`] +// CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr` +// CHECK: results: [7 = `%{{.*}}`] // CHECK: Symbol: phi_coalescing_3 // CHECK: No range constraints amdgcn.module @range_tests target = isa = { @@ -329,9 +334,10 @@ amdgcn.module @range_tests target = isa = { %3 = alloca : !amdgcn.sgpr test_inst outs %0 ins %2 : (!amdgcn.vgpr, !amdgcn.sgpr) -> () test_inst outs %1 ins %3 : (!amdgcn.vgpr, !amdgcn.sgpr) -> () - %6 = lsir.cmpi i32 eq %2, %c0_i32 : !amdgcn.sgpr, i32 + %scc = lsir.alloca : !amdgcn.scc + %6 = lsir.cmpi i32 eq %scc, %2, %c0_i32 : !amdgcn.scc, !amdgcn.sgpr, i32 %7 = alloca : !amdgcn.vgpr - cf.cond_br %6, ^bb1, ^bb2 + lsir.cond_br %6 : !amdgcn.scc, ^bb1, ^bb2 ^bb1: // CHECK: pred: ^bb0 lsir.copy %7, %0 : !amdgcn.vgpr, !amdgcn.vgpr cf.br ^bb3 diff --git a/test/Dialect/AMDGCN/Analysis/register-interference.mlir b/test/Dialect/AMDGCN/Analysis/register-interference.mlir index 6e1c8d87b..dc756f006 100644 --- a/test/Dialect/AMDGCN/Analysis/register-interference.mlir +++ b/test/Dialect/AMDGCN/Analysis/register-interference.mlir @@ -256,9 +256,10 @@ amdgcn.module @interference_tests target = isa = { %5 = alloca : !amdgcn.vgpr test_inst outs %0 ins %2 : (!amdgcn.vgpr, !amdgcn.sgpr) -> () test_inst outs %1 ins %3 : (!amdgcn.vgpr, !amdgcn.sgpr) -> () - %8 = lsir.cmpi i32 eq %2, %c0_i32 : !amdgcn.sgpr, i32 + %scc = lsir.alloca : !amdgcn.scc<0> + lsir.cmpi i32 eq %scc, %2, %c0_i32 : !amdgcn.scc<0>, !amdgcn.sgpr, i32 %9 = alloca : !amdgcn.vgpr - cf.cond_br %8, ^bb1, ^bb2 + lsir.cond_br %scc : !amdgcn.scc<0>, ^bb1, ^bb2 ^bb1: // CHECK: pred: ^bb0 test_inst outs %4 ins %0 : (!amdgcn.vgpr, !amdgcn.vgpr) -> () %11 = alloca : !amdgcn.vgpr @@ -298,9 +299,10 @@ amdgcn.module @interference_tests target = isa = { %3 = alloca : !amdgcn.sgpr test_inst outs %0 ins %2 : (!amdgcn.vgpr, !amdgcn.sgpr) -> () test_inst outs %1 ins %3 : (!amdgcn.vgpr, !amdgcn.sgpr) -> () - %6 = lsir.cmpi i32 eq %2, %c0_i32 : !amdgcn.sgpr, i32 + %scc = lsir.alloca : !amdgcn.scc<0> + lsir.cmpi i32 eq %scc, %2, %c0_i32 : !amdgcn.scc<0>, !amdgcn.sgpr, i32 %7 = alloca : !amdgcn.vgpr - cf.cond_br %6, ^bb1, ^bb2 + lsir.cond_br %scc : !amdgcn.scc<0>, ^bb1, ^bb2 ^bb1: // CHECK: pred: ^bb0 lsir.copy %7, %0 : !amdgcn.vgpr, !amdgcn.vgpr cf.br ^bb3 diff --git a/test/Dialect/AMDGCN/Analysis/register-liveness.mlir b/test/Dialect/AMDGCN/Analysis/register-liveness.mlir index 0ad368af4..33ee3d1bf 100644 --- a/test/Dialect/AMDGCN/Analysis/register-liveness.mlir +++ b/test/Dialect/AMDGCN/Analysis/register-liveness.mlir @@ -526,14 +526,16 @@ amdgcn.kernel @reg_interference { // CHECK: results: [5 = `%{{.*}}`] // CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr` // CHECK: results: [6 = `%{{.*}}`] -// CHECK: Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr, i32` +// CHECK: Operation: `%{{.*}} = lsir.alloca : !amdgcn.scc` // CHECK: results: [7 = `%{{.*}}`] -// CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr` +// CHECK: Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr, i32` // CHECK: results: [8 = `%{{.*}}`] // CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr` // CHECK: results: [9 = `%{{.*}}`] // CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr` // CHECK: results: [10 = `%{{.*}}`] +// CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr` +// CHECK: results: [11 = `%{{.*}}`] // CHECK: Op: module {...} // CHECK: LIVE BEFORE: [] // CHECK: Symbol: phi_coalescing_2 @@ -557,12 +559,14 @@ amdgcn.kernel @reg_interference { // CHECK: LIVE BEFORE: [3 = `%{{.*}}`, 4 = `%{{.*}}`] // CHECK: Op: amdgcn.test_inst outs %{{.*}} ins %{{.*}} : (!amdgcn.vgpr, !amdgcn.sgpr) -> () // CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 3 = `%{{.*}}`, 4 = `%{{.*}}`] -// CHECK: Op: %{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr, i32 +// CHECK: Op: %{{.*}} = lsir.alloca : !amdgcn.scc +// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 3 = `%{{.*}}`] +// CHECK: Op: %{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr, i32 // CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 3 = `%{{.*}}`] // CHECK: Op: %{{.*}} = amdgcn.alloca : !amdgcn.vgpr -// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`] -// CHECK: Op: cf.cond_br %{{.*}}, ^bb1, ^bb2 -// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`] +// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 8 = `%{{.*}}`] +// CHECK: Op: lsir.cond_br %{{.*}} : !amdgcn.scc, ^bb1, ^bb2 +// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 8 = `%{{.*}}`] // CHECK: Op: amdgcn.test_inst outs %{{.*}} ins %{{.*}} : (!amdgcn.vgpr, !amdgcn.vgpr) -> () // CHECK: LIVE BEFORE: [1 = `%{{.*}}`] // CHECK: Op: %{{.*}} = amdgcn.alloca : !amdgcn.vgpr @@ -570,9 +574,9 @@ amdgcn.kernel @reg_interference { // CHECK: Op: amdgcn.test_inst outs %{{.*}} : (!amdgcn.vgpr) -> () // CHECK: LIVE BEFORE: [] // CHECK: Op: lsir.copy %{{.*}}, %{{.*}} : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: LIVE BEFORE: [9 = `%{{.*}}`] +// CHECK: LIVE BEFORE: [10 = `%{{.*}}`] // CHECK: Op: cf.br ^bb3 -// CHECK: LIVE BEFORE: [8 = `%{{.*}}`] +// CHECK: LIVE BEFORE: [9 = `%{{.*}}`] // CHECK: Op: amdgcn.test_inst outs %{{.*}} ins %{{.*}} : (!amdgcn.vgpr, !amdgcn.vgpr) -> () // CHECK: LIVE BEFORE: [2 = `%{{.*}}`] // CHECK: Op: %{{.*}} = amdgcn.alloca : !amdgcn.vgpr @@ -580,11 +584,11 @@ amdgcn.kernel @reg_interference { // CHECK: Op: amdgcn.test_inst outs %{{.*}} : (!amdgcn.vgpr) -> () // CHECK: LIVE BEFORE: [] // CHECK: Op: lsir.copy %{{.*}}, %{{.*}} : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: LIVE BEFORE: [10 = `%{{.*}}`] +// CHECK: LIVE BEFORE: [11 = `%{{.*}}`] // CHECK: Op: cf.br ^bb3 -// CHECK: LIVE BEFORE: [8 = `%{{.*}}`] +// CHECK: LIVE BEFORE: [9 = `%{{.*}}`] // CHECK: Op: amdgcn.test_inst ins %{{.*}} : (!amdgcn.vgpr) -> () -// CHECK: LIVE BEFORE: [8 = `%{{.*}}`] +// CHECK: LIVE BEFORE: [9 = `%{{.*}}`] // CHECK: Op: amdgcn.end_kernel // CHECK: LIVE BEFORE: [] amdgcn.kernel @phi_coalescing_2 { @@ -597,9 +601,10 @@ amdgcn.kernel @phi_coalescing_2 { %5 = alloca : !amdgcn.vgpr test_inst outs %0 ins %2 : (!amdgcn.vgpr, !amdgcn.sgpr) -> () test_inst outs %1 ins %3 : (!amdgcn.vgpr, !amdgcn.sgpr) -> () - %8 = lsir.cmpi i32 eq %2, %c0_i32 : !amdgcn.sgpr, i32 + %scc8 = lsir.alloca : !amdgcn.scc + %8 = lsir.cmpi i32 eq %scc8, %2, %c0_i32 : !amdgcn.scc, !amdgcn.sgpr, i32 %9 = alloca : !amdgcn.vgpr - cf.cond_br %8, ^bb1, ^bb2 + lsir.cond_br %8 : !amdgcn.scc, ^bb1, ^bb2 ^bb1: // CHECK: pred: ^bb0 test_inst outs %4 ins %0 : (!amdgcn.vgpr, !amdgcn.vgpr) -> () %11 = alloca : !amdgcn.vgpr @@ -629,10 +634,12 @@ amdgcn.kernel @phi_coalescing_2 { // CHECK: results: [3 = `%{{.*}}`] // CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.sgpr` // CHECK: results: [4 = `%{{.*}}`] -// CHECK: Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr, i32` +// CHECK: Operation: `%{{.*}} = lsir.alloca : !amdgcn.scc` // CHECK: results: [5 = `%{{.*}}`] -// CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr` +// CHECK: Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr, i32` // CHECK: results: [6 = `%{{.*}}`] +// CHECK: Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr` +// CHECK: results: [7 = `%{{.*}}`] // CHECK: Op: module {...} // CHECK: LIVE BEFORE: [] // CHECK: Symbol: phi_coalescing_3 @@ -652,22 +659,24 @@ amdgcn.kernel @phi_coalescing_2 { // CHECK: LIVE BEFORE: [3 = `%{{.*}}`, 4 = `%{{.*}}`] // CHECK: Op: amdgcn.test_inst outs %{{.*}} ins %{{.*}} : (!amdgcn.vgpr, !amdgcn.sgpr) -> () // CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 3 = `%{{.*}}`, 4 = `%{{.*}}`] -// CHECK: Op: %{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr, i32 +// CHECK: Op: %{{.*}} = lsir.alloca : !amdgcn.scc +// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 3 = `%{{.*}}`] +// CHECK: Op: %{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr, i32 // CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 3 = `%{{.*}}`] // CHECK: Op: %{{.*}} = amdgcn.alloca : !amdgcn.vgpr -// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`] -// CHECK: Op: cf.cond_br %{{.*}}, ^bb1, ^bb2 -// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`] +// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 6 = `%{{.*}}`] +// CHECK: Op: lsir.cond_br %{{.*}} : !amdgcn.scc, ^bb1, ^bb2 +// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 6 = `%{{.*}}`] // CHECK: Op: lsir.copy %{{.*}}, %{{.*}} : !amdgcn.vgpr, !amdgcn.vgpr // CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`] // CHECK: Op: cf.br ^bb3 -// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 6 = `%{{.*}}`] +// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 7 = `%{{.*}}`] // CHECK: Op: lsir.copy %{{.*}}, %{{.*}} : !amdgcn.vgpr, !amdgcn.vgpr // CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`] // CHECK: Op: cf.br ^bb3 -// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 6 = `%{{.*}}`] +// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 7 = `%{{.*}}`] // CHECK: Op: amdgcn.test_inst ins %{{.*}}, %{{.*}}, %{{.*}} : (!amdgcn.vgpr, !amdgcn.vgpr, !amdgcn.vgpr) -> () -// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 6 = `%{{.*}}`] +// CHECK: LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 7 = `%{{.*}}`] // CHECK: Op: amdgcn.end_kernel // CHECK: LIVE BEFORE: [] amdgcn.kernel @phi_coalescing_3 { @@ -678,9 +687,10 @@ amdgcn.kernel @phi_coalescing_3 { %3 = alloca : !amdgcn.sgpr test_inst outs %0 ins %2 : (!amdgcn.vgpr, !amdgcn.sgpr) -> () test_inst outs %1 ins %3 : (!amdgcn.vgpr, !amdgcn.sgpr) -> () - %6 = lsir.cmpi i32 eq %2, %c0_i32 : !amdgcn.sgpr, i32 + %scc6 = lsir.alloca : !amdgcn.scc + %6 = lsir.cmpi i32 eq %scc6, %2, %c0_i32 : !amdgcn.scc, !amdgcn.sgpr, i32 %7 = alloca : !amdgcn.vgpr - cf.cond_br %6, ^bb1, ^bb2 + lsir.cond_br %6 : !amdgcn.scc, ^bb1, ^bb2 ^bb1: // CHECK: pred: ^bb0 lsir.copy %7, %0 : !amdgcn.vgpr, !amdgcn.vgpr cf.br ^bb3 @@ -780,15 +790,18 @@ amdgcn.kernel @test_empty_kernel { // CHECK: LIVE BEFORE: [] // CHECK: Op: %{{.*}} = amdgcn.alloca : !amdgcn.sgpr // CHECK: LIVE BEFORE: [] -// CHECK: Op: %{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr, i32 +// CHECK: Op: %{{.*}} = lsir.alloca : !amdgcn.scc // CHECK: LIVE BEFORE: [1 = `%{{.*}}`] -// CHECK: Op: cf.cond_br %{{.*}}, ^bb1, ^bb2 -// CHECK: LIVE BEFORE: [] +// CHECK: Op: %{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr, i32 +// CHECK: LIVE BEFORE: [1 = `%{{.*}}`] +// CHECK: Op: lsir.cond_br %{{.*}} : !amdgcn.scc, ^bb1, ^bb2 +// CHECK: LIVE BEFORE: [3 = `%{{.*}}`] amdgcn.kernel @test_non_register_filtered { %c0 = arith.constant 0 : i32 %0 = alloca : !amdgcn.sgpr - %cond = lsir.cmpi i32 eq %0, %c0 : !amdgcn.sgpr, i32 - cf.cond_br %cond, ^bb1, ^bb2 + %scc_dst = lsir.alloca : !amdgcn.scc + %cond = lsir.cmpi i32 eq %scc_dst, %0, %c0 : !amdgcn.scc, !amdgcn.sgpr, i32 + lsir.cond_br %cond : !amdgcn.scc, ^bb1, ^bb2 ^bb1: end_kernel ^bb2: diff --git a/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-compute-ops.mlir b/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-compute-ops.mlir index 2dfa1a000..26354a114 100644 --- a/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-compute-ops.mlir +++ b/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-compute-ops.mlir @@ -4,9 +4,9 @@ amdgcn.module @allowed_cmpi target = #amdgcn.target isa = #amdgcn.isa attributes {normal_forms = [#amdgcn.no_lsir_compute_ops]} { - func.func @f(%a: !amdgcn.sgpr, %b: !amdgcn.sgpr) -> i1 { - %cmp = lsir.cmpi i32 slt %a, %b : !amdgcn.sgpr, !amdgcn.sgpr - return %cmp : i1 + func.func @f(%dst: !amdgcn.scc, %a: !amdgcn.sgpr, %b: !amdgcn.sgpr) -> !amdgcn.scc { + %cmp = lsir.cmpi i32 slt %dst, %a, %b : !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr + return %cmp : !amdgcn.scc } } diff --git a/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-control-ops-invalid.mlir b/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-control-ops-invalid.mlir index 03da996dd..966375010 100644 --- a/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-control-ops-invalid.mlir +++ b/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-control-ops-invalid.mlir @@ -4,9 +4,9 @@ amdgcn.module @rejected_cmpi target = #amdgcn.target isa = #amdgcn.isa attributes {normal_forms = [#amdgcn.no_lsir_control_ops]} { - func.func @f(%a: !amdgcn.sgpr, %b: !amdgcn.sgpr) -> i1 { + func.func @f(%dst: !amdgcn.scc, %a: !amdgcn.sgpr, %b: !amdgcn.sgpr) -> !amdgcn.scc { // expected-error @+1 {{normal form violation: LSIR control-flow operations are disallowed but found: lsir.cmpi}} - %cmp = lsir.cmpi i32 slt %a, %b : !amdgcn.sgpr, !amdgcn.sgpr - return %cmp : i1 + %cmp = lsir.cmpi i32 slt %dst, %a, %b : !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr + return %cmp : !amdgcn.scc } } diff --git a/test/Dialect/AMDGCN/Transforms/bufferization.mlir b/test/Dialect/AMDGCN/Transforms/bufferization.mlir index c14d53a36..338ee89b9 100644 --- a/test/Dialect/AMDGCN/Transforms/bufferization.mlir +++ b/test/Dialect/AMDGCN/Transforms/bufferization.mlir @@ -15,12 +15,12 @@ func.func private @rand() -> i1 // CHECK: cf.br ^bb2 // CHECK: ^bb2: // CHECK: lsir.copy %[[VAL_0]], %[[VAL_2]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb5 +// CHECK: lsir.br ^bb5 // CHECK: ^bb3: // CHECK: cf.br ^bb4 // CHECK: ^bb4: // CHECK: lsir.copy %[[VAL_0]], %[[VAL_3]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb5 +// CHECK: lsir.br ^bb5 // CHECK: ^bb5: // CHECK: %[[COPY_0:.*]] = lsir.copy %[[VAL_1]], %[[VAL_0]] : !amdgcn.vgpr, !amdgcn.vgpr // CHECK: %[[VAL_4:.*]] = test_inst outs %[[COPY_0]] : (!amdgcn.vgpr) -> !amdgcn.vgpr @@ -54,12 +54,12 @@ func.func private @rand() -> i1 // CHECK: cf.br ^bb2 // CHECK: ^bb2: // CHECK: lsir.copy %[[VAL_0]], %[[VAL_2]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb5 +// CHECK: lsir.br ^bb5 // CHECK: ^bb3: // CHECK: cf.br ^bb4 // CHECK: ^bb4: // CHECK: lsir.copy %[[VAL_0]], %[[VAL_2]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb5 +// CHECK: lsir.br ^bb5 // CHECK: ^bb5: // CHECK: %[[COPY_0:.*]] = lsir.copy %[[VAL_1]], %[[VAL_0]] : !amdgcn.vgpr, !amdgcn.vgpr // CHECK: test_inst ins %[[COPY_0]] : (!amdgcn.vgpr) -> () @@ -95,12 +95,12 @@ func.func private @rand() -> i1 // CHECK: cf.br ^bb2 // CHECK: ^bb2: // CHECK: lsir.copy %[[VAL_0]], %[[VAL_2]] : !amdgcn.sgpr, !amdgcn.sgpr -// CHECK: cf.br ^bb5 +// CHECK: lsir.br ^bb5 // CHECK: ^bb3: // CHECK: cf.br ^bb4 // CHECK: ^bb4: // CHECK: lsir.copy %[[VAL_0]], %[[VAL_3]] : !amdgcn.sgpr, !amdgcn.sgpr -// CHECK: cf.br ^bb5 +// CHECK: lsir.br ^bb5 // CHECK: ^bb5: // CHECK: %[[COPY_0:.*]] = lsir.copy %[[VAL_1]], %[[VAL_0]] : !amdgcn.sgpr, !amdgcn.sgpr // CHECK: test_inst ins %[[COPY_0]] : (!amdgcn.sgpr) -> () @@ -139,12 +139,12 @@ func.func private @rand() -> i1 // CHECK: cf.br ^bb2 // CHECK: ^bb2: // CHECK: lsir.copy %[[VAL_0]], %[[VAL_5]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb5 +// CHECK: lsir.br ^bb5 // CHECK: ^bb3: // CHECK: cf.br ^bb4 // CHECK: ^bb4: // CHECK: lsir.copy %[[VAL_0]], %[[VAL_6]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb5 +// CHECK: lsir.br ^bb5 // CHECK: ^bb5: // CHECK: %[[COPY_0:.*]] = lsir.copy %[[VAL_1]], %[[VAL_0]] : !amdgcn.vgpr, !amdgcn.vgpr // CHECK: test_inst ins %[[COPY_0]] : (!amdgcn.vgpr) -> () @@ -241,7 +241,7 @@ func.func private @rand() -> i1 // CHECK: cf.br ^bb1 // CHECK: ^bb1: // CHECK: lsir.copy %[[VAL_0]], %[[VAL_4]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb2 +// CHECK: lsir.br ^bb2 // CHECK: ^bb2: // CHECK: %[[CALL_0:.*]] = func.call @rand() : () -> i1 // CHECK: %[[VAL_5:.*]] = alloca : !amdgcn.vgpr @@ -250,7 +250,7 @@ func.func private @rand() -> i1 // CHECK: cf.cond_br %[[CALL_0]], ^bb3, ^bb4 // CHECK: ^bb3: // CHECK: lsir.copy %[[VAL_0]], %[[VAL_6]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb2 +// CHECK: lsir.br ^bb2 // CHECK: ^bb4: // CHECK: test_inst ins %[[COPY_0]] : (!amdgcn.vgpr) -> () // CHECK: end_kernel @@ -290,7 +290,7 @@ func.func private @rand() -> i1 // CHECK: ^bb1: // CHECK: lsir.copy %[[VAL_2]], %[[VAL_7]] : !amdgcn.vgpr, !amdgcn.vgpr // CHECK: lsir.copy %[[VAL_0]], %[[VAL_8]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb2 +// CHECK: lsir.br ^bb2 // CHECK: ^bb2: // CHECK: %[[CALL_0:.*]] = func.call @rand() : () -> i1 // CHECK: %[[COPY_0:.*]] = lsir.copy %[[VAL_3]], %[[VAL_2]] : !amdgcn.vgpr, !amdgcn.vgpr @@ -299,7 +299,7 @@ func.func private @rand() -> i1 // CHECK: ^bb3: // CHECK: lsir.copy %[[VAL_2]], %[[COPY_1]] : !amdgcn.vgpr, !amdgcn.vgpr // CHECK: lsir.copy %[[VAL_0]], %[[COPY_0]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb2 +// CHECK: lsir.br ^bb2 // CHECK: ^bb4: // CHECK: test_inst ins %[[COPY_0]], %[[COPY_1]] : (!amdgcn.vgpr, !amdgcn.vgpr) -> () // CHECK: end_kernel @@ -346,13 +346,13 @@ func.func private @rand() -> i1 // CHECK: ^bb2: // CHECK: lsir.copy %[[VAL_2]], %[[VAL_9]] : !amdgcn.vgpr, !amdgcn.vgpr // CHECK: lsir.copy %[[VAL_0]], %[[VAL_10]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb5 +// CHECK: lsir.br ^bb5 // CHECK: ^bb3: // CHECK: cf.br ^bb4 // CHECK: ^bb4: // CHECK: lsir.copy %[[VAL_2]], %[[VAL_11]] : !amdgcn.vgpr, !amdgcn.vgpr // CHECK: lsir.copy %[[VAL_0]], %[[VAL_12]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb5 +// CHECK: lsir.br ^bb5 // CHECK: ^bb5: // CHECK: %[[COPY_0:.*]] = lsir.copy %[[VAL_3]], %[[VAL_2]] : !amdgcn.vgpr, !amdgcn.vgpr // CHECK: %[[COPY_1:.*]] = lsir.copy %[[VAL_1]], %[[VAL_0]] : !amdgcn.vgpr, !amdgcn.vgpr @@ -397,7 +397,7 @@ func.func private @rand() -> i1 // CHECK: ^bb1: // CHECK: lsir.copy %[[ALLOCA_2]], %[[TEST_INST_0]] : !amdgcn.vgpr, !amdgcn.vgpr // CHECK: lsir.copy %[[ALLOCA_0]], %[[TEST_INST_1]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb2 +// CHECK: lsir.br ^bb2 // CHECK: ^bb2: // CHECK: %[[COPY_0:.*]] = lsir.copy %[[ALLOCA_1]], %[[ALLOCA_0]] : !amdgcn.vgpr, !amdgcn.vgpr // CHECK: amdgcn.test_inst ins %[[COPY_0]] : (!amdgcn.vgpr) -> () @@ -427,14 +427,14 @@ func.func @test_copy_loc() { // CHECK: cf.br ^bb1 // CHECK: ^bb1: // CHECK: lsir.copy %[[ALLOCA_0]], %[[TEST_INST_0]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb2 +// CHECK: lsir.br ^bb2 // CHECK: ^bb2: // CHECK: %[[VAL_0:.*]] = call @rand() : () -> i1 // CHECK: %[[COPY_0:.*]] = lsir.copy %[[ALLOCA_1]], %[[ALLOCA_0]] : !amdgcn.vgpr, !amdgcn.vgpr // CHECK: cf.cond_br %[[VAL_0]], ^bb3, ^bb4 // CHECK: ^bb3: // CHECK: lsir.copy %[[ALLOCA_0]], %[[COPY_0]] : !amdgcn.vgpr, !amdgcn.vgpr -// CHECK: cf.br ^bb2 +// CHECK: lsir.br ^bb2 // CHECK: ^bb4: // CHECK: %[[ALLOCA_3:.*]] = amdgcn.alloca : !amdgcn.vgpr // CHECK: %[[TEST_INST_1:.*]] = amdgcn.test_inst outs %[[ALLOCA_3]] ins %[[COPY_0]] : (!amdgcn.vgpr, !amdgcn.vgpr) -> !amdgcn.vgpr diff --git a/test/Dialect/AMDGCN/Transforms/chained-select-dps-violation.mlir b/test/Dialect/AMDGCN/Transforms/chained-select-dps-violation.mlir index 942492a7c..d04ed2640 100644 --- a/test/Dialect/AMDGCN/Transforms/chained-select-dps-violation.mlir +++ b/test/Dialect/AMDGCN/Transforms/chained-select-dps-violation.mlir @@ -11,12 +11,12 @@ // 4. Chained selects and loop counter advance all get concrete registers. // // CHECK-LABEL: kernel @chained_select_loop_3_way_buffer_mux { -// CHECK: cf.br ^bb1 +// CHECK: lsir.br ^bb1 // CHECK: ^bb1: // CHECK: lsir.select %{{[0-9]+}}, // CHECK: lsir.select %{{[0-9]+}}, // CHECK: test_inst ins %{{[0-9]+}} : (!amdgcn.sgpr<{{[0-9]+}}>) -// CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2 +// CHECK: lsir.cond_br %{{.*}} : !amdgcn.scc amdgcn.module @chained_select target = isa = { kernel @chained_select_loop_3_way_buffer_mux { %c0_i32 = arith.constant 0 : i32 @@ -47,26 +47,30 @@ amdgcn.module @chained_select target = isa = { ^loop(%k: !amdgcn.sgpr, %buf_idx: !amdgcn.sgpr): // 3-way buffer mux: chained select on buf_idx - %is_buf0 = lsir.cmpi i32 eq %buf_idx, %c0_i32 : !amdgcn.sgpr, i32 - %is_buf1 = lsir.cmpi i32 eq %buf_idx, %c1_i32 : !amdgcn.sgpr, i32 + %scc0_alloc = amdgcn.alloca : !amdgcn.scc + %scc0 = lsir.cmpi i32 eq %scc0_alloc, %buf_idx, %c0_i32 : !amdgcn.scc, !amdgcn.sgpr, i32 + %scc1_alloc = amdgcn.alloca : !amdgcn.scc + %scc1 =lsir.cmpi i32 eq %scc1_alloc, %buf_idx, %c1_i32 : !amdgcn.scc, !amdgcn.sgpr, i32 // Chained select: inner picks between buf1 / buf2 values, // outer picks between buf0 and inner result. - %inner = lsir.select %s_inner, %is_buf1, %c100_i32, %c200_i32 : !amdgcn.sgpr, i1, i32, i32 - %outer = lsir.select %s_outer, %is_buf0, %c0_i32, %inner : !amdgcn.sgpr, i1, i32, !amdgcn.sgpr + %inner = lsir.select %s_inner, %scc1, %c100_i32, %c200_i32 : !amdgcn.sgpr, !amdgcn.scc, i32, i32 + %outer = lsir.select %s_outer, %scc0, %c0_i32, %inner : !amdgcn.sgpr, !amdgcn.scc, i32, !amdgcn.sgpr // Use the outer select result (prevents DCE) test_inst ins %outer : (!amdgcn.sgpr) -> () // Advance buf_idx: (buf_idx + 1) % 3 via wrap %next_raw = sop2 s_add_u32 outs %s_add ins %buf_idx, %c1_i32 : !amdgcn.sgpr, !amdgcn.sgpr, i32 - %is_3 = lsir.cmpi i32 eq %next_raw, %c3_i32 : !amdgcn.sgpr, i32 - %next_buf = lsir.select %s_next_buf, %is_3, %c0_i32, %next_raw : !amdgcn.sgpr, i1, i32, !amdgcn.sgpr + %scc2_alloc = amdgcn.alloca : !amdgcn.scc + %scc2 = lsir.cmpi i32 eq %scc2_alloc, %next_raw, %c3_i32 : !amdgcn.scc, !amdgcn.sgpr, i32 + %next_buf = lsir.select %s_next_buf, %scc2, %c0_i32, %next_raw : !amdgcn.sgpr, !amdgcn.scc, i32, !amdgcn.sgpr // Advance loop counter %k_next = sop2 s_add_u32 outs %s_cmp ins %k, %c1_i32 : !amdgcn.sgpr, !amdgcn.sgpr, i32 - %done = lsir.cmpi i32 slt %k_next, %c6_i32 : !amdgcn.sgpr, i32 - cf.cond_br %done, ^loop(%k_next, %next_buf : !amdgcn.sgpr, !amdgcn.sgpr), ^exit + %scc3_alloc = amdgcn.alloca : !amdgcn.scc + %scc3 = lsir.cmpi i32 slt %scc3_alloc, %k_next, %c6_i32 : !amdgcn.scc, !amdgcn.sgpr, i32 + lsir.cond_br %scc3 : !amdgcn.scc, ^loop(%k_next, %next_buf : !amdgcn.sgpr, !amdgcn.sgpr), ^exit ^exit: end_kernel diff --git a/test/Dialect/AMDGCN/Transforms/convert-scf-iter-args.mlir b/test/Dialect/AMDGCN/Transforms/convert-scf-iter-args.mlir index 385a8eb46..7768e48ce 100644 --- a/test/Dialect/AMDGCN/Transforms/convert-scf-iter-args.mlir +++ b/test/Dialect/AMDGCN/Transforms/convert-scf-iter-args.mlir @@ -1,4 +1,4 @@ -// RUN: aster-opt %s --amdgcn-convert-scf-control-flow | FileCheck %s +// RUN: aster-opt %s --aster-convert-scf-control-flow | FileCheck %s // Test scf.for with a single iter_arg (accumulator pattern) // Uses index_cast to convert i32 bounds to index (required by scf.for with iter_args) diff --git a/test/Dialect/AMDGCN/Transforms/convert-scf-nf.mlir b/test/Dialect/AMDGCN/Transforms/convert-scf-nf.mlir index aca47f3b9..9e06e4c05 100644 --- a/test/Dialect/AMDGCN/Transforms/convert-scf-nf.mlir +++ b/test/Dialect/AMDGCN/Transforms/convert-scf-nf.mlir @@ -1,11 +1,12 @@ -// RUN: aster-opt --pass-pipeline='builtin.module(any(amdgcn-convert-scf-control-flow))' %s \ +// RUN: aster-opt --pass-pipeline='builtin.module(any(aster-convert-scf-control-flow))' %s \ // RUN: | FileCheck %s -// Verify that convert-scf-control-flow sets the no_scf_ops post-condition. +// Verify that convert-scf-control-flow converts scf.for with no remaining SCF ops. -// CHECK-LABEL: kernel @sets_postcondition -// CHECK-SAME: attributes {normal_forms = [#amdgcn.no_scf_ops]} -amdgcn.kernel @sets_postcondition { +// CHECK-LABEL: kernel @no_remaining_scf +// CHECK-NOT: scf.for +// CHECK-NOT: scf.if +amdgcn.kernel @no_remaining_scf { ^bb0: %0 = amdgcn.alloca : !amdgcn.vgpr<3> amdgcn.end_kernel diff --git a/test/Dialect/AMDGCN/Transforms/convert-scf.mlir b/test/Dialect/AMDGCN/Transforms/convert-scf.mlir index 88ab2629d..af9566887 100644 --- a/test/Dialect/AMDGCN/Transforms/convert-scf.mlir +++ b/test/Dialect/AMDGCN/Transforms/convert-scf.mlir @@ -1,4 +1,4 @@ -// RUN: aster-opt %s --amdgcn-convert-scf-control-flow --split-input-file --verify-diagnostics | FileCheck %s +// RUN: aster-opt %s --aster-convert-scf-control-flow --split-input-file --verify-diagnostics | FileCheck %s // CHECK-LABEL: func.func @test_uniform_loops_const_bounds() { // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32 @@ -55,33 +55,6 @@ func.func @test_uniform_loops_non_const_bounds(%n: i32) { // ----- -func.func @test_non_const_bounds(%n: i32) { - %c0 = arith.constant 0 : i32 - %c1 = arith.constant 1 : i32 - // expected-error@+1 {{only thread-uniform loops are supported in this conversion}} - scf.for %i = %c0 to %n step %c1 : i32 { - %iv = lsir.to_reg %i : i32 -> !amdgcn.sgpr - amdgcn.test_inst ins %iv : (!amdgcn.sgpr) -> () - } - return -} - -// ----- - -func.func @test_index_loop_unsupported() { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c10 = arith.constant 10 : index - // expected-error@+1 {{only i32 induction variables are supported in this conversion}} - scf.for %i = %c0 to %c10 step %c1 { - %iv = lsir.to_reg %i : index -> !amdgcn.sgpr - amdgcn.test_inst ins %iv : (!amdgcn.sgpr) -> () - } - return -} - -// ----- - // CHECK-LABEL: func.func @test_uniform_if_no_else( // CHECK-SAME: %[[COND:.*]]: i1) { // CHECK: %[[COND_U:.*]] = aster_utils.assume_uniform %[[COND]] : i1 @@ -407,18 +380,6 @@ func.func @test_if_with_results_inside_for(%cond: i1, %init: i32) { // ----- -func.func @test_non_uniform_if(%cond: i1) { - // expected-error@+1 {{only thread-uniform conditions are supported in this conversion}} - scf.if %cond { - %c42 = arith.constant 42 : i32 - %reg = lsir.to_reg %c42 : i32 -> !amdgcn.sgpr - amdgcn.test_inst ins %reg : (!amdgcn.sgpr) -> () - } - return -} - -// ----- - // Test that pre-existing cf.cond_br with bad block layout is rejected. // Both destinations jump past the next block, so neither is a fallthrough. func.func @test_bad_block_layout(%cond: i1) { diff --git a/test/Dialect/AMDGCN/Transforms/legalize-cf-nf.mlir b/test/Dialect/AMDGCN/Transforms/legalize-cf-nf.mlir index f379f688b..12c6cc30e 100644 --- a/test/Dialect/AMDGCN/Transforms/legalize-cf-nf.mlir +++ b/test/Dialect/AMDGCN/Transforms/legalize-cf-nf.mlir @@ -13,8 +13,9 @@ amdgcn.kernel @sets_postcondition attributes {normal_forms = [#amdgcn.all_regist %alloc1 = amdgcn.alloca : !amdgcn.sgpr<1> amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0_i32 : !amdgcn.sgpr<0>, i32 amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10_i32 : !amdgcn.sgpr<1>, i32 - %cmp = lsir.cmpi i32 slt %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1> - cf.cond_br %cmp, ^bb1, ^bb2 + %alloc_scc = lsir.alloca : !amdgcn.scc<0> + lsir.cmpi i32 slt %alloc_scc, %alloc0, %alloc1 : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1> + lsir.cond_br %alloc_scc : !amdgcn.scc<0>, ^bb1, ^bb2 ^bb1: amdgcn.end_kernel ^bb2: diff --git a/test/Dialect/AMDGCN/Transforms/legalize-cf.mlir b/test/Dialect/AMDGCN/Transforms/legalize-cf.mlir index 754bc2386..05f59d8a0 100644 --- a/test/Dialect/AMDGCN/Transforms/legalize-cf.mlir +++ b/test/Dialect/AMDGCN/Transforms/legalize-cf.mlir @@ -3,10 +3,10 @@ // CHECK-LABEL: kernel @test_cond_branch_slt // CHECK: sop1 s_mov_b32 outs %[[A:.*]] ins // CHECK: sop1 s_mov_b32 outs %[[B:.*]] ins -// CHECK: %[[SCC:.*]] = alloca : !amdgcn.scc -// CHECK: cmpi s_cmp_lt_i32 outs %[[SCC]] ins %[[A]], %[[B]] : outs(!amdgcn.scc) ins(!amdgcn.sgpr<0>, !amdgcn.sgpr<1>) +// CHECK: %[[SCC:.*]] = alloca : !amdgcn.scc<0> +// CHECK: lsir.cmpi i32 slt %[[SCC]], %[[A]], %[[B]] : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1> // Use SCC0 because ^bb1 (trueDest) is the next physical block - branch to ^bb2 if false -// CHECK: cbranch s_cbranch_scc0 %[[SCC]] ^bb2 fallthrough(^bb1) : !amdgcn.scc +// CHECK: cbranch s_cbranch_scc0 %[[SCC]] ^bb2 fallthrough(^bb1) : !amdgcn.scc<0> // CHECK: ^bb1: // CHECK: end_kernel // CHECK: ^bb2: @@ -19,8 +19,9 @@ amdgcn.module @test_slt target = isa = { %alloc1 = alloca : !amdgcn.sgpr<1> sop1 s_mov_b32 outs %alloc0 ins %c0_i32 : !amdgcn.sgpr<0>, i32 sop1 s_mov_b32 outs %alloc1 ins %c10_i32 : !amdgcn.sgpr<1>, i32 - %cmp = lsir.cmpi i32 slt %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1> - cf.cond_br %cmp, ^bb1, ^bb2 + %scc0 = amdgcn.alloca : !amdgcn.scc<0> + lsir.cmpi i32 slt %scc0, %alloc0, %alloc1 : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1> + lsir.cond_br %scc0 : !amdgcn.scc<0>, ^bb1, ^bb2 ^bb1: end_kernel ^bb2: @@ -36,7 +37,7 @@ amdgcn.module @test_slt target = isa = { // CHECK: end_kernel amdgcn.module @test_br target = isa = { amdgcn.kernel @test_unconditional_branch { - cf.br ^bb1 + lsir.br ^bb1 ^bb1: end_kernel } @@ -45,7 +46,7 @@ amdgcn.module @test_br target = isa = { // ----- // Verify lsir.cmpi is converted to amdgcn.cmpi with allocated operands -// Verify cf.cond_br is converted to amdgcn.cbranch +// Verify lsir.cond_br is converted to amdgcn.cbranch // Entry check: use s_cbranch_scc0 because ^bb1 (trueDest) is the next physical block // Branch to ^bb2 if SCC=0 (condition false), fallthrough to ^bb1 if SCC=1 (true) // Verify block argument is removed (^bb1 has no args after legalization) @@ -54,11 +55,11 @@ amdgcn.module @test_br target = isa = { // Branch to ^bb1 if SCC=1 (continue loop), fallthrough to ^bb2 if SCC=0 (exit) // CHECK-LABEL: kernel @test_cf_cond_br_lsir_cmpi -// CHECK: cmpi s_cmp_gt_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.scc) ins(!amdgcn.sgpr<6>, i32) +// CHECK: lsir.cmpi i32 sgt %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc<0>, !amdgcn.sgpr<6>, i32 // CHECK: cbranch s_cbranch_scc0 %{{.*}} ^bb2 fallthrough(^bb1) // CHECK: ^bb1: // CHECK: sop2 s_add_u32 outs %[[LOOP_ALLOC:.*]] ins %[[LOOP_ALLOC]] -// CHECK: cmpi s_cmp_lt_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.scc) ins(!amdgcn.sgpr<7>, !amdgcn.sgpr<6>) +// CHECK: lsir.cmpi i32 slt %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc<0>, !amdgcn.sgpr<7>, !amdgcn.sgpr<6> // CHECK: cbranch s_cbranch_scc1 %{{.*}} ^bb1 fallthrough(^bb2) // CHECK: ^bb2: // CHECK: end_kernel @@ -95,21 +96,23 @@ amdgcn.module @ds_kernels target = isa = { amdgcn.sopp.s_waitcnt lgkmcnt = 0 // // Loop start cond: - %15 = lsir.cmpi i32 sgt %6, %c0_i32 : !amdgcn.sgpr<6>, i32 + %scc0 = amdgcn.alloca : !amdgcn.scc<0> + lsir.cmpi i32 sgt %scc0, %6, %c0_i32 : !amdgcn.scc<0>, !amdgcn.sgpr<6>, i32 // Loop iv: sgpr<7> sop1 s_mov_b32 outs %7 ins %c0_i32 : !amdgcn.sgpr<7>, i32 - cf.cond_br %15, ^bb1(%7 : !amdgcn.sgpr<7>), ^bb2 - ^bb1(%18: !amdgcn.sgpr<7>): // 2 preds: ^bb0, ^bb1 - sop2 s_lshl_b32 outs %8 ins %18, %c2_i32 : !amdgcn.sgpr<8>, !amdgcn.sgpr<7>, i32 + lsir.cond_br %scc0 : !amdgcn.scc<0>, ^bb1, ^bb2 + ^bb1: // 2 preds: ^bb0, ^bb1 + sop2 s_lshl_b32 outs %8 ins %7, %c2_i32 : !amdgcn.sgpr<8>, !amdgcn.sgpr<7>, i32 amdgcn.vop1.vop1 %9, %8 : (!amdgcn.vgpr<0>, !amdgcn.sgpr<8>) -> () %21 = store global_store_dword data %9 addr %13 offset d(%9) : ins(!amdgcn.vgpr<0>, !amdgcn.sgpr<[4 : 6]>, !amdgcn.vgpr<0>) -> !amdgcn.write_token // // Loop iv increment: sgpr<7> - sop2 s_add_u32 outs %7 ins %18, %c1_i32 : !amdgcn.sgpr<7>, !amdgcn.sgpr<7>, i32 + sop2 s_add_u32 outs %7 ins %7, %c1_i32 : !amdgcn.sgpr<7>, !amdgcn.sgpr<7>, i32 // Loop end cond: lsir.cmpi - %24 = lsir.cmpi i32 slt %7, %6 : !amdgcn.sgpr<7>, !amdgcn.sgpr<6> - // Loop backedge: cf.cond_br - cf.cond_br %24, ^bb1(%7 : !amdgcn.sgpr<7>), ^bb2 + %scc1 = amdgcn.alloca : !amdgcn.scc<0> + lsir.cmpi i32 slt %scc1, %7, %6 : !amdgcn.scc<0>, !amdgcn.sgpr<7>, !amdgcn.sgpr<6> + // Loop backedge: lsir.cond_br + lsir.cond_br %scc1 : !amdgcn.scc<0>, ^bb1, ^bb2 ^bb2: // 2 preds: ^bb0, ^bb1 end_kernel } @@ -146,26 +149,21 @@ amdgcn.module @test_br_vgpr_range target = isa = { amdgcn.vop1.vop1 %v3, %c0 : (!amdgcn.vgpr<3>, i32) -> () // Create range - uses vop1 results (which write to allocas) - // CHECK: make_register_range + // CHECK: %[[RANGE:.*]] = make_register_range %range = make_register_range %v0, %v1, %v2, %v3 : !amdgcn.vgpr<0>, !amdgcn.vgpr<1>, !amdgcn.vgpr<2>, !amdgcn.vgpr<3> - // Branch with range as operand + // Branch with range as operand - lsir.br is not lowered by this pass // CHECK: branch s_branch ^bb1 - cf.br ^bb1(%range : !amdgcn.vgpr<[0 : 4]>) + lsir.br ^bb1 - // Block argument should be removed, range reconstructed + // Block argument remains (lsir.br is not lowered, so block args are preserved) // CHECK: ^bb1: - // CHECK-NOT: ^bb1(% - // Verify no duplicate allocas created - // CHECK-NOT: alloca - ^bb1(%arg: !amdgcn.vgpr<[0 : 4]>): - // Range should be reconstructed from SAME allocas at block entry - // CHECK: %[[RECONSTRUCTED:.*]] = make_register_range %[[V0]], %[[V1]], %[[V2]], %[[V3]] // Split the range - verify 4 results - // CHECK: %{{.*}}:4 = split_register_range %[[RECONSTRUCTED]] - %split:4 = split_register_range %arg : !amdgcn.vgpr<[0 : 4]> + // CHECK: %{{.*}}:4 = split_register_range %[[RANGE]] + ^bb1: + %split:4 = split_register_range %range : !amdgcn.vgpr<[0 : 4]> // CHECK: end_kernel end_kernel @@ -218,18 +216,20 @@ amdgcn.module @test_loop target = isa = { // CHECK: sop1 s_mov_b32 outs %[[S8]] sop1 s_mov_b32 outs %s8 ins %c0_i32 : !amdgcn.sgpr<8>, i32 - // Branch to loop - passes counter (SGPR) and accumulator (VGPR range) + // Branch to loop - counter and accumulator flow through allocas // CHECK: branch s_branch ^bb1 - cf.br ^bb1(%s8, %acc_init : !amdgcn.sgpr<8>, !amdgcn.vgpr<[4 : 8]>) + lsir.br ^bb1 - // Loop header - block arguments should be removed + // Loop header - no block arguments (values flow through allocas) // CHECK: ^bb1: // CHECK-NOT: ^bb1(% // Verify no duplicate allocas - counter flows through %[[S8]], accumulator through %[[V4]]-[[V7]] // CHECK-NOT: alloca - ^bb1(%counter: !amdgcn.sgpr<8>, %acc: !amdgcn.vgpr<[4 : 8]>): + ^bb1: // Accumulator range should be reconstructed from SAME allocas at loop entry // CHECK: %[[ACC_RECON:.*]] = make_register_range %[[V4]], %[[V5]], %[[V6]], %[[V7]] + %acc_loop = make_register_range %v4, %v5, %v6, %v7 : + !amdgcn.vgpr<4>, !amdgcn.vgpr<5>, !amdgcn.vgpr<6>, !amdgcn.vgpr<7> // Dummy operands for MFMA (simplified - real code would have loads) %v16 = alloca : !amdgcn.vgpr<16> @@ -241,27 +241,29 @@ amdgcn.module @test_loop target = isa = { // MFMA: new_acc = MFMA(a, b, acc) - accumulator is both input and output // CHECK: vop3p.vop3p_mai %[[ACC_RECON]] - amdgcn.vop3p.vop3p_mai %acc, %dummy_a, %dummy_b, %acc : <[16 : 18]>, <[18 : 20]>, !amdgcn.vgpr<[4 : 8]> -> !amdgcn.vgpr<[4 : 8]> + amdgcn.vop3p.vop3p_mai %acc_loop, %dummy_a, %dummy_b, %acc_loop : <[16 : 18]>, <[18 : 20]>, !amdgcn.vgpr<[4 : 8]> -> !amdgcn.vgpr<[4 : 8]> // Increment counter - writes to %[[S8]] alloca // CHECK: sop2 s_add_u32 outs %[[S8]] ins %[[S8]] - sop2 s_add_u32 outs %s8 ins %counter, %c1_i32 : !amdgcn.sgpr<8>, !amdgcn.sgpr<8>, i32 + sop2 s_add_u32 outs %s8 ins %s8, %c1_i32 : !amdgcn.sgpr<8>, !amdgcn.sgpr<8>, i32 // Loop condition - // CHECK: cmpi s_cmp_lt_i32 - %cond = lsir.cmpi i32 slt %s8, %c2_i32 : !amdgcn.sgpr<8>, i32 + %scc0 = amdgcn.alloca : !amdgcn.scc<0> + lsir.cmpi i32 slt %scc0, %s8, %c2_i32 : !amdgcn.scc<0>, !amdgcn.sgpr<8>, i32 - // Loop backedge - passes updated counter and accumulator to both loop and exit + // Loop backedge - values flow through allocas // CHECK: cbranch s_cbranch_scc1 {{.*}} ^bb1 fallthrough(^bb2) - cf.cond_br %cond, ^bb1(%s8, %acc : !amdgcn.sgpr<8>, !amdgcn.vgpr<[4 : 8]>), ^bb2(%acc : !amdgcn.vgpr<[4 : 8]>) + lsir.cond_br %scc0 : !amdgcn.scc<0>, ^bb1, ^bb2 - // Exit block - receives final accumulator from loop + // Exit block - no block arguments (values flow through allocas) // CHECK: ^bb2: // CHECK-NOT: ^bb2(% // CHECK-NOT: alloca - ^bb2(%final_acc: !amdgcn.vgpr<[4 : 8]>): + ^bb2: // Reconstruct range at exit from SAME allocas // CHECK: %[[FINAL_RECON:.*]] = make_register_range %[[V4]], %[[V5]], %[[V6]], %[[V7]] + %final_acc = make_register_range %v4, %v5, %v6, %v7 : + !amdgcn.vgpr<4>, !amdgcn.vgpr<5>, !amdgcn.vgpr<6>, !amdgcn.vgpr<7> // Extract final values - verify 4 results // CHECK: %{{.*}}:4 = split_register_range %[[FINAL_RECON]] %final:4 = split_register_range %final_acc : !amdgcn.vgpr<[4 : 8]> @@ -278,8 +280,7 @@ amdgcn.module @test_loop target = isa = { // CHECK-LABEL: kernel @test_select_i1 // CHECK: sop1 s_mov_b32 outs %[[A:.*]] ins // CHECK: sop1 s_mov_b32 outs %[[B:.*]] ins -// CHECK: %[[SCC:.*]] = alloca : !amdgcn.scc -// CHECK: cmpi s_cmp_eq_i32 outs %[[SCC]] ins %[[A]], %[[B]] +// CHECK: %[[SCC:.*]] = alloca : !amdgcn.scc<0> // CHECK: sop2 s_cselect_b32 outs %{{.*}} ins // CHECK: end_kernel amdgcn.module @test_select_i1_mod target = isa = { @@ -293,8 +294,9 @@ amdgcn.module @test_select_i1_mod target = isa = { %alloc2 = amdgcn.alloca : !amdgcn.sgpr<2> amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0 : !amdgcn.sgpr<0>, i32 amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10 : !amdgcn.sgpr<1>, i32 - %cmp = lsir.cmpi i32 eq %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1> - lsir.select %alloc2, %cmp, %c42, %c99 : !amdgcn.sgpr<2>, i1, i32, i32 + %scc0 = amdgcn.alloca : !amdgcn.scc<0> + lsir.cmpi i32 eq %scc0, %alloc0, %alloc1 : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1> + lsir.select %alloc2, %scc0, %c42, %c99 : !amdgcn.sgpr<2>, !amdgcn.scc<0>, i32, i32 amdgcn.end_kernel } } @@ -307,10 +309,8 @@ amdgcn.module @test_select_i1_mod target = isa = { // CHECK-LABEL: kernel @test_select_fanout // CHECK: sop1 s_mov_b32 outs %[[A:.*]] ins // CHECK: sop1 s_mov_b32 outs %[[B:.*]] ins -// CHECK: %[[SCC:.*]] = alloca : !amdgcn.scc -// CHECK: cmpi s_cmp_eq_i32 outs %[[SCC]] ins %[[A]], %[[B]] -// CHECK-NOT: alloca : !amdgcn.scc -// CHECK-NOT: cmpi +// CHECK: %[[SCC:.*]] = alloca : !amdgcn.scc<0> +// CHECK-NOT: alloca : !amdgcn.scc<0> // CHECK: sop2 s_cselect_b32 // CHECK: sop2 s_cselect_b32 // CHECK: end_kernel @@ -328,9 +328,10 @@ amdgcn.module @test_select_fanout_mod target = isa = { %alloc3 = amdgcn.alloca : !amdgcn.sgpr<3> amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0 : !amdgcn.sgpr<0>, i32 amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10 : !amdgcn.sgpr<1>, i32 - %cmp = lsir.cmpi i32 eq %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1> - lsir.select %alloc2, %cmp, %c1, %c2 : !amdgcn.sgpr<2>, i1, i32, i32 - lsir.select %alloc3, %cmp, %c3, %c4 : !amdgcn.sgpr<3>, i1, i32, i32 + %scc0 = amdgcn.alloca : !amdgcn.scc<0> + lsir.cmpi i32 eq %scc0, %alloc0, %alloc1 : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1> + lsir.select %alloc2, %scc0, %c1, %c2 : !amdgcn.sgpr<2>, !amdgcn.scc<0>, i32, i32 + lsir.select %alloc3, %scc0, %c3, %c4 : !amdgcn.sgpr<3>, !amdgcn.scc<0>, i32, i32 amdgcn.end_kernel } } @@ -342,10 +343,8 @@ amdgcn.module @test_select_fanout_mod target = isa = { // CHECK-LABEL: kernel @test_mixed_consumers // CHECK: sop1 s_mov_b32 outs %[[A:.*]] ins // CHECK: sop1 s_mov_b32 outs %[[B:.*]] ins -// CHECK: %[[SCC:.*]] = alloca : !amdgcn.scc -// CHECK: cmpi s_cmp_eq_i32 outs %[[SCC]] ins %[[A]], %[[B]] -// CHECK-NOT: alloca : !amdgcn.scc -// CHECK-NOT: cmpi +// CHECK: %[[SCC:.*]] = alloca : !amdgcn.scc<0> +// CHECK-NOT: alloca : !amdgcn.scc<0> // CHECK: sop2 s_cselect_b32 // CHECK: cbranch s_cbranch_scc0 %[[SCC]] // CHECK: ^bb1: @@ -363,9 +362,10 @@ amdgcn.module @test_mixed_mod target = isa = { %alloc2 = amdgcn.alloca : !amdgcn.sgpr<2> amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0 : !amdgcn.sgpr<0>, i32 amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10 : !amdgcn.sgpr<1>, i32 - %cmp = lsir.cmpi i32 eq %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1> - lsir.select %alloc2, %cmp, %c42, %c99 : !amdgcn.sgpr<2>, i1, i32, i32 - cf.cond_br %cmp, ^bb1, ^bb2 + %scc0 = amdgcn.alloca : !amdgcn.scc<0> + lsir.cmpi i32 eq %scc0, %alloc0, %alloc1 : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1> + lsir.select %alloc2, %scc0, %c42, %c99 : !amdgcn.sgpr<2>, !amdgcn.scc<0>, i32, i32 + lsir.cond_br %scc0 : !amdgcn.scc<0>, ^bb1, ^bb2 ^bb1: amdgcn.end_kernel ^bb2: @@ -378,11 +378,7 @@ amdgcn.module @test_mixed_mod target = isa = { // Sequential non-overlapping i1 lifetimes are fine. // CHECK-LABEL: kernel @test_sequential_i1 -// CHECK: cmpi s_cmp_eq_i32 -// CHECK-NOT: cmpi // CHECK: sop2 s_cselect_b32 -// CHECK: cmpi s_cmp_lt_i32 -// CHECK-NOT: cmpi // CHECK: sop2 s_cselect_b32 // CHECK: end_kernel amdgcn.module @test_sequential_mod target = isa = { @@ -398,91 +394,13 @@ amdgcn.module @test_sequential_mod target = isa = { amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0 : !amdgcn.sgpr<0>, i32 amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10 : !amdgcn.sgpr<1>, i32 // First compare: consumed immediately by select - %cmp1 = lsir.cmpi i32 eq %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1> - lsir.select %alloc2, %cmp1, %c42, %c99 : !amdgcn.sgpr<2>, i1, i32, i32 + %scc0 = amdgcn.alloca : !amdgcn.scc<0> + lsir.cmpi i32 eq %scc0, %alloc0, %alloc1 : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1> + lsir.select %alloc2, %scc0, %c42, %c99 : !amdgcn.sgpr<2>, !amdgcn.scc<0>, i32, i32 // Second compare: starts after first is consumed -- no overlap - %cmp2 = lsir.cmpi i32 slt %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1> - lsir.select %alloc3, %cmp2, %c42, %c99 : !amdgcn.sgpr<3>, i1, i32, i32 - amdgcn.end_kernel - } -} - -// ----- - -// Dead cmpi won't be lowered and doesn't consume SCC at runtime (would have -// been DCE'd away) - -// CHECK-LABEL: kernel @test_dead_cmpi_then_live -// CHECK-NOT: cmpi s_cmp_eq_i32 -// CHECK: cmpi s_cmp_lt_i32 -// CHECK-NOT: cmpi -// CHECK: sop2 s_cselect_b32 -// CHECK: end_kernel -amdgcn.module @test_dead_cmpi_mod target = isa = { - amdgcn.kernel @test_dead_cmpi_then_live { - %c0 = arith.constant 0 : i32 - %c10 = arith.constant 10 : i32 - %c42 = arith.constant 42 : i32 - %c99 = arith.constant 99 : i32 - %alloc0 = amdgcn.alloca : !amdgcn.sgpr<0> - %alloc1 = amdgcn.alloca : !amdgcn.sgpr<1> - %alloc2 = amdgcn.alloca : !amdgcn.sgpr<2> - amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0 : !amdgcn.sgpr<0>, i32 - amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10 : !amdgcn.sgpr<1>, i32 - // Dead compare -- result unused, should be ignored by precondition check - %dead = lsir.cmpi i32 eq %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1> - // Live compare -- consumed by select - %live = lsir.cmpi i32 slt %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1> - lsir.select %alloc2, %live, %c42, %c99 : !amdgcn.sgpr<2>, i1, i32, i32 - amdgcn.end_kernel - } -} - -// ----- - -// Overlapping i1 lifetimes: cmpi2 executes while cmpi1's result is still live. -// This would silently clobber SCC. - -amdgcn.module @test_overlap_mod target = isa = { - amdgcn.kernel @test_overlapping_i1 { - %c0 = arith.constant 0 : i32 - %c10 = arith.constant 10 : i32 - %c42 = arith.constant 42 : i32 - %c99 = arith.constant 99 : i32 - %alloc0 = amdgcn.alloca : !amdgcn.sgpr<0> - %alloc1 = amdgcn.alloca : !amdgcn.sgpr<1> - %alloc2 = amdgcn.alloca : !amdgcn.sgpr<2> - amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0 : !amdgcn.sgpr<0>, i32 - amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10 : !amdgcn.sgpr<1>, i32 - %cmp1 = lsir.cmpi i32 eq %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1> - // expected-error @+1 {{would clobber flag register from earlier compare; i1 lifetimes must not overlap}} - %cmp2 = lsir.cmpi i32 slt %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1> - lsir.select %alloc2, %cmp1, %c42, %c99 : !amdgcn.sgpr<2>, i1, i32, i32 - amdgcn.end_kernel - } -} - -// ----- - -// Cross-block i1 usage: flag register (SCC/VCC) is not preserved across -// block boundaries (any branch clobbers it). - -amdgcn.module @test_crossblock_mod target = isa = { - amdgcn.kernel @test_cross_block_i1 { - %c0 = arith.constant 0 : i32 - %c10 = arith.constant 10 : i32 - %c42 = arith.constant 42 : i32 - %c99 = arith.constant 99 : i32 - %alloc0 = amdgcn.alloca : !amdgcn.sgpr<0> - %alloc1 = amdgcn.alloca : !amdgcn.sgpr<1> - %alloc2 = amdgcn.alloca : !amdgcn.sgpr<2> - amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0 : !amdgcn.sgpr<0>, i32 - amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10 : !amdgcn.sgpr<1>, i32 - // expected-error @+1 {{has consumer in a different block; flag register (SCC/VCC) is not preserved across block boundaries}} - %cmp = lsir.cmpi i32 eq %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1> - cf.br ^bb1 - ^bb1: - lsir.select %alloc2, %cmp, %c42, %c99 : !amdgcn.sgpr<2>, i1, i32, i32 + %scc1 = amdgcn.alloca : !amdgcn.scc<0> + lsir.cmpi i32 slt %scc1, %alloc0, %alloc1 : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1> + lsir.select %alloc3, %scc1, %c42, %c99 : !amdgcn.sgpr<3>, !amdgcn.scc<0>, i32, i32 amdgcn.end_kernel } } @@ -494,7 +412,6 @@ amdgcn.module @test_crossblock_mod target = isa = { // CHECK-LABEL: kernel @test_vopc_cond_branch // CHECK: %[[VCC:.*]] = alloca : !amdgcn.vcc -// CHECK: cmpi v_cmp_lt_i32 outs %[[VCC]] ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(i32, !amdgcn.vgpr<0>) // CHECK: cbranch s_cbranch_vccz %[[VCC]] ^bb2 fallthrough(^bb1) : !amdgcn.vcc // CHECK: ^bb1: // CHECK: end_kernel @@ -505,8 +422,9 @@ amdgcn.module @test_vopc_mod target = isa = { %c0_i32 = arith.constant 0 : i32 %v0 = alloca : !amdgcn.vgpr<0> // VGPR on rhs forces vector compare path - %cmp = lsir.cmpi i32 slt %c0_i32, %v0 : i32, !amdgcn.vgpr<0> - cf.cond_br %cmp, ^bb1, ^bb2 + %vcc0 = amdgcn.alloca : !amdgcn.vcc<0> + lsir.cmpi i32 slt %vcc0, %c0_i32, %v0 : !amdgcn.vcc<0>, i32, !amdgcn.vgpr<0> + lsir.cond_br %vcc0 : !amdgcn.vcc<0>, ^bb1, ^bb2 ^bb1: end_kernel ^bb2: @@ -521,15 +439,15 @@ amdgcn.module @test_vopc_mod target = isa = { // and predicate is flipped: slt(v0, 32) -> gt(32, v0). // CHECK-LABEL: kernel @test_vopc_operand_swap -// CHECK: cmpi v_cmp_gt_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(i32, !amdgcn.vgpr<0>) // CHECK: cbranch s_cbranch_vccz %{{.*}} ^bb2 fallthrough(^bb1) : !amdgcn.vcc amdgcn.module @test_vopc_swap_mod target = isa = { amdgcn.kernel @test_vopc_operand_swap { %c32_i32 = arith.constant 32 : i32 %v0 = alloca : !amdgcn.vgpr<0> // lhs=VGPR, rhs=imm -> swap + flip: slt -> gt - %cmp = lsir.cmpi i32 slt %v0, %c32_i32 : !amdgcn.vgpr<0>, i32 - cf.cond_br %cmp, ^bb1, ^bb2 + %vcc0 = amdgcn.alloca : !amdgcn.vcc<0> + lsir.cmpi i32 slt %vcc0, %v0, %c32_i32 : !amdgcn.vcc<0>, !amdgcn.vgpr<0>, i32 + lsir.cond_br %vcc0 : !amdgcn.vcc<0>, ^bb1, ^bb2 ^bb1: end_kernel ^bb2: @@ -542,14 +460,14 @@ amdgcn.module @test_vopc_swap_mod target = isa = { // VOPC with two VGPR operands: no swap needed, rhs already a VGPR. // CHECK-LABEL: kernel @test_vopc_two_vgprs -// CHECK: cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>) // CHECK: cbranch s_cbranch_vccz %{{.*}} ^bb2 fallthrough(^bb1) : !amdgcn.vcc amdgcn.module @test_vopc_vv_mod target = isa = { amdgcn.kernel @test_vopc_two_vgprs { %v0 = alloca : !amdgcn.vgpr<0> %v1 = alloca : !amdgcn.vgpr<1> - %cmp = lsir.cmpi i32 eq %v0, %v1 : !amdgcn.vgpr<0>, !amdgcn.vgpr<1> - cf.cond_br %cmp, ^bb1, ^bb2 + %vcc0 = amdgcn.alloca : !amdgcn.vcc<0> + lsir.cmpi i32 eq %vcc0, %v0, %v1 : !amdgcn.vcc<0>, !amdgcn.vgpr<0>, !amdgcn.vgpr<1> + lsir.cond_br %vcc0 : !amdgcn.vcc<0>, ^bb1, ^bb2 ^bb1: end_kernel ^bb2: @@ -565,7 +483,6 @@ amdgcn.module @test_vopc_vv_mod target = isa = { // CHECK-LABEL: kernel @test_vopc_select // CHECK: %[[VCC:.*]] = alloca : !amdgcn.vcc -// CHECK: cmpi v_cmp_eq_i32 outs %[[VCC]] ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>) // CHECK: vop2 v_cndmask_b32 outs %{{.*}} ins %{{.*}}, %{{.*}} src2 = %[[VCC]] // CHECK: end_kernel amdgcn.module @test_vopc_select_mod target = isa = { @@ -574,9 +491,10 @@ amdgcn.module @test_vopc_select_mod target = isa = { %v1 = alloca : !amdgcn.vgpr<1> %v2 = alloca : !amdgcn.vgpr<2> %v3 = alloca : !amdgcn.vgpr<3> - %cmp = lsir.cmpi i32 eq %v0, %v1 : !amdgcn.vgpr<0>, !amdgcn.vgpr<1> + %vcc0 = amdgcn.alloca : !amdgcn.vcc<0> + lsir.cmpi i32 eq %vcc0, %v0, %v1 : !amdgcn.vcc<0>, !amdgcn.vgpr<0>, !amdgcn.vgpr<1> // true_value=%v2, false_value=%v3, dst=%v3 (v_cndmask reads VCC) - lsir.select %v3, %cmp, %v2, %v3 : !amdgcn.vgpr<3>, i1, !amdgcn.vgpr<2>, !amdgcn.vgpr<3> + lsir.select %v3, %vcc0, %v2, %v3 : !amdgcn.vgpr<3>, !amdgcn.vcc<0>, !amdgcn.vgpr<2>, !amdgcn.vgpr<3> end_kernel } } diff --git a/test/Dialect/AMDGCN/Transforms/legalize-operands.mlir b/test/Dialect/AMDGCN/Transforms/legalize-operands.mlir index 5292611e2..07fafd447 100644 --- a/test/Dialect/AMDGCN/Transforms/legalize-operands.mlir +++ b/test/Dialect/AMDGCN/Transforms/legalize-operands.mlir @@ -7,7 +7,7 @@ // CHECK: lsir.cmpi // CHECK: %[[OUT:.*]] = alloca : !amdgcn.sgpr // CHECK: %[[MOV:.*]] = sop1 s_mov_b32 outs %[[OUT]] ins %{{.*}} : !amdgcn.sgpr, i32 -// CHECK: lsir.select %{{.*}}, %{{.*}}, %[[MOV]], %{{.*}} : !amdgcn.sgpr, i1, !amdgcn.sgpr, i32 +// CHECK: lsir.select %{{.*}}, %{{.*}}, %[[MOV]], %{{.*}} : !amdgcn.sgpr, !amdgcn.scc, !amdgcn.sgpr, i32 amdgcn.module @dual_literal_mod target = isa = { amdgcn.kernel @dual_literal_select { %c0 = arith.constant 0 : i32 @@ -15,8 +15,9 @@ amdgcn.module @dual_literal_mod target = isa = { %c1632 = arith.constant 1632 : i32 %s0 = alloca : !amdgcn.sgpr %s1 = alloca : !amdgcn.sgpr - %cmp = lsir.cmpi i32 eq %s0, %c0 : !amdgcn.sgpr, i32 - %sel = lsir.select %s1, %cmp, %c544, %c1632 : !amdgcn.sgpr, i1, i32, i32 + %scc = lsir.alloca : !amdgcn.scc + %cmp = lsir.cmpi i32 eq %scc, %s0, %c0 : !amdgcn.scc, !amdgcn.sgpr, i32 + %sel = lsir.select %s1, %cmp, %c544, %c1632 : !amdgcn.sgpr, !amdgcn.scc, i32, i32 test_inst ins %sel : (!amdgcn.sgpr) -> () end_kernel } @@ -29,7 +30,7 @@ amdgcn.module @dual_literal_mod target = isa = { // CHECK-LABEL: kernel @one_inline_select // CHECK-NOT: sop1 s_mov_b32 -// CHECK: lsir.select %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.sgpr, i1, i32, i32 +// CHECK: lsir.select %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.sgpr, !amdgcn.scc, i32, i32 amdgcn.module @one_inline_mod target = isa = { amdgcn.kernel @one_inline_select { %c0 = arith.constant 0 : i32 @@ -37,8 +38,9 @@ amdgcn.module @one_inline_mod target = isa = { %c200 = arith.constant 200 : i32 %s0 = alloca : !amdgcn.sgpr %s1 = alloca : !amdgcn.sgpr - %cmp = lsir.cmpi i32 eq %s0, %c0 : !amdgcn.sgpr, i32 - %sel = lsir.select %s1, %cmp, %c10, %c200 : !amdgcn.sgpr, i1, i32, i32 + %scc = lsir.alloca : !amdgcn.scc + %cmp = lsir.cmpi i32 eq %scc, %s0, %c0 : !amdgcn.scc, !amdgcn.sgpr, i32 + %sel = lsir.select %s1, %cmp, %c10, %c200 : !amdgcn.sgpr, !amdgcn.scc, i32, i32 test_inst ins %sel : (!amdgcn.sgpr) -> () end_kernel } @@ -51,7 +53,7 @@ amdgcn.module @one_inline_mod target = isa = { // CHECK-LABEL: kernel @both_inline_select // CHECK-NOT: sop1 s_mov_b32 -// CHECK: lsir.select %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.sgpr, i1, i32, i32 +// CHECK: lsir.select %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.sgpr, !amdgcn.scc, i32, i32 amdgcn.module @both_inline_mod target = isa = { amdgcn.kernel @both_inline_select { %c0 = arith.constant 0 : i32 @@ -59,8 +61,9 @@ amdgcn.module @both_inline_mod target = isa = { %c20 = arith.constant 20 : i32 %s0 = alloca : !amdgcn.sgpr %s1 = alloca : !amdgcn.sgpr - %cmp = lsir.cmpi i32 eq %s0, %c0 : !amdgcn.sgpr, i32 - %sel = lsir.select %s1, %cmp, %c10, %c20 : !amdgcn.sgpr, i1, i32, i32 + %scc = lsir.alloca : !amdgcn.scc + %cmp = lsir.cmpi i32 eq %scc, %s0, %c0 : !amdgcn.scc, !amdgcn.sgpr, i32 + %sel = lsir.select %s1, %cmp, %c10, %c20 : !amdgcn.sgpr, !amdgcn.scc, i32, i32 test_inst ins %sel : (!amdgcn.sgpr) -> () end_kernel } @@ -74,7 +77,7 @@ amdgcn.module @both_inline_mod target = isa = { // CHECK-LABEL: kernel @non_constant_select // CHECK: %[[A:.*]] = sop1 s_mov_b32 // CHECK: %[[B:.*]] = sop1 s_mov_b32 -// CHECK: lsir.select %{{.*}}, %{{.*}}, %[[A]], %[[B]] : !amdgcn.sgpr, i1, !amdgcn.sgpr, !amdgcn.sgpr +// CHECK: lsir.select %{{.*}}, %{{.*}}, %[[A]], %[[B]] : !amdgcn.sgpr, !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr amdgcn.module @non_constant_mod target = isa = { amdgcn.kernel @non_constant_select { %c0 = arith.constant 0 : i32 @@ -83,8 +86,9 @@ amdgcn.module @non_constant_mod target = isa = { %s2 = alloca : !amdgcn.sgpr %a = sop1 s_mov_b32 outs %s1 ins %c0 : !amdgcn.sgpr, i32 %b = sop1 s_mov_b32 outs %s2 ins %c0 : !amdgcn.sgpr, i32 - %cmp = lsir.cmpi i32 eq %s0, %c0 : !amdgcn.sgpr, i32 - %sel = lsir.select %s0, %cmp, %a, %b : !amdgcn.sgpr, i1, !amdgcn.sgpr, !amdgcn.sgpr + %scc = lsir.alloca : !amdgcn.scc + %cmp = lsir.cmpi i32 eq %scc, %s0, %c0 : !amdgcn.scc, !amdgcn.sgpr, i32 + %sel = lsir.select %s0, %cmp, %a, %b : !amdgcn.sgpr, !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr test_inst ins %sel : (!amdgcn.sgpr) -> () end_kernel } @@ -97,7 +101,7 @@ amdgcn.module @non_constant_mod target = isa = { // CHECK-LABEL: kernel @boundary_inline_select // CHECK-NOT: sop1 s_mov_b32 -// CHECK: lsir.select {{.*}} : !amdgcn.sgpr, i1, i32, i32 +// CHECK: lsir.select {{.*}} : !amdgcn.sgpr, !amdgcn.scc, i32, i32 amdgcn.module @boundary_inline_mod target = isa = { amdgcn.kernel @boundary_inline_select { %c0 = arith.constant 0 : i32 @@ -105,8 +109,9 @@ amdgcn.module @boundary_inline_mod target = isa = { %c64 = arith.constant 64 : i32 %s0 = alloca : !amdgcn.sgpr %s1 = alloca : !amdgcn.sgpr - %cmp = lsir.cmpi i32 eq %s0, %c0 : !amdgcn.sgpr, i32 - %sel = lsir.select %s1, %cmp, %cn16, %c64 : !amdgcn.sgpr, i1, i32, i32 + %scc = lsir.alloca : !amdgcn.scc + %cmp = lsir.cmpi i32 eq %scc, %s0, %c0 : !amdgcn.scc, !amdgcn.sgpr, i32 + %sel = lsir.select %s1, %cmp, %cn16, %c64 : !amdgcn.sgpr, !amdgcn.scc, i32, i32 test_inst ins %sel : (!amdgcn.sgpr) -> () end_kernel } @@ -121,7 +126,7 @@ amdgcn.module @boundary_inline_mod target = isa = { // CHECK: lsir.cmpi // CHECK: %[[OUT:.*]] = alloca : !amdgcn.sgpr // CHECK: sop1 s_mov_b32 outs %[[OUT]] -// CHECK: lsir.select {{.*}} : !amdgcn.sgpr, i1, !amdgcn.sgpr, i32 +// CHECK: lsir.select {{.*}} : !amdgcn.sgpr, !amdgcn.scc, !amdgcn.sgpr, i32 amdgcn.module @boundary_non_inline_mod target = isa = { amdgcn.kernel @boundary_non_inline_select { %c0 = arith.constant 0 : i32 @@ -129,8 +134,9 @@ amdgcn.module @boundary_non_inline_mod target = isa = { %c65 = arith.constant 65 : i32 %s0 = alloca : !amdgcn.sgpr %s1 = alloca : !amdgcn.sgpr - %cmp = lsir.cmpi i32 eq %s0, %c0 : !amdgcn.sgpr, i32 - %sel = lsir.select %s1, %cmp, %cn17, %c65 : !amdgcn.sgpr, i1, i32, i32 + %scc = lsir.alloca : !amdgcn.scc + %cmp = lsir.cmpi i32 eq %scc, %s0, %c0 : !amdgcn.scc, !amdgcn.sgpr, i32 + %sel = lsir.select %s1, %cmp, %cn17, %c65 : !amdgcn.sgpr, !amdgcn.scc, i32, i32 test_inst ins %sel : (!amdgcn.sgpr) -> () end_kernel } diff --git a/test/Dialect/AMDGCN/cmp-ops.mlir b/test/Dialect/AMDGCN/cmp-ops.mlir index 1c4b6a249..1c0d97a9a 100644 --- a/test/Dialect/AMDGCN/cmp-ops.mlir +++ b/test/Dialect/AMDGCN/cmp-ops.mlir @@ -1,66 +1,66 @@ // RUN: aster-opt %s --verify-roundtrip -func.func @cmpi(%scc: !amdgcn.scc, %vcc: !amdgcn.vcc, %a: i32, %b: i32, +func.func @cmpi(%scc: !amdgcn.scc<0>, %vcc: !amdgcn.vcc<0>, %a: i32, %b: i32, %v1: !amdgcn.vgpr, %dst: !amdgcn.sgpr<[? + 2]>, %dstAlloc: !amdgcn.sgpr<[0 : 2]>) { - amdgcn.cmpi s_cmp_eq_i32 outs %scc ins %a, %b : outs(!amdgcn.scc) ins(i32, i32) - amdgcn.cmpi v_cmp_eq_i32 outs %vcc ins %a, %v1 : outs(!amdgcn.vcc) ins(i32, !amdgcn.vgpr) + amdgcn.cmpi s_cmp_eq_i32 outs %scc ins %a, %b : outs(!amdgcn.scc<0>) ins(i32, i32) + amdgcn.cmpi v_cmp_eq_i32 outs %vcc ins %a, %v1 : outs(!amdgcn.vcc<0>) ins(i32, !amdgcn.vgpr) %0 = amdgcn.cmpi v_cmp_eq_i32_e64 outs %dst ins %a, %v1 : dps(!amdgcn.sgpr<[? + 2]>) ins(i32, !amdgcn.vgpr) amdgcn.cmpi v_cmp_eq_i32_e64 outs %dstAlloc ins %a, %v1 : outs(!amdgcn.sgpr<[0 : 2]>) ins(i32, !amdgcn.vgpr) return } -func.func @sopc_signed_comparisons(%src0: !amdgcn.sgpr, %src1: !amdgcn.sgpr, %scc: !amdgcn.scc) { +func.func @sopc_signed_comparisons(%src0: !amdgcn.sgpr, %src1: !amdgcn.sgpr, %scc: !amdgcn.scc<0>) { // s_cmp_eq_i32 - SOPC compare equal (signed 32-bit) amdgcn.cmpi s_cmp_eq_i32 outs %scc ins %src0, %src1 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr) // s_cmp_lg_i32 - SOPC compare not equal (signed 32-bit) amdgcn.cmpi s_cmp_lg_i32 outs %scc ins %src0, %src1 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr) // s_cmp_gt_i32 - SOPC compare greater than (signed 32-bit) amdgcn.cmpi s_cmp_gt_i32 outs %scc ins %src0, %src1 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr) // s_cmp_ge_i32 - SOPC compare greater than or equal (signed 32-bit) amdgcn.cmpi s_cmp_ge_i32 outs %scc ins %src0, %src1 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr) // s_cmp_lt_i32 - SOPC compare less than (signed 32-bit) amdgcn.cmpi s_cmp_lt_i32 outs %scc ins %src0, %src1 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr) // s_cmp_le_i32 - SOPC compare less than or equal (signed 32-bit) amdgcn.cmpi s_cmp_le_i32 outs %scc ins %src0, %src1 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr) return } -func.func @sopc_unsigned_comparisons(%src0: !amdgcn.sgpr, %src1: !amdgcn.sgpr, %scc: !amdgcn.scc) { +func.func @sopc_unsigned_comparisons(%src0: !amdgcn.sgpr, %src1: !amdgcn.sgpr, %scc: !amdgcn.scc<0>) { // s_cmp_eq_u32 - SOPC compare equal (unsigned 32-bit) amdgcn.cmpi s_cmp_eq_u32 outs %scc ins %src0, %src1 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr) // s_cmp_lg_u32 - SOPC compare not equal (unsigned 32-bit) amdgcn.cmpi s_cmp_lg_u32 outs %scc ins %src0, %src1 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr) // s_cmp_gt_u32 - SOPC compare greater than (unsigned 32-bit) amdgcn.cmpi s_cmp_gt_u32 outs %scc ins %src0, %src1 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr) // s_cmp_ge_u32 - SOPC compare greater than or equal (unsigned 32-bit) amdgcn.cmpi s_cmp_ge_u32 outs %scc ins %src0, %src1 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr) // s_cmp_lt_u32 - SOPC compare less than (unsigned 32-bit) amdgcn.cmpi s_cmp_lt_u32 outs %scc ins %src0, %src1 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr) // s_cmp_le_u32 - SOPC compare less than or equal (unsigned 32-bit) amdgcn.cmpi s_cmp_le_u32 outs %scc ins %src0, %src1 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr) return } diff --git a/test/Dialect/LSIR/CodeGen/arith-minmax-codegen.mlir b/test/Dialect/LSIR/CodeGen/arith-minmax-codegen.mlir index 440ae7276..d9e4bf987 100644 --- a/test/Dialect/LSIR/CodeGen/arith-minmax-codegen.mlir +++ b/test/Dialect/LSIR/CodeGen/arith-minmax-codegen.mlir @@ -7,7 +7,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec< // CHECK-LABEL: func.func @test_minui( // CHECK-SAME: %[[A:.*]]: !amdgcn.vgpr, %[[B:.*]]: !amdgcn.vgpr) -> !amdgcn.vgpr // CHECK: %[[DST:.*]] = lsir.alloca : !amdgcn.vgpr -// CHECK: %[[CMP:.*]] = lsir.cmpi i32 ult %[[A]], %[[B]] +// CHECK: %[[CMP_DST:.*]] = lsir.alloca : !amdgcn.vcc +// CHECK: %[[CMP:.*]] = lsir.cmpi i32 ult %[[CMP_DST]], %[[A]], %[[B]] // CHECK: lsir.select %[[DST]], %[[CMP]], %[[A]], %[[B]] func.func @test_minui(%a: i32, %b: i32) -> i32 attributes {abi = (!amdgcn.vgpr, !amdgcn.vgpr) -> !amdgcn.vgpr} { @@ -28,7 +29,8 @@ func.func @test_maxui(%a: i32, %b: i32) -> i32 // CHECK-LABEL: func.func @test_minsi( // CHECK-SAME: %[[A:.*]]: !amdgcn.vgpr, %[[B:.*]]: !amdgcn.vgpr) -> !amdgcn.vgpr // CHECK: %[[DST:.*]] = lsir.alloca : !amdgcn.vgpr -// CHECK: %[[CMP:.*]] = lsir.cmpi i32 slt %[[A]], %[[B]] +// CHECK: %[[CMP_DST:.*]] = lsir.alloca : !amdgcn.vcc +// CHECK: %[[CMP:.*]] = lsir.cmpi i32 slt %[[CMP_DST]], %[[A]], %[[B]] // CHECK: lsir.select %[[DST]], %[[CMP]], %[[A]], %[[B]] func.func @test_minsi(%a: i32, %b: i32) -> i32 attributes {abi = (!amdgcn.vgpr, !amdgcn.vgpr) -> !amdgcn.vgpr} { diff --git a/test/Dialect/LSIR/Transforms/codegen-cf.mlir b/test/Dialect/LSIR/Transforms/codegen-cf.mlir index 2b5fe1200..df7eaaf39 100644 --- a/test/Dialect/LSIR/Transforms/codegen-cf.mlir +++ b/test/Dialect/LSIR/Transforms/codegen-cf.mlir @@ -15,16 +15,18 @@ // CHECK: %[[LOAD:.*]] = load_arg 1 // CHECK: amdgcn.sopp.s_waitcnt // CHECK: split_register_range -// CHECK: %[[CMP_INIT:.*]] = lsir.cmpi i32 sgt %{{.*}}, %[[C0]] : !amdgcn.sgpr, i32 +// CHECK: %[[CMP_DST_INIT:.*]] = lsir.alloca : !amdgcn.scc +// CHECK: %[[CMP_INIT:.*]] = lsir.cmpi i32 sgt %[[CMP_DST_INIT]], %{{.*}}, %[[C0]] : !amdgcn.scc, !amdgcn.sgpr, i32 // CHECK: %[[ALLOCA_INIT:.*]] = lsir.alloca : !amdgcn.sgpr // CHECK: %[[MOV_INIT:.*]] = lsir.mov %[[ALLOCA_INIT]], %[[C0]] -// CHECK: cf.cond_br %[[CMP_INIT]], ^bb1(%[[MOV_INIT]] : !amdgcn.sgpr), ^bb2 +// CHECK: lsir.cond_br %[[CMP_INIT]] : !amdgcn.scc, ^bb1(%[[MOV_INIT]] : !amdgcn.sgpr), ^bb2 // CHECK: ^bb1(%[[LOOP_ARG:.*]]: !amdgcn.sgpr): // CHECK: test_inst ins %[[LOOP_ARG]] // CHECK: %[[ALLOCA_LOOP:.*]] = lsir.alloca : !amdgcn.sgpr // CHECK: %[[LOOP_ADDI:.*]] = lsir.addi i32 %[[ALLOCA_LOOP]], %[[LOOP_ARG]], %[[C1]] -// CHECK: %[[CMP_LOOP:.*]] = lsir.cmpi i32 slt %[[LOOP_ADDI]], %{{.*}} : !amdgcn.sgpr, !amdgcn.sgpr -// CHECK: cf.cond_br %[[CMP_LOOP]], ^bb1(%[[LOOP_ADDI]] : !amdgcn.sgpr), ^bb2 +// CHECK: %[[CMP_DST_LOOP:.*]] = lsir.alloca : !amdgcn.scc +// CHECK: %[[CMP_LOOP:.*]] = lsir.cmpi i32 slt %[[CMP_DST_LOOP]], %[[LOOP_ADDI]], %{{.*}} : !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr +// CHECK: lsir.cond_br %[[CMP_LOOP]] : !amdgcn.scc, ^bb1(%[[LOOP_ADDI]] : !amdgcn.sgpr), ^bb2 // CHECK: ^bb2: // CHECK: end_kernel @@ -63,10 +65,11 @@ amdgcn.module @test_uniform_loop target = isa = { // CHECK: alloca // CHECK: %[[LOAD_RESULT:.*]], %{{.*}} = load s_load_dword // CHECK: amdgcn.sopp.s_waitcnt -// CHECK: %[[CMP_INIT2:.*]] = lsir.cmpi i32 sgt %[[LOAD_RESULT]], %[[C0]] : !amdgcn.sgpr, i32 +// CHECK: %[[CMP_DST_INIT2:.*]] = lsir.alloca : !amdgcn.scc +// CHECK: %[[CMP_INIT2:.*]] = lsir.cmpi i32 sgt %[[CMP_DST_INIT2]], %[[LOAD_RESULT]], %[[C0]] : !amdgcn.scc, !amdgcn.sgpr, i32 // CHECK: %[[ALLOCA_INIT2:.*]] = lsir.alloca : !amdgcn.sgpr // CHECK: %[[MOV_INIT2:.*]] = lsir.mov %[[ALLOCA_INIT2]], %[[C0]] -// CHECK: cf.cond_br %[[CMP_INIT2]], ^bb1(%[[MOV_INIT2]] : !amdgcn.sgpr), ^bb2 +// CHECK: lsir.cond_br %[[CMP_INIT2]] : !amdgcn.scc, ^bb1(%[[MOV_INIT2]] : !amdgcn.sgpr), ^bb2 // CHECK: ^bb1(%[[LOOP_ARG2:.*]]: !amdgcn.sgpr): // CHECK: %[[ALLOCA_SHLI:.*]] = lsir.alloca : !amdgcn.sgpr // CHECK: %[[LOOP_SHLI:.*]] = lsir.shli i32 %[[ALLOCA_SHLI]], %[[LOOP_ARG2]], %[[C2]] @@ -75,8 +78,9 @@ amdgcn.module @test_uniform_loop target = isa = { // CHECK: store global_store_dword // CHECK: %[[ALLOCA_ADDI:.*]] = lsir.alloca : !amdgcn.sgpr // CHECK: %[[LOOP_ADDI2:.*]] = lsir.addi i32 %[[ALLOCA_ADDI]], %[[LOOP_ARG2]], %[[C1]] -// CHECK: %[[CMP_LOOP2:.*]] = lsir.cmpi i32 slt %[[LOOP_ADDI2]], %[[LOAD_RESULT]] : !amdgcn.sgpr, !amdgcn.sgpr -// CHECK: cf.cond_br %[[CMP_LOOP2]], ^bb1(%[[LOOP_ADDI2]] : !amdgcn.sgpr), ^bb2 +// CHECK: %[[CMP_DST_LOOP2:.*]] = lsir.alloca : !amdgcn.scc +// CHECK: %[[CMP_LOOP2:.*]] = lsir.cmpi i32 slt %[[CMP_DST_LOOP2]], %[[LOOP_ADDI2]], %[[LOAD_RESULT]] : !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr +// CHECK: lsir.cond_br %[[CMP_LOOP2]] : !amdgcn.scc, ^bb1(%[[LOOP_ADDI2]] : !amdgcn.sgpr), ^bb2 // CHECK: ^bb2: // CHECK: end_kernel @@ -111,18 +115,19 @@ amdgcn.module @test_uniform_loop_with_load target = isa = { // ----- //===----------------------------------------------------------------------===// -// Test arith.cmpi + arith.select -> lsir.cmpi + lsir.select(i1) +// Test arith.cmpi + arith.select -> lsir.cmpi + lsir.select // Verifies that: -// 1. arith.cmpi is converted to lsir.cmpi returning i1 -// 2. arith.select with i1 condition is converted to lsir.select with i1 -// 3. No unrealized_conversion_cast is inserted for the i1 condition +// 1. arith.cmpi is converted to lsir.cmpi (DPS, returns SCC/VCC register) +// 2. arith.select with cmpi condition is converted to lsir.select with SCC +// 3. No unrealized_conversion_cast is inserted for the condition //===----------------------------------------------------------------------===// // CHECK-LABEL: amdgcn.module @test_select_i1 // CHECK: kernel @test_select_i1 -// CHECK: %[[CMP:.*]] = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr, i32 +// CHECK: %{{.*}} = lsir.alloca : !amdgcn.scc +// CHECK: %[[CMP:.*]] = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr, i32 // CHECK: %[[ALLOCA:.*]] = lsir.alloca : !amdgcn.sgpr -// CHECK: lsir.select %[[ALLOCA]], %[[CMP]], %{{.*}}, %{{.*}} : !amdgcn.sgpr, i1, i32, i32 +// CHECK: lsir.select %[[ALLOCA]], %[[CMP]], %{{.*}}, %{{.*}} : !amdgcn.sgpr, !amdgcn.scc, i32, i32 // CHECK-NOT: unrealized_conversion_cast amdgcn.module @test_select_i1 target = isa = { @@ -148,10 +153,11 @@ amdgcn.module @test_select_i1 target = isa = { // CHECK-LABEL: func.func @test_token_in_args( // CHECK-SAME: %[[ARG0:.*]]: !amdgcn.vgpr, %[[ARG1:.*]]: !amdgcn.read_token) { // CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 -// CHECK: %[[CMPI_0:.*]] = lsir.cmpi i32 sgt %[[ARG0]], %[[CONSTANT_0]] : !amdgcn.vgpr, i32 -// CHECK: cf.cond_br %[[CMPI_0]], ^bb1(%[[ARG1]] : !amdgcn.read_token), ^bb2 +// CHECK: %{{.*}} = lsir.alloca : !amdgcn.vcc +// CHECK: %[[CMPI_0:.*]] = lsir.cmpi i32 sgt %{{.*}}, %[[ARG0]], %[[CONSTANT_0]] : !amdgcn.vcc, !amdgcn.vgpr, i32 +// CHECK: lsir.cond_br %[[CMPI_0]] : !amdgcn.vcc, ^bb1(%[[ARG1]] : !amdgcn.read_token), ^bb2 // CHECK: ^bb1(%[[VAL_0:.*]]: !amdgcn.read_token): -// CHECK: cf.cond_br %[[CMPI_0]], ^bb1(%[[VAL_0]] : !amdgcn.read_token), ^bb2 +// CHECK: lsir.cond_br %[[CMPI_0]] : !amdgcn.vcc, ^bb1(%[[VAL_0]] : !amdgcn.read_token), ^bb2 // CHECK: ^bb2: // CHECK: return // CHECK: } diff --git a/test/Dialect/LSIR/Transforms/codegen-func-cf.mlir b/test/Dialect/LSIR/Transforms/codegen-func-cf.mlir index aafc0a76f..ce937351c 100644 --- a/test/Dialect/LSIR/Transforms/codegen-func-cf.mlir +++ b/test/Dialect/LSIR/Transforms/codegen-func-cf.mlir @@ -2,13 +2,13 @@ // CHECK-LABEL: amdgcn.module @test // CHECK: func.func @loop_func -// CHECK: cf.cond_br %{{.*}}, ^bb1(%{{.*}} : !amdgcn.sgpr), ^bb2 +// CHECK: lsir.cond_br %{{.*}}, ^bb1(%{{.*}} : !amdgcn.sgpr), ^bb2 // CHECK: ^bb1(%{{.*}}: !amdgcn.sgpr): -// CHECK: cf.cond_br %{{.*}}, ^bb1(%{{.*}} : !amdgcn.sgpr), ^bb2 +// CHECK: lsir.cond_br %{{.*}}, ^bb1(%{{.*}} : !amdgcn.sgpr), ^bb2 // CHECK: ^bb2: amdgcn.module @test target = isa = { - func.func @loop_func(%arg0: i32, %n: i32) { + func.func @loop_func(%arg0: i32, %n: i32) attributes {gpu.kernel} { %c0 = arith.constant 0 : i32 %c1 = arith.constant 1 : i32 %cmp_init = arith.cmpi slt, %c0, %n : i32 diff --git a/test/Dialect/LSIR/ops.mlir b/test/Dialect/LSIR/ops.mlir index ac93b8234..4be0fc583 100644 --- a/test/Dialect/LSIR/ops.mlir +++ b/test/Dialect/LSIR/ops.mlir @@ -65,54 +65,54 @@ func.func @test_shrui(%dst: !amdgcn.vgpr, %value: !amdgcn.vgpr, %amount: !amdgcn return %0 : !amdgcn.vgpr } -func.func @test_cmpi_eq(%lhs: !amdgcn.sgpr, %rhs: !amdgcn.sgpr) -> i1 { - %0 = lsir.cmpi i32 eq %lhs, %rhs : !amdgcn.sgpr, !amdgcn.sgpr - return %0 : i1 +func.func @test_cmpi_eq(%dst: !amdgcn.scc, %lhs: !amdgcn.sgpr, %rhs: !amdgcn.sgpr) -> !amdgcn.scc { + %0 = lsir.cmpi i32 eq %dst, %lhs, %rhs : !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr + return %0 : !amdgcn.scc } -func.func @test_cmpi_ne(%lhs: !amdgcn.sgpr, %rhs: !amdgcn.sgpr) -> i1 { - %0 = lsir.cmpi i32 ne %lhs, %rhs : !amdgcn.sgpr, !amdgcn.sgpr - return %0 : i1 +func.func @test_cmpi_ne(%dst: !amdgcn.scc, %lhs: !amdgcn.sgpr, %rhs: !amdgcn.sgpr) -> !amdgcn.scc { + %0 = lsir.cmpi i32 ne %dst, %lhs, %rhs : !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr + return %0 : !amdgcn.scc } -func.func @test_cmpi_slt(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 { - %0 = lsir.cmpi i32 slt %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr - return %0 : i1 +func.func @test_cmpi_slt(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc { + %0 = lsir.cmpi i32 slt %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr + return %0 : !amdgcn.vcc } -func.func @test_cmpi_sle(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 { - %0 = lsir.cmpi i32 sle %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr - return %0 : i1 +func.func @test_cmpi_sle(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc { + %0 = lsir.cmpi i32 sle %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr + return %0 : !amdgcn.vcc } -func.func @test_cmpi_sgt(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 { - %0 = lsir.cmpi i32 sgt %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr - return %0 : i1 +func.func @test_cmpi_sgt(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc { + %0 = lsir.cmpi i32 sgt %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr + return %0 : !amdgcn.vcc } -func.func @test_cmpi_sge(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 { - %0 = lsir.cmpi i32 sge %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr - return %0 : i1 +func.func @test_cmpi_sge(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc { + %0 = lsir.cmpi i32 sge %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr + return %0 : !amdgcn.vcc } -func.func @test_cmpi_ult(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 { - %0 = lsir.cmpi i32 ult %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr - return %0 : i1 +func.func @test_cmpi_ult(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc { + %0 = lsir.cmpi i32 ult %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr + return %0 : !amdgcn.vcc } -func.func @test_cmpi_ule(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 { - %0 = lsir.cmpi i32 ule %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr - return %0 : i1 +func.func @test_cmpi_ule(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc { + %0 = lsir.cmpi i32 ule %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr + return %0 : !amdgcn.vcc } -func.func @test_cmpi_ugt(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 { - %0 = lsir.cmpi i32 ugt %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr - return %0 : i1 +func.func @test_cmpi_ugt(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc { + %0 = lsir.cmpi i32 ugt %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr + return %0 : !amdgcn.vcc } -func.func @test_cmpi_uge(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 { - %0 = lsir.cmpi i32 uge %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr - return %0 : i1 +func.func @test_cmpi_uge(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc { + %0 = lsir.cmpi i32 uge %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr + return %0 : !amdgcn.vcc } func.func @test_extsi(%dst: !amdgcn.sgpr, %value: !amdgcn.sgpr) -> !amdgcn.sgpr { @@ -211,14 +211,14 @@ func.func @test_xori(%dst: !amdgcn.sgpr, %lhs: !amdgcn.sgpr, %rhs: !amdgcn.sgpr) return %0 : !amdgcn.sgpr } -func.func @test_cmpf_olt(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 { - %0 = lsir.cmpf f32 olt %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr - return %0 : i1 +func.func @test_cmpf_olt(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc { + %0 = lsir.cmpf f32 olt %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr + return %0 : !amdgcn.vcc } -func.func @test_cmpf_oeq(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 { - %0 = lsir.cmpf f32 oeq %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr - return %0 : i1 +func.func @test_cmpf_oeq(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc { + %0 = lsir.cmpf f32 oeq %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr + return %0 : !amdgcn.vcc } func.func @test_extf(%dst: !amdgcn.vgpr, %value: !amdgcn.vgpr) -> !amdgcn.vgpr { @@ -360,14 +360,14 @@ func.func @test_select_reg_condition(%dst: !amdgcn.vgpr, %cond: !amdgcn.vgpr, %t return %0 : !amdgcn.vgpr } -func.func @test_select_i1_condition(%dst: !amdgcn.sgpr, %cond: i1, %tv: !amdgcn.sgpr, %fv: !amdgcn.sgpr) -> !amdgcn.sgpr { - %0 = lsir.select %dst, %cond, %tv, %fv : !amdgcn.sgpr, i1, !amdgcn.sgpr, !amdgcn.sgpr +func.func @test_select_scc_condition(%dst: !amdgcn.sgpr, %cond: !amdgcn.scc, %tv: !amdgcn.sgpr, %fv: !amdgcn.sgpr) -> !amdgcn.sgpr { + %0 = lsir.select %dst, %cond, %tv, %fv : !amdgcn.sgpr, !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr return %0 : !amdgcn.sgpr } -func.func @test_select_i1_imm_operands(%dst: !amdgcn.sgpr, %cond: i1) -> !amdgcn.sgpr { +func.func @test_select_scc_imm_operands(%dst: !amdgcn.sgpr, %cond: !amdgcn.scc) -> !amdgcn.sgpr { %c42 = arith.constant 42 : i32 %c99 = arith.constant 99 : i32 - %0 = lsir.select %dst, %cond, %c42, %c99 : !amdgcn.sgpr, i1, i32, i32 + %0 = lsir.select %dst, %cond, %c42, %c99 : !amdgcn.sgpr, !amdgcn.scc, i32, i32 return %0 : !amdgcn.sgpr } diff --git a/test/Target/ASM/cbranch.mlir b/test/Target/ASM/cbranch.mlir index 9ec8d81e5..af7bfa659 100644 --- a/test/Target/ASM/cbranch.mlir +++ b/test/Target/ASM/cbranch.mlir @@ -31,11 +31,11 @@ amdgcn.module @mod target = #amdgcn.target isa = #amdgcn.isa { ^entry: %s2 = amdgcn.alloca : !amdgcn.sgpr<2> %s3 = amdgcn.alloca : !amdgcn.sgpr<3> - %scc = amdgcn.alloca : !amdgcn.scc + %scc = amdgcn.alloca : !amdgcn.scc<0> amdgcn.cmpi s_cmp_gt_u32 outs %scc ins %s2, %s3 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr<2>, !amdgcn.sgpr<3>) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr<2>, !amdgcn.sgpr<3>) amdgcn.cbranch s_cbranch_scc1 %scc ^loop fallthrough (^exit) - : !amdgcn.scc + : !amdgcn.scc<0> ^exit: amdgcn.end_kernel ^loop: @@ -46,11 +46,11 @@ amdgcn.module @mod target = #amdgcn.target isa = #amdgcn.isa { ^entry: %s0 = amdgcn.alloca : !amdgcn.sgpr<0> %s1 = amdgcn.alloca : !amdgcn.sgpr<1> - %scc = amdgcn.alloca : !amdgcn.scc + %scc = amdgcn.alloca : !amdgcn.scc<0> amdgcn.cmpi s_cmp_eq_i32 outs %scc ins %s0, %s1 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr<0>, !amdgcn.sgpr<1>) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr<0>, !amdgcn.sgpr<1>) amdgcn.cbranch s_cbranch_scc0 %scc ^true_path fallthrough (^false_path) - : !amdgcn.scc + : !amdgcn.scc<0> ^false_path: amdgcn.end_kernel ^true_path: diff --git a/test/Target/ASM/g2s-load-lds.mlir b/test/Target/ASM/g2s-load-lds.mlir index 01a0586af..992adee29 100644 --- a/test/Target/ASM/g2s-load-lds.mlir +++ b/test/Target/ASM/g2s-load-lds.mlir @@ -19,9 +19,9 @@ amdgcn.module @g2s_mod target = #amdgcn.target isa = #amdgcn.isa amdgcn.kernel @test_g2s_dword { ^entry: - %m0 = amdgcn.alloca : !amdgcn.m0 + %m0 = amdgcn.alloca : !amdgcn.m0<0> %c0 = arith.constant 0 : i32 - amdgcn.sop1 s_mov_b32 outs %m0 ins %c0 : !amdgcn.m0, i32 + amdgcn.sop1 s_mov_b32 outs %m0 ins %c0 : !amdgcn.m0<0>, i32 // Buffer descriptor (s[0:3]) and scalar offset (s4) %s0 = amdgcn.alloca : !amdgcn.sgpr<0> @@ -35,7 +35,7 @@ amdgcn.module @g2s_mod target = #amdgcn.target isa = #amdgcn.isa %tok = amdgcn.load_lds buffer_load_dword_lds m0 %m0 addr %rsrc offset u(%soff) + d(%voff) + c(%c0) - : ins(!amdgcn.m0, !amdgcn.sgpr<[0 : 4]>, !amdgcn.sgpr<4>, !amdgcn.vgpr<0>, i32) + : ins(!amdgcn.m0<0>, !amdgcn.sgpr<[0 : 4]>, !amdgcn.sgpr<4>, !amdgcn.vgpr<0>, i32) -> !amdgcn.write_token amdgcn.sopp.s_waitcnt #amdgcn.inst vmcnt = 0 amdgcn.end_kernel @@ -43,10 +43,10 @@ amdgcn.module @g2s_mod target = #amdgcn.target isa = #amdgcn.isa amdgcn.kernel @test_g2s_dwordx4 { ^entry: - %m0 = amdgcn.alloca : !amdgcn.m0 + %m0 = amdgcn.alloca : !amdgcn.m0<0> %c0 = arith.constant 0 : i32 %c64 = arith.constant 64 : i32 - amdgcn.sop1 s_mov_b32 outs %m0 ins %c0 : !amdgcn.m0, i32 + amdgcn.sop1 s_mov_b32 outs %m0 ins %c0 : !amdgcn.m0<0>, i32 %s0 = amdgcn.alloca : !amdgcn.sgpr<0> %s1 = amdgcn.alloca : !amdgcn.sgpr<1> @@ -59,7 +59,7 @@ amdgcn.module @g2s_mod target = #amdgcn.target isa = #amdgcn.isa %tok = amdgcn.load_lds buffer_load_dwordx4_lds m0 %m0 addr %rsrc offset u(%soff) + d(%voff) + c(%c64) - : ins(!amdgcn.m0, !amdgcn.sgpr<[0 : 4]>, !amdgcn.sgpr<4>, !amdgcn.vgpr<0>, i32) + : ins(!amdgcn.m0<0>, !amdgcn.sgpr<[0 : 4]>, !amdgcn.sgpr<4>, !amdgcn.vgpr<0>, i32) -> !amdgcn.write_token amdgcn.sopp.s_waitcnt #amdgcn.inst vmcnt = 0 amdgcn.end_kernel diff --git a/test/Target/ASM/loops.mlir b/test/Target/ASM/loops.mlir index e1747f8d1..8f5af5bea 100644 --- a/test/Target/ASM/loops.mlir +++ b/test/Target/ASM/loops.mlir @@ -44,17 +44,17 @@ amdgcn.module @mod target = #amdgcn.target isa = #amdgcn.isa { ^entry: %c5 = arith.constant 5 : i32 %c4 = arith.constant 4 : i32 - %scc = amdgcn.alloca : !amdgcn.scc + %scc = amdgcn.alloca : !amdgcn.scc<0> %s0 = amdgcn.alloca : !amdgcn.sgpr<0> %s1 = amdgcn.alloca : !amdgcn.sgpr<1> amdgcn.sop1 s_mov_b32 outs %s0 ins %c5 : !amdgcn.sgpr<0>, i32 amdgcn.sop1 s_mov_b32 outs %s1 ins %c4 : !amdgcn.sgpr<1>, i32 amdgcn.cmpi s_cmp_le_i32 outs %scc ins %s0, %s1 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr<0>, !amdgcn.sgpr<1>) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr<0>, !amdgcn.sgpr<1>) amdgcn.cbranch s_cbranch_scc1 %scc ^then fallthrough (^else) - : !amdgcn.scc + : !amdgcn.scc<0> ^else: amdgcn.end_kernel ^then: @@ -68,7 +68,7 @@ amdgcn.module @mod target = #amdgcn.target isa = #amdgcn.isa { ^entry: %c10 = arith.constant 10 : i32 %c9 = arith.constant 9 : i32 - %scc = amdgcn.alloca : !amdgcn.scc + %scc = amdgcn.alloca : !amdgcn.scc<0> %s2 = amdgcn.alloca : !amdgcn.sgpr<2> %s3 = amdgcn.alloca : !amdgcn.sgpr<3> @@ -78,9 +78,9 @@ amdgcn.module @mod target = #amdgcn.target isa = #amdgcn.isa { ^loop_header: amdgcn.cmpi s_cmp_lt_i32 outs %scc ins %s3, %s2 - : outs(!amdgcn.scc) ins(!amdgcn.sgpr<3>, !amdgcn.sgpr<2>) + : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr<3>, !amdgcn.sgpr<2>) amdgcn.cbranch s_cbranch_scc0 %scc ^exit fallthrough (^loop_body) - : !amdgcn.scc + : !amdgcn.scc<0> ^loop_body: amdgcn.branch s_branch ^loop_header ^exit: diff --git a/test/Target/ASM/s-mov-m0.mlir b/test/Target/ASM/s-mov-m0.mlir index 0fbc86ef5..b6a2b045c 100644 --- a/test/Target/ASM/s-mov-m0.mlir +++ b/test/Target/ASM/s-mov-m0.mlir @@ -16,17 +16,17 @@ amdgcn.module @m0_mod target = #amdgcn.target isa = #amdgcn.isa { amdgcn.kernel @test_s_mov_m0_imm { ^entry: - %m0 = amdgcn.alloca : !amdgcn.m0 + %m0 = amdgcn.alloca : !amdgcn.m0<0> %c1024 = arith.constant 1024 : i32 - amdgcn.sop1 s_mov_b32 outs %m0 ins %c1024 : !amdgcn.m0, i32 + amdgcn.sop1 s_mov_b32 outs %m0 ins %c1024 : !amdgcn.m0<0>, i32 amdgcn.end_kernel } amdgcn.kernel @test_s_mov_m0_sgpr { ^entry: - %m0 = amdgcn.alloca : !amdgcn.m0 + %m0 = amdgcn.alloca : !amdgcn.m0<0> %s0 = amdgcn.alloca : !amdgcn.sgpr<0> - amdgcn.sop1 s_mov_b32 outs %m0 ins %s0 : !amdgcn.m0, !amdgcn.sgpr<0> + amdgcn.sop1 s_mov_b32 outs %m0 ins %s0 : !amdgcn.m0<0>, !amdgcn.sgpr<0> amdgcn.end_kernel } } diff --git a/test/Target/ASM/vopc-branch.mlir b/test/Target/ASM/vopc-branch.mlir index 233e53aa2..70b9b56e9 100644 --- a/test/Target/ASM/vopc-branch.mlir +++ b/test/Target/ASM/vopc-branch.mlir @@ -33,12 +33,12 @@ amdgcn.module @vopc_branch_mod target = #amdgcn.target isa = #amdgcn.isa amdgcn.kernel @test_vcmp_lt_i32_vccnz { ^entry: %v0 = amdgcn.alloca : !amdgcn.vgpr<0> - %vcc = amdgcn.alloca : !amdgcn.vcc + %vcc = amdgcn.alloca : !amdgcn.vcc<0> %c0 = arith.constant 0 : i32 amdgcn.cmpi v_cmp_lt_i32 outs %vcc ins %c0, %v0 - : outs(!amdgcn.vcc) ins(i32, !amdgcn.vgpr<0>) + : outs(!amdgcn.vcc<0>) ins(i32, !amdgcn.vgpr<0>) amdgcn.cbranch s_cbranch_vccz %vcc ^taken fallthrough(^fallthru) - : !amdgcn.vcc + : !amdgcn.vcc<0> ^fallthru: amdgcn.end_kernel ^taken: @@ -50,11 +50,11 @@ amdgcn.module @vopc_branch_mod target = #amdgcn.target isa = #amdgcn.isa ^entry: %v0 = amdgcn.alloca : !amdgcn.vgpr<0> %v1 = amdgcn.alloca : !amdgcn.vgpr<1> - %vcc = amdgcn.alloca : !amdgcn.vcc + %vcc = amdgcn.alloca : !amdgcn.vcc<0> amdgcn.cmpi v_cmp_eq_i32 outs %vcc ins %v0, %v1 - : outs(!amdgcn.vcc) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>) + : outs(!amdgcn.vcc<0>) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>) amdgcn.cbranch s_cbranch_vccz %vcc ^taken fallthrough(^fallthru) - : !amdgcn.vcc + : !amdgcn.vcc<0> ^fallthru: amdgcn.end_kernel ^taken: @@ -65,12 +65,12 @@ amdgcn.module @vopc_branch_mod target = #amdgcn.target isa = #amdgcn.isa amdgcn.kernel @test_vcmp_gt_swap { ^entry: %v0 = amdgcn.alloca : !amdgcn.vgpr<0> - %vcc = amdgcn.alloca : !amdgcn.vcc + %vcc = amdgcn.alloca : !amdgcn.vcc<0> %c32 = arith.constant 32 : i32 amdgcn.cmpi v_cmp_gt_i32 outs %vcc ins %c32, %v0 - : outs(!amdgcn.vcc) ins(i32, !amdgcn.vgpr<0>) + : outs(!amdgcn.vcc<0>) ins(i32, !amdgcn.vgpr<0>) amdgcn.cbranch s_cbranch_vccz %vcc ^taken fallthrough(^fallthru) - : !amdgcn.vcc + : !amdgcn.vcc<0> ^fallthru: amdgcn.end_kernel ^taken: diff --git a/test/integration/g2s-load-lds-e2e.mlir b/test/integration/g2s-load-lds-e2e.mlir index 843665cbe..3728e9124 100644 --- a/test/integration/g2s-load-lds-e2e.mlir +++ b/test/integration/g2s-load-lds-e2e.mlir @@ -102,9 +102,9 @@ amdgcn.module @g2s_e2e_mod target = #amdgcn.target isa = #amdgcn.isa %c44 = arith.constant 44 : i32 - amdgcn.sop1 s_mov_b32 outs %m0 ins %c44 : !amdgcn.m0, i32 + amdgcn.sop1 s_mov_b32 outs %m0 ins %c44 : !amdgcn.m0<0>, i32 // 1 NOP required after SALU writes M0 before G2S (CDNA4 hazard) amdgcn.sopp.sopp #amdgcn.inst , imm = 10 @@ -115,7 +115,7 @@ amdgcn.module @g2s_e2e_mod target = #amdgcn.target isa = #amdgcn.isa LDS[44 + tid*4] %tok_g2s = amdgcn.load_lds buffer_load_dword_lds m0 %m0 addr %src_rsrc offset u(%soffset) + d(%voffset) + c(%c0) - : ins(!amdgcn.m0, !amdgcn.sgpr<[? + 4]>, !amdgcn.sgpr, !amdgcn.vgpr, i32) + : ins(!amdgcn.m0<0>, !amdgcn.sgpr<[? + 4]>, !amdgcn.sgpr, !amdgcn.vgpr, i32) -> !amdgcn.write_token // Wait for G2S to complete (vmcnt tracks buffer loads) diff --git a/test/integration/sreg-roundtrip-e2e.mlir b/test/integration/sreg-roundtrip-e2e.mlir index 25e509d29..e18ac562b 100644 --- a/test/integration/sreg-roundtrip-e2e.mlir +++ b/test/integration/sreg-roundtrip-e2e.mlir @@ -31,15 +31,15 @@ amdgcn.module @m0_roundtrip_mod target = #amdgcn.target isa = #amdgcn.is // Write constant 42 to M0 via s_mov_b32 // M0 is pre-allocated (fixed physical register), so write has no SSA result. - %m0 = amdgcn.alloca : !amdgcn.m0 + %m0 = amdgcn.alloca : !amdgcn.m0<0> %c42 = arith.constant 42 : i32 amdgcn.sop1 s_mov_b32 outs %m0 ins %c42 - : !amdgcn.m0, i32 + : !amdgcn.m0<0>, i32 // Read M0 back into an SGPR via s_mov_b32 %s_dest = amdgcn.alloca : !amdgcn.sgpr %s_val = amdgcn.sop1 s_mov_b32 outs %s_dest ins %m0 - : !amdgcn.sgpr, !amdgcn.m0 + : !amdgcn.sgpr, !amdgcn.m0<0> // Broadcast scalar to all VGPR lanes via v_mov_b32_e32 %v_dest = amdgcn.alloca : !amdgcn.vgpr diff --git a/test/integration/vopc-branch-e2e.mlir b/test/integration/vopc-branch-e2e.mlir index 50afb2395..a44baa9f9 100644 --- a/test/integration/vopc-branch-e2e.mlir +++ b/test/integration/vopc-branch-e2e.mlir @@ -43,9 +43,10 @@ amdgcn.module @vopc_select_mod target = #amdgcn.target isa = #amdgcn.isa %v_false = amdgcn.vop1.vop1 #amdgcn.inst %v_false_alloc, %c99 : (!amdgcn.vgpr, i32) -> !amdgcn.vgpr %v_out_alloc = amdgcn.alloca : !amdgcn.vgpr - %cmp = lsir.cmpi i32 slt %tid, %c100 : !amdgcn.vgpr, i32 + %vcc0 = lsir.alloca : !amdgcn.vcc + %cmp = lsir.cmpi i32 slt %vcc0, %tid, %c100 : !amdgcn.vcc, !amdgcn.vgpr, i32 %selected = lsir.select %v_out_alloc, %cmp, %v_true, %v_false - : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr + : !amdgcn.vgpr, !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr %tok = amdgcn.store global_store_dword data %selected addr %out_ptr offset d(%voffset) + c(%c0) : ins(!amdgcn.vgpr, !amdgcn.sgpr<[? + 2]>, !amdgcn.vgpr, i32) @@ -74,9 +75,10 @@ amdgcn.module @vopc_select_mod target = #amdgcn.target isa = #amdgcn.isa %v_false = amdgcn.vop1.vop1 #amdgcn.inst %v_false_alloc, %c99 : (!amdgcn.vgpr, i32) -> !amdgcn.vgpr %v_out_alloc = amdgcn.alloca : !amdgcn.vgpr - %cmp = lsir.cmpi i32 slt %tid, %c0 : !amdgcn.vgpr, i32 + %vcc1 = lsir.alloca : !amdgcn.vcc + %cmp = lsir.cmpi i32 slt %vcc1, %tid, %c0 : !amdgcn.vcc, !amdgcn.vgpr, i32 %selected = lsir.select %v_out_alloc, %cmp, %v_true, %v_false - : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr + : !amdgcn.vgpr, !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr %tok = amdgcn.store global_store_dword data %selected addr %out_ptr offset d(%voffset) + c(%c0) : ins(!amdgcn.vgpr, !amdgcn.sgpr<[? + 2]>, !amdgcn.vgpr, i32) @@ -107,9 +109,10 @@ amdgcn.module @vopc_select_mod target = #amdgcn.target isa = #amdgcn.isa %v_false = amdgcn.vop1.vop1 #amdgcn.inst %v_false_alloc, %c99 : (!amdgcn.vgpr, i32) -> !amdgcn.vgpr %v_out_alloc = amdgcn.alloca : !amdgcn.vgpr - %cmp = lsir.cmpi i32 slt %tid, %c32 : !amdgcn.vgpr, i32 + %vcc2 = lsir.alloca : !amdgcn.vcc + %cmp = lsir.cmpi i32 slt %vcc2, %tid, %c32 : !amdgcn.vcc, !amdgcn.vgpr, i32 %selected = lsir.select %v_out_alloc, %cmp, %v_true, %v_false - : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr + : !amdgcn.vgpr, !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr %tok = amdgcn.store global_store_dword data %selected addr %out_ptr offset d(%voffset) + c(%c0) : ins(!amdgcn.vgpr, !amdgcn.sgpr<[? + 2]>, !amdgcn.vgpr, i32)