From e37d0bf41d36e4517b3eba7b68bce1b8c13df197 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fabian.mora-cordero@amd.com>
Date: Thu, 9 Apr 2026 15:42:54 +0000
Subject: [PATCH] step

Signed-off-by: Fabian Mora <fabian.mora-cordero@amd.com>
---
 .../aster/Dialect/AMDGCN/IR/AMDGCNTypes.td    |  87 +---
 .../Dialect/AMDGCN/Transforms/AMDGCNPasses.td |  41 +-
 include/aster/Dialect/LSIR/IR/LSIROps.h       |   1 +
 include/aster/Dialect/LSIR/IR/LSIROps.td      | 131 ++++-
 include/aster/Interfaces/RegisterType.td      |  39 +-
 include/aster/Transforms/Passes.td            |  20 +
 lib/Analysis/MemoryDependenceAnalysis.cpp     |   6 +-
 .../AMDGCN/CodeGen/CodeGenPatterns.cpp        |   8 +
 lib/Dialect/AMDGCN/IR/AMDGCN.cpp              |   2 +-
 lib/Dialect/AMDGCN/IR/AMDGCNAttrs.cpp         |   3 +-
 lib/Dialect/AMDGCN/IR/AMDGCNTypes.cpp         |  23 +-
 lib/Dialect/AMDGCN/IR/AMDGCNVerifiers.cpp     |   4 +-
 .../AMDGCN/Transforms/AMDGCNBufferization.cpp |   4 +-
 lib/Dialect/AMDGCN/Transforms/CMakeLists.txt  |   1 -
 lib/Dialect/AMDGCN/Transforms/LegalizeCF.cpp  | 483 ++----------------
 lib/Dialect/AMDGCN/Transforms/Mem2Reg.cpp     |   4 +-
 .../AMDGCN/Transforms/ToAMDGCNPatterns.cpp    | 145 +++++-
 .../AMDGCN/Transforms/ToRegisterSemantics.cpp |   3 +-
 lib/Dialect/LSIR/CodeGen/CodeGenPatterns.cpp  | 104 ++--
 lib/Dialect/LSIR/IR/LSIROps.cpp               |  30 ++
 lib/Target/ASM/TranslateModule.cpp            |   4 +-
 lib/Transforms/CMakeLists.txt                 |   2 +
 .../Transforms/ConvertSCFControlFlow.cpp      |  82 +--
 python/aster/pass_pipelines.py                |   2 +-
 .../AMDGCN/Analysis/cdna3-hazards.mlir        |  16 +-
 .../AMDGCN/Analysis/range-constraints.mlir    |  22 +-
 .../Analysis/register-interference.mlir       |  10 +-
 .../AMDGCN/Analysis/register-liveness.mlir    |  71 +--
 .../IR/normal-forms-no-lsir-compute-ops.mlir  |   6 +-
 ...mal-forms-no-lsir-control-ops-invalid.mlir |   6 +-
 .../AMDGCN/Transforms/bufferization.mlir      |  34 +-
 .../chained-select-dps-violation.mlir         |  24 +-
 .../Transforms/convert-scf-iter-args.mlir     |   2 +-
 .../AMDGCN/Transforms/convert-scf-nf.mlir     |  11 +-
 .../AMDGCN/Transforms/convert-scf.mlir        |  41 +-
 .../AMDGCN/Transforms/legalize-cf-nf.mlir     |   5 +-
 .../AMDGCN/Transforms/legalize-cf.mlir        | 236 +++------
 .../AMDGCN/Transforms/legalize-operands.mlir  |  42 +-
 test/Dialect/AMDGCN/cmp-ops.mlir              |  34 +-
 .../LSIR/CodeGen/arith-minmax-codegen.mlir    |   6 +-
 test/Dialect/LSIR/Transforms/codegen-cf.mlir  |  40 +-
 .../LSIR/Transforms/codegen-func-cf.mlir      |   6 +-
 test/Dialect/LSIR/ops.mlir                    |  80 +--
 test/Target/ASM/cbranch.mlir                  |  12 +-
 test/Target/ASM/g2s-load-lds.mlir             |  12 +-
 test/Target/ASM/loops.mlir                    |  12 +-
 test/Target/ASM/s-mov-m0.mlir                 |   8 +-
 test/Target/ASM/vopc-branch.mlir              |  18 +-
 test/integration/g2s-load-lds-e2e.mlir        |   6 +-
 test/integration/sreg-roundtrip-e2e.mlir      |   6 +-
 test/integration/vopc-branch-e2e.mlir         |  15 +-
 51 files changed, 833 insertions(+), 1177 deletions(-)
 rename lib/{Dialect/AMDGCN => }/Transforms/ConvertSCFControlFlow.cpp (73%)

diff --git a/include/aster/Dialect/AMDGCN/IR/AMDGCNTypes.td b/include/aster/Dialect/AMDGCN/IR/AMDGCNTypes.td
index f48883376..64a8e2d50 100644
--- a/include/aster/Dialect/AMDGCN/IR/AMDGCNTypes.td
+++ b/include/aster/Dialect/AMDGCN/IR/AMDGCNTypes.td
@@ -50,14 +50,9 @@ def AGPRType : AMDGCN_RegisterDef<"AGPR", "agpr", [MemRefElementTypeInterface]>
   let genVerifyDecl = 1;
   let extraClassDeclaration = [{
     Register getReg() const { return getRange().begin(); }
-
-    /// Returns true if the register is relocatable.
-    bool isRelocatable() const { return getReg().isRelocatable(); }
-
     //===------------------------------------------------------------------===//
     // RegisterTypeInterface
     //===------------------------------------------------------------------===//
-    bool isRegisterRange() const { return getRange().size() > 1; }
     RegisterRange getAsRange() const {
       return getRange();
     }
@@ -67,9 +62,6 @@ def AGPRType : AMDGCN_RegisterDef<"AGPR", "agpr", [MemRefElementTypeInterface]>
     RegisterTypeInterface cloneRegisterType(RegisterRange range) const {
       return AGPRType::get(getContext(), range);
     }
-    RegisterTypeInterface cloneRegisterType(Register reg) const {
-      return AGPRType::get(getContext(), RegisterRange(reg, 1));
-    }
 
     //===------------------------------------------------------------------===//
     // ResourceTypeInterface
@@ -97,14 +89,9 @@ def SGPRType : AMDGCN_RegisterDef<"SGPR", "sgpr", [MemRefElementTypeInterface]>
   let genVerifyDecl = 1;
   let extraClassDeclaration = [{
     Register getReg() const { return getRange().begin(); }
-
-    /// Returns true if the register is relocatable.
-    bool isRelocatable() const { return getReg().isRelocatable(); }
-
     //===------------------------------------------------------------------===//
     // RegisterTypeInterface
     //===------------------------------------------------------------------===//
-    bool isRegisterRange() const { return getRange().size() > 1; }
     RegisterRange getAsRange() const {
       return getRange();
     }
@@ -114,9 +101,6 @@ def SGPRType : AMDGCN_RegisterDef<"SGPR", "sgpr", [MemRefElementTypeInterface]>
     RegisterTypeInterface cloneRegisterType(RegisterRange range) const {
       return SGPRType::get(getContext(), range);
     }
-    RegisterTypeInterface cloneRegisterType(Register reg) const {
-      return SGPRType::get(getContext(), RegisterRange(reg, 1));
-    }
 
     //===------------------------------------------------------------------===//
     // ResourceTypeInterface
@@ -145,13 +129,9 @@ def VGPRType : AMDGCN_RegisterDef<"VGPR", "vgpr", [MemRefElementTypeInterface]>
   let extraClassDeclaration = [{
     Register getReg() const { return getRange().begin(); }
 
-    /// Returns true if the register is relocatable.
-    bool isRelocatable() const { return getRange().begin().isRelocatable(); }
-
     //===------------------------------------------------------------------===//
     // RegisterTypeInterface
     //===------------------------------------------------------------------===//
-    bool isRegisterRange() const { return getRange().size() > 1; }
     RegisterRange getAsRange() const {
       return getRange();
     }
@@ -161,9 +141,6 @@ def VGPRType : AMDGCN_RegisterDef<"VGPR", "vgpr", [MemRefElementTypeInterface]>
     RegisterTypeInterface cloneRegisterType(RegisterRange range) const {
       return VGPRType::get(getContext(), range);
     }
-    RegisterTypeInterface cloneRegisterType(Register reg) const {
-      return VGPRType::get(getContext(), RegisterRange(reg, 1));
-    }
 
     //===------------------------------------------------------------------===//
     // ResourceTypeInterface
@@ -176,71 +153,43 @@ def VGPRType : AMDGCN_RegisterDef<"VGPR", "vgpr", [MemRefElementTypeInterface]>
 // SREG like types
 //===----------------------------------------------------------------------===//
 
-/// Special registers to model state.
-def SREGType : AMDGCN_RegisterDef<"SREG", "sreg", [MemRefElementTypeInterface]> {
-  let summary = "SREG type";
-  let parameters = (ins
-    DefaultValuedParameter<"Register", "Register()">:$reg,
-    "SregKind":$kind);
-  let assemblyFormat = "`<`$kind (`,` $reg^)?`>`";
-  let genVerifyDecl = 1;
-  let extraClassDeclaration = [{
-    /// Returns true if the register is relocatable.
-    bool isRelocatable() const { return getReg().isRelocatable(); }
-
-    //===------------------------------------------------------------------===//
-    // RegisterTypeInterface
-    //===------------------------------------------------------------------===//
-    bool isRegisterRange() const { return false; }
-    RegisterRange getAsRange() const {
-      return RegisterRange(getReg(), 1);
-    }
-    RegisterKind getRegisterKind() const {
-      return RegisterKind::SREG;
-    }
-    RegisterTypeInterface cloneRegisterType(RegisterRange range) const {
-      assert(range.size() == 1 && "SREG type can only clone single register");
-      return SREGType::get(getContext(), range.begin(), getKind());
-    }
-    RegisterTypeInterface cloneRegisterType(Register reg) const {
-      return SREGType::get(getContext(), reg, getKind());
-    }
-
-    //===------------------------------------------------------------------===//
-    // ResourceTypeInterface
-    //===------------------------------------------------------------------===//
-    Resource *getResource() const;
-  }];
-}
-
 /// Special registers to model state.
 class SREGBase<string name, string mnemonic, string kind>
     : AMDGCN_RegisterDef<name, mnemonic, [MemRefElementTypeInterface]> {
   let summary = kind # " special register type";
-  let assemblyFormat = "";
+  let parameters = (ins
+    DefaultValuedParameter<"Register", "Register()">:$reg
+  );
+  let assemblyFormat = "(`<` $reg^ `>`)?";
+  let builders = [
+    TypeBuilder<(ins CArg<"Register", "Register(0)">:$reg), [{
+      return $_get($_ctxt, normalizeRegister(reg));
+    }]>
+  ];
+  let skipDefaultBuilders = 1;
   string declarations = StrSubst<[{
     /// The register kind for this SREG type.
     static constexpr RegisterKind kRegisterKind = RegisterKind::$kind;
     }], [VarRepl<"kind", kind>]>.result;
   let extraClassDeclaration = declarations # [{
-    /// Returns true if the register is relocatable.
-    bool isRelocatable() const { return false; }
+    static Register normalizeRegister(Register reg) {
+      if (reg.getSemantics() == RegisterSemantics::Unallocated)
+        return Register(0);
+      return reg;
+    }
 
     //===------------------------------------------------------------------===//
     // RegisterTypeInterface
     //===------------------------------------------------------------------===//
-    bool isRegisterRange() const { return false; }
     RegisterRange getAsRange() const {
-      return RegisterRange(Register(0), 1);
+      return RegisterRange(getReg(), 1);
     }
     RegisterKind getRegisterKind() const {
       return kRegisterKind;
     }
     RegisterTypeInterface cloneRegisterType(RegisterRange range) const {
-      return get(getContext());
-    }
-    RegisterTypeInterface cloneRegisterType(Register reg) const {
-      return get(getContext());
+      assert(range.size() == 1 && "SREG type can only clone single register");
+      return get(getContext(), range.begin());
     }
 
     //===------------------------------------------------------------------===//
diff --git a/include/aster/Dialect/AMDGCN/Transforms/AMDGCNPasses.td b/include/aster/Dialect/AMDGCN/Transforms/AMDGCNPasses.td
index dee4d1363..e6eaefecf 100644
--- a/include/aster/Dialect/AMDGCN/Transforms/AMDGCNPasses.td
+++ b/include/aster/Dialect/AMDGCN/Transforms/AMDGCNPasses.td
@@ -291,44 +291,19 @@ def ConvertLDSBuffers : Pass<"amdgcn-convert-lds-buffers"> {
   ];
 }
 
-def ConvertSCFControlFlow : Pass<"amdgcn-convert-scf-control-flow"> {
-  let summary = "Convert SCF control flow to AMDGCN control flow instructions";
-  let description = [{
-    This pass converts SCF structured control flow operations (such as scf.for,
-    scf.if, scf.while) to AMDGCN control flow instructions.
-
-    The pass first runs thread uniform analysis to determine whether loop
-    induction variables and conditions are uniform across all threads. Based on
-    this analysis:
-
-    - For thread-uniform conditions: emit scalar compare instructions (s_cmp_*)
-      and branch on SCC
-    - For thread-divergent conditions: emit vector compare instructions (v_cmp_*)
-      and branch on VCC/VCCZ
-
-    This pass should run after the ABI has been set and before register
-    allocation.
-
-    Post-condition: #amdgcn.no_scf_ops
-  }];
-  let dependentDialects = [
-    "mlir::aster::amdgcn::AMDGCNDialect",
-    "mlir::aster::lsir::LSIRDialect",
-    "mlir::cf::ControlFlowDialect"
-  ];
-}
-
 def LegalizeCF : Pass<"amdgcn-legalize-cf"> {
   let summary = "Legalize CF dialect ops to AMDGCN scalar branch instructions";
   let description = [{
-    This pass legalizes CF dialect operations (cf.cond_br, cf.br) and lsir.cmpi
-    to AMDGCN scalar branch and compare instructions. It runs after register
-    allocation when operands are in physical registers and values flow through
-    side effects.
+    This pass legalizes CF dialect operations (cf.cond_br, cf.br) to AMDGCN
+    scalar branch instructions. It runs after register allocation when operands
+    are in physical registers and values flow through side effects.
+
+    The pass expects cf.cond_br conditions to come from amdgcn.is_cc which
+    tests an SCC or VCC register.
 
     Transformations:
-    - lsir.cmpi (returns i1) -> s_cmp_* (sets SCC flag)
-    - cf.cond_br -> s_cbranch_scc1 / scc0 + s_branch
+    - cf.cond_br (cond from amdgcn.is_cc) -> s_cbranch_scc1/scc0 or
+      s_cbranch_vccnz/vccz + s_branch
     - cf.br -> s_branch
 
     Pre-condition: #amdgcn.all_registers_allocated
diff --git a/include/aster/Dialect/LSIR/IR/LSIROps.h b/include/aster/Dialect/LSIR/IR/LSIROps.h
index 9c1802ce5..a7bcac601 100644
--- a/include/aster/Dialect/LSIR/IR/LSIROps.h
+++ b/include/aster/Dialect/LSIR/IR/LSIROps.h
@@ -25,6 +25,7 @@
 #include "mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/InferIntRangeInterface.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
diff --git a/include/aster/Dialect/LSIR/IR/LSIROps.td b/include/aster/Dialect/LSIR/IR/LSIROps.td
index f35033c96..c66bb9d64 100644
--- a/include/aster/Dialect/LSIR/IR/LSIROps.td
+++ b/include/aster/Dialect/LSIR/IR/LSIROps.td
@@ -21,6 +21,7 @@ include "aster/Dialect/LSIR/IR/LSIRTypes.td"
 include "aster/Interfaces/AllocaOpInterface.td"
 include "mlir/Dialect/Arith/IR/ArithBase.td"
 include "mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/Interfaces/InferIntRangeInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
@@ -117,6 +118,90 @@ def LSIR_AllocaOp : LSIR_Op<"alloca", [Pure, AllocaOpInterface]> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// BrOp
+//===----------------------------------------------------------------------===//
+
+def LSIR_BranchOp : LSIR_Op<"br", [
+    DeclareOpInterfaceMethods<BranchOpInterface, ["getSuccessorForOperands"]>,
+    Pure, Terminator]> {
+  let summary = "Unconditional branch operation";
+  let description = [{
+    The `lsir.br` operation unconditionally branches to a destination block,
+    passing the given operands as block arguments. Unlike `cf.br`, this op is
+    used after codegen when all values are in register types.
+
+    Example:
+    ```mlir
+    lsir.br ^bb1(%val : !amdgcn.sgpr)
+    ```
+  }];
+  let arguments = (ins Variadic<AnyType>:$destOperands);
+  let successors = (successor AnySuccessor:$dest);
+  let builders = [
+    OpBuilder<(ins "::mlir::Block *":$dest,
+                   CArg<"::mlir::ValueRange", "{}">:$destOperands), [{
+      $_state.addSuccessors(dest);
+      $_state.addOperands(destOperands);
+    }]>
+  ];
+  let assemblyFormat = [{
+    $dest (`(` $destOperands^ `:` type($destOperands) `)`)? attr-dict
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// CondBrOp
+//===----------------------------------------------------------------------===//
+
+def LSIR_CondBranchOp : LSIR_Op<"cond_br",
+    [AttrSizedOperandSegments,
+     DeclareOpInterfaceMethods<BranchOpInterface, ["getSuccessorForOperands"]>,
+     Pure, Terminator]> {
+  let summary = "Conditional branch operation";
+  let description = [{
+    The `lsir.cond_br` operation branches conditionally based on a register
+    type (SCC or VCC). Unlike `cf.cond_br` which takes `i1`, this op takes
+    a register type directly. Branches to `trueDest` when the condition
+    register is nonzero, otherwise to `falseDest`.
+
+    Example:
+    ```mlir
+    lsir.cond_br %cond : !amdgcn.scc, ^bb1(%val : !amdgcn.sgpr), ^bb2
+    ```
+  }];
+  let arguments = (ins
+    RegType:$condition,
+    Variadic<AnyType>:$trueDestOperands,
+    Variadic<AnyType>:$falseDestOperands
+  );
+  let successors = (successor AnySuccessor:$trueDest, AnySuccessor:$falseDest);
+  let builders = [
+    OpBuilder<(ins "::mlir::Value":$condition,
+                   "::mlir::Block *":$trueDest,
+                   "::mlir::ValueRange":$trueOperands,
+                   "::mlir::Block *":$falseDest,
+                   "::mlir::ValueRange":$falseOperands), [{
+      $_state.addOperands(condition);
+      $_state.addOperands(trueOperands);
+      $_state.addOperands(falseOperands);
+      $_state.addAttribute(
+        getOperandSegmentSizeAttr(),
+        $_builder.getDenseI32ArrayAttr(
+          {1, static_cast<int32_t>(trueOperands.size()),
+           static_cast<int32_t>(falseOperands.size())}));
+      $_state.addSuccessors(trueDest);
+      $_state.addSuccessors(falseDest);
+    }]>
+  ];
+  let assemblyFormat = [{
+    $condition `:` type($condition) `,`
+    $trueDest (`(` $trueDestOperands^ `:` type($trueDestOperands) `)`)? `,`
+    $falseDest (`(` $falseDestOperands^ `:` type($falseDestOperands) `)`)?
+    attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // AndIOp
 //===----------------------------------------------------------------------===//
@@ -171,26 +256,29 @@ def LSIR_AssumeNoaliasOp : LSIR_Op<"assume_noalias", [
 // CmpFOp
 //===----------------------------------------------------------------------===//
 
-def LSIR_CmpFOp : LSIR_InstOp<"cmpf", [PureInst]> {
+def LSIR_CmpFOp : LSIR_InstOp<"cmpf", [InstInferType, PureInst]> {
   let summary = "Floating point comparison operation";
   let description = [{
-    The `lsir.cmpf` operation compares two floating point values and returns i1.
-    The `i1` return value is kept late in the pipeline and is only translated to
-    SCC after register allocation, together with cf branch operations.
+    The `lsir.cmpf` operation compares two floating point values and writes the
+    result to a destination register (VCC for vector operands). Follows
+    destination-passing style: the caller allocates the output register and
+    passes it as `$dst`; the operation returns it.
 
     Example:
     ```mlir
-    %result = lsir.cmpf f32 olt %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr
+    %result = lsir.cmpf f32 olt %dst, %lhs, %rhs
+                : !amdgcn.sreg<vcc>, !amdgcn.vgpr, !amdgcn.vgpr
     ```
   }];
   let leadingArguments = (ins
     AnyFloatTypeAttr:$semantics,
     Arith_CmpFPredicateAttr:$predicate
   );
+  let outputs = (ins RegType:$dst);
   let inputs = (ins FloatOrRegType:$lhs, FloatOrRegType:$rhs);
-  let leadingResults = (outs I1:$result);
   let assemblyFormat = [{
-    $semantics $predicate $lhs `,` $rhs attr-dict `:` type($lhs) `,` type($rhs)
+    $semantics $predicate $dst `,` $lhs `,` $rhs attr-dict
+    `:` type($dst) `,` type($lhs) `,` type($rhs)
   }];
 }
 
@@ -198,27 +286,29 @@ def LSIR_CmpFOp : LSIR_InstOp<"cmpf", [PureInst]> {
 // CmpIOp
 //===----------------------------------------------------------------------===//
 
-def LSIR_CmpIOp : LSIR_InstOp<"cmpi", [PureInst]> {
+def LSIR_CmpIOp : LSIR_InstOp<"cmpi", [InstInferType, PureInst]> {
   let summary = "Integer comparison operation";
   let description = [{
     The `lsir.cmpi` operation compares two integer operands according to the
-    specified predicate and returns i1.
-    The `i1` return value is kept late in the pipeline and is only translated to
-    SCC after register allocation, together with cf branch operations.
+    specified predicate and writes the result to a destination register (SCC or
+    VCC). Follows destination-passing style: the caller allocates the output
+    register and passes it as `$dst`; the operation returns it.
 
     Example:
     ```mlir
-    %result = lsir.cmpi i32 eq %lhs, %rhs : !amdgcn.sgpr, !amdgcn.sgpr
+    %result = lsir.cmpi i32 eq %dst, %lhs, %rhs
+                : !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr
     ```
   }];
   let leadingArguments = (ins
     AnyIntTypeAttr:$semantics,
     Arith_CmpIPredicateAttr:$predicate
   );
+  let outputs = (ins RegType:$dst);
   let inputs = (ins IntOrRegType:$lhs, IntOrRegType:$rhs);
-  let leadingResults = (outs I1:$result);
   let assemblyFormat = [{
-    $semantics $predicate $lhs `,` $rhs attr-dict `:` type($lhs) `,` type($rhs)
+    $semantics $predicate $dst `,` $lhs `,` $rhs attr-dict
+    `:` type($dst) `,` type($lhs) `,` type($rhs)
   }];
 }
 
@@ -713,24 +803,19 @@ def LSIR_RemUIOp : BinaryIOp<"remui"> {
 def LSIR_SelectOp : LSIR_InstOp<"select", [PureInst, InstInferType]> {
   let summary = "Select operation";
   let description = [{
-    The `lsir.select` operation selects between two values based on a condition.
-    The condition can be either a register type or i1 (from lsir.cmpi/cmpf).
-
-    When the condition is i1, LegalizeCF fuses the cmpi + select into
-    s_cmp + s_cselect_b32. When the condition is a register, it lowers to
-    a conditional move instruction.
+    The `lsir.select` operation selects between two values based on a register
+    condition (SCC or VCC). LegalizeCF lowers this to s_cselect_b32 (SCC) or
+    v_cndmask_b32 (VCC).
 
     Example:
     ```mlir
     %result = lsir.select %dst, %cond, %tv, %fv
       : !amdgcn.sgpr, !amdgcn.sgpr, !amdgcn.sgpr, !amdgcn.sgpr
-    %result = lsir.select %dst, %i1_cond, %tv, %fv
-      : !amdgcn.sgpr, i1, i32, i32
     ```
   }];
   let outputs = (ins RegType:$dst);
   let inputs = (ins
-    IntFloatOrRegType:$condition,
+    RegType:$condition,
     IntFloatOrRegType:$true_value,
     IntFloatOrRegType:$false_value
   );
diff --git a/include/aster/Interfaces/RegisterType.td b/include/aster/Interfaces/RegisterType.td
index 9c2e4febc..58b9c7ce8 100644
--- a/include/aster/Interfaces/RegisterType.td
+++ b/include/aster/Interfaces/RegisterType.td
@@ -24,23 +24,13 @@ include "aster/Interfaces/ResourceInterfaces.td"
 //===----------------------------------------------------------------------===//
 
 def RegisterTypeInterface : TypeInterface<"RegisterTypeInterface", [
-    ResourceTypeInterface
+    ResourceTypeInterface,
   ]> {
   let description = [{
     This interface defines a common API for interacting with register types.
   }];
   let cppNamespace = "::mlir::aster";
   let methods = [
-    InterfaceMethod<[{
-        This method returns whether the register is relocatable.
-      }],
-      "bool", "isRelocatable"
-    >,
-    InterfaceMethod<[{
-        This method returns whether the type is a register range.
-      }],
-      "bool", "isRegisterRange"
-    >,
     InterfaceMethod<[{
         This method returns the register's range.
       }],
@@ -53,24 +43,18 @@ def RegisterTypeInterface : TypeInterface<"RegisterTypeInterface", [
       (ins "::mlir::aster::RegisterRange":$range)
     >,
     InterfaceMethod<[{
-        This method returns a clone of the register.
-      }],
-      "::mlir::aster::RegisterTypeInterface", "cloneRegisterType",
-      (ins "::mlir::aster::Register":$reg)
-    >,
-    InterfaceMethod<[{
-        This method returns the size in bytes of the register(s).
-        Each register is 4 bytes (32-bit). Returns std::nullopt if the type
+        This method returns the size in bits of the register(s).
+        Each register is 32 bits. Returns std::nullopt if the type
         is not a valid register type.
       }],
-      "std::optional<int64_t>", "getSizeInBytes", (ins), [{}], [{
-        return $_type.getAsRange().size() * 4;
+      "std::optional<int64_t>", "getSizeInBits", (ins), [{}], [{
+        return $_type.getAsRange().size() * 32;
       }]
     >
   ];
   let extraTraitClassDeclaration = [{
     bool isAllocatable() const {
-      return $_type.isRelocatable();
+      return !$_type.hasAllocatedSemantics();
     }
   }];
   let extraSharedClassDeclaration = [{
@@ -103,6 +87,17 @@ def RegisterTypeInterface : TypeInterface<"RegisterTypeInterface", [
     ::mlir::aster::RegisterTypeInterface getAsValue() const {
       return $_type.cloneRegisterType($_type.getAsRange().getAsValueRange());
     }
+
+    /// Returns true if this register type represents a range of registers.
+    bool isRegisterRange() const {
+      return $_type.getAsRange().size() > 1;
+    }
+
+    /// This method returns a clone of the register type using the provided
+    /// register as the base of the range.
+    ::mlir::aster::RegisterTypeInterface cloneRegisterType(::mlir::aster::Register reg) const {
+      return $_type.cloneRegisterType(::mlir::aster::RegisterRange(reg, 1));
+    }
   }];
 }
 
diff --git a/include/aster/Transforms/Passes.td b/include/aster/Transforms/Passes.td
index fc49182a8..1e9839312 100644
--- a/include/aster/Transforms/Passes.td
+++ b/include/aster/Transforms/Passes.td
@@ -345,4 +345,24 @@ def CFGSimplification : Pass<"aster-cfg-simplification"> {
   let dependentDialects = [];
 }
 
+//===----------------------------------------------------------------------===//
+// ConvertSCFControlFlow
+//===----------------------------------------------------------------------===//
+
+def ConvertSCFControlFlow : Pass<"aster-convert-scf-control-flow"> {
+  let summary = "Convert SCF control flow to CF dialect with basic block structure";
+  let description = [{
+    Converts scf.for and scf.if to cf dialect operations with explicit basic
+    block structure. Handles both integer and index induction variable types.
+    Does not check for thread uniformity.
+
+    Post-condition: no scf.for or scf.if ops remain.
+  }];
+  let dependentDialects = [
+    "mlir::arith::ArithDialect",
+    "mlir::cf::ControlFlowDialect",
+    "mlir::scf::SCFDialect"
+  ];
+}
+
 #endif // ASTER_TRANSFORMS_PASSES_TD
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 909b9ea80..6dab1294b 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -177,9 +177,9 @@ bool MemoryDependenceAnalysis::isStoreOp(Operation *op) {
 
 static int64_t computeAccessLength(Type type) {
   if (auto regType = dyn_cast<RegisterTypeInterface>(type)) {
-    std::optional<int64_t> sizeInBytes = regType.getSizeInBytes();
-    assert(sizeInBytes.has_value() && "register type must have valid size");
-    return *sizeInBytes;
+    std::optional<int64_t> sizeInBits = regType.getSizeInBits();
+    assert(sizeInBits.has_value() && "register type must have valid size");
+    return *sizeInBits / 8;
   }
   // Conservative: assume 4 bytes if we can't determine precisely.
   return 4;
diff --git a/lib/Dialect/AMDGCN/CodeGen/CodeGenPatterns.cpp b/lib/Dialect/AMDGCN/CodeGen/CodeGenPatterns.cpp
index da1c05f12..91424ff02 100644
--- a/lib/Dialect/AMDGCN/CodeGen/CodeGenPatterns.cpp
+++ b/lib/Dialect/AMDGCN/CodeGen/CodeGenPatterns.cpp
@@ -301,6 +301,14 @@ static Type convertTypeImpl(Value value, const CodeGenConverter &converter) {
   if (isa<RegisterTypeInterface>(value.getType()))
     return value.getType();
 
+  // i1 values map to SCC (thread-uniform) or VCC (divergent).
+  if (value.getType().isInteger(1)) {
+    std::optional<bool> isUniform = converter.isThreadUniform(value);
+    if (isUniform.has_value() && *isUniform)
+      return amdgcn::SCCType::get(value.getContext(), Register());
+    return amdgcn::VCCType::get(value.getContext(), Register());
+  }
+
   int64_t typeSize = converter.getTypeSize(value.getType());
   int64_t numWords = (typeSize + 3) / 4;
 
diff --git a/lib/Dialect/AMDGCN/IR/AMDGCN.cpp b/lib/Dialect/AMDGCN/IR/AMDGCN.cpp
index ef88622fb..e63c197d4 100644
--- a/lib/Dialect/AMDGCN/IR/AMDGCN.cpp
+++ b/lib/Dialect/AMDGCN/IR/AMDGCN.cpp
@@ -798,7 +798,7 @@ void LoadToLDSOp::getEffects(
 /// Check if a type is an unallocated register (relocatable).
 static bool isUnallocatedRegister(Type type) {
   auto regType = dyn_cast<RegisterTypeInterface>(type);
-  return regType && regType.isRelocatable();
+  return regType && !regType.hasAllocatedSemantics();
 }
 
 /// Parse the output types for CmpIOp.
diff --git a/lib/Dialect/AMDGCN/IR/AMDGCNAttrs.cpp b/lib/Dialect/AMDGCN/IR/AMDGCNAttrs.cpp
index d9cdfbfe8..05f1da1d2 100644
--- a/lib/Dialect/AMDGCN/IR/AMDGCNAttrs.cpp
+++ b/lib/Dialect/AMDGCN/IR/AMDGCNAttrs.cpp
@@ -301,7 +301,8 @@ LogicalResult NoLsirComputeOpsAttr::verifyOperation(
 
   // Allow control-flow ops (lowered by LegalizeCF) and copy (regalloc
   // primitive).
-  if (isa<lsir::CmpIOp, lsir::CmpFOp, lsir::SelectOp, lsir::CopyOp>(op))
+  if (isa<lsir::CmpIOp, lsir::CmpFOp, lsir::SelectOp, lsir::CopyOp,
+          lsir::BranchOp, lsir::CondBranchOp>(op))
     return success();
 
   return emitError() << "normal form violation: LSIR compute/memory "
diff --git a/lib/Dialect/AMDGCN/IR/AMDGCNTypes.cpp b/lib/Dialect/AMDGCN/IR/AMDGCNTypes.cpp
index 63ca67319..efce063c7 100644
--- a/lib/Dialect/AMDGCN/IR/AMDGCNTypes.cpp
+++ b/lib/Dialect/AMDGCN/IR/AMDGCNTypes.cpp
@@ -37,7 +37,7 @@ LogicalResult verifyRegisterRange(function_ref<InFlightDiagnostic()> emitError,
     return emitError() << "align must be a power of 2, got " << alignment;
 
   // Check alignment if the range is allocated
-  if (!range.begin().isRelocatable()) {
+  if (range.getSemantics() == RegisterSemantics::Allocated) {
     if (alignment <= 0)
       return emitError() << "align must be positive, got " << alignment;
 
@@ -100,24 +100,3 @@ LogicalResult VGPRType::verify(function_ref<InFlightDiagnostic()> emitError,
 }
 
 Resource *VGPRType::getResource() const { return VGPRResource::get(); }
-
-//===----------------------------------------------------------------------===//
-// SREG types
-//===----------------------------------------------------------------------===//
-
-LogicalResult SREGType::verify(function_ref<InFlightDiagnostic()> emitError,
-                               Register reg, SregKind kind) {
-  if (!reg.isValid())
-    return emitError() << "SREG must be non-negative";
-  switch (kind) {
-  case SregKind::Scc: {
-    if (!reg.isRelocatable() && reg.getRegister() != 0) {
-      return emitError() << "SCC SREG must be register 0";
-    }
-    break;
-  }
-  }
-  return success();
-}
-
-Resource *SREGType::getResource() const { return SGPRResource::get(); }
diff --git a/lib/Dialect/AMDGCN/IR/AMDGCNVerifiers.cpp b/lib/Dialect/AMDGCN/IR/AMDGCNVerifiers.cpp
index 65eed9190..db747d972 100644
--- a/lib/Dialect/AMDGCN/IR/AMDGCNVerifiers.cpp
+++ b/lib/Dialect/AMDGCN/IR/AMDGCNVerifiers.cpp
@@ -79,7 +79,7 @@ static LogicalResult checkOperand(Operation *op, Type type, int32_t pos,
                .attachNote(state.getLoc())
            << "is invalid";
   }
-  if (!allowUnallocated && regTy.isRelocatable()) {
+  if (!allowUnallocated && !regTy.hasAllocatedSemantics()) {
     return (op->emitError(direction + " operand ")
             << pos << " is unallocated register type: " << type)
                .attachNote(state.getLoc())
@@ -99,7 +99,7 @@ static LogicalResult checkValue(Operation *op, Type type, int32_t pos,
                .attachNote(state.getLoc())
            << "is invalid";
   }
-  if (!allowUnallocated && regTy.isRelocatable()) {
+  if (!allowUnallocated && !regTy.hasAllocatedSemantics()) {
     return (op->emitError(direction)
             << pos << " is unallocated register type: " << type)
                .attachNote(state.getLoc())
diff --git a/lib/Dialect/AMDGCN/Transforms/AMDGCNBufferization.cpp b/lib/Dialect/AMDGCN/Transforms/AMDGCNBufferization.cpp
index ed506a614..07f3a88fe 100644
--- a/lib/Dialect/AMDGCN/Transforms/AMDGCNBufferization.cpp
+++ b/lib/Dialect/AMDGCN/Transforms/AMDGCNBufferization.cpp
@@ -294,8 +294,8 @@ void BufferizationImpl::handlePhiForwardGroup(IRRewriter &rewriter,
   }
 
   // Create a branch op to the block to forward to.
-  cf::BranchOp::create(rewriter, std::get<0>(phiForwards[start])->getLoc(),
-                       std::get<3>(phiForwards[start]), fwdValues);
+  lsir::BranchOp::create(rewriter, std::get<0>(phiForwards[start])->getLoc(),
+                         std::get<3>(phiForwards[start]), fwdValues);
 }
 
 void BufferizationImpl::handleBlocksAndTerminators(IRRewriter &rewriter,
diff --git a/lib/Dialect/AMDGCN/Transforms/CMakeLists.txt b/lib/Dialect/AMDGCN/Transforms/CMakeLists.txt
index 535c29c4f..9e68a081d 100644
--- a/lib/Dialect/AMDGCN/Transforms/CMakeLists.txt
+++ b/lib/Dialect/AMDGCN/Transforms/CMakeLists.txt
@@ -3,7 +3,6 @@ add_mlir_library(AMDGCNTransforms
   AMDGCNBufferization.cpp
   AMDGCNHazards.cpp
   ConvertLDSBuffers.cpp
-  ConvertSCFControlFlow.cpp
   ConvertWaits.cpp
   ExpandMetadataOps.cpp
   HoistIterArgWaits.cpp
diff --git a/lib/Dialect/AMDGCN/Transforms/LegalizeCF.cpp b/lib/Dialect/AMDGCN/Transforms/LegalizeCF.cpp
index 65773d1cf..5fe91bba2 100644
--- a/lib/Dialect/AMDGCN/Transforms/LegalizeCF.cpp
+++ b/lib/Dialect/AMDGCN/Transforms/LegalizeCF.cpp
@@ -8,14 +8,14 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass legalizes CF dialect operations (cf.cond_br, cf.br) and lsir.cmpi
-// to AMDGCN scalar branch and compare instructions. It runs after register
-// allocation when operands are in physical registers.
+// This pass legalizes CF dialect operations (cf.cond_br, cf.br) to AMDGCN
+// scalar branch instructions. It runs after register allocation when operands
+// are in physical registers.
 //
-// Transformations:
-//   - lsir.cmpi (SGPR/i32 operands) -> s_cmp_* (sets SCC flag)
-//   - lsir.cmpi (VGPR operands) -> v_cmp_* (sets VCC flag)
-//   - cf.cond_br -> s_cbranch_scc1/scc0 or s_cbranch_vccnz/vccz + s_branch
+// The pass expects cf.cond_br conditions to come from amdgcn.is_cc (which
+// tests an SCC or VCC register). The transformation:
+//   - cf.cond_br (cond from amdgcn.is_cc) -> s_cbranch_scc1/scc0 or
+//     s_cbranch_vccnz/vccz + s_branch
 //   - cf.br -> s_branch
 //
 //===----------------------------------------------------------------------===//
@@ -24,13 +24,12 @@
 #include "aster/Dialect/AMDGCN/IR/AMDGCNOps.h"
 #include "aster/Dialect/AMDGCN/IR/AMDGCNTypes.h"
 #include "aster/Dialect/AMDGCN/Transforms/Passes.h"
-#include "aster/Dialect/LSIR/IR/LSIRDialect.h"
 #include "aster/Dialect/LSIR/IR/LSIROps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/SmallVector.h"
 
 namespace mlir::aster {
 namespace amdgcn {
@@ -54,175 +53,21 @@ struct LegalizeCF : public amdgcn::impl::LegalizeCFBase<LegalizeCF> {
   void runOnOperation() override;
 
 private:
-  /// Verify i1 lifetime constraints for SCC/VCC registers:
-  /// 1. No i1 value is used across block boundaries (flag reg not preserved).
-  /// 2. No two lsir.cmp ops have overlapping lifetimes within a block
-  /// (clobber).
-  LogicalResult verifyI1Lifetimes(Operation *op);
-
-  /// Get or create the lowered amdgcn.cmpi + alloca for an lsir.cmpi.
-  /// Selects s_cmp_* (SCC) for scalar operands, v_cmp_* (VCC) for vector.
-  /// On first call for a given cmpOp, creates the alloca and cmpi at the
-  /// original lsir.cmpi location. On subsequent calls, returns the cached
-  /// result.
-  Value getOrCreateLoweredCmp(lsir::CmpIOp cmpOp, IRRewriter &rewriter);
-
-  /// Lower lsir.cmpi + cf.cond_br pattern to AMDGCN compare + branch.
-  LogicalResult lowerCondBranch(cf::CondBranchOp condBr);
-
-  /// Lower cf.br to s_branch.
-  LogicalResult lowerBranch(cf::BranchOp br);
-
-  /// Lower lsir.cmpi + lsir.select(i1) pattern to s_cmp + s_cselect_b32
-  /// or v_cmp + v_cndmask_b32.
-  LogicalResult lowerSelect(lsir::SelectOp selectOp);
-
-  /// Map from original lsir.cmpi to the SCC/VCC alloca value from the lowered
-  /// amdgcn.cmpi. Used to deduplicate compare lowering on fan-out.
-  DenseMap<Operation *, Value> loweredCmpMap;
-};
-
-/// Map arith::CmpIPredicate to the appropriate s_cmp_* opcode (scalar).
-static OpCode getScalarCompareOpCode(arith::CmpIPredicate predicate) {
-  switch (predicate) {
-  case arith::CmpIPredicate::eq:
-    return OpCode::S_CMP_EQ_I32;
-  case arith::CmpIPredicate::ne:
-    return OpCode::S_CMP_LG_I32;
-  case arith::CmpIPredicate::slt:
-    return OpCode::S_CMP_LT_I32;
-  case arith::CmpIPredicate::sle:
-    return OpCode::S_CMP_LE_I32;
-  case arith::CmpIPredicate::sgt:
-    return OpCode::S_CMP_GT_I32;
-  case arith::CmpIPredicate::sge:
-    return OpCode::S_CMP_GE_I32;
-  case arith::CmpIPredicate::ult:
-    return OpCode::S_CMP_LT_U32;
-  case arith::CmpIPredicate::ule:
-    return OpCode::S_CMP_LE_U32;
-  case arith::CmpIPredicate::ugt:
-    return OpCode::S_CMP_GT_U32;
-  case arith::CmpIPredicate::uge:
-    return OpCode::S_CMP_GE_U32;
-  }
-  llvm_unreachable("Unknown CmpIPredicate");
-}
-
-/// Map arith::CmpIPredicate to the appropriate v_cmp_* opcode (vector, 32-bit
-/// encoding). The 32-bit VOPC encoding requires rhs (src1) to be a VGPR.
-/// If operands need swapping, the predicate should be flipped first.
-static OpCode getVectorCompareOpCode(arith::CmpIPredicate predicate) {
-  switch (predicate) {
-  case arith::CmpIPredicate::eq:
-    return OpCode::V_CMP_EQ_I32;
-  case arith::CmpIPredicate::ne:
-    return OpCode::V_CMP_NE_I32;
-  case arith::CmpIPredicate::slt:
-    return OpCode::V_CMP_LT_I32;
-  case arith::CmpIPredicate::sle:
-    return OpCode::V_CMP_LE_I32;
-  case arith::CmpIPredicate::sgt:
-    return OpCode::V_CMP_GT_I32;
-  case arith::CmpIPredicate::sge:
-    return OpCode::V_CMP_GE_I32;
-  case arith::CmpIPredicate::ult:
-    return OpCode::V_CMP_LT_U32;
-  case arith::CmpIPredicate::ule:
-    return OpCode::V_CMP_LE_U32;
-  case arith::CmpIPredicate::ugt:
-    return OpCode::V_CMP_GT_U32;
-  case arith::CmpIPredicate::uge:
-    return OpCode::V_CMP_GE_U32;
-  }
-  llvm_unreachable("Unknown CmpIPredicate");
-}
-
-/// Swap a comparison predicate (a < b becomes b > a).
-static arith::CmpIPredicate swapPredicate(arith::CmpIPredicate pred) {
-  switch (pred) {
-  case arith::CmpIPredicate::eq:
-    return arith::CmpIPredicate::eq;
-  case arith::CmpIPredicate::ne:
-    return arith::CmpIPredicate::ne;
-  case arith::CmpIPredicate::slt:
-    return arith::CmpIPredicate::sgt;
-  case arith::CmpIPredicate::sle:
-    return arith::CmpIPredicate::sge;
-  case arith::CmpIPredicate::sgt:
-    return arith::CmpIPredicate::slt;
-  case arith::CmpIPredicate::sge:
-    return arith::CmpIPredicate::sle;
-  case arith::CmpIPredicate::ult:
-    return arith::CmpIPredicate::ugt;
-  case arith::CmpIPredicate::ule:
-    return arith::CmpIPredicate::uge;
-  case arith::CmpIPredicate::ugt:
-    return arith::CmpIPredicate::ult;
-  case arith::CmpIPredicate::uge:
-    return arith::CmpIPredicate::ule;
-  }
-  llvm_unreachable("Unknown CmpIPredicate");
-}
-
-/// Returns true if either operand of the compare is a VGPR.
-static bool hasVGPROperand(lsir::CmpIOp cmpOp) {
-  return isa<VGPRType>(cmpOp.getLhs().getType()) ||
-         isa<VGPRType>(cmpOp.getRhs().getType());
-}
-
-Value LegalizeCF::getOrCreateLoweredCmp(lsir::CmpIOp cmpOp,
-                                        IRRewriter &rewriter) {
-  auto it = loweredCmpMap.find(cmpOp);
-  if (it != loweredCmpMap.end())
-    return it->second;
-
-  // Create the lowered compare at the original lsir.cmpi location.
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(cmpOp);
-  Location loc = cmpOp.getLoc();
+  /// Lower lsir.cond_br to AMDGCN scalar/vector branch instructions.
+  /// The condition is a register type (SCC or VCC) directly.
+  LogicalResult lowerCondBranch(lsir::CondBranchOp condBr);
 
-  bool isVector = hasVGPROperand(cmpOp);
-  if (isVector) {
-    // Vector compare: v_cmp_* writes to VCC.
-    // The 32-bit VOPC encoding requires src1 (rhs) to be a VGPR.
-    // If rhs is not a VGPR, swap operands and flip the predicate.
-    Value lhs = cmpOp.getLhs();
-    Value rhs = cmpOp.getRhs();
-    arith::CmpIPredicate pred = cmpOp.getPredicate();
-    if (!isa<VGPRType>(rhs.getType())) {
-      assert(isa<VGPRType>(lhs.getType()) &&
-             "at least one operand must be a VGPR for vector compare");
-      std::swap(lhs, rhs);
-      pred = swapPredicate(pred);
-    }
-    Type vccType = VCCType::get(rewriter.getContext());
-    Value vcc = AllocaOp::create(rewriter, loc, vccType);
-    OpCode cmpOpCode = getVectorCompareOpCode(pred);
-    amdgcn::CmpIOp::create(rewriter, loc, cmpOpCode, vcc, lhs, rhs);
-    loweredCmpMap[cmpOp] = vcc;
-    return vcc;
-  }
-
-  // Scalar compare: s_cmp_* writes to SCC.
-  Type sccType = SCCType::get(rewriter.getContext());
-  Value scc = AllocaOp::create(rewriter, loc, sccType);
-  OpCode cmpOpCode = getScalarCompareOpCode(cmpOp.getPredicate());
-  amdgcn::CmpIOp::create(rewriter, loc, cmpOpCode, scc, cmpOp.getLhs(),
-                         cmpOp.getRhs());
+  /// Lower lsir.br to s_branch.
+  LogicalResult lowerBranch(lsir::BranchOp br);
 
-  loweredCmpMap[cmpOp] = scc;
-  return scc;
-}
+  /// Lower lsir.select with a register condition (SCC or VCC).
+  LogicalResult lowerSelect(lsir::SelectOp selectOp);
+};
 
-LogicalResult LegalizeCF::lowerCondBranch(cf::CondBranchOp condBr) {
-  // Get the condition - must come from lsir.cmpi
-  Value condition = condBr.getCondition();
-  auto cmpOp = condition.getDefiningOp<lsir::CmpIOp>();
-  if (!cmpOp) {
-    return condBr.emitError()
-           << "cf.cond_br condition must come from lsir.cmpi for legalization";
-  }
+LogicalResult LegalizeCF::lowerCondBranch(lsir::CondBranchOp condBr) {
+  // The condition is a register type (SCC or VCC) directly.
+  Value flagReg = condBr.getCondition();
+  bool isVector = isa<VCCType>(flagReg.getType());
 
   // Note: We just drop block arguments as they are allocated and all values
   // flow through side effects.
@@ -232,9 +77,9 @@ LogicalResult LegalizeCF::lowerCondBranch(cf::CondBranchOp condBr) {
        {condBr.getTrueDestOperands(), condBr.getFalseDestOperands()}) {
     for (Value operand : brOpRange) {
       Type type = operand.getType();
-      if (!isa<SGPRType, VGPRType, SGPRType, VGPRType>(type)) {
+      if (!isa<SGPRType, VGPRType>(type)) {
         return condBr.emitError()
-               << "cf.br operand must have an allocated register type";
+               << "lsir.cond_br operand must have an allocated register type";
       }
     }
   }
@@ -242,9 +87,6 @@ LogicalResult LegalizeCF::lowerCondBranch(cf::CondBranchOp condBr) {
   IRRewriter rewriter(condBr);
   rewriter.setInsertionPoint(condBr);
 
-  Value flagReg = getOrCreateLoweredCmp(cmpOp, rewriter);
-  bool isVector = isa<VCCType>(flagReg.getType());
-
   // Create conditional branch based on which destination is the next physical
   // block. The fallthrough target must be the next block.
   Location loc = condBr.getLoc();
@@ -253,21 +95,24 @@ LogicalResult LegalizeCF::lowerCondBranch(cf::CondBranchOp condBr) {
   Block *currentBlock = condBr->getBlock();
   Block *nextBlock = currentBlock->getNextNode();
 
-  // Select branch opcodes based on whether the compare wrote SCC or VCC.
-  OpCode branchTrue =
+  // lsir.cond_br branches to trueDest when the condition register is nonzero.
+  // Select branch opcodes based on whether the flag register is SCC or VCC.
+  OpCode branchIfTrue =
       isVector ? OpCode::S_CBRANCH_VCCNZ : OpCode::S_CBRANCH_SCC1;
-  OpCode branchFalse =
+  OpCode branchIfFalse =
       isVector ? OpCode::S_CBRANCH_VCCZ : OpCode::S_CBRANCH_SCC0;
 
   // amdgcn::CBranchOp takes a label; later, the actual 16-bit PC-relative
   // offset is computed by the LLVM assembler (MC layer) when it assembles this
   // text into binary machine code. This is happening outside of aster.
   if (falseDest == nextBlock) {
-    // Branch to trueDest if flag set, fallthrough to falseDest
-    CBranchOp::create(rewriter, loc, branchTrue, flagReg, trueDest, falseDest);
+    // Branch to trueDest if condition true, fallthrough to falseDest.
+    CBranchOp::create(rewriter, loc, branchIfTrue, flagReg, trueDest,
+                      falseDest);
   } else if (trueDest == nextBlock) {
-    // Branch to falseDest if flag clear, fallthrough to trueDest
-    CBranchOp::create(rewriter, loc, branchFalse, flagReg, falseDest, trueDest);
+    // Branch to falseDest if condition false, fallthrough to trueDest.
+    CBranchOp::create(rewriter, loc, branchIfFalse, flagReg, falseDest,
+                      trueDest);
   } else {
     // TODO: neither destination is the next block, we need more sophisticated
     // logic to insert explicit branch and create a new block. For this to
@@ -286,16 +131,16 @@ LogicalResult LegalizeCF::lowerCondBranch(cf::CondBranchOp condBr) {
   return success();
 }
 
-LogicalResult LegalizeCF::lowerBranch(cf::BranchOp br) {
+LogicalResult LegalizeCF::lowerBranch(lsir::BranchOp br) {
   // Note: We just drop block arguments as they are allocated and all values
   // flow through side effects.
   // TODO: In the future, this is better done as a RA legalization once we have
   // a side-effecting representation of instructions without return values.
   for (Value operand : br.getDestOperands()) {
     Type type = operand.getType();
-    if (!isa<SGPRType, VGPRType, SGPRType, VGPRType>(type)) {
+    if (!isa<SGPRType, VGPRType>(type)) {
       return br.emitError()
-             << "cf.br operand must have an allocated register type";
+             << "lsir.br operand must have an allocated register type";
     }
   }
 
@@ -313,26 +158,14 @@ LogicalResult LegalizeCF::lowerBranch(cf::BranchOp br) {
 }
 
 LogicalResult LegalizeCF::lowerSelect(lsir::SelectOp selectOp) {
-  Value condition = selectOp.getCondition();
-  // Only handle i1-conditioned selects (from lsir.cmpi).
-  // Register-conditioned selects are handled elsewhere.
-  if (!condition.getType().isInteger(1))
-    return success();
-
-  auto cmpOp = condition.getDefiningOp<lsir::CmpIOp>();
-  if (!cmpOp) {
-    return selectOp.emitError()
-           << "lsir.select with i1 condition must come from lsir.cmpi";
-  }
+  // The condition is always a register type (SCC or VCC) after codegen.
+  Value flagReg = selectOp.getCondition();
+  bool isVector = isa<VCCType>(flagReg.getType());
 
   Location loc = selectOp.getLoc();
   IRRewriter rewriter(selectOp);
   rewriter.setInsertionPoint(selectOp);
 
-  // Ensure the compare is lowered.
-  Value flagReg = getOrCreateLoweredCmp(cmpOp, rewriter);
-  bool isVector = isa<VCCType>(flagReg.getType());
-
   Value dst = selectOp.getDst();
   if (isVector) {
     // v_cndmask_b32: vdst = VCC[lane] ? src1 : src0 (note: reversed order!)
@@ -366,75 +199,6 @@ LogicalResult LegalizeCF::lowerSelect(lsir::SelectOp selectOp) {
   return success();
 }
 
-/// Find the last user of `value` in `block`, by operation order.
-/// Returns nullptr if no user exists in the block.
-static Operation *findLastUserInBlock(Value value, Block *block) {
-  Operation *lastUser = nullptr;
-  for (Operation *user : value.getUsers()) {
-    if (user->getBlock() != block)
-      continue;
-    if (!lastUser || lastUser->isBeforeInBlock(user))
-      lastUser = user;
-  }
-  return lastUser;
-}
-
-LogicalResult LegalizeCF::verifyI1Lifetimes(Operation *op) {
-  LogicalResult result = success();
-
-  op->walk([&](Block *block) {
-    // Track the currently-live i1 value and where its lifetime ends.
-    Operation *activeI1Op = nullptr;
-    Operation *activeI1OpLastUserOp = nullptr;
-
-    for (Operation &innerOp : *block) {
-      Value i1;
-      if (auto cmpOp = dyn_cast<lsir::CmpIOp>(&innerOp))
-        i1 = cmpOp.getResult();
-      else if (auto cmpOp = dyn_cast<lsir::CmpFOp>(&innerOp))
-        i1 = cmpOp.getResult();
-      else
-        continue;
-
-      // Check cross-block usage: all users of this cmpi must be in the same
-      // block. SCC/VCC are not preserved across block boundaries.
-      for (Operation *user : i1.getUsers()) {
-        if (user->getBlock() != block) {
-          innerOp.emitError()
-              << "has consumer in a different block; flag register (SCC/VCC) "
-                 "is not preserved across block boundaries";
-          result = failure();
-          return WalkResult::interrupt();
-        }
-      }
-
-      // Check overlap: any cmpi (even dead ones) clobbers the flag register,
-      // so if a previous i1 is still live, this is an error.
-      if (activeI1Op && activeI1OpLastUserOp &&
-          !activeI1OpLastUserOp->isBeforeInBlock(&innerOp)) {
-        innerOp.emitError()
-            << "would clobber flag register from earlier compare; i1 "
-               "lifetimes must not overlap";
-        result = failure();
-        return WalkResult::interrupt();
-      }
-
-      // Dead cmpi (no users) is benign for tracking purposes -- it clobbers
-      // SCC but has no consumers that could be affected by a future clobber.
-      // Don't update activeI1 so it doesn't block subsequent live cmpi ops.
-      if (i1.use_empty())
-        continue;
-
-      // Start tracking this cmpi's lifetime.
-      activeI1Op = &innerOp;
-      activeI1OpLastUserOp = findLastUserInBlock(i1, block);
-    }
-    return WalkResult::advance();
-  });
-
-  return result;
-}
-
 void LegalizeCF::runOnOperation() {
   Operation *op = getOperation();
 
@@ -448,58 +212,20 @@ void LegalizeCF::runOnOperation() {
     }
   }
 
-  // Precondition: verify i1 lifetimes are non-overlapping and block-local.
-  // SCC/VCC are implicit flag registers with no spill capability, so
-  // overlapping lifetimes or cross-block usage would produce silent
-  // miscompilation.
-  if (failed(verifyI1Lifetimes(op))) {
-    signalPassFailure();
-    return;
-  }
-
-  // Construct allocated register to alloca map.
-  // After canonicalize + CSE (run in the backend pipeline before this pass),
-  // there must be exactly one alloca per concrete register type. CSE
-  // deduplicates allocas of the same type since allocated allocas are Pure.
-  DenseMap<RegisterTypeInterface, AllocaOp> allocatedRegisterToAllocaMap;
-  bool hasDuplicates = false;
-  op->walk([&](AllocaOp alloca) {
-    auto regType = cast<RegisterTypeInterface>(alloca.getType());
-    if (regType.isRelocatable()) {
-      alloca.emitOpError("alloca must have a fixed register type "
-                         "(register coloring must run before LegalizeCF)");
-      hasDuplicates = true;
-      return;
-    }
-    auto [it, inserted] =
-        allocatedRegisterToAllocaMap.try_emplace(regType, alloca);
-    if (!inserted) {
-      alloca.emitOpError("duplicate alloca for register type ")
-          << alloca.getType()
-          << " (canonicalize + CSE should deduplicate allocated allocas)";
-      hasDuplicates = true;
-    }
-  });
-  if (hasDuplicates) {
-    signalPassFailure();
-    return;
-  }
-
   // Collect all operations to lower.
   SmallVector<lsir::SelectOp> selects;
-  SmallVector<cf::CondBranchOp> condBranches;
-  SmallVector<cf::BranchOp> branches;
+  SmallVector<lsir::CondBranchOp> condBranches;
+  SmallVector<lsir::BranchOp> branches;
   op->walk([&](Operation *innerOp) {
     if (auto selectOp = dyn_cast<lsir::SelectOp>(innerOp))
       selects.push_back(selectOp);
-    else if (auto condBr = dyn_cast<cf::CondBranchOp>(innerOp))
+    else if (auto condBr = dyn_cast<lsir::CondBranchOp>(innerOp))
       condBranches.push_back(condBr);
-    else if (auto br = dyn_cast<cf::BranchOp>(innerOp))
+    else if (auto br = dyn_cast<lsir::BranchOp>(innerOp))
       branches.push_back(br);
   });
 
-  // Lower i1-conditioned selects first (they reference lsir.cmpi which may
-  // also be used by cond_br).
+  // Lower i1-conditioned selects (they reference amdgcn.is_cc).
   for (lsir::SelectOp selectOp : selects) {
     if (failed(lowerSelect(selectOp))) {
       signalPassFailure();
@@ -507,137 +233,22 @@ void LegalizeCF::runOnOperation() {
     }
   }
 
-  // Lower conditional branches (they may reference lsir.cmpi)
-  for (cf::CondBranchOp condBr : condBranches) {
+  // Lower conditional branches.
+  for (lsir::CondBranchOp condBr : condBranches) {
     if (failed(lowerCondBranch(condBr))) {
       signalPassFailure();
       return;
     }
   }
 
-  // Lower unconditional branches
-  for (cf::BranchOp br : branches) {
+  // Lower unconditional branches.
+  for (lsir::BranchOp br : branches) {
     if (failed(lowerBranch(br))) {
       signalPassFailure();
       return;
     }
   }
 
-  // Erase original lsir.cmpi ops that were lowered. Collect first, then
-  // clear the map before erasing to avoid dangling pointers during iteration.
-  SmallVector<Operation *> cmpsToErase;
-  for (auto &[cmpOp, scc] : loweredCmpMap) {
-    assert(cmpOp->use_empty() &&
-           "lsir.cmpi still has uses after all consumers lowered");
-    cmpsToErase.push_back(cmpOp);
-  }
-  loweredCmpMap.clear();
-  for (Operation *cmpOp : cmpsToErase)
-    cmpOp->erase();
-
-  // Iterate all blocks in all regions of the function and replace block
-  // arguments with the corresponding alloca.
-  //
-  // For register range block arguments, we decompose them to individual
-  // registers since ranges are composite constructs without their own allocas.
-  // Each range block arg is replaced by reconstructing the range from its
-  // constituent allocas using make_register_range at the block entry.
-  //
-  // This is a simple way of legalizing block arguments, late in the pipeline.
-  //
-  // Note and caveat: taking the alloc is fine because at this point values do
-  // not flow through SSA values anymore, except i1  cf.cond_br conditions.
-  // While this is correct, it is easily confusing since SSA and side-effects
-  // are mixed in the same representation.
-  //
-  // TODO: In the very short future, this is better done as a RA legalization
-  // once we have a side-effecting representation of instructions without return
-  // values.
-  op->walk([&](Block *block) {
-    IRRewriter rewriter(op->getContext());
-
-    // Drop all block arguments, if any.
-    for (int i = block->getNumArguments() - 1; i >= 0; --i) {
-      // Always erase index i; indices shift after each erase.
-      BlockArgument arg = block->getArgument(i);
-      RegisterTypeInterface regType =
-          cast<RegisterTypeInterface>(arg.getType());
-
-      // Simple case: non-range register type
-      if (!regType.isRegisterRange()) {
-        auto it = allocatedRegisterToAllocaMap.find(regType);
-        if (it == allocatedRegisterToAllocaMap.end()) {
-          block->getParentOp()->emitError()
-              << "Alloca not found for register type " << regType;
-          signalPassFailure();
-          return WalkResult::interrupt();
-        }
-        arg.replaceAllUsesWith(it->second);
-        block->eraseArgument(i);
-        continue;
-      }
-
-      // Complex case: register range type - decompose to constituents
-      RegisterRange range = regType.getAsRange();
-      Register beginReg = range.begin();
-      int16_t rangeSize = range.size();
-
-      if (beginReg.isRelocatable()) {
-        block->getParentOp()->emitError()
-            << "Cannot legalize relocatable register range block argument";
-        signalPassFailure();
-        return WalkResult::interrupt();
-      }
-
-      // Collect allocas for all constituent registers
-      SmallVector<Value> constituentAllocas;
-      constituentAllocas.reserve(rangeSize);
-
-      auto rangeRegType = cast<AMDGCNRegisterTypeInterface>(regType);
-      RegisterKind regKind = rangeRegType.getRegisterKind();
-
-      for (int16_t offset = 0; offset < rangeSize; ++offset) {
-        Register reg = beginReg.getWithOffset(offset);
-
-        RegisterTypeInterface constituentType;
-        MLIRContext *ctx = block->getParentOp()->getContext();
-        switch (regKind) {
-        case RegisterKind::SGPR:
-          constituentType = SGPRType::get(ctx, reg);
-          break;
-        case RegisterKind::VGPR:
-          constituentType = VGPRType::get(ctx, reg);
-          break;
-        case RegisterKind::AGPR:
-          constituentType = AGPRType::get(ctx, reg);
-          break;
-        default:
-          block->getParentOp()->emitError()
-              << "Unsupported register kind for range block argument";
-          signalPassFailure();
-          return WalkResult::interrupt();
-        }
-
-        auto it = allocatedRegisterToAllocaMap.find(constituentType);
-        if (it == allocatedRegisterToAllocaMap.end()) {
-          block->getParentOp()->emitError()
-              << "Alloca not found for constituent register " << constituentType
-              << " in range " << regType;
-          signalPassFailure();
-          return WalkResult::interrupt();
-        }
-        constituentAllocas.push_back(it->second);
-      }
-
-      rewriter.setInsertionPointToStart(block);
-      Value reconstructedRange = MakeRegisterRangeOp::create(
-          rewriter, arg.getLoc(), constituentAllocas);
-      arg.replaceAllUsesWith(reconstructedRange);
-      block->eraseArgument(i);
-    }
-    return WalkResult::advance();
-  });
-
   // Set post-condition: no CF branches remain.
   if (auto kernelOp = dyn_cast<KernelOp>(op))
     kernelOp.addNormalForms({NoCfBranchesAttr::get(op->getContext())});
diff --git a/lib/Dialect/AMDGCN/Transforms/Mem2Reg.cpp b/lib/Dialect/AMDGCN/Transforms/Mem2Reg.cpp
index 7354a08b1..53a944b83 100644
--- a/lib/Dialect/AMDGCN/Transforms/Mem2Reg.cpp
+++ b/lib/Dialect/AMDGCN/Transforms/Mem2Reg.cpp
@@ -63,8 +63,8 @@ void Mem2Reg::runOnOperation() {
       SmallVector<Value> allocas;
       RegisterRange range = regType.getAsRange();
       for (int16_t i = 0; i < range.size(); ++i) {
-        Register reg = regType.isRelocatable()
-                           ? Register()
+        Register reg = !regType.hasAllocatedSemantics()
+                           ? range.begin()
                            : Register(range.begin().getRegister() + i);
         allocas.push_back(amdgcn::AllocaOp::create(
             rewriter, pOp.getLoc(), getRegisterType(regType, reg)));
diff --git a/lib/Dialect/AMDGCN/Transforms/ToAMDGCNPatterns.cpp b/lib/Dialect/AMDGCN/Transforms/ToAMDGCNPatterns.cpp
index d5cf9e5ae..8fc5ee2cc 100644
--- a/lib/Dialect/AMDGCN/Transforms/ToAMDGCNPatterns.cpp
+++ b/lib/Dialect/AMDGCN/Transforms/ToAMDGCNPatterns.cpp
@@ -296,6 +296,16 @@ struct WaitOpPattern : public OpRewritePattern<lsir::WaitOp> {
                                 PatternRewriter &rewriter) const override;
 };
 
+//===----------------------------------------------------------------------===//
+// CmpIOpPattern
+//===----------------------------------------------------------------------===//
+
+struct CmpIOpPattern : public OpRewritePattern<lsir::CmpIOp> {
+  using Base::Base;
+  LogicalResult matchAndRewrite(lsir::CmpIOp op,
+                                PatternRewriter &rewriter) const override;
+};
+
 //===----------------------------------------------------------------------===//
 // PtrAddOpPattern
 //===----------------------------------------------------------------------===//
@@ -2110,6 +2120,131 @@ PtrAddOpPattern::matchAndRewrite(PtrAddOp op, PatternRewriter &rewriter) const {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// CmpIOpPattern helpers
+//===----------------------------------------------------------------------===//
+
+/// Map arith::CmpIPredicate to the appropriate s_cmp_* opcode (scalar).
+static OpCode getScalarCompareOpCode(arith::CmpIPredicate predicate) {
+  switch (predicate) {
+  case arith::CmpIPredicate::eq:
+    return OpCode::S_CMP_EQ_I32;
+  case arith::CmpIPredicate::ne:
+    return OpCode::S_CMP_LG_I32;
+  case arith::CmpIPredicate::slt:
+    return OpCode::S_CMP_LT_I32;
+  case arith::CmpIPredicate::sle:
+    return OpCode::S_CMP_LE_I32;
+  case arith::CmpIPredicate::sgt:
+    return OpCode::S_CMP_GT_I32;
+  case arith::CmpIPredicate::sge:
+    return OpCode::S_CMP_GE_I32;
+  case arith::CmpIPredicate::ult:
+    return OpCode::S_CMP_LT_U32;
+  case arith::CmpIPredicate::ule:
+    return OpCode::S_CMP_LE_U32;
+  case arith::CmpIPredicate::ugt:
+    return OpCode::S_CMP_GT_U32;
+  case arith::CmpIPredicate::uge:
+    return OpCode::S_CMP_GE_U32;
+  }
+  llvm_unreachable("unknown CmpIPredicate");
+}
+
+/// Map arith::CmpIPredicate to the appropriate v_cmp_* opcode (vector, 32-bit
+/// encoding). The 32-bit VOPC encoding requires rhs (src1) to be a VGPR. If
+/// operands need swapping, the predicate should be flipped first.
+static OpCode getVectorCompareOpCode(arith::CmpIPredicate predicate) {
+  switch (predicate) {
+  case arith::CmpIPredicate::eq:
+    return OpCode::V_CMP_EQ_I32;
+  case arith::CmpIPredicate::ne:
+    return OpCode::V_CMP_NE_I32;
+  case arith::CmpIPredicate::slt:
+    return OpCode::V_CMP_LT_I32;
+  case arith::CmpIPredicate::sle:
+    return OpCode::V_CMP_LE_I32;
+  case arith::CmpIPredicate::sgt:
+    return OpCode::V_CMP_GT_I32;
+  case arith::CmpIPredicate::sge:
+    return OpCode::V_CMP_GE_I32;
+  case arith::CmpIPredicate::ult:
+    return OpCode::V_CMP_LT_U32;
+  case arith::CmpIPredicate::ule:
+    return OpCode::V_CMP_LE_U32;
+  case arith::CmpIPredicate::ugt:
+    return OpCode::V_CMP_GT_U32;
+  case arith::CmpIPredicate::uge:
+    return OpCode::V_CMP_GE_U32;
+  }
+  llvm_unreachable("unknown CmpIPredicate");
+}
+
+/// Swap a comparison predicate (a op b becomes b swapped_op a).
+static arith::CmpIPredicate swapPredicate(arith::CmpIPredicate pred) {
+  switch (pred) {
+  case arith::CmpIPredicate::eq:
+    return arith::CmpIPredicate::eq;
+  case arith::CmpIPredicate::ne:
+    return arith::CmpIPredicate::ne;
+  case arith::CmpIPredicate::slt:
+    return arith::CmpIPredicate::sgt;
+  case arith::CmpIPredicate::sle:
+    return arith::CmpIPredicate::sge;
+  case arith::CmpIPredicate::sgt:
+    return arith::CmpIPredicate::slt;
+  case arith::CmpIPredicate::sge:
+    return arith::CmpIPredicate::sle;
+  case arith::CmpIPredicate::ult:
+    return arith::CmpIPredicate::ugt;
+  case arith::CmpIPredicate::ule:
+    return arith::CmpIPredicate::uge;
+  case arith::CmpIPredicate::ugt:
+    return arith::CmpIPredicate::ult;
+  case arith::CmpIPredicate::uge:
+    return arith::CmpIPredicate::ule;
+  }
+  llvm_unreachable("unknown CmpIPredicate");
+}
+
+//===----------------------------------------------------------------------===//
+// CmpIOpPattern
+//===----------------------------------------------------------------------===//
+
+LogicalResult CmpIOpPattern::matchAndRewrite(lsir::CmpIOp op,
+                                             PatternRewriter &rewriter) const {
+  Value dst = op.getDst();
+  Value lhs = op.getLhs();
+  Value rhs = op.getRhs();
+  arith::CmpIPredicate pred = op.getPredicate();
+  Location loc = op.getLoc();
+
+  if (isa<VCCType>(dst.getType())) {
+    // Vector compare: v_cmp_* writes to VCC. The 32-bit VOPC encoding requires
+    // src1 (rhs) to be a VGPR. If rhs is not a VGPR, swap operands and flip
+    // the predicate.
+    if (!isa<VGPRType>(rhs.getType())) {
+      assert(isa<VGPRType>(lhs.getType()) &&
+             "at least one operand must be a VGPR for vector compare");
+      std::swap(lhs, rhs);
+      pred = swapPredicate(pred);
+    }
+    Value result =
+        amdgcn::CmpIOp::create(rewriter, loc, getVectorCompareOpCode(pred), dst,
+                               lhs, rhs)
+            .getDestRes();
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+  // Scalar compare: s_cmp_* writes to SCC.
+  Value result = amdgcn::CmpIOp::create(
+                     rewriter, loc, getScalarCompareOpCode(pred), dst, lhs, rhs)
+                     .getDestRes();
+  rewriter.replaceOp(op, result);
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ToAMDGCNPass patterns
 //===----------------------------------------------------------------------===//
@@ -2117,11 +2252,11 @@ PtrAddOpPattern::matchAndRewrite(PtrAddOp op, PatternRewriter &rewriter) const {
 void mlir::aster::amdgcn::populateToAMDGCNPatterns(
     RewritePatternSet &patterns) {
   patterns.add< // Arithmetic ops.
-      AddFOpPattern, AddIOpPattern, AndIOpPattern, ExtSIOpPattern,
-      ExtUIOpPattern, MaximumFOpPattern, MinimumFOpPattern, MulFOpPattern,
-      MulIOpPattern, MulHiSIOpPattern, OrIOpPattern, ShLIOpPattern,
-      ShRSIOpPattern, ShRUIOpPattern, SubFOpPattern, SubIOpPattern,
-      XOrIOpPattern,
+      AddFOpPattern, AddIOpPattern, AndIOpPattern, CmpIOpPattern,
+      ExtSIOpPattern, ExtUIOpPattern, MaximumFOpPattern, MinimumFOpPattern,
+      MulFOpPattern, MulIOpPattern, MulHiSIOpPattern, OrIOpPattern,
+      ShLIOpPattern, ShRSIOpPattern, ShRUIOpPattern, SubFOpPattern,
+      SubIOpPattern, XOrIOpPattern,
       // Memory ops.
       AllocaOpPattern, AssumeNoaliasOpPattern, LoadOpPattern, StoreOpPattern,
       // Data movement ops.
diff --git a/lib/Dialect/AMDGCN/Transforms/ToRegisterSemantics.cpp b/lib/Dialect/AMDGCN/Transforms/ToRegisterSemantics.cpp
index a2733424d..fb0e2be48 100644
--- a/lib/Dialect/AMDGCN/Transforms/ToRegisterSemantics.cpp
+++ b/lib/Dialect/AMDGCN/Transforms/ToRegisterSemantics.cpp
@@ -396,7 +396,8 @@ void ToRegisterSemantics::runOnOperation() {
   RewritePatternSet patterns(ctx);
   patterns
       .add<AllocaOpPattern, InstOpPattern, MakeRegisterRangePattern,
-           SplitRegisterRangePattern, GenericOpPattern<lsir::CmpIOp>,
+           SplitRegisterRangePattern, GenericOpPattern<lsir::CondBranchOp>,
+           GenericOpPattern<lsir::BranchOp>,
            GenericOpPattern<amdgcn::RegInterferenceOp>, DeallocCastOpPattern>(
           ctx);
   if (failed(applyPatternsGreedily(
diff --git a/lib/Dialect/LSIR/CodeGen/CodeGenPatterns.cpp b/lib/Dialect/LSIR/CodeGen/CodeGenPatterns.cpp
index 01d45a449..b7e43179e 100644
--- a/lib/Dialect/LSIR/CodeGen/CodeGenPatterns.cpp
+++ b/lib/Dialect/LSIR/CodeGen/CodeGenPatterns.cpp
@@ -16,6 +16,8 @@
 
 #include "aster/CodeGen/CodeGen.h"
 #include "aster/Dialect/AMDGCN/IR/AMDGCNOps.h"
+#include "aster/Dialect/AMDGCN/IR/AMDGCNTypes.h"
+#include "aster/Dialect/AMDGCN/IR/Interfaces/AMDGCNRegisterTypeInterface.h"
 #include "aster/Dialect/AsterUtils/IR/AsterUtilsOps.h"
 #include "aster/Dialect/LSIR/IR/LSIRDialect.h"
 #include "aster/Dialect/LSIR/IR/LSIROps.h"
@@ -128,12 +130,20 @@ struct ArithMinMaxOpPattern : public OpCodeGenPattern<MinMaxOp> {
     Location loc = op.getLoc();
     Value lhs = adaptor.getLhs(), rhs = adaptor.getRhs();
     // Lower to lsir.cmpi + lsir.select directly (skip arith intermediates).
+    // lsir.cmpi uses DPS: determine the compare dst type (SCC for scalar ops,
+    // VCC for vector ops) from the lhs register kind.
     Type regType = this->converter.convertType(op);
     Value dst = this->createAlloca(rewriter, loc, regType);
+    Type cmpDstType =
+        isa<amdgcn::VGPRType>(lhs.getType())
+            ? Type(amdgcn::VCCType::get(rewriter.getContext(), Register()))
+            : Type(amdgcn::SCCType::get(rewriter.getContext(), Register()));
+    Value cmpDst = this->createAlloca(rewriter, loc, cmpDstType);
     Value cmp = lsir::CmpIOp::create(
-        rewriter, loc, rewriter.getI1Type(),
-        TypeAttr::get(op.getLhs().getType()),
-        arith::CmpIPredicateAttr::get(rewriter.getContext(), pred), lhs, rhs);
+                    rewriter, loc, TypeAttr::get(op.getLhs().getType()),
+                    arith::CmpIPredicateAttr::get(rewriter.getContext(), pred),
+                    cmpDst, lhs, rhs)
+                    .getDstRes();
     rewriter.replaceOpWithNewOp<lsir::SelectOp>(op, dst, cmp, lhs, rhs);
     return success();
   }
@@ -271,10 +281,12 @@ LogicalResult
 ArithCmpIOpPattern::matchAndRewrite(arith::CmpIOp op,
                                     arith::CmpIOp::Adaptor adaptor,
                                     ConversionPatternRewriter &rewriter) const {
-  // lsir.cmpi returns i1 directly, operands are converted to registers
-  rewriter.replaceOpWithNewOp<lsir::CmpIOp>(
-      op, rewriter.getI1Type(), TypeAttr::get(op.getLhs().getType()),
-      op.getPredicateAttr(), adaptor.getLhs(), adaptor.getRhs());
+  Type dstType = converter.convertType(op.getResult());
+  Value dst = createAlloca(rewriter, op.getLoc(), dstType);
+  auto cmpOp = lsir::CmpIOp::create(
+      rewriter, op.getLoc(), TypeAttr::get(op.getLhs().getType()),
+      op.getPredicateAttr(), dst, adaptor.getLhs(), adaptor.getRhs());
+  rewriter.replaceOp(op, cmpOp);
   return success();
 }
 
@@ -286,10 +298,12 @@ LogicalResult
 ArithCmpFOpPattern::matchAndRewrite(arith::CmpFOp op,
                                     arith::CmpFOp::Adaptor adaptor,
                                     ConversionPatternRewriter &rewriter) const {
-  // lsir.cmpf returns i1 directly, operands are converted to registers
-  rewriter.replaceOpWithNewOp<lsir::CmpFOp>(
-      op, rewriter.getI1Type(), TypeAttr::get(op.getLhs().getType()),
-      op.getPredicateAttr(), adaptor.getLhs(), adaptor.getRhs());
+  Type dstType = amdgcn::VCCType::get(op.getContext(), Register());
+  Value dst = createAlloca(rewriter, op.getLoc(), dstType);
+  auto cmpOp = lsir::CmpFOp::create(
+      rewriter, op.getLoc(), TypeAttr::get(op.getLhs().getType()),
+      op.getPredicateAttr(), dst, adaptor.getLhs(), adaptor.getRhs());
+  rewriter.replaceOp(op, cmpOp);
   return success();
 }
 
@@ -337,7 +351,13 @@ LogicalResult CFCondBranchOpPattern::matchAndRewrite(
     ConversionPatternRewriter &rewriter) const {
   Location loc = op.getLoc();
 
-  // Convert operands to match the expected converted block argument types
+  // The condition must already be a register type (from lsir.cmpi). If it is
+  // still i1 (e.g. from lsir.cmpf), we cannot convert this op.
+  Value cond = adaptor.getCondition();
+  if (!isa<RegisterTypeInterface>(cond.getType()))
+    return failure();
+
+  // Convert operands to match the expected converted block argument types.
   SmallVector<Value> trueOperands =
       convertBranchOperands(adaptor.getTrueDestOperands(), op.getTrueDest(),
                             *getTypeConverter(), rewriter, loc);
@@ -345,8 +365,8 @@ LogicalResult CFCondBranchOpPattern::matchAndRewrite(
       convertBranchOperands(adaptor.getFalseDestOperands(), op.getFalseDest(),
                             *getTypeConverter(), rewriter, loc);
 
-  rewriter.replaceOpWithNewOp<cf::CondBranchOp>(
-      op, op.getCondition(), op.getTrueDest(), trueOperands, op.getFalseDest(),
+  rewriter.replaceOpWithNewOp<lsir::CondBranchOp>(
+      op, cond, op.getTrueDest(), trueOperands, op.getFalseDest(),
       falseOperands);
   return success();
 }
@@ -361,12 +381,12 @@ CFBranchOpPattern::matchAndRewrite(cf::BranchOp op,
                                    ConversionPatternRewriter &rewriter) const {
   Location loc = op.getLoc();
 
-  // Convert operands to match the expected converted block argument types
+  // Convert operands to match the expected converted block argument types.
   SmallVector<Value> destOperands =
       convertBranchOperands(adaptor.getDestOperands(), op.getDest(),
                             *getTypeConverter(), rewriter, loc);
 
-  rewriter.replaceOpWithNewOp<cf::BranchOp>(op, op.getDest(), destOperands);
+  rewriter.replaceOpWithNewOp<lsir::BranchOp>(op, op.getDest(), destOperands);
   return success();
 }
 
@@ -395,16 +415,13 @@ KernelOpConversion::matchAndRewrite(amdgcn::KernelOp op,
 LogicalResult ArithSelectOpPattern::matchAndRewrite(
     arith::SelectOp op, arith::SelectOp::Adaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
+  // lsir.select requires a register condition. If the condition is still i1
+  // (e.g. from lsir.cmpf), we cannot convert this op.
+  Value cond = adaptor.getCondition();
+  if (!isa<RegisterTypeInterface>(cond.getType()))
+    return failure();
   Type type = this->converter.convertType(op);
   Value dst = this->createAlloca(rewriter, op.getLoc(), type);
-  // If the original condition comes from lsir.cmpi/cmpf (i1 result), use the
-  // original value to avoid the type converter wrapping it in a cast.
-  // For block-argument i1 conditions, the type converter correctly maps them
-  // to register types, so we use the adapted value.
-  Value cond = op.getCondition().getDefiningOp<lsir::CmpIOp>() ||
-                       op.getCondition().getDefiningOp<lsir::CmpFOp>()
-                   ? op.getCondition()
-                   : adaptor.getCondition();
   rewriter.replaceOpWithNewOp<lsir::SelectOp>(
       op, dst, cond, adaptor.getTrueValue(), adaptor.getFalseValue());
   return success();
@@ -454,40 +471,24 @@ void mlir::aster::lsir::populateCodeGenPatterns(CodeGenConverter &converter,
                       lsir::FromRegOp, lsir::ToRegOp, lsir::RegConstraintOp>();
   target.addLegalOp<UnrealizedConversionCastOp>();
 
-  // arith.cmpi/cmpf are always converted to lsir counterparts.
-  // They return i1 but their operands are converted to register types.
+  // arith.cmpi is converted to lsir.cmpi (DPS, returns SCC/VCC register).
+  // arith.cmpf is converted to lsir.cmpf (returns i1 for now).
   target.addIllegalOp<arith::CmpIOp, arith::CmpFOp>();
 
-  // Helper to check if operands are legal for CF ops. Operands are legal if
-  // they are register types OR if they come from constants (which stay scalar).
-  auto cfOperandsLegal = [&](ValueRange operands) {
-    return llvm::all_of(operands, [&](Value v) {
-      Type t = v.getType();
-      // Register types are legal
-      if (isa<RegisterTypeInterface, TokenDependencyTypeInterface>(t))
-        return true;
-      return false;
-    });
-  };
-
-  // CF dialect ops are dynamically legal when their branch operands are either
-  // register types or constants. The condition stays as i1.
-  target.addDynamicallyLegalOp<cf::CondBranchOp>([&](cf::CondBranchOp op) {
-    return cfOperandsLegal(op.getTrueDestOperands()) &&
-           cfOperandsLegal(op.getFalseDestOperands());
-  });
-  target.addDynamicallyLegalOp<cf::BranchOp>(
-      [&](cf::BranchOp op) { return cfOperandsLegal(op.getDestOperands()); });
+  // CF dialect branch ops are always illegal — they must be replaced by the
+  // corresponding lsir.br / lsir.cond_br ops that carry register conditions.
+  target.addIllegalOp<cf::CondBranchOp, cf::BranchOp>();
 
   // KernelOp is dynamically legal - it becomes legal once the
   // KernelOpConversion pattern has converted all block argument types.
   // Start as illegal to ensure the pattern runs.
   target.addDynamicallyLegalOp<amdgcn::KernelOp>([&](amdgcn::KernelOp op) {
-    // Check if any block in the body has non-register, non-i1 arguments
+    // Check if any block in the body has non-register arguments. Token types
+    // are always legal.
     for (Block &block : op.getBodyRegion()) {
       for (BlockArgument arg : block.getArguments()) {
         Type t = arg.getType();
-        if (!isa<RegisterTypeInterface>(t) && !t.isInteger(1))
+        if (!isa<RegisterTypeInterface, TokenDependencyTypeInterface>(t))
           return false;
       }
     }
@@ -538,10 +539,9 @@ void mlir::aster::lsir::populateCodeGenPatterns(CodeGenConverter &converter,
                // These patterns go together for proper composable control-flow
                // support. CF patterns need the type converter to handle block
                // argument conversion. KernelOp conversion handles block
-               // argument types in kernel bodies. Cmp ops are converted to lsir
-               // counterparts returning i1, which persists late in the pipeline
-               // and is only translated to SCC after register allocation,
-               // together with cf branch operations.
+               // argument types in kernel bodies. arith.cmpi is converted to
+               // lsir.cmpi (DPS, SCC/VCC dst); cf.br/cf.cond_br are replaced
+               // by lsir.br/lsir.cond_br that carry register conditions.
                ArithCmpIOpPattern, ArithCmpFOpPattern, CFCondBranchOpPattern,
                CFBranchOpPattern, KernelOpConversion, AssumeUniformOpPattern
                // That's all folks!
diff --git a/lib/Dialect/LSIR/IR/LSIROps.cpp b/lib/Dialect/LSIR/IR/LSIROps.cpp
index 917028577..6395ea5af 100644
--- a/lib/Dialect/LSIR/IR/LSIROps.cpp
+++ b/lib/Dialect/LSIR/IR/LSIROps.cpp
@@ -18,6 +18,7 @@
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/InliningUtils.h"
 
@@ -198,6 +199,35 @@ LogicalResult RegCastOp::canonicalize(RegCastOp op,
   return failure();
 }
 
+//===----------------------------------------------------------------------===//
+// LSIR BranchOp
+//===----------------------------------------------------------------------===//
+
+SuccessorOperands BranchOp::getSuccessorOperands(unsigned index) {
+  assert(index == 0 && "invalid successor index");
+  return SuccessorOperands(getDestOperandsMutable());
+}
+
+Block *BranchOp::getSuccessorForOperands(ArrayRef<Attribute>) {
+  return getDest();
+}
+
+//===----------------------------------------------------------------------===//
+// LSIR CondBranchOp
+//===----------------------------------------------------------------------===//
+
+SuccessorOperands CondBranchOp::getSuccessorOperands(unsigned index) {
+  assert(index < 2 && "invalid successor index");
+  return SuccessorOperands(index == 0 ? getTrueDestOperandsMutable()
+                                      : getFalseDestOperandsMutable());
+}
+
+Block *CondBranchOp::getSuccessorForOperands(ArrayRef<Attribute> operands) {
+  if (auto condAttr = dyn_cast_or_null<IntegerAttr>(operands.front()))
+    return condAttr.getValue().isOne() ? getTrueDest() : getFalseDest();
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // LSIR IncGen
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ASM/TranslateModule.cpp b/lib/Target/ASM/TranslateModule.cpp
index 66823b2b0..04c37115f 100644
--- a/lib/Target/ASM/TranslateModule.cpp
+++ b/lib/Target/ASM/TranslateModule.cpp
@@ -298,8 +298,8 @@ FailureOr<RegisterUsage> RegisterUsage::countKernelRegisters(KernelOp kernel) {
   DenseSet<int16_t> usedAGPRs;
   auto result = kernel.walk([&](AllocaOp op) -> WalkResult {
     AMDGCNRegisterTypeInterface type = op.getType();
-    if (type.isRelocatable()) {
-      op->emitError() << "expected non-relocatable registers";
+    if (!type.hasAllocatedSemantics()) {
+      op->emitError() << "expected allocated registers";
       return WalkResult::interrupt();
     }
     RegisterRange range = type.getAsRange();
diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt
index 5e094e79c..c002ac148 100644
--- a/lib/Transforms/CMakeLists.txt
+++ b/lib/Transforms/CMakeLists.txt
@@ -4,6 +4,7 @@ add_mlir_library(AsterTransforms
   AffineOptimizePtrAdd.cpp
   CanonicalizePtrs.cpp
   ConstexprExpansion.cpp
+  ConvertSCFControlFlow.cpp
   FactorizeAffineExpr.cpp
   DecomposeMemrefIterArgs.cpp
   LDSMultibufferPrep.cpp
@@ -31,6 +32,7 @@ add_mlir_library(AsterTransforms
   MLIRAffineTransforms
   MLIRArithDialect
   MLIRArithTransforms
+  MLIRControlFlowDialect
   MLIRControlFlowInterfaces
   MLIRFuncDialect
   MLIRGPUDialect
diff --git a/lib/Dialect/AMDGCN/Transforms/ConvertSCFControlFlow.cpp b/lib/Transforms/ConvertSCFControlFlow.cpp
similarity index 73%
rename from lib/Dialect/AMDGCN/Transforms/ConvertSCFControlFlow.cpp
rename to lib/Transforms/ConvertSCFControlFlow.cpp
index 25e544680..5fd243d51 100644
--- a/lib/Dialect/AMDGCN/Transforms/ConvertSCFControlFlow.cpp
+++ b/lib/Transforms/ConvertSCFControlFlow.cpp
@@ -1,4 +1,4 @@
-//===- ConvertSCFControlFlow.cpp - SCF to AMDGCN control flow conversion --===//
+//===- ConvertSCFControlFlow.cpp - SCF to CF control flow conversion ------===//
 //
 // Copyright 2025 The ASTER Authors
 //
@@ -9,18 +9,12 @@
 //===----------------------------------------------------------------------===//
 //
 // This file implements the pass that converts SCF control flow operations to
-// CF dialect operations with explicit basic block structure. The pass uses
-// thread uniform analysis to ensure loops are uniform before conversion.
+// CF dialect operations with explicit basic block structure.
 //
 //===----------------------------------------------------------------------===//
 
-#include "aster/Dialect/AMDGCN/Transforms/Passes.h"
+#include "aster/Transforms/Passes.h"
 
-#include "aster/Analysis/ABIAnalysis.h"
-#include "aster/Dialect/AMDGCN/IR/AMDGCNAttrs.h"
-#include "aster/Dialect/AMDGCN/IR/AMDGCNDialect.h"
-#include "aster/Dialect/AMDGCN/IR/AMDGCNOps.h"
-#include "aster/Dialect/LSIR/IR/LSIRDialect.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -29,10 +23,8 @@
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir::aster {
-namespace amdgcn {
 #define GEN_PASS_DEF_CONVERTSCFCONTROLFLOW
-#include "aster/Dialect/AMDGCN/Transforms/Passes.h.inc"
-} // namespace amdgcn
+#include "aster/Transforms/Passes.h.inc"
 } // namespace mlir::aster
 
 using namespace mlir;
@@ -44,22 +36,20 @@ namespace {
 //===----------------------------------------------------------------------===//
 
 struct ConvertSCFControlFlow
-    : public amdgcn::impl::ConvertSCFControlFlowBase<ConvertSCFControlFlow> {
+    : public aster::impl::ConvertSCFControlFlowBase<ConvertSCFControlFlow> {
 public:
   using Base::Base;
   void runOnOperation() override;
 
 private:
   /// Convert a scf.for operation to CF dialect control flow.
-  LogicalResult convertForOp(scf::ForOp forOp, const ABIAnalysis &abiAnalysis);
+  LogicalResult convertForOp(scf::ForOp forOp);
 
   /// Convert a scf.if operation to CF dialect control flow.
-  LogicalResult convertIfOp(scf::IfOp ifOp, const ABIAnalysis &abiAnalysis);
+  LogicalResult convertIfOp(scf::IfOp ifOp);
 };
 
-LogicalResult
-ConvertSCFControlFlow::convertForOp(scf::ForOp forOp,
-                                    const ABIAnalysis &abiAnalysis) {
+LogicalResult ConvertSCFControlFlow::convertForOp(scf::ForOp forOp) {
   Location loc = forOp.getLoc();
   IRRewriter rewriter(forOp);
 
@@ -70,34 +60,6 @@ ConvertSCFControlFlow::convertForOp(scf::ForOp forOp,
 
   Type ivType = forOp.getInductionVar().getType();
 
-  // Check if a value is i32 or index_cast from i32.
-  auto isI32OrCastFromI32 = [](Value v) {
-    if (v.getType().isInteger(32))
-      return true;
-    if (v.getType().isIndex()) {
-      if (auto castOp = v.getDefiningOp<arith::IndexCastOp>())
-        return castOp.getIn().getType().isInteger(32);
-    }
-    return false;
-  };
-
-  // Only i32 (or index_cast from i32) bounds are supported.
-  if (!isI32OrCastFromI32(lowerBound) || !isI32OrCastFromI32(upperBound) ||
-      !isI32OrCastFromI32(step)) {
-    return forOp.emitError()
-           << "only i32 induction variables are supported in this conversion "
-              "(bounds must be i32 or arith.index_cast from i32)";
-  }
-
-  // Check if the loop is thread-uniform.
-  bool isUniform = abiAnalysis.isThreadUniform(lowerBound).value_or(false) &&
-                   abiAnalysis.isThreadUniform(upperBound).value_or(false) &&
-                   abiAnalysis.isThreadUniform(step).value_or(false);
-  if (!isUniform) {
-    return forOp.emitError()
-           << "only thread-uniform loops are supported in this conversion";
-  }
-
   // Get the yield op and its operands before modifying the body.
   auto yieldOp = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
   SmallVector<Value> yieldOperands(yieldOp.getOperands());
@@ -137,9 +99,6 @@ ConvertSCFControlFlow::convertForOp(scf::ForOp forOp,
     iterArgBlockArgs.push_back(bbBody->getArgument(i + 1));
 
   // Build the mapping from original region args to block args.
-  // This mapping is used by inlineBlockBefore to remap the body, but
-  // yieldOperands may also reference old block args (e.g., swap patterns
-  // like `scf.yield %b, %a`), so we must remap them too.
   SmallVector<Value> bodyArgMapping = {ivBlockArg};
   bodyArgMapping.append(iterArgBlockArgs);
 
@@ -155,9 +114,7 @@ ConvertSCFControlFlow::convertForOp(scf::ForOp forOp,
   rewriter.inlineBlockBefore(forOp.getBody(), bbBody, bbBody->end(),
                              bodyArgMapping);
 
-  // Remap yield operands: they may reference old block args (now dead)
-  // from the original for body. After inlining, those block args have been
-  // replaced, but yieldOperands still holds the old Value references.
+  // Remap yield operands: they may reference old block args (now dead).
   for (Value &val : yieldOperands)
     val = blockArgMapping.lookupOrDefault(val);
 
@@ -181,20 +138,12 @@ ConvertSCFControlFlow::convertForOp(scf::ForOp forOp,
   return success();
 }
 
-LogicalResult
-ConvertSCFControlFlow::convertIfOp(scf::IfOp ifOp,
-                                   const ABIAnalysis &abiAnalysis) {
+LogicalResult ConvertSCFControlFlow::convertIfOp(scf::IfOp ifOp) {
   Location loc = ifOp.getLoc();
   IRRewriter rewriter(ifOp);
 
   Value condition = ifOp.getCondition();
 
-  // Check if the condition is thread-uniform.
-  if (!abiAnalysis.isThreadUniform(condition).value_or(false)) {
-    return ifOp.emitError()
-           << "only thread-uniform conditions are supported in this conversion";
-  }
-
   bool hasElse = !ifOp.getElseRegion().empty();
 
   // Capture yield operands and block pointers before modifying anything.
@@ -248,9 +197,6 @@ ConvertSCFControlFlow::convertIfOp(scf::IfOp ifOp,
 void ConvertSCFControlFlow::runOnOperation() {
   Operation *op = getOperation();
 
-  // Get the ABI analysis which includes thread uniform analysis.
-  auto &abiAnalysis = getAnalysis<aster::ABIAnalysis>();
-
   // Collect all SCF operations first to avoid modifying while iterating.
   // Walk is post-order (inner before outer), but we need top-down order
   // (outer before inner) so that converting an outer op inlines the body
@@ -266,9 +212,9 @@ void ConvertSCFControlFlow::runOnOperation() {
   for (Operation *scfOp : scfOps) {
     LogicalResult result = success();
     if (auto forOp = dyn_cast<scf::ForOp>(scfOp))
-      result = convertForOp(forOp, abiAnalysis);
+      result = convertForOp(forOp);
     else if (auto ifOp = dyn_cast<scf::IfOp>(scfOp))
-      result = convertIfOp(ifOp, abiAnalysis);
+      result = convertIfOp(ifOp);
     if (failed(result)) {
       signalPassFailure();
       return;
@@ -290,10 +236,6 @@ void ConvertSCFControlFlow::runOnOperation() {
       signalPassFailure();
     }
   });
-
-  // Set post-condition: no SCF ops remain.
-  if (auto kernelOp = dyn_cast<amdgcn::KernelOp>(op))
-    kernelOp.addNormalForms({amdgcn::NoScfOpsAttr::get(op->getContext())});
 }
 
 } // namespace
diff --git a/python/aster/pass_pipelines.py b/python/aster/pass_pipelines.py
index f19ae2248..b5d190120 100644
--- a/python/aster/pass_pipelines.py
+++ b/python/aster/pass_pipelines.py
@@ -198,7 +198,7 @@ def phase_scf_pipelining(lcm_unroll=True, unroll_factor_multiplier=1,
     # Convert SCF control flow to AMDGCN control flow
     # Note: control flow support is very limited atm, add NORMAL FORMS
     # to harden invariants.
-    "amdgcn-convert-scf-control-flow",
+    "aster-convert-scf-control-flow",
     "canonicalize", "cse",
     "aster-codegen",
     "canonicalize", "cse", "canonicalize",
diff --git a/test/Dialect/AMDGCN/Analysis/cdna3-hazards.mlir b/test/Dialect/AMDGCN/Analysis/cdna3-hazards.mlir
index 419fca06b..05f485466 100644
--- a/test/Dialect/AMDGCN/Analysis/cdna3-hazards.mlir
+++ b/test/Dialect/AMDGCN/Analysis/cdna3-hazards.mlir
@@ -36,25 +36,25 @@ func.func @cdna3_store_hazard_detected(%arg0: !amdgcn.vgpr<0>, %arg1: !amdgcn.vg
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: Symbol: cdna3_vcc_vccz_hazard_detected
-// CHECK: Op: func.func @cdna3_vcc_vccz_hazard_detected(%{{.*}}: !amdgcn.vcc, %{{.*}}: !amdgcn.vccz, %{{.*}}: !amdgcn.vgpr<0>, %{{.*}}: !amdgcn.vgpr<1>, %{{.*}}: !amdgcn.vgpr<2>) {...}
+// CHECK: Op: func.func @cdna3_vcc_vccz_hazard_detected(%{{.*}}: !amdgcn.vcc<0>, %{{.*}}: !amdgcn.vccz<0>, %{{.*}}: !amdgcn.vgpr<0>, %{{.*}}: !amdgcn.vgpr<1>, %{{.*}}: !amdgcn.vgpr<2>) {...}
 // CHECK:   HAZARD STATE AFTER: <Empty>
-// CHECK: Op: amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>)
+// CHECK: Op: amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc<0>) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>)
 // CHECK:   HAZARD STATE AFTER: {
 // CHECK:     active = [
-// CHECK:       {#amdgcn.cdna3_vcc_exec_vccz_execz_hazard, amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>), none, {v:5, s:0, ds:0}}
+// CHECK:       {#amdgcn.cdna3_vcc_exec_vccz_execz_hazard, amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc<0>) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>), none, {v:5, s:0, ds:0}}
 // CHECK:     ]
 // CHECK:     nop counts = {v:0, s:0, ds:0}
 // CHECK:   }
-// CHECK: Op: amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(!amdgcn.vccz, !amdgcn.vgpr<2>)
+// CHECK: Op: amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc<0>) ins(!amdgcn.vccz<0>, !amdgcn.vgpr<2>)
 // CHECK:   HAZARD STATE AFTER: {
 // CHECK:     active = [
-// CHECK:       {#amdgcn.cdna3_vcc_exec_vccz_execz_hazard, amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(!amdgcn.vccz, !amdgcn.vgpr<2>), none, {v:5, s:0, ds:0}}
+// CHECK:       {#amdgcn.cdna3_vcc_exec_vccz_execz_hazard, amdgcn.cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc<0>) ins(!amdgcn.vccz<0>, !amdgcn.vgpr<2>), none, {v:5, s:0, ds:0}}
 // CHECK:     ]
 // CHECK:     nop counts = {v:5, s:0, ds:0}
 // CHECK:   }
-func.func @cdna3_vcc_vccz_hazard_detected(%arg0: !amdgcn.vcc, %arg1: !amdgcn.vccz, %arg2: !amdgcn.vgpr<0>, %arg3: !amdgcn.vgpr<1>, %arg4: !amdgcn.vgpr<2>) {
-  amdgcn.cmpi v_cmp_eq_i32 outs %arg0 ins %arg2, %arg3 : outs(!amdgcn.vcc) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>)
-  amdgcn.cmpi v_cmp_eq_i32 outs %arg0 ins %arg1, %arg4 : outs(!amdgcn.vcc) ins(!amdgcn.vccz, !amdgcn.vgpr<2>)
+func.func @cdna3_vcc_vccz_hazard_detected(%arg0: !amdgcn.vcc<0>, %arg1: !amdgcn.vccz<0>, %arg2: !amdgcn.vgpr<0>, %arg3: !amdgcn.vgpr<1>, %arg4: !amdgcn.vgpr<2>) {
+  amdgcn.cmpi v_cmp_eq_i32 outs %arg0 ins %arg2, %arg3 : outs(!amdgcn.vcc<0>) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>)
+  amdgcn.cmpi v_cmp_eq_i32 outs %arg0 ins %arg1, %arg4 : outs(!amdgcn.vcc<0>) ins(!amdgcn.vccz<0>, !amdgcn.vgpr<2>)
   return
 }
 
diff --git a/test/Dialect/AMDGCN/Analysis/range-constraints.mlir b/test/Dialect/AMDGCN/Analysis/range-constraints.mlir
index 15b126ccf..5098a4540 100644
--- a/test/Dialect/AMDGCN/Analysis/range-constraints.mlir
+++ b/test/Dialect/AMDGCN/Analysis/range-constraints.mlir
@@ -260,14 +260,16 @@ amdgcn.module @range_tests target = <gfx942> isa = <cdna3> {
 // CHECK:  results: [5 = `%{{.*}}`]
 // CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>`
 // CHECK:  results: [6 = `%{{.*}}`]
-// CHECK:  Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr<?>, i32`
+// CHECK:  Operation: `%{{.*}} = lsir.alloca : !amdgcn.scc`
 // CHECK:  results: [7 = `%{{.*}}`]
-// CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>`
+// CHECK:  Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr<?>, i32`
 // CHECK:  results: [8 = `%{{.*}}`]
 // CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>`
 // CHECK:  results: [9 = `%{{.*}}`]
 // CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>`
 // CHECK:  results: [10 = `%{{.*}}`]
+// CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>`
+// CHECK:  results: [11 = `%{{.*}}`]
 // CHECK:  Symbol: phi_coalescing_2
 // CHECK:  No range constraints
 amdgcn.module @range_tests target = <gfx942> isa = <cdna3> {
@@ -281,9 +283,10 @@ amdgcn.module @range_tests target = <gfx942> isa = <cdna3> {
     %5 = alloca : !amdgcn.vgpr<?>
     test_inst outs %0 ins %2 : (!amdgcn.vgpr<?>, !amdgcn.sgpr<?>) -> ()
     test_inst outs %1 ins %3 : (!amdgcn.vgpr<?>, !amdgcn.sgpr<?>) -> ()
-    %8 = lsir.cmpi i32 eq %2, %c0_i32 : !amdgcn.sgpr<?>, i32
+    %scc = lsir.alloca : !amdgcn.scc
+    %8 = lsir.cmpi i32 eq %scc, %2, %c0_i32 : !amdgcn.scc, !amdgcn.sgpr<?>, i32
     %9 = alloca : !amdgcn.vgpr<?>
-    cf.cond_br %8, ^bb1, ^bb2
+    lsir.cond_br %8 : !amdgcn.scc, ^bb1, ^bb2
   ^bb1:  // CHECK:  pred: ^bb0
     test_inst outs %4 ins %0 : (!amdgcn.vgpr<?>, !amdgcn.vgpr<?>) -> ()
     %11 = alloca : !amdgcn.vgpr<?>
@@ -314,10 +317,12 @@ amdgcn.module @range_tests target = <gfx942> isa = <cdna3> {
 // CHECK:  results: [3 = `%{{.*}}`]
 // CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.sgpr<?>`
 // CHECK:  results: [4 = `%{{.*}}`]
-// CHECK:  Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr<?>, i32`
+// CHECK:  Operation: `%{{.*}} = lsir.alloca : !amdgcn.scc`
 // CHECK:  results: [5 = `%{{.*}}`]
-// CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>`
+// CHECK:  Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr<?>, i32`
 // CHECK:  results: [6 = `%{{.*}}`]
+// CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>`
+// CHECK:  results: [7 = `%{{.*}}`]
 // CHECK:  Symbol: phi_coalescing_3
 // CHECK:  No range constraints
 amdgcn.module @range_tests target = <gfx942> isa = <cdna3> {
@@ -329,9 +334,10 @@ amdgcn.module @range_tests target = <gfx942> isa = <cdna3> {
     %3 = alloca : !amdgcn.sgpr<?>
     test_inst outs %0 ins %2 : (!amdgcn.vgpr<?>, !amdgcn.sgpr<?>) -> ()
     test_inst outs %1 ins %3 : (!amdgcn.vgpr<?>, !amdgcn.sgpr<?>) -> ()
-    %6 = lsir.cmpi i32 eq %2, %c0_i32 : !amdgcn.sgpr<?>, i32
+    %scc = lsir.alloca : !amdgcn.scc
+    %6 = lsir.cmpi i32 eq %scc, %2, %c0_i32 : !amdgcn.scc, !amdgcn.sgpr<?>, i32
     %7 = alloca : !amdgcn.vgpr<?>
-    cf.cond_br %6, ^bb1, ^bb2
+    lsir.cond_br %6 : !amdgcn.scc, ^bb1, ^bb2
   ^bb1:  // CHECK:  pred: ^bb0
     lsir.copy %7, %0 : !amdgcn.vgpr<?>, !amdgcn.vgpr<?>
     cf.br ^bb3
diff --git a/test/Dialect/AMDGCN/Analysis/register-interference.mlir b/test/Dialect/AMDGCN/Analysis/register-interference.mlir
index 6e1c8d87b..dc756f006 100644
--- a/test/Dialect/AMDGCN/Analysis/register-interference.mlir
+++ b/test/Dialect/AMDGCN/Analysis/register-interference.mlir
@@ -256,9 +256,10 @@ amdgcn.module @interference_tests target = <gfx942> isa = <cdna3> {
     %5 = alloca : !amdgcn.vgpr<?>
     test_inst outs %0 ins %2 : (!amdgcn.vgpr<?>, !amdgcn.sgpr<?>) -> ()
     test_inst outs %1 ins %3 : (!amdgcn.vgpr<?>, !amdgcn.sgpr<?>) -> ()
-    %8 = lsir.cmpi i32 eq %2, %c0_i32 : !amdgcn.sgpr<?>, i32
+    %scc = lsir.alloca : !amdgcn.scc<0>
+    lsir.cmpi i32 eq %scc, %2, %c0_i32 : !amdgcn.scc<0>, !amdgcn.sgpr<?>, i32
     %9 = alloca : !amdgcn.vgpr<?>
-    cf.cond_br %8, ^bb1, ^bb2
+    lsir.cond_br %scc : !amdgcn.scc<0>, ^bb1, ^bb2
   ^bb1:  // CHECK: pred: ^bb0
     test_inst outs %4 ins %0 : (!amdgcn.vgpr<?>, !amdgcn.vgpr<?>) -> ()
     %11 = alloca : !amdgcn.vgpr<?>
@@ -298,9 +299,10 @@ amdgcn.module @interference_tests target = <gfx942> isa = <cdna3> {
     %3 = alloca : !amdgcn.sgpr<?>
     test_inst outs %0 ins %2 : (!amdgcn.vgpr<?>, !amdgcn.sgpr<?>) -> ()
     test_inst outs %1 ins %3 : (!amdgcn.vgpr<?>, !amdgcn.sgpr<?>) -> ()
-    %6 = lsir.cmpi i32 eq %2, %c0_i32 : !amdgcn.sgpr<?>, i32
+    %scc = lsir.alloca : !amdgcn.scc<0>
+    lsir.cmpi i32 eq %scc, %2, %c0_i32 : !amdgcn.scc<0>, !amdgcn.sgpr<?>, i32
     %7 = alloca : !amdgcn.vgpr<?>
-    cf.cond_br %6, ^bb1, ^bb2
+    lsir.cond_br %scc : !amdgcn.scc<0>, ^bb1, ^bb2
   ^bb1:  // CHECK: pred: ^bb0
     lsir.copy %7, %0 : !amdgcn.vgpr<?>, !amdgcn.vgpr<?>
     cf.br ^bb3
diff --git a/test/Dialect/AMDGCN/Analysis/register-liveness.mlir b/test/Dialect/AMDGCN/Analysis/register-liveness.mlir
index 0ad368af4..33ee3d1bf 100644
--- a/test/Dialect/AMDGCN/Analysis/register-liveness.mlir
+++ b/test/Dialect/AMDGCN/Analysis/register-liveness.mlir
@@ -526,14 +526,16 @@ amdgcn.kernel @reg_interference {
 // CHECK:    results: [5 = `%{{.*}}`]
 // CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>`
 // CHECK:    results: [6 = `%{{.*}}`]
-// CHECK:  Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr<?>, i32`
+// CHECK:  Operation: `%{{.*}} = lsir.alloca : !amdgcn.scc`
 // CHECK:    results: [7 = `%{{.*}}`]
-// CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>`
+// CHECK:  Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr<?>, i32`
 // CHECK:    results: [8 = `%{{.*}}`]
 // CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>`
 // CHECK:    results: [9 = `%{{.*}}`]
 // CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>`
 // CHECK:    results: [10 = `%{{.*}}`]
+// CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>`
+// CHECK:    results: [11 = `%{{.*}}`]
 // CHECK:  Op: module {...}
 // CHECK:    LIVE BEFORE: []
 // CHECK:  Symbol: phi_coalescing_2
@@ -557,12 +559,14 @@ amdgcn.kernel @reg_interference {
 // CHECK:    LIVE BEFORE: [3 = `%{{.*}}`, 4 = `%{{.*}}`]
 // CHECK:  Op: amdgcn.test_inst outs %{{.*}} ins %{{.*}} : (!amdgcn.vgpr<?>, !amdgcn.sgpr<?>) -> ()
 // CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 3 = `%{{.*}}`, 4 = `%{{.*}}`]
-// CHECK:  Op: %{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr<?>, i32
+// CHECK:  Op: %{{.*}} = lsir.alloca : !amdgcn.scc
+// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 3 = `%{{.*}}`]
+// CHECK:  Op: %{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr<?>, i32
 // CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 3 = `%{{.*}}`]
 // CHECK:  Op: %{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>
-// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`]
-// CHECK:  Op: cf.cond_br %{{.*}}, ^bb1, ^bb2
-// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`]
+// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 8 = `%{{.*}}`]
+// CHECK:  Op: lsir.cond_br %{{.*}} : !amdgcn.scc, ^bb1, ^bb2
+// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 8 = `%{{.*}}`]
 // CHECK:  Op: amdgcn.test_inst outs %{{.*}} ins %{{.*}} : (!amdgcn.vgpr<?>, !amdgcn.vgpr<?>) -> ()
 // CHECK:    LIVE BEFORE: [1 = `%{{.*}}`]
 // CHECK:  Op: %{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>
@@ -570,9 +574,9 @@ amdgcn.kernel @reg_interference {
 // CHECK:  Op: amdgcn.test_inst outs %{{.*}} : (!amdgcn.vgpr<?>) -> ()
 // CHECK:    LIVE BEFORE: []
 // CHECK:  Op: lsir.copy %{{.*}}, %{{.*}} : !amdgcn.vgpr<?>, !amdgcn.vgpr<?>
-// CHECK:    LIVE BEFORE: [9 = `%{{.*}}`]
+// CHECK:    LIVE BEFORE: [10 = `%{{.*}}`]
 // CHECK:  Op: cf.br ^bb3
-// CHECK:    LIVE BEFORE: [8 = `%{{.*}}`]
+// CHECK:    LIVE BEFORE: [9 = `%{{.*}}`]
 // CHECK:  Op: amdgcn.test_inst outs %{{.*}} ins %{{.*}} : (!amdgcn.vgpr<?>, !amdgcn.vgpr<?>) -> ()
 // CHECK:    LIVE BEFORE: [2 = `%{{.*}}`]
 // CHECK:  Op: %{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>
@@ -580,11 +584,11 @@ amdgcn.kernel @reg_interference {
 // CHECK:  Op: amdgcn.test_inst outs %{{.*}} : (!amdgcn.vgpr<?>) -> ()
 // CHECK:    LIVE BEFORE: []
 // CHECK:  Op: lsir.copy %{{.*}}, %{{.*}} : !amdgcn.vgpr<?>, !amdgcn.vgpr<?>
-// CHECK:    LIVE BEFORE: [10 = `%{{.*}}`]
+// CHECK:    LIVE BEFORE: [11 = `%{{.*}}`]
 // CHECK:  Op: cf.br ^bb3
-// CHECK:    LIVE BEFORE: [8 = `%{{.*}}`]
+// CHECK:    LIVE BEFORE: [9 = `%{{.*}}`]
 // CHECK:  Op: amdgcn.test_inst ins %{{.*}} : (!amdgcn.vgpr<?>) -> ()
-// CHECK:    LIVE BEFORE: [8 = `%{{.*}}`]
+// CHECK:    LIVE BEFORE: [9 = `%{{.*}}`]
 // CHECK:  Op: amdgcn.end_kernel
 // CHECK:    LIVE BEFORE: []
 amdgcn.kernel @phi_coalescing_2 {
@@ -597,9 +601,10 @@ amdgcn.kernel @phi_coalescing_2 {
   %5 = alloca : !amdgcn.vgpr<?>
   test_inst outs %0 ins %2 : (!amdgcn.vgpr<?>, !amdgcn.sgpr<?>) -> ()
   test_inst outs %1 ins %3 : (!amdgcn.vgpr<?>, !amdgcn.sgpr<?>) -> ()
-  %8 = lsir.cmpi i32 eq %2, %c0_i32 : !amdgcn.sgpr<?>, i32
+  %scc8 = lsir.alloca : !amdgcn.scc
+  %8 = lsir.cmpi i32 eq %scc8, %2, %c0_i32 : !amdgcn.scc, !amdgcn.sgpr<?>, i32
   %9 = alloca : !amdgcn.vgpr<?>
-  cf.cond_br %8, ^bb1, ^bb2
+  lsir.cond_br %8 : !amdgcn.scc, ^bb1, ^bb2
 ^bb1:  // CHECK: pred: ^bb0
   test_inst outs %4 ins %0 : (!amdgcn.vgpr<?>, !amdgcn.vgpr<?>) -> ()
   %11 = alloca : !amdgcn.vgpr<?>
@@ -629,10 +634,12 @@ amdgcn.kernel @phi_coalescing_2 {
 // CHECK:    results: [3 = `%{{.*}}`]
 // CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.sgpr<?>`
 // CHECK:    results: [4 = `%{{.*}}`]
-// CHECK:  Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr<?>, i32`
+// CHECK:  Operation: `%{{.*}} = lsir.alloca : !amdgcn.scc`
 // CHECK:    results: [5 = `%{{.*}}`]
-// CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>`
+// CHECK:  Operation: `%{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr<?>, i32`
 // CHECK:    results: [6 = `%{{.*}}`]
+// CHECK:  Operation: `%{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>`
+// CHECK:    results: [7 = `%{{.*}}`]
 // CHECK:  Op: module {...}
 // CHECK:    LIVE BEFORE: []
 // CHECK:  Symbol: phi_coalescing_3
@@ -652,22 +659,24 @@ amdgcn.kernel @phi_coalescing_2 {
 // CHECK:    LIVE BEFORE: [3 = `%{{.*}}`, 4 = `%{{.*}}`]
 // CHECK:  Op: amdgcn.test_inst outs %{{.*}} ins %{{.*}} : (!amdgcn.vgpr<?>, !amdgcn.sgpr<?>) -> ()
 // CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 3 = `%{{.*}}`, 4 = `%{{.*}}`]
-// CHECK:  Op: %{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr<?>, i32
+// CHECK:  Op: %{{.*}} = lsir.alloca : !amdgcn.scc
+// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 3 = `%{{.*}}`]
+// CHECK:  Op: %{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr<?>, i32
 // CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 3 = `%{{.*}}`]
 // CHECK:  Op: %{{.*}} = amdgcn.alloca : !amdgcn.vgpr<?>
-// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`]
-// CHECK:  Op: cf.cond_br %{{.*}}, ^bb1, ^bb2
-// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`]
+// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 6 = `%{{.*}}`]
+// CHECK:  Op: lsir.cond_br %{{.*}} : !amdgcn.scc, ^bb1, ^bb2
+// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 6 = `%{{.*}}`]
 // CHECK:  Op: lsir.copy %{{.*}}, %{{.*}} : !amdgcn.vgpr<?>, !amdgcn.vgpr<?>
 // CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`]
 // CHECK:  Op: cf.br ^bb3
-// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 6 = `%{{.*}}`]
+// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 7 = `%{{.*}}`]
 // CHECK:  Op: lsir.copy %{{.*}}, %{{.*}} : !amdgcn.vgpr<?>, !amdgcn.vgpr<?>
 // CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`]
 // CHECK:  Op: cf.br ^bb3
-// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 6 = `%{{.*}}`]
+// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 7 = `%{{.*}}`]
 // CHECK:  Op: amdgcn.test_inst ins %{{.*}}, %{{.*}}, %{{.*}} : (!amdgcn.vgpr<?>, !amdgcn.vgpr<?>, !amdgcn.vgpr<?>) -> ()
-// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 6 = `%{{.*}}`]
+// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`, 2 = `%{{.*}}`, 7 = `%{{.*}}`]
 // CHECK:  Op: amdgcn.end_kernel
 // CHECK:    LIVE BEFORE: []
 amdgcn.kernel @phi_coalescing_3 {
@@ -678,9 +687,10 @@ amdgcn.kernel @phi_coalescing_3 {
   %3 = alloca : !amdgcn.sgpr<?>
   test_inst outs %0 ins %2 : (!amdgcn.vgpr<?>, !amdgcn.sgpr<?>) -> ()
   test_inst outs %1 ins %3 : (!amdgcn.vgpr<?>, !amdgcn.sgpr<?>) -> ()
-  %6 = lsir.cmpi i32 eq %2, %c0_i32 : !amdgcn.sgpr<?>, i32
+  %scc6 = lsir.alloca : !amdgcn.scc
+  %6 = lsir.cmpi i32 eq %scc6, %2, %c0_i32 : !amdgcn.scc, !amdgcn.sgpr<?>, i32
   %7 = alloca : !amdgcn.vgpr<?>
-  cf.cond_br %6, ^bb1, ^bb2
+  lsir.cond_br %6 : !amdgcn.scc, ^bb1, ^bb2
 ^bb1:  // CHECK: pred: ^bb0
   lsir.copy %7, %0 : !amdgcn.vgpr<?>, !amdgcn.vgpr<?>
   cf.br ^bb3
@@ -780,15 +790,18 @@ amdgcn.kernel @test_empty_kernel {
 // CHECK:    LIVE BEFORE: []
 // CHECK:  Op: %{{.*}} = amdgcn.alloca : !amdgcn.sgpr<?>
 // CHECK:    LIVE BEFORE: []
-// CHECK:  Op: %{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr<?>, i32
+// CHECK:  Op: %{{.*}} = lsir.alloca : !amdgcn.scc
 // CHECK:    LIVE BEFORE: [1 = `%{{.*}}`]
-// CHECK:  Op: cf.cond_br %{{.*}}, ^bb1, ^bb2
-// CHECK:    LIVE BEFORE: []
+// CHECK:  Op: %{{.*}} = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr<?>, i32
+// CHECK:    LIVE BEFORE: [1 = `%{{.*}}`]
+// CHECK:  Op: lsir.cond_br %{{.*}} : !amdgcn.scc, ^bb1, ^bb2
+// CHECK:    LIVE BEFORE: [3 = `%{{.*}}`]
 amdgcn.kernel @test_non_register_filtered {
   %c0 = arith.constant 0 : i32
   %0 = alloca : !amdgcn.sgpr<?>
-  %cond = lsir.cmpi i32 eq %0, %c0 : !amdgcn.sgpr<?>, i32
-  cf.cond_br %cond, ^bb1, ^bb2
+  %scc_dst = lsir.alloca : !amdgcn.scc
+  %cond = lsir.cmpi i32 eq %scc_dst, %0, %c0 : !amdgcn.scc, !amdgcn.sgpr<?>, i32
+  lsir.cond_br %cond : !amdgcn.scc, ^bb1, ^bb2
 ^bb1:
   end_kernel
 ^bb2:
diff --git a/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-compute-ops.mlir b/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-compute-ops.mlir
index 2dfa1a000..26354a114 100644
--- a/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-compute-ops.mlir
+++ b/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-compute-ops.mlir
@@ -4,9 +4,9 @@
 
 amdgcn.module @allowed_cmpi target = #amdgcn.target<gfx942> isa = #amdgcn.isa<cdna3>
     attributes {normal_forms = [#amdgcn.no_lsir_compute_ops]} {
-  func.func @f(%a: !amdgcn.sgpr, %b: !amdgcn.sgpr) -> i1 {
-    %cmp = lsir.cmpi i32 slt %a, %b : !amdgcn.sgpr, !amdgcn.sgpr
-    return %cmp : i1
+  func.func @f(%dst: !amdgcn.scc, %a: !amdgcn.sgpr, %b: !amdgcn.sgpr) -> !amdgcn.scc {
+    %cmp = lsir.cmpi i32 slt %dst, %a, %b : !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr
+    return %cmp : !amdgcn.scc
   }
 }
 
diff --git a/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-control-ops-invalid.mlir b/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-control-ops-invalid.mlir
index 03da996dd..966375010 100644
--- a/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-control-ops-invalid.mlir
+++ b/test/Dialect/AMDGCN/IR/normal-forms-no-lsir-control-ops-invalid.mlir
@@ -4,9 +4,9 @@
 
 amdgcn.module @rejected_cmpi target = #amdgcn.target<gfx942> isa = #amdgcn.isa<cdna3>
     attributes {normal_forms = [#amdgcn.no_lsir_control_ops]} {
-  func.func @f(%a: !amdgcn.sgpr, %b: !amdgcn.sgpr) -> i1 {
+  func.func @f(%dst: !amdgcn.scc, %a: !amdgcn.sgpr, %b: !amdgcn.sgpr) -> !amdgcn.scc {
     // expected-error @+1 {{normal form violation: LSIR control-flow operations are disallowed but found: lsir.cmpi}}
-    %cmp = lsir.cmpi i32 slt %a, %b : !amdgcn.sgpr, !amdgcn.sgpr
-    return %cmp : i1
+    %cmp = lsir.cmpi i32 slt %dst, %a, %b : !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr
+    return %cmp : !amdgcn.scc
   }
 }
diff --git a/test/Dialect/AMDGCN/Transforms/bufferization.mlir b/test/Dialect/AMDGCN/Transforms/bufferization.mlir
index c14d53a36..338ee89b9 100644
--- a/test/Dialect/AMDGCN/Transforms/bufferization.mlir
+++ b/test/Dialect/AMDGCN/Transforms/bufferization.mlir
@@ -15,12 +15,12 @@ func.func private @rand() -> i1
 // CHECK:           cf.br ^bb2
 // CHECK:         ^bb2:
 // CHECK:           lsir.copy %[[VAL_0]], %[[VAL_2]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb5
+// CHECK:           lsir.br ^bb5
 // CHECK:         ^bb3:
 // CHECK:           cf.br ^bb4
 // CHECK:         ^bb4:
 // CHECK:           lsir.copy %[[VAL_0]], %[[VAL_3]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb5
+// CHECK:           lsir.br ^bb5
 // CHECK:         ^bb5:
 // CHECK:           %[[COPY_0:.*]] = lsir.copy %[[VAL_1]], %[[VAL_0]] : !amdgcn.vgpr, !amdgcn.vgpr<?>
 // CHECK:           %[[VAL_4:.*]] = test_inst outs %[[COPY_0]] : (!amdgcn.vgpr) -> !amdgcn.vgpr
@@ -54,12 +54,12 @@ func.func private @rand() -> i1
 // CHECK:           cf.br ^bb2
 // CHECK:         ^bb2:
 // CHECK:           lsir.copy %[[VAL_0]], %[[VAL_2]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb5
+// CHECK:           lsir.br ^bb5
 // CHECK:         ^bb3:
 // CHECK:           cf.br ^bb4
 // CHECK:         ^bb4:
 // CHECK:           lsir.copy %[[VAL_0]], %[[VAL_2]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb5
+// CHECK:           lsir.br ^bb5
 // CHECK:         ^bb5:
 // CHECK:           %[[COPY_0:.*]] = lsir.copy %[[VAL_1]], %[[VAL_0]] : !amdgcn.vgpr, !amdgcn.vgpr<?>
 // CHECK:           test_inst ins %[[COPY_0]] : (!amdgcn.vgpr) -> ()
@@ -95,12 +95,12 @@ func.func private @rand() -> i1
 // CHECK:           cf.br ^bb2
 // CHECK:         ^bb2:
 // CHECK:           lsir.copy %[[VAL_0]], %[[VAL_2]] : !amdgcn.sgpr<?>, !amdgcn.sgpr
-// CHECK:           cf.br ^bb5
+// CHECK:           lsir.br ^bb5
 // CHECK:         ^bb3:
 // CHECK:           cf.br ^bb4
 // CHECK:         ^bb4:
 // CHECK:           lsir.copy %[[VAL_0]], %[[VAL_3]] : !amdgcn.sgpr<?>, !amdgcn.sgpr
-// CHECK:           cf.br ^bb5
+// CHECK:           lsir.br ^bb5
 // CHECK:         ^bb5:
 // CHECK:           %[[COPY_0:.*]] = lsir.copy %[[VAL_1]], %[[VAL_0]] : !amdgcn.sgpr, !amdgcn.sgpr<?>
 // CHECK:           test_inst ins %[[COPY_0]] : (!amdgcn.sgpr) -> ()
@@ -139,12 +139,12 @@ func.func private @rand() -> i1
 // CHECK:           cf.br ^bb2
 // CHECK:         ^bb2:
 // CHECK:           lsir.copy %[[VAL_0]], %[[VAL_5]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb5
+// CHECK:           lsir.br ^bb5
 // CHECK:         ^bb3:
 // CHECK:           cf.br ^bb4
 // CHECK:         ^bb4:
 // CHECK:           lsir.copy %[[VAL_0]], %[[VAL_6]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb5
+// CHECK:           lsir.br ^bb5
 // CHECK:         ^bb5:
 // CHECK:           %[[COPY_0:.*]] = lsir.copy %[[VAL_1]], %[[VAL_0]] : !amdgcn.vgpr, !amdgcn.vgpr<?>
 // CHECK:           test_inst ins %[[COPY_0]] : (!amdgcn.vgpr) -> ()
@@ -241,7 +241,7 @@ func.func private @rand() -> i1
 // CHECK:           cf.br ^bb1
 // CHECK:         ^bb1:
 // CHECK:           lsir.copy %[[VAL_0]], %[[VAL_4]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb2
+// CHECK:           lsir.br ^bb2
 // CHECK:         ^bb2:
 // CHECK:           %[[CALL_0:.*]] = func.call @rand() : () -> i1
 // CHECK:           %[[VAL_5:.*]] = alloca : !amdgcn.vgpr
@@ -250,7 +250,7 @@ func.func private @rand() -> i1
 // CHECK:           cf.cond_br %[[CALL_0]], ^bb3, ^bb4
 // CHECK:         ^bb3:
 // CHECK:           lsir.copy %[[VAL_0]], %[[VAL_6]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb2
+// CHECK:           lsir.br ^bb2
 // CHECK:         ^bb4:
 // CHECK:           test_inst ins %[[COPY_0]] : (!amdgcn.vgpr) -> ()
 // CHECK:           end_kernel
@@ -290,7 +290,7 @@ func.func private @rand() -> i1
 // CHECK:         ^bb1:
 // CHECK:           lsir.copy %[[VAL_2]], %[[VAL_7]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
 // CHECK:           lsir.copy %[[VAL_0]], %[[VAL_8]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb2
+// CHECK:           lsir.br ^bb2
 // CHECK:         ^bb2:
 // CHECK:           %[[CALL_0:.*]] = func.call @rand() : () -> i1
 // CHECK:           %[[COPY_0:.*]] = lsir.copy %[[VAL_3]], %[[VAL_2]] : !amdgcn.vgpr, !amdgcn.vgpr<?>
@@ -299,7 +299,7 @@ func.func private @rand() -> i1
 // CHECK:         ^bb3:
 // CHECK:           lsir.copy %[[VAL_2]], %[[COPY_1]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
 // CHECK:           lsir.copy %[[VAL_0]], %[[COPY_0]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb2
+// CHECK:           lsir.br ^bb2
 // CHECK:         ^bb4:
 // CHECK:           test_inst ins %[[COPY_0]], %[[COPY_1]] : (!amdgcn.vgpr, !amdgcn.vgpr) -> ()
 // CHECK:           end_kernel
@@ -346,13 +346,13 @@ func.func private @rand() -> i1
 // CHECK:         ^bb2:
 // CHECK:           lsir.copy %[[VAL_2]], %[[VAL_9]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
 // CHECK:           lsir.copy %[[VAL_0]], %[[VAL_10]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb5
+// CHECK:           lsir.br ^bb5
 // CHECK:         ^bb3:
 // CHECK:           cf.br ^bb4
 // CHECK:         ^bb4:
 // CHECK:           lsir.copy %[[VAL_2]], %[[VAL_11]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
 // CHECK:           lsir.copy %[[VAL_0]], %[[VAL_12]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb5
+// CHECK:           lsir.br ^bb5
 // CHECK:         ^bb5:
 // CHECK:           %[[COPY_0:.*]] = lsir.copy %[[VAL_3]], %[[VAL_2]] : !amdgcn.vgpr, !amdgcn.vgpr<?>
 // CHECK:           %[[COPY_1:.*]] = lsir.copy %[[VAL_1]], %[[VAL_0]] : !amdgcn.vgpr, !amdgcn.vgpr<?>
@@ -397,7 +397,7 @@ func.func private @rand() -> i1
 // CHECK:         ^bb1:
 // CHECK:           lsir.copy %[[ALLOCA_2]], %[[TEST_INST_0]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
 // CHECK:           lsir.copy %[[ALLOCA_0]], %[[TEST_INST_1]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb2
+// CHECK:           lsir.br ^bb2
 // CHECK:         ^bb2:
 // CHECK:           %[[COPY_0:.*]] = lsir.copy %[[ALLOCA_1]], %[[ALLOCA_0]] : !amdgcn.vgpr, !amdgcn.vgpr<?>
 // CHECK:           amdgcn.test_inst ins %[[COPY_0]] : (!amdgcn.vgpr) -> ()
@@ -427,14 +427,14 @@ func.func @test_copy_loc() {
 // CHECK:           cf.br ^bb1
 // CHECK:         ^bb1:
 // CHECK:           lsir.copy %[[ALLOCA_0]], %[[TEST_INST_0]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb2
+// CHECK:           lsir.br ^bb2
 // CHECK:         ^bb2:
 // CHECK:           %[[VAL_0:.*]] = call @rand() : () -> i1
 // CHECK:           %[[COPY_0:.*]] = lsir.copy %[[ALLOCA_1]], %[[ALLOCA_0]] : !amdgcn.vgpr, !amdgcn.vgpr<?>
 // CHECK:           cf.cond_br %[[VAL_0]], ^bb3, ^bb4
 // CHECK:         ^bb3:
 // CHECK:           lsir.copy %[[ALLOCA_0]], %[[COPY_0]] : !amdgcn.vgpr<?>, !amdgcn.vgpr
-// CHECK:           cf.br ^bb2
+// CHECK:           lsir.br ^bb2
 // CHECK:         ^bb4:
 // CHECK:           %[[ALLOCA_3:.*]] = amdgcn.alloca : !amdgcn.vgpr
 // CHECK:           %[[TEST_INST_1:.*]] = amdgcn.test_inst outs %[[ALLOCA_3]] ins %[[COPY_0]] : (!amdgcn.vgpr, !amdgcn.vgpr) -> !amdgcn.vgpr
diff --git a/test/Dialect/AMDGCN/Transforms/chained-select-dps-violation.mlir b/test/Dialect/AMDGCN/Transforms/chained-select-dps-violation.mlir
index 942492a7c..d04ed2640 100644
--- a/test/Dialect/AMDGCN/Transforms/chained-select-dps-violation.mlir
+++ b/test/Dialect/AMDGCN/Transforms/chained-select-dps-violation.mlir
@@ -11,12 +11,12 @@
 // 4. Chained selects and loop counter advance all get concrete registers.
 //
 // CHECK-LABEL: kernel @chained_select_loop_3_way_buffer_mux {
-// CHECK:   cf.br ^bb1
+// CHECK:   lsir.br ^bb1
 // CHECK: ^bb1:
 // CHECK:   lsir.select %{{[0-9]+}},
 // CHECK:   lsir.select %{{[0-9]+}},
 // CHECK:   test_inst ins %{{[0-9]+}} : (!amdgcn.sgpr<{{[0-9]+}}>)
-// CHECK:   cf.cond_br %{{.*}}, ^bb1, ^bb2
+// CHECK:   lsir.cond_br %{{.*}} : !amdgcn.scc
 amdgcn.module @chained_select target = <gfx942> isa = <cdna3> {
   kernel @chained_select_loop_3_way_buffer_mux {
     %c0_i32 = arith.constant 0 : i32
@@ -47,26 +47,30 @@ amdgcn.module @chained_select target = <gfx942> isa = <cdna3> {
 
   ^loop(%k: !amdgcn.sgpr, %buf_idx: !amdgcn.sgpr):
     // 3-way buffer mux: chained select on buf_idx
-    %is_buf0 = lsir.cmpi i32 eq %buf_idx, %c0_i32 : !amdgcn.sgpr, i32
-    %is_buf1 = lsir.cmpi i32 eq %buf_idx, %c1_i32 : !amdgcn.sgpr, i32
+    %scc0_alloc = amdgcn.alloca : !amdgcn.scc
+    %scc0 = lsir.cmpi i32 eq %scc0_alloc, %buf_idx, %c0_i32 : !amdgcn.scc, !amdgcn.sgpr, i32
+    %scc1_alloc = amdgcn.alloca : !amdgcn.scc
+    %scc1 =lsir.cmpi i32 eq %scc1_alloc, %buf_idx, %c1_i32 : !amdgcn.scc, !amdgcn.sgpr, i32
 
     // Chained select: inner picks between buf1 / buf2 values,
     // outer picks between buf0 and inner result.
-    %inner = lsir.select %s_inner, %is_buf1, %c100_i32, %c200_i32 : !amdgcn.sgpr, i1, i32, i32
-    %outer = lsir.select %s_outer, %is_buf0, %c0_i32, %inner : !amdgcn.sgpr, i1, i32, !amdgcn.sgpr
+    %inner = lsir.select %s_inner, %scc1, %c100_i32, %c200_i32 : !amdgcn.sgpr, !amdgcn.scc, i32, i32
+    %outer = lsir.select %s_outer, %scc0, %c0_i32, %inner : !amdgcn.sgpr, !amdgcn.scc, i32, !amdgcn.sgpr
 
     // Use the outer select result (prevents DCE)
     test_inst ins %outer : (!amdgcn.sgpr) -> ()
 
     // Advance buf_idx: (buf_idx + 1) % 3 via wrap
     %next_raw = sop2 s_add_u32 outs %s_add ins %buf_idx, %c1_i32 : !amdgcn.sgpr, !amdgcn.sgpr, i32
-    %is_3 = lsir.cmpi i32 eq %next_raw, %c3_i32 : !amdgcn.sgpr, i32
-    %next_buf = lsir.select %s_next_buf, %is_3, %c0_i32, %next_raw : !amdgcn.sgpr, i1, i32, !amdgcn.sgpr
+    %scc2_alloc = amdgcn.alloca : !amdgcn.scc
+    %scc2 = lsir.cmpi i32 eq %scc2_alloc, %next_raw, %c3_i32 : !amdgcn.scc, !amdgcn.sgpr, i32
+    %next_buf = lsir.select %s_next_buf, %scc2, %c0_i32, %next_raw : !amdgcn.sgpr, !amdgcn.scc, i32, !amdgcn.sgpr
 
     // Advance loop counter
     %k_next = sop2 s_add_u32 outs %s_cmp ins %k, %c1_i32 : !amdgcn.sgpr, !amdgcn.sgpr, i32
-    %done = lsir.cmpi i32 slt %k_next, %c6_i32 : !amdgcn.sgpr, i32
-    cf.cond_br %done, ^loop(%k_next, %next_buf : !amdgcn.sgpr, !amdgcn.sgpr), ^exit
+    %scc3_alloc = amdgcn.alloca : !amdgcn.scc
+    %scc3 = lsir.cmpi i32 slt %scc3_alloc, %k_next, %c6_i32 : !amdgcn.scc, !amdgcn.sgpr, i32
+    lsir.cond_br %scc3 : !amdgcn.scc, ^loop(%k_next, %next_buf : !amdgcn.sgpr, !amdgcn.sgpr), ^exit
 
   ^exit:
     end_kernel
diff --git a/test/Dialect/AMDGCN/Transforms/convert-scf-iter-args.mlir b/test/Dialect/AMDGCN/Transforms/convert-scf-iter-args.mlir
index 385a8eb46..7768e48ce 100644
--- a/test/Dialect/AMDGCN/Transforms/convert-scf-iter-args.mlir
+++ b/test/Dialect/AMDGCN/Transforms/convert-scf-iter-args.mlir
@@ -1,4 +1,4 @@
-// RUN: aster-opt %s --amdgcn-convert-scf-control-flow | FileCheck %s
+// RUN: aster-opt %s --aster-convert-scf-control-flow | FileCheck %s
 
 // Test scf.for with a single iter_arg (accumulator pattern)
 // Uses index_cast to convert i32 bounds to index (required by scf.for with iter_args)
diff --git a/test/Dialect/AMDGCN/Transforms/convert-scf-nf.mlir b/test/Dialect/AMDGCN/Transforms/convert-scf-nf.mlir
index aca47f3b9..9e06e4c05 100644
--- a/test/Dialect/AMDGCN/Transforms/convert-scf-nf.mlir
+++ b/test/Dialect/AMDGCN/Transforms/convert-scf-nf.mlir
@@ -1,11 +1,12 @@
-// RUN: aster-opt --pass-pipeline='builtin.module(any(amdgcn-convert-scf-control-flow))' %s \
+// RUN: aster-opt --pass-pipeline='builtin.module(any(aster-convert-scf-control-flow))' %s \
 // RUN:   | FileCheck %s
 
-// Verify that convert-scf-control-flow sets the no_scf_ops post-condition.
+// Verify that convert-scf-control-flow converts scf.for with no remaining SCF ops.
 
-// CHECK-LABEL: kernel @sets_postcondition
-// CHECK-SAME: attributes {normal_forms = [#amdgcn.no_scf_ops]}
-amdgcn.kernel @sets_postcondition {
+// CHECK-LABEL: kernel @no_remaining_scf
+// CHECK-NOT: scf.for
+// CHECK-NOT: scf.if
+amdgcn.kernel @no_remaining_scf {
 ^bb0:
   %0 = amdgcn.alloca : !amdgcn.vgpr<3>
   amdgcn.end_kernel
diff --git a/test/Dialect/AMDGCN/Transforms/convert-scf.mlir b/test/Dialect/AMDGCN/Transforms/convert-scf.mlir
index 88ab2629d..af9566887 100644
--- a/test/Dialect/AMDGCN/Transforms/convert-scf.mlir
+++ b/test/Dialect/AMDGCN/Transforms/convert-scf.mlir
@@ -1,4 +1,4 @@
-// RUN: aster-opt %s --amdgcn-convert-scf-control-flow --split-input-file --verify-diagnostics | FileCheck %s
+// RUN: aster-opt %s --aster-convert-scf-control-flow --split-input-file --verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL:   func.func @test_uniform_loops_const_bounds() {
 // CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : i32
@@ -55,33 +55,6 @@ func.func @test_uniform_loops_non_const_bounds(%n: i32) {
 
 // -----
 
-func.func @test_non_const_bounds(%n: i32) {
-  %c0 = arith.constant 0 : i32
-  %c1 = arith.constant 1 : i32
-  // expected-error@+1 {{only thread-uniform loops are supported in this conversion}}
-  scf.for %i = %c0 to %n step %c1 : i32 {
-    %iv = lsir.to_reg %i : i32 -> !amdgcn.sgpr
-    amdgcn.test_inst ins %iv : (!amdgcn.sgpr) -> ()
-  }
-  return
-}
-
-// -----
-
-func.func @test_index_loop_unsupported() {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c10 = arith.constant 10 : index
-  // expected-error@+1 {{only i32 induction variables are supported in this conversion}}
-  scf.for %i = %c0 to %c10 step %c1 {
-    %iv = lsir.to_reg %i : index -> !amdgcn.sgpr
-    amdgcn.test_inst ins %iv : (!amdgcn.sgpr) -> ()
-  }
-  return
-}
-
-// -----
-
 // CHECK-LABEL:   func.func @test_uniform_if_no_else(
 // CHECK-SAME:      %[[COND:.*]]: i1) {
 // CHECK:           %[[COND_U:.*]] = aster_utils.assume_uniform %[[COND]] : i1
@@ -407,18 +380,6 @@ func.func @test_if_with_results_inside_for(%cond: i1, %init: i32) {
 
 // -----
 
-func.func @test_non_uniform_if(%cond: i1) {
-  // expected-error@+1 {{only thread-uniform conditions are supported in this conversion}}
-  scf.if %cond {
-    %c42 = arith.constant 42 : i32
-    %reg = lsir.to_reg %c42 : i32 -> !amdgcn.sgpr
-    amdgcn.test_inst ins %reg : (!amdgcn.sgpr) -> ()
-  }
-  return
-}
-
-// -----
-
 // Test that pre-existing cf.cond_br with bad block layout is rejected.
 // Both destinations jump past the next block, so neither is a fallthrough.
 func.func @test_bad_block_layout(%cond: i1) {
diff --git a/test/Dialect/AMDGCN/Transforms/legalize-cf-nf.mlir b/test/Dialect/AMDGCN/Transforms/legalize-cf-nf.mlir
index f379f688b..12c6cc30e 100644
--- a/test/Dialect/AMDGCN/Transforms/legalize-cf-nf.mlir
+++ b/test/Dialect/AMDGCN/Transforms/legalize-cf-nf.mlir
@@ -13,8 +13,9 @@ amdgcn.kernel @sets_postcondition attributes {normal_forms = [#amdgcn.all_regist
   %alloc1 = amdgcn.alloca : !amdgcn.sgpr<1>
   amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0_i32 : !amdgcn.sgpr<0>, i32
   amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10_i32 : !amdgcn.sgpr<1>, i32
-  %cmp = lsir.cmpi i32 slt %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
-  cf.cond_br %cmp, ^bb1, ^bb2
+  %alloc_scc = lsir.alloca : !amdgcn.scc<0>
+  lsir.cmpi i32 slt %alloc_scc, %alloc0, %alloc1 : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
+  lsir.cond_br %alloc_scc : !amdgcn.scc<0>, ^bb1, ^bb2
 ^bb1:
   amdgcn.end_kernel
 ^bb2:
diff --git a/test/Dialect/AMDGCN/Transforms/legalize-cf.mlir b/test/Dialect/AMDGCN/Transforms/legalize-cf.mlir
index 754bc2386..05f59d8a0 100644
--- a/test/Dialect/AMDGCN/Transforms/legalize-cf.mlir
+++ b/test/Dialect/AMDGCN/Transforms/legalize-cf.mlir
@@ -3,10 +3,10 @@
 // CHECK-LABEL: kernel @test_cond_branch_slt
 // CHECK:         sop1 s_mov_b32 outs %[[A:.*]] ins
 // CHECK:         sop1 s_mov_b32 outs %[[B:.*]] ins
-// CHECK:         %[[SCC:.*]] = alloca : !amdgcn.scc
-// CHECK:         cmpi s_cmp_lt_i32 outs %[[SCC]] ins %[[A]], %[[B]] : outs(!amdgcn.scc) ins(!amdgcn.sgpr<0>, !amdgcn.sgpr<1>)
+// CHECK:         %[[SCC:.*]] = alloca : !amdgcn.scc<0>
+// CHECK:         lsir.cmpi i32 slt %[[SCC]], %[[A]], %[[B]] : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
 // Use SCC0 because ^bb1 (trueDest) is the next physical block - branch to ^bb2 if false
-// CHECK:         cbranch s_cbranch_scc0 %[[SCC]] ^bb2 fallthrough(^bb1) : !amdgcn.scc
+// CHECK:         cbranch s_cbranch_scc0 %[[SCC]] ^bb2 fallthrough(^bb1) : !amdgcn.scc<0>
 // CHECK:       ^bb1:
 // CHECK:         end_kernel
 // CHECK:       ^bb2:
@@ -19,8 +19,9 @@ amdgcn.module @test_slt target = <gfx942> isa = <cdna3> {
     %alloc1 = alloca : !amdgcn.sgpr<1>
     sop1 s_mov_b32 outs %alloc0 ins %c0_i32 : !amdgcn.sgpr<0>, i32
     sop1 s_mov_b32 outs %alloc1 ins %c10_i32 : !amdgcn.sgpr<1>, i32
-    %cmp = lsir.cmpi i32 slt %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
-    cf.cond_br %cmp, ^bb1, ^bb2
+    %scc0 = amdgcn.alloca : !amdgcn.scc<0>
+    lsir.cmpi i32 slt %scc0, %alloc0, %alloc1 : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
+    lsir.cond_br %scc0 : !amdgcn.scc<0>, ^bb1, ^bb2
   ^bb1:
     end_kernel
   ^bb2:
@@ -36,7 +37,7 @@ amdgcn.module @test_slt target = <gfx942> isa = <cdna3> {
 // CHECK:         end_kernel
 amdgcn.module @test_br target = <gfx942> isa = <cdna3> {
   amdgcn.kernel @test_unconditional_branch {
-    cf.br ^bb1
+    lsir.br ^bb1
   ^bb1:
     end_kernel
   }
@@ -45,7 +46,7 @@ amdgcn.module @test_br target = <gfx942> isa = <cdna3> {
 // -----
 
 // Verify lsir.cmpi is converted to amdgcn.cmpi with allocated operands
-// Verify cf.cond_br is converted to amdgcn.cbranch
+// Verify lsir.cond_br is converted to amdgcn.cbranch
 // Entry check: use s_cbranch_scc0 because ^bb1 (trueDest) is the next physical block
 // Branch to ^bb2 if SCC=0 (condition false), fallthrough to ^bb1 if SCC=1 (true)
 // Verify block argument is removed (^bb1 has no args after legalization)
@@ -54,11 +55,11 @@ amdgcn.module @test_br target = <gfx942> isa = <cdna3> {
 // Branch to ^bb1 if SCC=1 (continue loop), fallthrough to ^bb2 if SCC=0 (exit)
 
 // CHECK-LABEL: kernel @test_cf_cond_br_lsir_cmpi
-//       CHECK:   cmpi s_cmp_gt_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.scc) ins(!amdgcn.sgpr<6>, i32)
+//       CHECK:   lsir.cmpi i32 sgt %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc<0>, !amdgcn.sgpr<6>, i32
 //       CHECK:   cbranch s_cbranch_scc0 %{{.*}} ^bb2 fallthrough(^bb1)
 //       CHECK:   ^bb1:
 //       CHECK:     sop2 s_add_u32 outs %[[LOOP_ALLOC:.*]] ins %[[LOOP_ALLOC]]
-//       CHECK:     cmpi s_cmp_lt_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.scc) ins(!amdgcn.sgpr<7>, !amdgcn.sgpr<6>)
+//       CHECK:     lsir.cmpi i32 slt %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc<0>, !amdgcn.sgpr<7>, !amdgcn.sgpr<6>
 //       CHECK:     cbranch s_cbranch_scc1 %{{.*}} ^bb1 fallthrough(^bb2)
 //       CHECK:   ^bb2:
 //       CHECK:     end_kernel
@@ -95,21 +96,23 @@ amdgcn.module @ds_kernels target = <gfx942> isa = <cdna3> {
     amdgcn.sopp.s_waitcnt <s_waitcnt> lgkmcnt = 0
     //
     // Loop start cond:
-    %15 = lsir.cmpi i32 sgt %6, %c0_i32 : !amdgcn.sgpr<6>, i32
+    %scc0 = amdgcn.alloca : !amdgcn.scc<0>
+    lsir.cmpi i32 sgt %scc0, %6, %c0_i32 : !amdgcn.scc<0>, !amdgcn.sgpr<6>, i32
     // Loop iv: sgpr<7>
     sop1 s_mov_b32 outs %7 ins %c0_i32 : !amdgcn.sgpr<7>, i32
-    cf.cond_br %15, ^bb1(%7 : !amdgcn.sgpr<7>), ^bb2
-  ^bb1(%18: !amdgcn.sgpr<7>):  // 2 preds: ^bb0, ^bb1
-    sop2 s_lshl_b32 outs %8 ins %18, %c2_i32 : !amdgcn.sgpr<8>, !amdgcn.sgpr<7>, i32
+    lsir.cond_br %scc0 : !amdgcn.scc<0>, ^bb1, ^bb2
+  ^bb1:  // 2 preds: ^bb0, ^bb1
+    sop2 s_lshl_b32 outs %8 ins %7, %c2_i32 : !amdgcn.sgpr<8>, !amdgcn.sgpr<7>, i32
     amdgcn.vop1.vop1 <v_mov_b32_e32> %9, %8 : (!amdgcn.vgpr<0>, !amdgcn.sgpr<8>) -> ()
     %21 = store global_store_dword data %9 addr %13 offset d(%9) : ins(!amdgcn.vgpr<0>, !amdgcn.sgpr<[4 : 6]>, !amdgcn.vgpr<0>) -> !amdgcn.write_token<flat>
     //
     // Loop iv increment: sgpr<7>
-    sop2 s_add_u32 outs %7 ins %18, %c1_i32 : !amdgcn.sgpr<7>, !amdgcn.sgpr<7>, i32
+    sop2 s_add_u32 outs %7 ins %7, %c1_i32 : !amdgcn.sgpr<7>, !amdgcn.sgpr<7>, i32
     // Loop end cond: lsir.cmpi
-    %24 = lsir.cmpi i32 slt %7, %6 : !amdgcn.sgpr<7>, !amdgcn.sgpr<6>
-    // Loop backedge: cf.cond_br
-    cf.cond_br %24, ^bb1(%7 : !amdgcn.sgpr<7>), ^bb2
+    %scc1 = amdgcn.alloca : !amdgcn.scc<0>
+    lsir.cmpi i32 slt %scc1, %7, %6 : !amdgcn.scc<0>, !amdgcn.sgpr<7>, !amdgcn.sgpr<6>
+    // Loop backedge: lsir.cond_br
+    lsir.cond_br %scc1 : !amdgcn.scc<0>, ^bb1, ^bb2
   ^bb2:  // 2 preds: ^bb0, ^bb1
     end_kernel
   }
@@ -146,26 +149,21 @@ amdgcn.module @test_br_vgpr_range target = <gfx942> isa = <cdna3> {
     amdgcn.vop1.vop1 <v_mov_b32_e32> %v3, %c0 : (!amdgcn.vgpr<3>, i32) -> ()
 
     // Create range - uses vop1 results (which write to allocas)
-    // CHECK:       make_register_range
+    // CHECK:       %[[RANGE:.*]] = make_register_range
     %range = make_register_range %v0, %v1, %v2, %v3 :
       !amdgcn.vgpr<0>, !amdgcn.vgpr<1>, !amdgcn.vgpr<2>, !amdgcn.vgpr<3>
 
-    // Branch with range as operand
+    // Branch with range as operand - lsir.br is not lowered by this pass
     // CHECK:       branch s_branch ^bb1
-    cf.br ^bb1(%range : !amdgcn.vgpr<[0 : 4]>)
+    lsir.br ^bb1
 
-    // Block argument should be removed, range reconstructed
+    // Block argument remains (lsir.br is not lowered, so block args are preserved)
     // CHECK:       ^bb1:
-    // CHECK-NOT:     ^bb1(%
-    // Verify no duplicate allocas created
-    // CHECK-NOT:     alloca
-  ^bb1(%arg: !amdgcn.vgpr<[0 : 4]>):
-    // Range should be reconstructed from SAME allocas at block entry
-    // CHECK:       %[[RECONSTRUCTED:.*]] = make_register_range %[[V0]], %[[V1]], %[[V2]], %[[V3]]
 
     // Split the range - verify 4 results
-    // CHECK:       %{{.*}}:4 = split_register_range %[[RECONSTRUCTED]]
-    %split:4 = split_register_range %arg : !amdgcn.vgpr<[0 : 4]>
+    // CHECK:       %{{.*}}:4 = split_register_range %[[RANGE]]
+  ^bb1:
+    %split:4 = split_register_range %range : !amdgcn.vgpr<[0 : 4]>
 
     // CHECK:       end_kernel
     end_kernel
@@ -218,18 +216,20 @@ amdgcn.module @test_loop target = <gfx942> isa = <cdna3> {
     // CHECK:       sop1 s_mov_b32 outs %[[S8]]
     sop1 s_mov_b32 outs %s8 ins %c0_i32 : !amdgcn.sgpr<8>, i32
 
-    // Branch to loop - passes counter (SGPR) and accumulator (VGPR range)
+    // Branch to loop - counter and accumulator flow through allocas
     // CHECK:       branch s_branch ^bb1
-    cf.br ^bb1(%s8, %acc_init : !amdgcn.sgpr<8>, !amdgcn.vgpr<[4 : 8]>)
+    lsir.br ^bb1
 
-    // Loop header - block arguments should be removed
+    // Loop header - no block arguments (values flow through allocas)
     // CHECK:       ^bb1:
     // CHECK-NOT:     ^bb1(%
     // Verify no duplicate allocas - counter flows through %[[S8]], accumulator through %[[V4]]-[[V7]]
     // CHECK-NOT:     alloca
-  ^bb1(%counter: !amdgcn.sgpr<8>, %acc: !amdgcn.vgpr<[4 : 8]>):
+  ^bb1:
     // Accumulator range should be reconstructed from SAME allocas at loop entry
     // CHECK:       %[[ACC_RECON:.*]] = make_register_range %[[V4]], %[[V5]], %[[V6]], %[[V7]]
+    %acc_loop = make_register_range %v4, %v5, %v6, %v7 :
+      !amdgcn.vgpr<4>, !amdgcn.vgpr<5>, !amdgcn.vgpr<6>, !amdgcn.vgpr<7>
 
     // Dummy operands for MFMA (simplified - real code would have loads)
     %v16 = alloca : !amdgcn.vgpr<16>
@@ -241,27 +241,29 @@ amdgcn.module @test_loop target = <gfx942> isa = <cdna3> {
 
     // MFMA: new_acc = MFMA(a, b, acc) - accumulator is both input and output
     // CHECK:       vop3p.vop3p_mai <v_mfma_f32_16x16x16_f16> %[[ACC_RECON]]
-    amdgcn.vop3p.vop3p_mai <v_mfma_f32_16x16x16_f16> %acc, %dummy_a, %dummy_b, %acc : <[16 : 18]>, <[18 : 20]>, !amdgcn.vgpr<[4 : 8]> -> !amdgcn.vgpr<[4 : 8]>
+    amdgcn.vop3p.vop3p_mai <v_mfma_f32_16x16x16_f16> %acc_loop, %dummy_a, %dummy_b, %acc_loop : <[16 : 18]>, <[18 : 20]>, !amdgcn.vgpr<[4 : 8]> -> !amdgcn.vgpr<[4 : 8]>
 
     // Increment counter - writes to %[[S8]] alloca
     // CHECK:       sop2 s_add_u32 outs %[[S8]] ins %[[S8]]
-    sop2 s_add_u32 outs %s8 ins %counter, %c1_i32 : !amdgcn.sgpr<8>, !amdgcn.sgpr<8>, i32
+    sop2 s_add_u32 outs %s8 ins %s8, %c1_i32 : !amdgcn.sgpr<8>, !amdgcn.sgpr<8>, i32
 
     // Loop condition
-    // CHECK:       cmpi s_cmp_lt_i32
-    %cond = lsir.cmpi i32 slt %s8, %c2_i32 : !amdgcn.sgpr<8>, i32
+    %scc0 = amdgcn.alloca : !amdgcn.scc<0>
+    lsir.cmpi i32 slt %scc0, %s8, %c2_i32 : !amdgcn.scc<0>, !amdgcn.sgpr<8>, i32
 
-    // Loop backedge - passes updated counter and accumulator to both loop and exit
+    // Loop backedge - values flow through allocas
     // CHECK:       cbranch s_cbranch_scc1 {{.*}} ^bb1 fallthrough(^bb2)
-    cf.cond_br %cond, ^bb1(%s8, %acc : !amdgcn.sgpr<8>, !amdgcn.vgpr<[4 : 8]>), ^bb2(%acc : !amdgcn.vgpr<[4 : 8]>)
+    lsir.cond_br %scc0 : !amdgcn.scc<0>, ^bb1, ^bb2
 
-    // Exit block - receives final accumulator from loop
+    // Exit block - no block arguments (values flow through allocas)
     // CHECK:       ^bb2:
     // CHECK-NOT:     ^bb2(%
     // CHECK-NOT:     alloca
-  ^bb2(%final_acc: !amdgcn.vgpr<[4 : 8]>):
+  ^bb2:
     // Reconstruct range at exit from SAME allocas
     // CHECK:       %[[FINAL_RECON:.*]] = make_register_range %[[V4]], %[[V5]], %[[V6]], %[[V7]]
+    %final_acc = make_register_range %v4, %v5, %v6, %v7 :
+      !amdgcn.vgpr<4>, !amdgcn.vgpr<5>, !amdgcn.vgpr<6>, !amdgcn.vgpr<7>
     // Extract final values - verify 4 results
     // CHECK:       %{{.*}}:4 = split_register_range %[[FINAL_RECON]]
     %final:4 = split_register_range %final_acc : !amdgcn.vgpr<[4 : 8]>
@@ -278,8 +280,7 @@ amdgcn.module @test_loop target = <gfx942> isa = <cdna3> {
 // CHECK-LABEL: kernel @test_select_i1
 // CHECK:         sop1 s_mov_b32 outs %[[A:.*]] ins
 // CHECK:         sop1 s_mov_b32 outs %[[B:.*]] ins
-// CHECK:         %[[SCC:.*]] = alloca : !amdgcn.scc
-// CHECK:         cmpi s_cmp_eq_i32 outs %[[SCC]] ins %[[A]], %[[B]]
+// CHECK:         %[[SCC:.*]] = alloca : !amdgcn.scc<0>
 // CHECK:         sop2 s_cselect_b32 outs %{{.*}} ins
 // CHECK:         end_kernel
 amdgcn.module @test_select_i1_mod target = <gfx942> isa = <cdna3> {
@@ -293,8 +294,9 @@ amdgcn.module @test_select_i1_mod target = <gfx942> isa = <cdna3> {
     %alloc2 = amdgcn.alloca : !amdgcn.sgpr<2>
     amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0 : !amdgcn.sgpr<0>, i32
     amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10 : !amdgcn.sgpr<1>, i32
-    %cmp = lsir.cmpi i32 eq %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
-    lsir.select %alloc2, %cmp, %c42, %c99 : !amdgcn.sgpr<2>, i1, i32, i32
+    %scc0 = amdgcn.alloca : !amdgcn.scc<0>
+    lsir.cmpi i32 eq %scc0, %alloc0, %alloc1 : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
+    lsir.select %alloc2, %scc0, %c42, %c99 : !amdgcn.sgpr<2>, !amdgcn.scc<0>, i32, i32
     amdgcn.end_kernel
   }
 }
@@ -307,10 +309,8 @@ amdgcn.module @test_select_i1_mod target = <gfx942> isa = <cdna3> {
 // CHECK-LABEL: kernel @test_select_fanout
 // CHECK:         sop1 s_mov_b32 outs %[[A:.*]] ins
 // CHECK:         sop1 s_mov_b32 outs %[[B:.*]] ins
-// CHECK:         %[[SCC:.*]] = alloca : !amdgcn.scc
-// CHECK:         cmpi s_cmp_eq_i32 outs %[[SCC]] ins %[[A]], %[[B]]
-// CHECK-NOT:     alloca : !amdgcn.scc
-// CHECK-NOT:     cmpi
+// CHECK:         %[[SCC:.*]] = alloca : !amdgcn.scc<0>
+// CHECK-NOT:     alloca : !amdgcn.scc<0>
 // CHECK:         sop2 s_cselect_b32
 // CHECK:         sop2 s_cselect_b32
 // CHECK:         end_kernel
@@ -328,9 +328,10 @@ amdgcn.module @test_select_fanout_mod target = <gfx942> isa = <cdna3> {
     %alloc3 = amdgcn.alloca : !amdgcn.sgpr<3>
     amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0 : !amdgcn.sgpr<0>, i32
     amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10 : !amdgcn.sgpr<1>, i32
-    %cmp = lsir.cmpi i32 eq %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
-    lsir.select %alloc2, %cmp, %c1, %c2 : !amdgcn.sgpr<2>, i1, i32, i32
-    lsir.select %alloc3, %cmp, %c3, %c4 : !amdgcn.sgpr<3>, i1, i32, i32
+    %scc0 = amdgcn.alloca : !amdgcn.scc<0>
+    lsir.cmpi i32 eq %scc0, %alloc0, %alloc1 : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
+    lsir.select %alloc2, %scc0, %c1, %c2 : !amdgcn.sgpr<2>, !amdgcn.scc<0>, i32, i32
+    lsir.select %alloc3, %scc0, %c3, %c4 : !amdgcn.sgpr<3>, !amdgcn.scc<0>, i32, i32
     amdgcn.end_kernel
   }
 }
@@ -342,10 +343,8 @@ amdgcn.module @test_select_fanout_mod target = <gfx942> isa = <cdna3> {
 // CHECK-LABEL: kernel @test_mixed_consumers
 // CHECK:         sop1 s_mov_b32 outs %[[A:.*]] ins
 // CHECK:         sop1 s_mov_b32 outs %[[B:.*]] ins
-// CHECK:         %[[SCC:.*]] = alloca : !amdgcn.scc
-// CHECK:         cmpi s_cmp_eq_i32 outs %[[SCC]] ins %[[A]], %[[B]]
-// CHECK-NOT:     alloca : !amdgcn.scc
-// CHECK-NOT:     cmpi
+// CHECK:         %[[SCC:.*]] = alloca : !amdgcn.scc<0>
+// CHECK-NOT:     alloca : !amdgcn.scc<0>
 // CHECK:         sop2 s_cselect_b32
 // CHECK:         cbranch s_cbranch_scc0 %[[SCC]]
 // CHECK:       ^bb1:
@@ -363,9 +362,10 @@ amdgcn.module @test_mixed_mod target = <gfx942> isa = <cdna3> {
     %alloc2 = amdgcn.alloca : !amdgcn.sgpr<2>
     amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0 : !amdgcn.sgpr<0>, i32
     amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10 : !amdgcn.sgpr<1>, i32
-    %cmp = lsir.cmpi i32 eq %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
-    lsir.select %alloc2, %cmp, %c42, %c99 : !amdgcn.sgpr<2>, i1, i32, i32
-    cf.cond_br %cmp, ^bb1, ^bb2
+    %scc0 = amdgcn.alloca : !amdgcn.scc<0>
+    lsir.cmpi i32 eq %scc0, %alloc0, %alloc1 : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
+    lsir.select %alloc2, %scc0, %c42, %c99 : !amdgcn.sgpr<2>, !amdgcn.scc<0>, i32, i32
+    lsir.cond_br %scc0 : !amdgcn.scc<0>, ^bb1, ^bb2
   ^bb1:
     amdgcn.end_kernel
   ^bb2:
@@ -378,11 +378,7 @@ amdgcn.module @test_mixed_mod target = <gfx942> isa = <cdna3> {
 // Sequential non-overlapping i1 lifetimes are fine.
 
 // CHECK-LABEL: kernel @test_sequential_i1
-// CHECK:         cmpi s_cmp_eq_i32
-// CHECK-NOT:     cmpi
 // CHECK:         sop2 s_cselect_b32
-// CHECK:         cmpi s_cmp_lt_i32
-// CHECK-NOT:     cmpi
 // CHECK:         sop2 s_cselect_b32
 // CHECK:         end_kernel
 amdgcn.module @test_sequential_mod target = <gfx942> isa = <cdna3> {
@@ -398,91 +394,13 @@ amdgcn.module @test_sequential_mod target = <gfx942> isa = <cdna3> {
     amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0 : !amdgcn.sgpr<0>, i32
     amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10 : !amdgcn.sgpr<1>, i32
     // First compare: consumed immediately by select
-    %cmp1 = lsir.cmpi i32 eq %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
-    lsir.select %alloc2, %cmp1, %c42, %c99 : !amdgcn.sgpr<2>, i1, i32, i32
+    %scc0 = amdgcn.alloca : !amdgcn.scc<0>
+    lsir.cmpi i32 eq %scc0, %alloc0, %alloc1 : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
+    lsir.select %alloc2, %scc0, %c42, %c99 : !amdgcn.sgpr<2>, !amdgcn.scc<0>, i32, i32
     // Second compare: starts after first is consumed -- no overlap
-    %cmp2 = lsir.cmpi i32 slt %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
-    lsir.select %alloc3, %cmp2, %c42, %c99 : !amdgcn.sgpr<3>, i1, i32, i32
-    amdgcn.end_kernel
-  }
-}
-
-// -----
-
-// Dead cmpi won't be lowered and doesn't consume SCC at runtime (would have
-// been DCE'd away)
-
-// CHECK-LABEL: kernel @test_dead_cmpi_then_live
-// CHECK-NOT:     cmpi s_cmp_eq_i32
-// CHECK:         cmpi s_cmp_lt_i32
-// CHECK-NOT:     cmpi
-// CHECK:         sop2 s_cselect_b32
-// CHECK:         end_kernel
-amdgcn.module @test_dead_cmpi_mod target = <gfx942> isa = <cdna3> {
-  amdgcn.kernel @test_dead_cmpi_then_live {
-    %c0 = arith.constant 0 : i32
-    %c10 = arith.constant 10 : i32
-    %c42 = arith.constant 42 : i32
-    %c99 = arith.constant 99 : i32
-    %alloc0 = amdgcn.alloca : !amdgcn.sgpr<0>
-    %alloc1 = amdgcn.alloca : !amdgcn.sgpr<1>
-    %alloc2 = amdgcn.alloca : !amdgcn.sgpr<2>
-    amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0 : !amdgcn.sgpr<0>, i32
-    amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10 : !amdgcn.sgpr<1>, i32
-    // Dead compare -- result unused, should be ignored by precondition check
-    %dead = lsir.cmpi i32 eq %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
-    // Live compare -- consumed by select
-    %live = lsir.cmpi i32 slt %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
-    lsir.select %alloc2, %live, %c42, %c99 : !amdgcn.sgpr<2>, i1, i32, i32
-    amdgcn.end_kernel
-  }
-}
-
-// -----
-
-// Overlapping i1 lifetimes: cmpi2 executes while cmpi1's result is still live.
-// This would silently clobber SCC.
-
-amdgcn.module @test_overlap_mod target = <gfx942> isa = <cdna3> {
-  amdgcn.kernel @test_overlapping_i1 {
-    %c0 = arith.constant 0 : i32
-    %c10 = arith.constant 10 : i32
-    %c42 = arith.constant 42 : i32
-    %c99 = arith.constant 99 : i32
-    %alloc0 = amdgcn.alloca : !amdgcn.sgpr<0>
-    %alloc1 = amdgcn.alloca : !amdgcn.sgpr<1>
-    %alloc2 = amdgcn.alloca : !amdgcn.sgpr<2>
-    amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0 : !amdgcn.sgpr<0>, i32
-    amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10 : !amdgcn.sgpr<1>, i32
-    %cmp1 = lsir.cmpi i32 eq %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
-    // expected-error @+1 {{would clobber flag register from earlier compare; i1 lifetimes must not overlap}}
-    %cmp2 = lsir.cmpi i32 slt %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
-    lsir.select %alloc2, %cmp1, %c42, %c99 : !amdgcn.sgpr<2>, i1, i32, i32
-    amdgcn.end_kernel
-  }
-}
-
-// -----
-
-// Cross-block i1 usage: flag register (SCC/VCC) is not preserved across
-// block boundaries (any branch clobbers it).
-
-amdgcn.module @test_crossblock_mod target = <gfx942> isa = <cdna3> {
-  amdgcn.kernel @test_cross_block_i1 {
-    %c0 = arith.constant 0 : i32
-    %c10 = arith.constant 10 : i32
-    %c42 = arith.constant 42 : i32
-    %c99 = arith.constant 99 : i32
-    %alloc0 = amdgcn.alloca : !amdgcn.sgpr<0>
-    %alloc1 = amdgcn.alloca : !amdgcn.sgpr<1>
-    %alloc2 = amdgcn.alloca : !amdgcn.sgpr<2>
-    amdgcn.sop1 s_mov_b32 outs %alloc0 ins %c0 : !amdgcn.sgpr<0>, i32
-    amdgcn.sop1 s_mov_b32 outs %alloc1 ins %c10 : !amdgcn.sgpr<1>, i32
-    // expected-error @+1 {{has consumer in a different block; flag register (SCC/VCC) is not preserved across block boundaries}}
-    %cmp = lsir.cmpi i32 eq %alloc0, %alloc1 : !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
-    cf.br ^bb1
-  ^bb1:
-    lsir.select %alloc2, %cmp, %c42, %c99 : !amdgcn.sgpr<2>, i1, i32, i32
+    %scc1 = amdgcn.alloca : !amdgcn.scc<0>
+    lsir.cmpi i32 slt %scc1, %alloc0, %alloc1 : !amdgcn.scc<0>, !amdgcn.sgpr<0>, !amdgcn.sgpr<1>
+    lsir.select %alloc3, %scc1, %c42, %c99 : !amdgcn.sgpr<3>, !amdgcn.scc<0>, i32, i32
     amdgcn.end_kernel
   }
 }
@@ -494,7 +412,6 @@ amdgcn.module @test_crossblock_mod target = <gfx942> isa = <cdna3> {
 
 // CHECK-LABEL: kernel @test_vopc_cond_branch
 // CHECK:         %[[VCC:.*]] = alloca : !amdgcn.vcc
-// CHECK:         cmpi v_cmp_lt_i32 outs %[[VCC]] ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(i32, !amdgcn.vgpr<0>)
 // CHECK:         cbranch s_cbranch_vccz %[[VCC]] ^bb2 fallthrough(^bb1) : !amdgcn.vcc
 // CHECK:       ^bb1:
 // CHECK:         end_kernel
@@ -505,8 +422,9 @@ amdgcn.module @test_vopc_mod target = <gfx942> isa = <cdna3> {
     %c0_i32 = arith.constant 0 : i32
     %v0 = alloca : !amdgcn.vgpr<0>
     // VGPR on rhs forces vector compare path
-    %cmp = lsir.cmpi i32 slt %c0_i32, %v0 : i32, !amdgcn.vgpr<0>
-    cf.cond_br %cmp, ^bb1, ^bb2
+    %vcc0 = amdgcn.alloca : !amdgcn.vcc<0>
+    lsir.cmpi i32 slt %vcc0, %c0_i32, %v0 : !amdgcn.vcc<0>, i32, !amdgcn.vgpr<0>
+    lsir.cond_br %vcc0 : !amdgcn.vcc<0>, ^bb1, ^bb2
   ^bb1:
     end_kernel
   ^bb2:
@@ -521,15 +439,15 @@ amdgcn.module @test_vopc_mod target = <gfx942> isa = <cdna3> {
 // and predicate is flipped: slt(v0, 32) -> gt(32, v0).
 
 // CHECK-LABEL: kernel @test_vopc_operand_swap
-// CHECK:         cmpi v_cmp_gt_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(i32, !amdgcn.vgpr<0>)
 // CHECK:         cbranch s_cbranch_vccz %{{.*}} ^bb2 fallthrough(^bb1) : !amdgcn.vcc
 amdgcn.module @test_vopc_swap_mod target = <gfx942> isa = <cdna3> {
   amdgcn.kernel @test_vopc_operand_swap {
     %c32_i32 = arith.constant 32 : i32
     %v0 = alloca : !amdgcn.vgpr<0>
     // lhs=VGPR, rhs=imm -> swap + flip: slt -> gt
-    %cmp = lsir.cmpi i32 slt %v0, %c32_i32 : !amdgcn.vgpr<0>, i32
-    cf.cond_br %cmp, ^bb1, ^bb2
+    %vcc0 = amdgcn.alloca : !amdgcn.vcc<0>
+    lsir.cmpi i32 slt %vcc0, %v0, %c32_i32 : !amdgcn.vcc<0>, !amdgcn.vgpr<0>, i32
+    lsir.cond_br %vcc0 : !amdgcn.vcc<0>, ^bb1, ^bb2
   ^bb1:
     end_kernel
   ^bb2:
@@ -542,14 +460,14 @@ amdgcn.module @test_vopc_swap_mod target = <gfx942> isa = <cdna3> {
 // VOPC with two VGPR operands: no swap needed, rhs already a VGPR.
 
 // CHECK-LABEL: kernel @test_vopc_two_vgprs
-// CHECK:         cmpi v_cmp_eq_i32 outs %{{.*}} ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>)
 // CHECK:         cbranch s_cbranch_vccz %{{.*}} ^bb2 fallthrough(^bb1) : !amdgcn.vcc
 amdgcn.module @test_vopc_vv_mod target = <gfx942> isa = <cdna3> {
   amdgcn.kernel @test_vopc_two_vgprs {
     %v0 = alloca : !amdgcn.vgpr<0>
     %v1 = alloca : !amdgcn.vgpr<1>
-    %cmp = lsir.cmpi i32 eq %v0, %v1 : !amdgcn.vgpr<0>, !amdgcn.vgpr<1>
-    cf.cond_br %cmp, ^bb1, ^bb2
+    %vcc0 = amdgcn.alloca : !amdgcn.vcc<0>
+    lsir.cmpi i32 eq %vcc0, %v0, %v1 : !amdgcn.vcc<0>, !amdgcn.vgpr<0>, !amdgcn.vgpr<1>
+    lsir.cond_br %vcc0 : !amdgcn.vcc<0>, ^bb1, ^bb2
   ^bb1:
     end_kernel
   ^bb2:
@@ -565,7 +483,6 @@ amdgcn.module @test_vopc_vv_mod target = <gfx942> isa = <cdna3> {
 
 // CHECK-LABEL: kernel @test_vopc_select
 // CHECK:         %[[VCC:.*]] = alloca : !amdgcn.vcc
-// CHECK:         cmpi v_cmp_eq_i32 outs %[[VCC]] ins %{{.*}}, %{{.*}} : outs(!amdgcn.vcc) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>)
 // CHECK:         vop2 v_cndmask_b32 outs %{{.*}} ins %{{.*}}, %{{.*}} src2 = %[[VCC]]
 // CHECK:         end_kernel
 amdgcn.module @test_vopc_select_mod target = <gfx942> isa = <cdna3> {
@@ -574,9 +491,10 @@ amdgcn.module @test_vopc_select_mod target = <gfx942> isa = <cdna3> {
     %v1 = alloca : !amdgcn.vgpr<1>
     %v2 = alloca : !amdgcn.vgpr<2>
     %v3 = alloca : !amdgcn.vgpr<3>
-    %cmp = lsir.cmpi i32 eq %v0, %v1 : !amdgcn.vgpr<0>, !amdgcn.vgpr<1>
+    %vcc0 = amdgcn.alloca : !amdgcn.vcc<0>
+    lsir.cmpi i32 eq %vcc0, %v0, %v1 : !amdgcn.vcc<0>, !amdgcn.vgpr<0>, !amdgcn.vgpr<1>
     // true_value=%v2, false_value=%v3, dst=%v3 (v_cndmask reads VCC)
-    lsir.select %v3, %cmp, %v2, %v3 : !amdgcn.vgpr<3>, i1, !amdgcn.vgpr<2>, !amdgcn.vgpr<3>
+    lsir.select %v3, %vcc0, %v2, %v3 : !amdgcn.vgpr<3>, !amdgcn.vcc<0>, !amdgcn.vgpr<2>, !amdgcn.vgpr<3>
     end_kernel
   }
 }
diff --git a/test/Dialect/AMDGCN/Transforms/legalize-operands.mlir b/test/Dialect/AMDGCN/Transforms/legalize-operands.mlir
index 5292611e2..07fafd447 100644
--- a/test/Dialect/AMDGCN/Transforms/legalize-operands.mlir
+++ b/test/Dialect/AMDGCN/Transforms/legalize-operands.mlir
@@ -7,7 +7,7 @@
 // CHECK:         lsir.cmpi
 // CHECK:         %[[OUT:.*]] = alloca : !amdgcn.sgpr
 // CHECK:         %[[MOV:.*]] = sop1 s_mov_b32 outs %[[OUT]] ins %{{.*}} : !amdgcn.sgpr, i32
-// CHECK:         lsir.select %{{.*}}, %{{.*}}, %[[MOV]], %{{.*}} : !amdgcn.sgpr, i1, !amdgcn.sgpr, i32
+// CHECK:         lsir.select %{{.*}}, %{{.*}}, %[[MOV]], %{{.*}} : !amdgcn.sgpr, !amdgcn.scc, !amdgcn.sgpr, i32
 amdgcn.module @dual_literal_mod target = <gfx942> isa = <cdna3> {
   amdgcn.kernel @dual_literal_select {
     %c0 = arith.constant 0 : i32
@@ -15,8 +15,9 @@ amdgcn.module @dual_literal_mod target = <gfx942> isa = <cdna3> {
     %c1632 = arith.constant 1632 : i32
     %s0 = alloca : !amdgcn.sgpr
     %s1 = alloca : !amdgcn.sgpr
-    %cmp = lsir.cmpi i32 eq %s0, %c0 : !amdgcn.sgpr, i32
-    %sel = lsir.select %s1, %cmp, %c544, %c1632 : !amdgcn.sgpr, i1, i32, i32
+    %scc = lsir.alloca : !amdgcn.scc
+    %cmp = lsir.cmpi i32 eq %scc, %s0, %c0 : !amdgcn.scc, !amdgcn.sgpr, i32
+    %sel = lsir.select %s1, %cmp, %c544, %c1632 : !amdgcn.sgpr, !amdgcn.scc, i32, i32
     test_inst ins %sel : (!amdgcn.sgpr) -> ()
     end_kernel
   }
@@ -29,7 +30,7 @@ amdgcn.module @dual_literal_mod target = <gfx942> isa = <cdna3> {
 
 // CHECK-LABEL: kernel @one_inline_select
 // CHECK-NOT:     sop1 s_mov_b32
-// CHECK:         lsir.select %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.sgpr, i1, i32, i32
+// CHECK:         lsir.select %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.sgpr, !amdgcn.scc, i32, i32
 amdgcn.module @one_inline_mod target = <gfx942> isa = <cdna3> {
   amdgcn.kernel @one_inline_select {
     %c0 = arith.constant 0 : i32
@@ -37,8 +38,9 @@ amdgcn.module @one_inline_mod target = <gfx942> isa = <cdna3> {
     %c200 = arith.constant 200 : i32
     %s0 = alloca : !amdgcn.sgpr
     %s1 = alloca : !amdgcn.sgpr
-    %cmp = lsir.cmpi i32 eq %s0, %c0 : !amdgcn.sgpr, i32
-    %sel = lsir.select %s1, %cmp, %c10, %c200 : !amdgcn.sgpr, i1, i32, i32
+    %scc = lsir.alloca : !amdgcn.scc
+    %cmp = lsir.cmpi i32 eq %scc, %s0, %c0 : !amdgcn.scc, !amdgcn.sgpr, i32
+    %sel = lsir.select %s1, %cmp, %c10, %c200 : !amdgcn.sgpr, !amdgcn.scc, i32, i32
     test_inst ins %sel : (!amdgcn.sgpr) -> ()
     end_kernel
   }
@@ -51,7 +53,7 @@ amdgcn.module @one_inline_mod target = <gfx942> isa = <cdna3> {
 
 // CHECK-LABEL: kernel @both_inline_select
 // CHECK-NOT:     sop1 s_mov_b32
-// CHECK:         lsir.select %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.sgpr, i1, i32, i32
+// CHECK:         lsir.select %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.sgpr, !amdgcn.scc, i32, i32
 amdgcn.module @both_inline_mod target = <gfx942> isa = <cdna3> {
   amdgcn.kernel @both_inline_select {
     %c0 = arith.constant 0 : i32
@@ -59,8 +61,9 @@ amdgcn.module @both_inline_mod target = <gfx942> isa = <cdna3> {
     %c20 = arith.constant 20 : i32
     %s0 = alloca : !amdgcn.sgpr
     %s1 = alloca : !amdgcn.sgpr
-    %cmp = lsir.cmpi i32 eq %s0, %c0 : !amdgcn.sgpr, i32
-    %sel = lsir.select %s1, %cmp, %c10, %c20 : !amdgcn.sgpr, i1, i32, i32
+    %scc = lsir.alloca : !amdgcn.scc
+    %cmp = lsir.cmpi i32 eq %scc, %s0, %c0 : !amdgcn.scc, !amdgcn.sgpr, i32
+    %sel = lsir.select %s1, %cmp, %c10, %c20 : !amdgcn.sgpr, !amdgcn.scc, i32, i32
     test_inst ins %sel : (!amdgcn.sgpr) -> ()
     end_kernel
   }
@@ -74,7 +77,7 @@ amdgcn.module @both_inline_mod target = <gfx942> isa = <cdna3> {
 // CHECK-LABEL: kernel @non_constant_select
 // CHECK:         %[[A:.*]] = sop1 s_mov_b32
 // CHECK:         %[[B:.*]] = sop1 s_mov_b32
-// CHECK:         lsir.select %{{.*}}, %{{.*}}, %[[A]], %[[B]] : !amdgcn.sgpr, i1, !amdgcn.sgpr, !amdgcn.sgpr
+// CHECK:         lsir.select %{{.*}}, %{{.*}}, %[[A]], %[[B]] : !amdgcn.sgpr, !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr
 amdgcn.module @non_constant_mod target = <gfx942> isa = <cdna3> {
   amdgcn.kernel @non_constant_select {
     %c0 = arith.constant 0 : i32
@@ -83,8 +86,9 @@ amdgcn.module @non_constant_mod target = <gfx942> isa = <cdna3> {
     %s2 = alloca : !amdgcn.sgpr
     %a = sop1 s_mov_b32 outs %s1 ins %c0 : !amdgcn.sgpr, i32
     %b = sop1 s_mov_b32 outs %s2 ins %c0 : !amdgcn.sgpr, i32
-    %cmp = lsir.cmpi i32 eq %s0, %c0 : !amdgcn.sgpr, i32
-    %sel = lsir.select %s0, %cmp, %a, %b : !amdgcn.sgpr, i1, !amdgcn.sgpr, !amdgcn.sgpr
+    %scc = lsir.alloca : !amdgcn.scc
+    %cmp = lsir.cmpi i32 eq %scc, %s0, %c0 : !amdgcn.scc, !amdgcn.sgpr, i32
+    %sel = lsir.select %s0, %cmp, %a, %b : !amdgcn.sgpr, !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr
     test_inst ins %sel : (!amdgcn.sgpr) -> ()
     end_kernel
   }
@@ -97,7 +101,7 @@ amdgcn.module @non_constant_mod target = <gfx942> isa = <cdna3> {
 
 // CHECK-LABEL: kernel @boundary_inline_select
 // CHECK-NOT:     sop1 s_mov_b32
-// CHECK:         lsir.select {{.*}} : !amdgcn.sgpr, i1, i32, i32
+// CHECK:         lsir.select {{.*}} : !amdgcn.sgpr, !amdgcn.scc, i32, i32
 amdgcn.module @boundary_inline_mod target = <gfx942> isa = <cdna3> {
   amdgcn.kernel @boundary_inline_select {
     %c0 = arith.constant 0 : i32
@@ -105,8 +109,9 @@ amdgcn.module @boundary_inline_mod target = <gfx942> isa = <cdna3> {
     %c64 = arith.constant 64 : i32
     %s0 = alloca : !amdgcn.sgpr
     %s1 = alloca : !amdgcn.sgpr
-    %cmp = lsir.cmpi i32 eq %s0, %c0 : !amdgcn.sgpr, i32
-    %sel = lsir.select %s1, %cmp, %cn16, %c64 : !amdgcn.sgpr, i1, i32, i32
+    %scc = lsir.alloca : !amdgcn.scc
+    %cmp = lsir.cmpi i32 eq %scc, %s0, %c0 : !amdgcn.scc, !amdgcn.sgpr, i32
+    %sel = lsir.select %s1, %cmp, %cn16, %c64 : !amdgcn.sgpr, !amdgcn.scc, i32, i32
     test_inst ins %sel : (!amdgcn.sgpr) -> ()
     end_kernel
   }
@@ -121,7 +126,7 @@ amdgcn.module @boundary_inline_mod target = <gfx942> isa = <cdna3> {
 // CHECK:         lsir.cmpi
 // CHECK:         %[[OUT:.*]] = alloca : !amdgcn.sgpr
 // CHECK:         sop1 s_mov_b32 outs %[[OUT]]
-// CHECK:         lsir.select {{.*}} : !amdgcn.sgpr, i1, !amdgcn.sgpr, i32
+// CHECK:         lsir.select {{.*}} : !amdgcn.sgpr, !amdgcn.scc, !amdgcn.sgpr, i32
 amdgcn.module @boundary_non_inline_mod target = <gfx942> isa = <cdna3> {
   amdgcn.kernel @boundary_non_inline_select {
     %c0 = arith.constant 0 : i32
@@ -129,8 +134,9 @@ amdgcn.module @boundary_non_inline_mod target = <gfx942> isa = <cdna3> {
     %c65 = arith.constant 65 : i32
     %s0 = alloca : !amdgcn.sgpr
     %s1 = alloca : !amdgcn.sgpr
-    %cmp = lsir.cmpi i32 eq %s0, %c0 : !amdgcn.sgpr, i32
-    %sel = lsir.select %s1, %cmp, %cn17, %c65 : !amdgcn.sgpr, i1, i32, i32
+    %scc = lsir.alloca : !amdgcn.scc
+    %cmp = lsir.cmpi i32 eq %scc, %s0, %c0 : !amdgcn.scc, !amdgcn.sgpr, i32
+    %sel = lsir.select %s1, %cmp, %cn17, %c65 : !amdgcn.sgpr, !amdgcn.scc, i32, i32
     test_inst ins %sel : (!amdgcn.sgpr) -> ()
     end_kernel
   }
diff --git a/test/Dialect/AMDGCN/cmp-ops.mlir b/test/Dialect/AMDGCN/cmp-ops.mlir
index 1c4b6a249..1c0d97a9a 100644
--- a/test/Dialect/AMDGCN/cmp-ops.mlir
+++ b/test/Dialect/AMDGCN/cmp-ops.mlir
@@ -1,66 +1,66 @@
 // RUN: aster-opt %s --verify-roundtrip
 
-func.func @cmpi(%scc: !amdgcn.scc, %vcc: !amdgcn.vcc, %a: i32, %b: i32,
+func.func @cmpi(%scc: !amdgcn.scc<0>, %vcc: !amdgcn.vcc<0>, %a: i32, %b: i32,
     %v1: !amdgcn.vgpr, %dst: !amdgcn.sgpr<[? + 2]>, %dstAlloc: !amdgcn.sgpr<[0 : 2]>) {
-  amdgcn.cmpi s_cmp_eq_i32 outs %scc ins %a, %b : outs(!amdgcn.scc) ins(i32, i32)
-  amdgcn.cmpi v_cmp_eq_i32 outs %vcc ins %a, %v1 : outs(!amdgcn.vcc) ins(i32, !amdgcn.vgpr)
+  amdgcn.cmpi s_cmp_eq_i32 outs %scc ins %a, %b : outs(!amdgcn.scc<0>) ins(i32, i32)
+  amdgcn.cmpi v_cmp_eq_i32 outs %vcc ins %a, %v1 : outs(!amdgcn.vcc<0>) ins(i32, !amdgcn.vgpr)
   %0 = amdgcn.cmpi v_cmp_eq_i32_e64 outs %dst ins %a, %v1 : dps(!amdgcn.sgpr<[? + 2]>) ins(i32, !amdgcn.vgpr)
   amdgcn.cmpi v_cmp_eq_i32_e64 outs %dstAlloc ins %a, %v1 : outs(!amdgcn.sgpr<[0 : 2]>) ins(i32, !amdgcn.vgpr)
   return
 }
 
-func.func @sopc_signed_comparisons(%src0: !amdgcn.sgpr, %src1: !amdgcn.sgpr, %scc: !amdgcn.scc) {
+func.func @sopc_signed_comparisons(%src0: !amdgcn.sgpr, %src1: !amdgcn.sgpr, %scc: !amdgcn.scc<0>) {
   // s_cmp_eq_i32 - SOPC compare equal (signed 32-bit)
   amdgcn.cmpi s_cmp_eq_i32 outs %scc ins %src0, %src1
-    : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr)
+    : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr)
 
   // s_cmp_lg_i32 - SOPC compare not equal (signed 32-bit)
   amdgcn.cmpi s_cmp_lg_i32 outs %scc ins %src0, %src1
-    : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr)
+    : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr)
 
   // s_cmp_gt_i32 - SOPC compare greater than (signed 32-bit)
   amdgcn.cmpi s_cmp_gt_i32 outs %scc ins %src0, %src1
-    : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr)
+    : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr)
 
   // s_cmp_ge_i32 - SOPC compare greater than or equal (signed 32-bit)
   amdgcn.cmpi s_cmp_ge_i32 outs %scc ins %src0, %src1
-    : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr)
+    : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr)
 
   // s_cmp_lt_i32 - SOPC compare less than (signed 32-bit)
   amdgcn.cmpi s_cmp_lt_i32 outs %scc ins %src0, %src1
-    : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr)
+    : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr)
 
   // s_cmp_le_i32 - SOPC compare less than or equal (signed 32-bit)
   amdgcn.cmpi s_cmp_le_i32 outs %scc ins %src0, %src1
-    : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr)
+    : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr)
 
   return
 }
 
-func.func @sopc_unsigned_comparisons(%src0: !amdgcn.sgpr, %src1: !amdgcn.sgpr, %scc: !amdgcn.scc) {
+func.func @sopc_unsigned_comparisons(%src0: !amdgcn.sgpr, %src1: !amdgcn.sgpr, %scc: !amdgcn.scc<0>) {
   // s_cmp_eq_u32 - SOPC compare equal (unsigned 32-bit)
   amdgcn.cmpi s_cmp_eq_u32 outs %scc ins %src0, %src1
-    : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr)
+    : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr)
 
   // s_cmp_lg_u32 - SOPC compare not equal (unsigned 32-bit)
   amdgcn.cmpi s_cmp_lg_u32 outs %scc ins %src0, %src1
-    : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr)
+    : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr)
 
   // s_cmp_gt_u32 - SOPC compare greater than (unsigned 32-bit)
   amdgcn.cmpi s_cmp_gt_u32 outs %scc ins %src0, %src1
-    : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr)
+    : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr)
 
   // s_cmp_ge_u32 - SOPC compare greater than or equal (unsigned 32-bit)
   amdgcn.cmpi s_cmp_ge_u32 outs %scc ins %src0, %src1
-    : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr)
+    : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr)
 
   // s_cmp_lt_u32 - SOPC compare less than (unsigned 32-bit)
   amdgcn.cmpi s_cmp_lt_u32 outs %scc ins %src0, %src1
-    : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr)
+    : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr)
 
   // s_cmp_le_u32 - SOPC compare less than or equal (unsigned 32-bit)
   amdgcn.cmpi s_cmp_le_u32 outs %scc ins %src0, %src1
-    : outs(!amdgcn.scc) ins(!amdgcn.sgpr, !amdgcn.sgpr)
+    : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr, !amdgcn.sgpr)
 
   return
 }
diff --git a/test/Dialect/LSIR/CodeGen/arith-minmax-codegen.mlir b/test/Dialect/LSIR/CodeGen/arith-minmax-codegen.mlir
index 440ae7276..d9e4bf987 100644
--- a/test/Dialect/LSIR/CodeGen/arith-minmax-codegen.mlir
+++ b/test/Dialect/LSIR/CodeGen/arith-minmax-codegen.mlir
@@ -7,7 +7,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<
 // CHECK-LABEL: func.func @test_minui(
 // CHECK-SAME:    %[[A:.*]]: !amdgcn.vgpr, %[[B:.*]]: !amdgcn.vgpr) -> !amdgcn.vgpr
 // CHECK:         %[[DST:.*]] = lsir.alloca : !amdgcn.vgpr
-// CHECK:         %[[CMP:.*]] = lsir.cmpi i32 ult %[[A]], %[[B]]
+// CHECK:         %[[CMP_DST:.*]] = lsir.alloca : !amdgcn.vcc
+// CHECK:         %[[CMP:.*]] = lsir.cmpi i32 ult %[[CMP_DST]], %[[A]], %[[B]]
 // CHECK:         lsir.select %[[DST]], %[[CMP]], %[[A]], %[[B]]
 func.func @test_minui(%a: i32, %b: i32) -> i32
     attributes {abi = (!amdgcn.vgpr, !amdgcn.vgpr) -> !amdgcn.vgpr} {
@@ -28,7 +29,8 @@ func.func @test_maxui(%a: i32, %b: i32) -> i32
 // CHECK-LABEL: func.func @test_minsi(
 // CHECK-SAME:    %[[A:.*]]: !amdgcn.vgpr, %[[B:.*]]: !amdgcn.vgpr) -> !amdgcn.vgpr
 // CHECK:         %[[DST:.*]] = lsir.alloca : !amdgcn.vgpr
-// CHECK:         %[[CMP:.*]] = lsir.cmpi i32 slt %[[A]], %[[B]]
+// CHECK:         %[[CMP_DST:.*]] = lsir.alloca : !amdgcn.vcc
+// CHECK:         %[[CMP:.*]] = lsir.cmpi i32 slt %[[CMP_DST]], %[[A]], %[[B]]
 // CHECK:         lsir.select %[[DST]], %[[CMP]], %[[A]], %[[B]]
 func.func @test_minsi(%a: i32, %b: i32) -> i32
     attributes {abi = (!amdgcn.vgpr, !amdgcn.vgpr) -> !amdgcn.vgpr} {
diff --git a/test/Dialect/LSIR/Transforms/codegen-cf.mlir b/test/Dialect/LSIR/Transforms/codegen-cf.mlir
index 2b5fe1200..df7eaaf39 100644
--- a/test/Dialect/LSIR/Transforms/codegen-cf.mlir
+++ b/test/Dialect/LSIR/Transforms/codegen-cf.mlir
@@ -15,16 +15,18 @@
 // CHECK:           %[[LOAD:.*]] = load_arg 1
 // CHECK:           amdgcn.sopp.s_waitcnt
 // CHECK:           split_register_range
-// CHECK:           %[[CMP_INIT:.*]] = lsir.cmpi i32 sgt %{{.*}}, %[[C0]] : !amdgcn.sgpr, i32
+// CHECK:           %[[CMP_DST_INIT:.*]] = lsir.alloca : !amdgcn.scc
+// CHECK:           %[[CMP_INIT:.*]] = lsir.cmpi i32 sgt %[[CMP_DST_INIT]], %{{.*}}, %[[C0]] : !amdgcn.scc, !amdgcn.sgpr, i32
 // CHECK:           %[[ALLOCA_INIT:.*]] = lsir.alloca : !amdgcn.sgpr
 // CHECK:           %[[MOV_INIT:.*]] = lsir.mov %[[ALLOCA_INIT]], %[[C0]]
-// CHECK:           cf.cond_br %[[CMP_INIT]], ^bb1(%[[MOV_INIT]] : !amdgcn.sgpr), ^bb2
+// CHECK:           lsir.cond_br %[[CMP_INIT]] : !amdgcn.scc, ^bb1(%[[MOV_INIT]] : !amdgcn.sgpr), ^bb2
 // CHECK:         ^bb1(%[[LOOP_ARG:.*]]: !amdgcn.sgpr):
 // CHECK:           test_inst ins %[[LOOP_ARG]]
 // CHECK:           %[[ALLOCA_LOOP:.*]] = lsir.alloca : !amdgcn.sgpr
 // CHECK:           %[[LOOP_ADDI:.*]] = lsir.addi i32 %[[ALLOCA_LOOP]], %[[LOOP_ARG]], %[[C1]]
-// CHECK:           %[[CMP_LOOP:.*]] = lsir.cmpi i32 slt %[[LOOP_ADDI]], %{{.*}} : !amdgcn.sgpr, !amdgcn.sgpr
-// CHECK:           cf.cond_br %[[CMP_LOOP]], ^bb1(%[[LOOP_ADDI]] : !amdgcn.sgpr), ^bb2
+// CHECK:           %[[CMP_DST_LOOP:.*]] = lsir.alloca : !amdgcn.scc
+// CHECK:           %[[CMP_LOOP:.*]] = lsir.cmpi i32 slt %[[CMP_DST_LOOP]], %[[LOOP_ADDI]], %{{.*}} : !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr
+// CHECK:           lsir.cond_br %[[CMP_LOOP]] : !amdgcn.scc, ^bb1(%[[LOOP_ADDI]] : !amdgcn.sgpr), ^bb2
 // CHECK:         ^bb2:
 // CHECK:           end_kernel
 
@@ -63,10 +65,11 @@ amdgcn.module @test_uniform_loop target = <gfx942> isa = <cdna3> {
 // CHECK:           alloca
 // CHECK:           %[[LOAD_RESULT:.*]], %{{.*}} = load s_load_dword
 // CHECK:           amdgcn.sopp.s_waitcnt
-// CHECK:           %[[CMP_INIT2:.*]] = lsir.cmpi i32 sgt %[[LOAD_RESULT]], %[[C0]] : !amdgcn.sgpr, i32
+// CHECK:           %[[CMP_DST_INIT2:.*]] = lsir.alloca : !amdgcn.scc
+// CHECK:           %[[CMP_INIT2:.*]] = lsir.cmpi i32 sgt %[[CMP_DST_INIT2]], %[[LOAD_RESULT]], %[[C0]] : !amdgcn.scc, !amdgcn.sgpr, i32
 // CHECK:           %[[ALLOCA_INIT2:.*]] = lsir.alloca : !amdgcn.sgpr
 // CHECK:           %[[MOV_INIT2:.*]] = lsir.mov %[[ALLOCA_INIT2]], %[[C0]]
-// CHECK:           cf.cond_br %[[CMP_INIT2]], ^bb1(%[[MOV_INIT2]] : !amdgcn.sgpr), ^bb2
+// CHECK:           lsir.cond_br %[[CMP_INIT2]] : !amdgcn.scc, ^bb1(%[[MOV_INIT2]] : !amdgcn.sgpr), ^bb2
 // CHECK:         ^bb1(%[[LOOP_ARG2:.*]]: !amdgcn.sgpr):
 // CHECK:           %[[ALLOCA_SHLI:.*]] = lsir.alloca : !amdgcn.sgpr
 // CHECK:           %[[LOOP_SHLI:.*]] = lsir.shli i32 %[[ALLOCA_SHLI]], %[[LOOP_ARG2]], %[[C2]]
@@ -75,8 +78,9 @@ amdgcn.module @test_uniform_loop target = <gfx942> isa = <cdna3> {
 // CHECK:           store global_store_dword
 // CHECK:           %[[ALLOCA_ADDI:.*]] = lsir.alloca : !amdgcn.sgpr
 // CHECK:           %[[LOOP_ADDI2:.*]] = lsir.addi i32 %[[ALLOCA_ADDI]], %[[LOOP_ARG2]], %[[C1]]
-// CHECK:           %[[CMP_LOOP2:.*]] = lsir.cmpi i32 slt %[[LOOP_ADDI2]], %[[LOAD_RESULT]] : !amdgcn.sgpr, !amdgcn.sgpr
-// CHECK:           cf.cond_br %[[CMP_LOOP2]], ^bb1(%[[LOOP_ADDI2]] : !amdgcn.sgpr), ^bb2
+// CHECK:           %[[CMP_DST_LOOP2:.*]] = lsir.alloca : !amdgcn.scc
+// CHECK:           %[[CMP_LOOP2:.*]] = lsir.cmpi i32 slt %[[CMP_DST_LOOP2]], %[[LOOP_ADDI2]], %[[LOAD_RESULT]] : !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr
+// CHECK:           lsir.cond_br %[[CMP_LOOP2]] : !amdgcn.scc, ^bb1(%[[LOOP_ADDI2]] : !amdgcn.sgpr), ^bb2
 // CHECK:         ^bb2:
 // CHECK:           end_kernel
 
@@ -111,18 +115,19 @@ amdgcn.module @test_uniform_loop_with_load target = <gfx942> isa = <cdna3> {
 // -----
 
 //===----------------------------------------------------------------------===//
-// Test arith.cmpi + arith.select -> lsir.cmpi + lsir.select(i1)
+// Test arith.cmpi + arith.select -> lsir.cmpi + lsir.select
 // Verifies that:
-// 1. arith.cmpi is converted to lsir.cmpi returning i1
-// 2. arith.select with i1 condition is converted to lsir.select with i1
-// 3. No unrealized_conversion_cast is inserted for the i1 condition
+// 1. arith.cmpi is converted to lsir.cmpi (DPS, returns SCC/VCC register)
+// 2. arith.select with cmpi condition is converted to lsir.select with SCC
+// 3. No unrealized_conversion_cast is inserted for the condition
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: amdgcn.module @test_select_i1
 // CHECK:         kernel @test_select_i1
-// CHECK:           %[[CMP:.*]] = lsir.cmpi i32 eq %{{.*}}, %{{.*}} : !amdgcn.sgpr, i32
+// CHECK:           %{{.*}} = lsir.alloca : !amdgcn.scc
+// CHECK:           %[[CMP:.*]] = lsir.cmpi i32 eq %{{.*}}, %{{.*}}, %{{.*}} : !amdgcn.scc, !amdgcn.sgpr, i32
 // CHECK:           %[[ALLOCA:.*]] = lsir.alloca : !amdgcn.sgpr
-// CHECK:           lsir.select %[[ALLOCA]], %[[CMP]], %{{.*}}, %{{.*}} : !amdgcn.sgpr, i1, i32, i32
+// CHECK:           lsir.select %[[ALLOCA]], %[[CMP]], %{{.*}}, %{{.*}} : !amdgcn.sgpr, !amdgcn.scc, i32, i32
 // CHECK-NOT:       unrealized_conversion_cast
 
 amdgcn.module @test_select_i1 target = <gfx942> isa = <cdna3> {
@@ -148,10 +153,11 @@ amdgcn.module @test_select_i1 target = <gfx942> isa = <cdna3> {
 // CHECK-LABEL:   func.func @test_token_in_args(
 // CHECK-SAME:      %[[ARG0:.*]]: !amdgcn.vgpr, %[[ARG1:.*]]: !amdgcn.read_token<flat>) {
 // CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : i32
-// CHECK:           %[[CMPI_0:.*]] = lsir.cmpi i32 sgt %[[ARG0]], %[[CONSTANT_0]] : !amdgcn.vgpr, i32
-// CHECK:           cf.cond_br %[[CMPI_0]], ^bb1(%[[ARG1]] : !amdgcn.read_token<flat>), ^bb2
+// CHECK:           %{{.*}} = lsir.alloca : !amdgcn.vcc
+// CHECK:           %[[CMPI_0:.*]] = lsir.cmpi i32 sgt %{{.*}}, %[[ARG0]], %[[CONSTANT_0]] : !amdgcn.vcc, !amdgcn.vgpr, i32
+// CHECK:           lsir.cond_br %[[CMPI_0]] : !amdgcn.vcc, ^bb1(%[[ARG1]] : !amdgcn.read_token<flat>), ^bb2
 // CHECK:         ^bb1(%[[VAL_0:.*]]: !amdgcn.read_token<flat>):
-// CHECK:           cf.cond_br %[[CMPI_0]], ^bb1(%[[VAL_0]] : !amdgcn.read_token<flat>), ^bb2
+// CHECK:           lsir.cond_br %[[CMPI_0]] : !amdgcn.vcc, ^bb1(%[[VAL_0]] : !amdgcn.read_token<flat>), ^bb2
 // CHECK:         ^bb2:
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Dialect/LSIR/Transforms/codegen-func-cf.mlir b/test/Dialect/LSIR/Transforms/codegen-func-cf.mlir
index aafc0a76f..ce937351c 100644
--- a/test/Dialect/LSIR/Transforms/codegen-func-cf.mlir
+++ b/test/Dialect/LSIR/Transforms/codegen-func-cf.mlir
@@ -2,13 +2,13 @@
 
 // CHECK-LABEL: amdgcn.module @test
 // CHECK:   func.func @loop_func
-// CHECK:     cf.cond_br %{{.*}}, ^bb1(%{{.*}} : !amdgcn.sgpr), ^bb2
+// CHECK:     lsir.cond_br %{{.*}}, ^bb1(%{{.*}} : !amdgcn.sgpr), ^bb2
 // CHECK:   ^bb1(%{{.*}}: !amdgcn.sgpr):
-// CHECK:     cf.cond_br %{{.*}}, ^bb1(%{{.*}} : !amdgcn.sgpr), ^bb2
+// CHECK:     lsir.cond_br %{{.*}}, ^bb1(%{{.*}} : !amdgcn.sgpr), ^bb2
 // CHECK:   ^bb2:
 
 amdgcn.module @test target = <gfx942> isa = <cdna3> {
-  func.func @loop_func(%arg0: i32, %n: i32) {
+  func.func @loop_func(%arg0: i32, %n: i32) attributes {gpu.kernel} {
     %c0 = arith.constant 0 : i32
     %c1 = arith.constant 1 : i32
     %cmp_init = arith.cmpi slt, %c0, %n : i32
diff --git a/test/Dialect/LSIR/ops.mlir b/test/Dialect/LSIR/ops.mlir
index ac93b8234..4be0fc583 100644
--- a/test/Dialect/LSIR/ops.mlir
+++ b/test/Dialect/LSIR/ops.mlir
@@ -65,54 +65,54 @@ func.func @test_shrui(%dst: !amdgcn.vgpr, %value: !amdgcn.vgpr, %amount: !amdgcn
   return %0 : !amdgcn.vgpr
 }
 
-func.func @test_cmpi_eq(%lhs: !amdgcn.sgpr, %rhs: !amdgcn.sgpr) -> i1 {
-  %0 = lsir.cmpi i32 eq %lhs, %rhs : !amdgcn.sgpr, !amdgcn.sgpr
-  return %0 : i1
+func.func @test_cmpi_eq(%dst: !amdgcn.scc, %lhs: !amdgcn.sgpr, %rhs: !amdgcn.sgpr) -> !amdgcn.scc {
+  %0 = lsir.cmpi i32 eq %dst, %lhs, %rhs : !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr
+  return %0 : !amdgcn.scc
 }
 
-func.func @test_cmpi_ne(%lhs: !amdgcn.sgpr, %rhs: !amdgcn.sgpr) -> i1 {
-  %0 = lsir.cmpi i32 ne %lhs, %rhs : !amdgcn.sgpr, !amdgcn.sgpr
-  return %0 : i1
+func.func @test_cmpi_ne(%dst: !amdgcn.scc, %lhs: !amdgcn.sgpr, %rhs: !amdgcn.sgpr) -> !amdgcn.scc {
+  %0 = lsir.cmpi i32 ne %dst, %lhs, %rhs : !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr
+  return %0 : !amdgcn.scc
 }
 
-func.func @test_cmpi_slt(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 {
-  %0 = lsir.cmpi i32 slt %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr
-  return %0 : i1
+func.func @test_cmpi_slt(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc {
+  %0 = lsir.cmpi i32 slt %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr
+  return %0 : !amdgcn.vcc
 }
 
-func.func @test_cmpi_sle(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 {
-  %0 = lsir.cmpi i32 sle %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr
-  return %0 : i1
+func.func @test_cmpi_sle(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc {
+  %0 = lsir.cmpi i32 sle %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr
+  return %0 : !amdgcn.vcc
 }
 
-func.func @test_cmpi_sgt(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 {
-  %0 = lsir.cmpi i32 sgt %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr
-  return %0 : i1
+func.func @test_cmpi_sgt(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc {
+  %0 = lsir.cmpi i32 sgt %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr
+  return %0 : !amdgcn.vcc
 }
 
-func.func @test_cmpi_sge(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 {
-  %0 = lsir.cmpi i32 sge %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr
-  return %0 : i1
+func.func @test_cmpi_sge(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc {
+  %0 = lsir.cmpi i32 sge %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr
+  return %0 : !amdgcn.vcc
 }
 
-func.func @test_cmpi_ult(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 {
-  %0 = lsir.cmpi i32 ult %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr
-  return %0 : i1
+func.func @test_cmpi_ult(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc {
+  %0 = lsir.cmpi i32 ult %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr
+  return %0 : !amdgcn.vcc
 }
 
-func.func @test_cmpi_ule(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 {
-  %0 = lsir.cmpi i32 ule %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr
-  return %0 : i1
+func.func @test_cmpi_ule(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc {
+  %0 = lsir.cmpi i32 ule %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr
+  return %0 : !amdgcn.vcc
 }
 
-func.func @test_cmpi_ugt(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 {
-  %0 = lsir.cmpi i32 ugt %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr
-  return %0 : i1
+func.func @test_cmpi_ugt(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc {
+  %0 = lsir.cmpi i32 ugt %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr
+  return %0 : !amdgcn.vcc
 }
 
-func.func @test_cmpi_uge(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 {
-  %0 = lsir.cmpi i32 uge %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr
-  return %0 : i1
+func.func @test_cmpi_uge(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc {
+  %0 = lsir.cmpi i32 uge %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr
+  return %0 : !amdgcn.vcc
 }
 
 func.func @test_extsi(%dst: !amdgcn.sgpr, %value: !amdgcn.sgpr) -> !amdgcn.sgpr {
@@ -211,14 +211,14 @@ func.func @test_xori(%dst: !amdgcn.sgpr, %lhs: !amdgcn.sgpr, %rhs: !amdgcn.sgpr)
   return %0 : !amdgcn.sgpr
 }
 
-func.func @test_cmpf_olt(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 {
-  %0 = lsir.cmpf f32 olt %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr
-  return %0 : i1
+func.func @test_cmpf_olt(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc {
+  %0 = lsir.cmpf f32 olt %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr
+  return %0 : !amdgcn.vcc
 }
 
-func.func @test_cmpf_oeq(%lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> i1 {
-  %0 = lsir.cmpf f32 oeq %lhs, %rhs : !amdgcn.vgpr, !amdgcn.vgpr
-  return %0 : i1
+func.func @test_cmpf_oeq(%dst: !amdgcn.vcc, %lhs: !amdgcn.vgpr, %rhs: !amdgcn.vgpr) -> !amdgcn.vcc {
+  %0 = lsir.cmpf f32 oeq %dst, %lhs, %rhs : !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr
+  return %0 : !amdgcn.vcc
 }
 
 func.func @test_extf(%dst: !amdgcn.vgpr, %value: !amdgcn.vgpr) -> !amdgcn.vgpr {
@@ -360,14 +360,14 @@ func.func @test_select_reg_condition(%dst: !amdgcn.vgpr, %cond: !amdgcn.vgpr, %t
   return %0 : !amdgcn.vgpr
 }
 
-func.func @test_select_i1_condition(%dst: !amdgcn.sgpr, %cond: i1, %tv: !amdgcn.sgpr, %fv: !amdgcn.sgpr) -> !amdgcn.sgpr {
-  %0 = lsir.select %dst, %cond, %tv, %fv : !amdgcn.sgpr, i1, !amdgcn.sgpr, !amdgcn.sgpr
+func.func @test_select_scc_condition(%dst: !amdgcn.sgpr, %cond: !amdgcn.scc, %tv: !amdgcn.sgpr, %fv: !amdgcn.sgpr) -> !amdgcn.sgpr {
+  %0 = lsir.select %dst, %cond, %tv, %fv : !amdgcn.sgpr, !amdgcn.scc, !amdgcn.sgpr, !amdgcn.sgpr
   return %0 : !amdgcn.sgpr
 }
 
-func.func @test_select_i1_imm_operands(%dst: !amdgcn.sgpr, %cond: i1) -> !amdgcn.sgpr {
+func.func @test_select_scc_imm_operands(%dst: !amdgcn.sgpr, %cond: !amdgcn.scc) -> !amdgcn.sgpr {
   %c42 = arith.constant 42 : i32
   %c99 = arith.constant 99 : i32
-  %0 = lsir.select %dst, %cond, %c42, %c99 : !amdgcn.sgpr, i1, i32, i32
+  %0 = lsir.select %dst, %cond, %c42, %c99 : !amdgcn.sgpr, !amdgcn.scc, i32, i32
   return %0 : !amdgcn.sgpr
 }
diff --git a/test/Target/ASM/cbranch.mlir b/test/Target/ASM/cbranch.mlir
index 9ec8d81e5..af7bfa659 100644
--- a/test/Target/ASM/cbranch.mlir
+++ b/test/Target/ASM/cbranch.mlir
@@ -31,11 +31,11 @@ amdgcn.module @mod target = #amdgcn.target<gfx942> isa = #amdgcn.isa<cdna3> {
   ^entry:
     %s2 = amdgcn.alloca : !amdgcn.sgpr<2>
     %s3 = amdgcn.alloca : !amdgcn.sgpr<3>
-    %scc = amdgcn.alloca : !amdgcn.scc
+    %scc = amdgcn.alloca : !amdgcn.scc<0>
     amdgcn.cmpi s_cmp_gt_u32 outs %scc ins %s2, %s3
-      : outs(!amdgcn.scc) ins(!amdgcn.sgpr<2>, !amdgcn.sgpr<3>)
+      : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr<2>, !amdgcn.sgpr<3>)
     amdgcn.cbranch s_cbranch_scc1 %scc ^loop fallthrough (^exit)
-      : !amdgcn.scc
+      : !amdgcn.scc<0>
   ^exit:
     amdgcn.end_kernel
   ^loop:
@@ -46,11 +46,11 @@ amdgcn.module @mod target = #amdgcn.target<gfx942> isa = #amdgcn.isa<cdna3> {
   ^entry:
     %s0 = amdgcn.alloca : !amdgcn.sgpr<0>
     %s1 = amdgcn.alloca : !amdgcn.sgpr<1>
-    %scc = amdgcn.alloca : !amdgcn.scc
+    %scc = amdgcn.alloca : !amdgcn.scc<0>
     amdgcn.cmpi s_cmp_eq_i32 outs %scc ins %s0, %s1
-      : outs(!amdgcn.scc) ins(!amdgcn.sgpr<0>, !amdgcn.sgpr<1>)
+      : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr<0>, !amdgcn.sgpr<1>)
     amdgcn.cbranch s_cbranch_scc0 %scc ^true_path fallthrough (^false_path)
-      : !amdgcn.scc
+      : !amdgcn.scc<0>
   ^false_path:
     amdgcn.end_kernel
   ^true_path:
diff --git a/test/Target/ASM/g2s-load-lds.mlir b/test/Target/ASM/g2s-load-lds.mlir
index 01a0586af..992adee29 100644
--- a/test/Target/ASM/g2s-load-lds.mlir
+++ b/test/Target/ASM/g2s-load-lds.mlir
@@ -19,9 +19,9 @@ amdgcn.module @g2s_mod target = #amdgcn.target<gfx950> isa = #amdgcn.isa<cdna4>
 
   amdgcn.kernel @test_g2s_dword {
   ^entry:
-    %m0 = amdgcn.alloca : !amdgcn.m0
+    %m0 = amdgcn.alloca : !amdgcn.m0<0>
     %c0 = arith.constant 0 : i32
-    amdgcn.sop1 s_mov_b32 outs %m0 ins %c0 : !amdgcn.m0, i32
+    amdgcn.sop1 s_mov_b32 outs %m0 ins %c0 : !amdgcn.m0<0>, i32
 
     // Buffer descriptor (s[0:3]) and scalar offset (s4)
     %s0 = amdgcn.alloca : !amdgcn.sgpr<0>
@@ -35,7 +35,7 @@ amdgcn.module @g2s_mod target = #amdgcn.target<gfx950> isa = #amdgcn.isa<cdna4>
 
     %tok = amdgcn.load_lds buffer_load_dword_lds m0 %m0 addr %rsrc
         offset u(%soff) + d(%voff) + c(%c0)
-        : ins(!amdgcn.m0, !amdgcn.sgpr<[0 : 4]>, !amdgcn.sgpr<4>, !amdgcn.vgpr<0>, i32)
+        : ins(!amdgcn.m0<0>, !amdgcn.sgpr<[0 : 4]>, !amdgcn.sgpr<4>, !amdgcn.vgpr<0>, i32)
         -> !amdgcn.write_token<flat>
     amdgcn.sopp.s_waitcnt #amdgcn.inst<s_waitcnt> vmcnt = 0
     amdgcn.end_kernel
@@ -43,10 +43,10 @@ amdgcn.module @g2s_mod target = #amdgcn.target<gfx950> isa = #amdgcn.isa<cdna4>
 
   amdgcn.kernel @test_g2s_dwordx4 {
   ^entry:
-    %m0 = amdgcn.alloca : !amdgcn.m0
+    %m0 = amdgcn.alloca : !amdgcn.m0<0>
     %c0 = arith.constant 0 : i32
     %c64 = arith.constant 64 : i32
-    amdgcn.sop1 s_mov_b32 outs %m0 ins %c0 : !amdgcn.m0, i32
+    amdgcn.sop1 s_mov_b32 outs %m0 ins %c0 : !amdgcn.m0<0>, i32
 
     %s0 = amdgcn.alloca : !amdgcn.sgpr<0>
     %s1 = amdgcn.alloca : !amdgcn.sgpr<1>
@@ -59,7 +59,7 @@ amdgcn.module @g2s_mod target = #amdgcn.target<gfx950> isa = #amdgcn.isa<cdna4>
 
     %tok = amdgcn.load_lds buffer_load_dwordx4_lds m0 %m0 addr %rsrc
         offset u(%soff) + d(%voff) + c(%c64)
-        : ins(!amdgcn.m0, !amdgcn.sgpr<[0 : 4]>, !amdgcn.sgpr<4>, !amdgcn.vgpr<0>, i32)
+        : ins(!amdgcn.m0<0>, !amdgcn.sgpr<[0 : 4]>, !amdgcn.sgpr<4>, !amdgcn.vgpr<0>, i32)
         -> !amdgcn.write_token<flat>
     amdgcn.sopp.s_waitcnt #amdgcn.inst<s_waitcnt> vmcnt = 0
     amdgcn.end_kernel
diff --git a/test/Target/ASM/loops.mlir b/test/Target/ASM/loops.mlir
index e1747f8d1..8f5af5bea 100644
--- a/test/Target/ASM/loops.mlir
+++ b/test/Target/ASM/loops.mlir
@@ -44,17 +44,17 @@ amdgcn.module @mod target = #amdgcn.target<gfx942> isa = #amdgcn.isa<cdna3> {
   ^entry:
     %c5 = arith.constant 5 : i32
     %c4 = arith.constant 4 : i32
-    %scc = amdgcn.alloca : !amdgcn.scc
+    %scc = amdgcn.alloca : !amdgcn.scc<0>
     %s0 = amdgcn.alloca : !amdgcn.sgpr<0>
     %s1 = amdgcn.alloca : !amdgcn.sgpr<1>
 
     amdgcn.sop1 s_mov_b32 outs %s0 ins %c5 : !amdgcn.sgpr<0>, i32
     amdgcn.sop1 s_mov_b32 outs %s1 ins %c4 : !amdgcn.sgpr<1>, i32
     amdgcn.cmpi s_cmp_le_i32 outs %scc ins %s0, %s1
-      : outs(!amdgcn.scc) ins(!amdgcn.sgpr<0>, !amdgcn.sgpr<1>)
+      : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr<0>, !amdgcn.sgpr<1>)
 
     amdgcn.cbranch s_cbranch_scc1 %scc ^then fallthrough (^else)
-      : !amdgcn.scc
+      : !amdgcn.scc<0>
   ^else:
     amdgcn.end_kernel
   ^then:
@@ -68,7 +68,7 @@ amdgcn.module @mod target = #amdgcn.target<gfx942> isa = #amdgcn.isa<cdna3> {
   ^entry:
     %c10 = arith.constant 10 : i32
     %c9 = arith.constant 9 : i32
-    %scc = amdgcn.alloca : !amdgcn.scc
+    %scc = amdgcn.alloca : !amdgcn.scc<0>
     %s2 = amdgcn.alloca : !amdgcn.sgpr<2>
     %s3 = amdgcn.alloca : !amdgcn.sgpr<3>
 
@@ -78,9 +78,9 @@ amdgcn.module @mod target = #amdgcn.target<gfx942> isa = #amdgcn.isa<cdna3> {
 
   ^loop_header:
     amdgcn.cmpi s_cmp_lt_i32 outs %scc ins %s3, %s2
-      : outs(!amdgcn.scc) ins(!amdgcn.sgpr<3>, !amdgcn.sgpr<2>)
+      : outs(!amdgcn.scc<0>) ins(!amdgcn.sgpr<3>, !amdgcn.sgpr<2>)
     amdgcn.cbranch s_cbranch_scc0 %scc ^exit fallthrough (^loop_body)
-      : !amdgcn.scc
+      : !amdgcn.scc<0>
   ^loop_body:
     amdgcn.branch s_branch ^loop_header
   ^exit:
diff --git a/test/Target/ASM/s-mov-m0.mlir b/test/Target/ASM/s-mov-m0.mlir
index 0fbc86ef5..b6a2b045c 100644
--- a/test/Target/ASM/s-mov-m0.mlir
+++ b/test/Target/ASM/s-mov-m0.mlir
@@ -16,17 +16,17 @@
 amdgcn.module @m0_mod target = #amdgcn.target<gfx942> isa = #amdgcn.isa<cdna3> {
   amdgcn.kernel @test_s_mov_m0_imm {
   ^entry:
-    %m0 = amdgcn.alloca : !amdgcn.m0
+    %m0 = amdgcn.alloca : !amdgcn.m0<0>
     %c1024 = arith.constant 1024 : i32
-    amdgcn.sop1 s_mov_b32 outs %m0 ins %c1024 : !amdgcn.m0, i32
+    amdgcn.sop1 s_mov_b32 outs %m0 ins %c1024 : !amdgcn.m0<0>, i32
     amdgcn.end_kernel
   }
 
   amdgcn.kernel @test_s_mov_m0_sgpr {
   ^entry:
-    %m0 = amdgcn.alloca : !amdgcn.m0
+    %m0 = amdgcn.alloca : !amdgcn.m0<0>
     %s0 = amdgcn.alloca : !amdgcn.sgpr<0>
-    amdgcn.sop1 s_mov_b32 outs %m0 ins %s0 : !amdgcn.m0, !amdgcn.sgpr<0>
+    amdgcn.sop1 s_mov_b32 outs %m0 ins %s0 : !amdgcn.m0<0>, !amdgcn.sgpr<0>
     amdgcn.end_kernel
   }
 }
diff --git a/test/Target/ASM/vopc-branch.mlir b/test/Target/ASM/vopc-branch.mlir
index 233e53aa2..70b9b56e9 100644
--- a/test/Target/ASM/vopc-branch.mlir
+++ b/test/Target/ASM/vopc-branch.mlir
@@ -33,12 +33,12 @@ amdgcn.module @vopc_branch_mod target = #amdgcn.target<gfx942> isa = #amdgcn.isa
   amdgcn.kernel @test_vcmp_lt_i32_vccnz {
   ^entry:
     %v0 = amdgcn.alloca : !amdgcn.vgpr<0>
-    %vcc = amdgcn.alloca : !amdgcn.vcc
+    %vcc = amdgcn.alloca : !amdgcn.vcc<0>
     %c0 = arith.constant 0 : i32
     amdgcn.cmpi v_cmp_lt_i32 outs %vcc ins %c0, %v0
-      : outs(!amdgcn.vcc) ins(i32, !amdgcn.vgpr<0>)
+      : outs(!amdgcn.vcc<0>) ins(i32, !amdgcn.vgpr<0>)
     amdgcn.cbranch s_cbranch_vccz %vcc ^taken fallthrough(^fallthru)
-      : !amdgcn.vcc
+      : !amdgcn.vcc<0>
   ^fallthru:
     amdgcn.end_kernel
   ^taken:
@@ -50,11 +50,11 @@ amdgcn.module @vopc_branch_mod target = #amdgcn.target<gfx942> isa = #amdgcn.isa
   ^entry:
     %v0 = amdgcn.alloca : !amdgcn.vgpr<0>
     %v1 = amdgcn.alloca : !amdgcn.vgpr<1>
-    %vcc = amdgcn.alloca : !amdgcn.vcc
+    %vcc = amdgcn.alloca : !amdgcn.vcc<0>
     amdgcn.cmpi v_cmp_eq_i32 outs %vcc ins %v0, %v1
-      : outs(!amdgcn.vcc) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>)
+      : outs(!amdgcn.vcc<0>) ins(!amdgcn.vgpr<0>, !amdgcn.vgpr<1>)
     amdgcn.cbranch s_cbranch_vccz %vcc ^taken fallthrough(^fallthru)
-      : !amdgcn.vcc
+      : !amdgcn.vcc<0>
   ^fallthru:
     amdgcn.end_kernel
   ^taken:
@@ -65,12 +65,12 @@ amdgcn.module @vopc_branch_mod target = #amdgcn.target<gfx942> isa = #amdgcn.isa
   amdgcn.kernel @test_vcmp_gt_swap {
   ^entry:
     %v0 = amdgcn.alloca : !amdgcn.vgpr<0>
-    %vcc = amdgcn.alloca : !amdgcn.vcc
+    %vcc = amdgcn.alloca : !amdgcn.vcc<0>
     %c32 = arith.constant 32 : i32
     amdgcn.cmpi v_cmp_gt_i32 outs %vcc ins %c32, %v0
-      : outs(!amdgcn.vcc) ins(i32, !amdgcn.vgpr<0>)
+      : outs(!amdgcn.vcc<0>) ins(i32, !amdgcn.vgpr<0>)
     amdgcn.cbranch s_cbranch_vccz %vcc ^taken fallthrough(^fallthru)
-      : !amdgcn.vcc
+      : !amdgcn.vcc<0>
   ^fallthru:
     amdgcn.end_kernel
   ^taken:
diff --git a/test/integration/g2s-load-lds-e2e.mlir b/test/integration/g2s-load-lds-e2e.mlir
index 843665cbe..3728e9124 100644
--- a/test/integration/g2s-load-lds-e2e.mlir
+++ b/test/integration/g2s-load-lds-e2e.mlir
@@ -102,9 +102,9 @@ amdgcn.module @g2s_e2e_mod target = #amdgcn.target<gfx950> isa = #amdgcn.isa<cdn
     // Set M0 = 44 (non-trivial dword-aligned LDS base offset for coverage).
     // Hardware writes to LDS at M0[17:2]*4 + tid*4 = 44 + tid*4.
     // M0 must be dword-aligned (low 2 bits are masked by hardware).
-    %m0 = amdgcn.alloca : !amdgcn.m0
+    %m0 = amdgcn.alloca : !amdgcn.m0<0>
     %c44 = arith.constant 44 : i32
-    amdgcn.sop1 s_mov_b32 outs %m0 ins %c44 : !amdgcn.m0, i32
+    amdgcn.sop1 s_mov_b32 outs %m0 ins %c44 : !amdgcn.m0<0>, i32
 
     // 1 NOP required after SALU writes M0 before G2S (CDNA4 hazard)
     amdgcn.sopp.sopp #amdgcn.inst<s_nop> , imm = 10
@@ -115,7 +115,7 @@ amdgcn.module @g2s_e2e_mod target = #amdgcn.target<gfx950> isa = #amdgcn.isa<cdn
     // Each lane loads src[tid] -> LDS[44 + tid*4]
     %tok_g2s = amdgcn.load_lds buffer_load_dword_lds m0 %m0 addr %src_rsrc
         offset u(%soffset) + d(%voffset) + c(%c0)
-        : ins(!amdgcn.m0, !amdgcn.sgpr<[? + 4]>, !amdgcn.sgpr, !amdgcn.vgpr, i32)
+        : ins(!amdgcn.m0<0>, !amdgcn.sgpr<[? + 4]>, !amdgcn.sgpr, !amdgcn.vgpr, i32)
         -> !amdgcn.write_token<flat>
 
     // Wait for G2S to complete (vmcnt tracks buffer loads)
diff --git a/test/integration/sreg-roundtrip-e2e.mlir b/test/integration/sreg-roundtrip-e2e.mlir
index 25e509d29..e18ac562b 100644
--- a/test/integration/sreg-roundtrip-e2e.mlir
+++ b/test/integration/sreg-roundtrip-e2e.mlir
@@ -31,15 +31,15 @@ amdgcn.module @m0_roundtrip_mod target = #amdgcn.target<gfx942> isa = #amdgcn.is
 
     // Write constant 42 to M0 via s_mov_b32
     // M0 is pre-allocated (fixed physical register), so write has no SSA result.
-    %m0 = amdgcn.alloca : !amdgcn.m0
+    %m0 = amdgcn.alloca : !amdgcn.m0<0>
     %c42 = arith.constant 42 : i32
     amdgcn.sop1 s_mov_b32 outs %m0 ins %c42
-      : !amdgcn.m0, i32
+      : !amdgcn.m0<0>, i32
 
     // Read M0 back into an SGPR via s_mov_b32
     %s_dest = amdgcn.alloca : !amdgcn.sgpr
     %s_val = amdgcn.sop1 s_mov_b32 outs %s_dest ins %m0
-      : !amdgcn.sgpr, !amdgcn.m0
+      : !amdgcn.sgpr, !amdgcn.m0<0>
 
     // Broadcast scalar to all VGPR lanes via v_mov_b32_e32
     %v_dest = amdgcn.alloca : !amdgcn.vgpr
diff --git a/test/integration/vopc-branch-e2e.mlir b/test/integration/vopc-branch-e2e.mlir
index 50afb2395..a44baa9f9 100644
--- a/test/integration/vopc-branch-e2e.mlir
+++ b/test/integration/vopc-branch-e2e.mlir
@@ -43,9 +43,10 @@ amdgcn.module @vopc_select_mod target = #amdgcn.target<gfx942> isa = #amdgcn.isa
     %v_false = amdgcn.vop1.vop1 #amdgcn.inst<v_mov_b32_e32> %v_false_alloc, %c99
       : (!amdgcn.vgpr, i32) -> !amdgcn.vgpr
     %v_out_alloc = amdgcn.alloca : !amdgcn.vgpr
-    %cmp = lsir.cmpi i32 slt %tid, %c100 : !amdgcn.vgpr, i32
+    %vcc0 = lsir.alloca : !amdgcn.vcc
+    %cmp = lsir.cmpi i32 slt %vcc0, %tid, %c100 : !amdgcn.vcc, !amdgcn.vgpr, i32
     %selected = lsir.select %v_out_alloc, %cmp, %v_true, %v_false
-      : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr
+      : !amdgcn.vgpr, !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr
     %tok = amdgcn.store global_store_dword data %selected addr %out_ptr
       offset d(%voffset) + c(%c0)
       : ins(!amdgcn.vgpr, !amdgcn.sgpr<[? + 2]>, !amdgcn.vgpr, i32)
@@ -74,9 +75,10 @@ amdgcn.module @vopc_select_mod target = #amdgcn.target<gfx942> isa = #amdgcn.isa
     %v_false = amdgcn.vop1.vop1 #amdgcn.inst<v_mov_b32_e32> %v_false_alloc, %c99
       : (!amdgcn.vgpr, i32) -> !amdgcn.vgpr
     %v_out_alloc = amdgcn.alloca : !amdgcn.vgpr
-    %cmp = lsir.cmpi i32 slt %tid, %c0 : !amdgcn.vgpr, i32
+    %vcc1 = lsir.alloca : !amdgcn.vcc
+    %cmp = lsir.cmpi i32 slt %vcc1, %tid, %c0 : !amdgcn.vcc, !amdgcn.vgpr, i32
     %selected = lsir.select %v_out_alloc, %cmp, %v_true, %v_false
-      : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr
+      : !amdgcn.vgpr, !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr
     %tok = amdgcn.store global_store_dword data %selected addr %out_ptr
       offset d(%voffset) + c(%c0)
       : ins(!amdgcn.vgpr, !amdgcn.sgpr<[? + 2]>, !amdgcn.vgpr, i32)
@@ -107,9 +109,10 @@ amdgcn.module @vopc_select_mod target = #amdgcn.target<gfx942> isa = #amdgcn.isa
     %v_false = amdgcn.vop1.vop1 #amdgcn.inst<v_mov_b32_e32> %v_false_alloc, %c99
       : (!amdgcn.vgpr, i32) -> !amdgcn.vgpr
     %v_out_alloc = amdgcn.alloca : !amdgcn.vgpr
-    %cmp = lsir.cmpi i32 slt %tid, %c32 : !amdgcn.vgpr, i32
+    %vcc2 = lsir.alloca : !amdgcn.vcc
+    %cmp = lsir.cmpi i32 slt %vcc2, %tid, %c32 : !amdgcn.vcc, !amdgcn.vgpr, i32
     %selected = lsir.select %v_out_alloc, %cmp, %v_true, %v_false
-      : !amdgcn.vgpr, i1, !amdgcn.vgpr, !amdgcn.vgpr
+      : !amdgcn.vgpr, !amdgcn.vcc, !amdgcn.vgpr, !amdgcn.vgpr
     %tok = amdgcn.store global_store_dword data %selected addr %out_ptr
       offset d(%voffset) + c(%c0)
       : ins(!amdgcn.vgpr, !amdgcn.sgpr<[? + 2]>, !amdgcn.vgpr, i32)