diff --git a/clang-tools-extra/clang-tidy/misc/StaticInitializationCycleCheck.cpp b/clang-tools-extra/clang-tidy/misc/StaticInitializationCycleCheck.cpp
index eb230983c8a7a..3d1fc88e7233a 100644
--- a/clang-tools-extra/clang-tidy/misc/StaticInitializationCycleCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/StaticInitializationCycleCheck.cpp
@@ -193,7 +193,7 @@ class VarUseCollector : public DynamicRecursiveASTVisitor {
   }
   bool TraverseAttr(Attr *At) override { return true; }
   bool TraverseDecl(Decl *D) override {
-    if (DC && DC->containsDecl(D))
+    if (D && DC && DC->containsDecl(D))
       return DynamicRecursiveASTVisitor::TraverseDecl(D);
     return true;
   }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/static-initialization-cycle.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/static-initialization-cycle.cpp
index 2e5af81b6af8c..c17a87758e243 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/static-initialization-cycle.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/static-initialization-cycle.cpp
@@ -120,6 +120,14 @@ int f1() {
 int S::A = f1();
 }
 
+namespace catch_all_handler {
+void f() {
+  try {
+  } catch (...) {
+  }
+}
+} // catch_all_handler
+
 namespace recursive_calls {
 int f2();
 int f1() {
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f5660f9670eae..0feecef6bbd4f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -506,6 +506,9 @@ Improvements to Clang's diagnostics
 
 - Added ``-Wattribute-alias`` to diagnose type mismatches between an alias and its aliased function. (#GH195550)
 
+- Added warnings for floating-point exception function calls (fenv.h) without enabling
+  floating-point exception behavior via the appropriate flags or pragmas. (#GH128239)
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index b2fd522e6865c..d0a402e59ff60 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -498,6 +498,12 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// The type for the C ucontext_t type.
   TypeDecl *ucontext_tDecl = nullptr;
 
+  /// The type for the C fexcept_t type.
+  TypeDecl *fexcept_tDecl = nullptr;
+
+  /// The type for the C fenv_t type.
+  TypeDecl *fenv_tDecl = nullptr;
+
   /// Type for the Block descriptor for Blocks CodeGen.
   ///
   /// Since this is only used for generation of debug info, it is not
@@ -2350,6 +2356,30 @@ class ASTContext : public RefCountedBase<ASTContext> {
     return QualType();
   }
 
+  /// Set the type for the C fexcept_t type.
+  void setfexcept_tDecl(TypeDecl *fexcept_tDecl) {
+    this->fexcept_tDecl = fexcept_tDecl;
+  }
+
+  /// Retrieve the C fexcept_t type.
+  QualType getfexcept_tType() const {
+    if (fexcept_tDecl)
+      return getTypeDeclType(ElaboratedTypeKeyword::None,
+                             /*Qualifier=*/std::nullopt, fexcept_tDecl);
+    return QualType();
+  }
+
+  /// Set the type for the C fenv_t type.
+  void setfenv_tDecl(TypeDecl *fenv_tDecl) { this->fenv_tDecl = fenv_tDecl; }
+
+  /// Retrieve the C fenv_t type.
+  QualType getfenv_tType() const {
+    if (fenv_tDecl)
+      return getTypeDeclType(ElaboratedTypeKeyword::None,
+                             /*Qualifier=*/std::nullopt, fenv_tDecl);
+    return QualType();
+  }
+
   /// The result type of logical operations, '<', '>', '!=', etc.
   CanQualType getLogicalOperationType() const {
     return getLangOpts().CPlusPlus ? BoolTy : IntTy;
@@ -2630,7 +2660,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
     GE_Missing_setjmp,
 
     /// Missing a type from <ucontext.h>
-    GE_Missing_ucontext
+    GE_Missing_ucontext,
+
+    /// Missing a type from <fenv.h>
+    GE_Missing_fenv
   };
 
   QualType DecodeTypeStr(const char *&Str, const ASTContext &Context,
diff --git a/clang/include/clang/Basic/BuiltinHeaders.def b/clang/include/clang/Basic/BuiltinHeaders.def
index 23889a22769ed..f77665a2e8975 100644
--- a/clang/include/clang/Basic/BuiltinHeaders.def
+++ b/clang/include/clang/Basic/BuiltinHeaders.def
@@ -17,6 +17,7 @@ HEADER(BLOCKS_H, "Blocks.h")
 HEADER(COMPLEX_H, "complex.h")
 HEADER(CTYPE_H, "ctype.h")
 HEADER(EMMINTRIN_H, "emmintrin.h")
+HEADER(FENV_H, "fenv.h")
 HEADER(FOUNDATION_NSOBJCRUNTIME_H, "Foundation/NSObjCRuntime.h")
 HEADER(IMMINTRIN_H, "immintrin.h")
 HEADER(INTRIN_H, "intrin.h")
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 40ec94ab75046..7b833487e23a2 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4599,6 +4599,61 @@ def BlockObjectDispose : LibBuiltin<"blocks.h"> {
 }
 // FIXME: Also declare NSConcreteGlobalBlock and NSConcreteStackBlock.
 
+def FeClearExcept : LibBuiltin<"fenv.h"> {
+  let Spellings = ["feclearexcept"];
+  let Prototype = "int(int)";
+}
+
+def FeGetExceptFlag : LibBuiltin<"fenv.h"> {
+  let Spellings = ["fegetexceptflag"];
+  let Prototype = "int(fexcept_t*, int)";
+}
+
+def FeRaiseExcept : LibBuiltin<"fenv.h"> {
+  let Spellings = ["feraiseexcept"];
+  let Prototype = "int(int)";
+}
+
+def FeSetExceptFlag : LibBuiltin<"fenv.h"> {
+  let Spellings = ["fesetexceptflag"];
+  let Prototype = "int(fexcept_t const*, int)";
+}
+
+def FeTestExcept : LibBuiltin<"fenv.h"> {
+  let Spellings = ["fetestexcept"];
+  let Prototype = "int(int)";
+}
+
+def FeGetRound : LibBuiltin<"fenv.h"> {
+  let Spellings = ["fegetround"];
+  let Prototype = "int()";
+}
+
+def FeSetRound : LibBuiltin<"fenv.h"> {
+  let Spellings = ["fesetround"];
+  let Prototype = "int(int)";
+}
+
+def FeGetEnv : LibBuiltin<"fenv.h"> {
+  let Spellings = ["fegetenv"];
+  let Prototype = "int(fenv_t*)";
+}
+
+def FeHoldExcept : LibBuiltin<"fenv.h"> {
+  let Spellings = ["feholdexcept"];
+  let Prototype = "int(fenv_t*)";
+}
+
+def FeSetEnv : LibBuiltin<"fenv.h"> {
+  let Spellings = ["fesetenv"];
+  let Prototype = "int(fenv_t const*)";
+}
+
+def FeUpdateEnv : LibBuiltin<"fenv.h"> {
+  let Spellings = ["feupdateenv"];
+  let Prototype = "int(fenv_t const*)";
+}
+
 def __Addressof : LangBuiltin<"CXX_LANG"> {
   let Spellings = ["__addressof"];
   let Attributes = [FunctionWithoutBuiltinPrefix, NoThrow, Const,
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index d7dd20d6a45e4..c73c116cdc451 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -1041,6 +1041,11 @@ def err_ptrauth_indirect_goto_addrlabel_arithmetic : Error<
   "%select{subtraction|addition}0 of address-of-label expressions is not "
   "supported with ptrauth indirect gotos">;
 
+def warn_fe_access_without_fenv_access : Warning<
+  "'%0' used without enabling floating-point exception behavior; use 'pragma STDC "
+  "FENV_ACCESS ON' or compile with '-ffp-exception-behavior=maytrap'">,
+  InGroup<DiagGroup<"fenv-access">>;
+
 // __ptrauth qualifier
 def err_ptrauth_qualifier_invalid : Error<
   "%select{return type|parameter type|property}1 may not be qualified with "
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index f07d8ebb75035..4bd21b4112580 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -877,6 +877,8 @@ NOTABLE_IDENTIFIER(FILE)
 NOTABLE_IDENTIFIER(jmp_buf)
 NOTABLE_IDENTIFIER(sigjmp_buf)
 NOTABLE_IDENTIFIER(ucontext_t)
+NOTABLE_IDENTIFIER(fexcept_t)
+NOTABLE_IDENTIFIER(fenv_t)
 NOTABLE_IDENTIFIER(float_t)
 NOTABLE_IDENTIFIER(double_t)
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 5202244cee2a7..1c8a47169bbd3 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -8264,6 +8264,10 @@ class Sema final : public SemaBase {
     return currentEvaluationContext().isUnevaluated();
   }
 
+  bool isPotentiallyEvaluatedContext() const {
+    return currentEvaluationContext().isPotentiallyEvaluated();
+  }
+
   bool isImmediateFunctionContext() const {
     return currentEvaluationContext().isImmediateFunctionContext();
   }
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 3c8f3ba59a07e..4bb287af687b5 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1225,7 +1225,13 @@ enum SpecialTypeIDs {
   SPECIAL_TYPE_OBJC_SEL_REDEFINITION = 6,
 
   /// C ucontext_t typedef type
-  SPECIAL_TYPE_UCONTEXT_T = 7
+  SPECIAL_TYPE_UCONTEXT_T = 7,
+
+  /// C fexcept_t typedef type
+  SPECIAL_TYPE_FEXCEPT_T = 8,
+
+  /// C fenv_t typedef type
+  SPECIAL_TYPE_FENV_T = 9
 };
 
 /// The number of special type IDs.
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index bc4771aec77d1..412ba583353d2 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -12849,6 +12849,25 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
   case 'm':
     Type = Context.MFloat8Ty;
     break;
+  case 'T':
+    switch (*Str++) {
+    case 'x': {
+      Type = Context.getfexcept_tType();
+      break;
+    }
+    case 'e': {
+      Type = Context.getfenv_tType();
+      break;
+    }
+    default: {
+      llvm_unreachable("Unexpected target builtin type");
+    }
+    }
+    if (Type.isNull()) {
+      Error = ASTContext::GE_Missing_fenv;
+      return {};
+    }
+    break;
   }
 
   // If there are modifiers and if we're allowed to parse them, go for it.
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index cc834bbee23c4..03091f2ba0cfe 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3928,6 +3928,24 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     if (BuiltinCountedByRef(TheCall))
       return ExprError();
     break;
+
+  case Builtin::BIfeclearexcept:
+  case Builtin::BIfegetexceptflag:
+  case Builtin::BIferaiseexcept:
+  case Builtin::BIfesetexceptflag:
+  case Builtin::BIfetestexcept:
+  case Builtin::BIfegetround:
+  case Builtin::BIfesetround:
+  case Builtin::BIfegetenv:
+  case Builtin::BIfeholdexcept:
+  case Builtin::BIfesetenv:
+  case Builtin::BIfeupdateenv:
+    if (TheCall->getFPFeaturesInEffect(getLangOpts()).getExceptionMode() ==
+            LangOptions::FPE_Ignore &&
+        isPotentiallyEvaluatedContext()) {
+      Diag(TheCall->getBeginLoc(), diag::warn_fe_access_without_fenv_access)
+          << FDecl->getName() << TheCall->getSourceRange();
+    }
   }
 
   if (getLangOpts().HLSL && HLSL().CheckBuiltinFunctionCall(BuiltinID, TheCall))
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 4b9576479e29e..51ad5ac5f9e62 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -2398,6 +2398,8 @@ static StringRef getHeaderName(Builtin::Context &BuiltinInfo, unsigned ID,
     return "setjmp.h";
   case ASTContext::GE_Missing_ucontext:
     return "ucontext.h";
+  case ASTContext::GE_Missing_fenv:
+    return "fenv.h";
   }
   llvm_unreachable("unhandled error kind");
 }
@@ -7018,6 +7020,12 @@ Sema::ActOnTypedefNameDecl(Scope *S, DeclContext *DC, TypedefNameDecl *NewTD,
       case tok::NotableIdentifierKind::ucontext_t:
         Context.setucontext_tDecl(NewTD);
         break;
+      case tok::NotableIdentifierKind::fexcept_t:
+        Context.setfexcept_tDecl(NewTD);
+        break;
+      case tok::NotableIdentifierKind::fenv_t:
+        Context.setfenv_tDecl(NewTD);
+        break;
       case tok::NotableIdentifierKind::float_t:
       case tok::NotableIdentifierKind::double_t:
         NewTD->addAttr(AvailableOnlyInDefaultEvalMethodAttr::Create(Context));
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 0f834859e982a..5425c1da7419b 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -5758,6 +5758,42 @@ void ASTReader::InitializeContext() {
         }
       }
     }
+
+    if (TypeID Fexcept_t = SpecialTypes[SPECIAL_TYPE_FEXCEPT_T]) {
+      QualType Fexcept_tType = GetType(Fexcept_t);
+      if (Fexcept_tType.isNull()) {
+        Error("fexcept_t type is NULL");
+        return;
+      }
+
+      if (!Context.fexcept_tDecl) {
+        if (const TypedefType *Typedef = Fexcept_tType->getAs<TypedefType>())
+          Context.setfexcept_tDecl(Typedef->getDecl());
+        else {
+          const TagType *Tag = Fexcept_tType->getAs<TagType>();
+          assert(Tag && "Invalid fexcept_t type in AST file");
+          Context.setfexcept_tDecl(Tag->getDecl());
+        }
+      }
+    }
+
+    if (TypeID Fenv_t = SpecialTypes[SPECIAL_TYPE_FENV_T]) {
+      QualType Fenv_tType = GetType(Fenv_t);
+      if (Fenv_tType.isNull()) {
+        Error("fenv_t type is NULL");
+        return;
+      }
+
+      if (!Context.fenv_tDecl) {
+        if (const TypedefType *Typedef = Fenv_tType->getAs<TypedefType>())
+          Context.setfenv_tDecl(Typedef->getDecl());
+        else {
+          const TagType *Tag = Fenv_tType->getAs<TagType>();
+          assert(Tag && "Invalid fenv_t type in AST file");
+          Context.setfenv_tDecl(Tag->getDecl());
+        }
+      }
+    }
   }
 
   ReadPragmaDiagnosticMappings(Context.getDiagnostics());
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 1970ed86589b5..c0c4aa107e200 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6158,6 +6158,8 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema *SemaPtr, StringRef isysroot,
     AddTypeRef(Context, Context.ObjCClassRedefinitionType, SpecialTypes);
     AddTypeRef(Context, Context.ObjCSelRedefinitionType, SpecialTypes);
     AddTypeRef(Context, Context.getucontext_tType(), SpecialTypes);
+    AddTypeRef(Context, Context.getfexcept_tType(), SpecialTypes);
+    AddTypeRef(Context, Context.getfenv_tType(), SpecialTypes);
   }
 
   if (SemaPtr)
diff --git a/clang/test/PCH/builtins-fenv.c b/clang/test/PCH/builtins-fenv.c
new file mode 100644
index 0000000000000..72bcf4a134730
--- /dev/null
+++ b/clang/test/PCH/builtins-fenv.c
@@ -0,0 +1,25 @@
+// Test this without pch.
+// RUN: %clang_cc1 -include %S/builtins-fenv.h -fsyntax-only -verify %s
+
+// Test with pch.
+// RUN: %clang_cc1 -emit-pch -o %t %S/builtins-fenv.h
+// RUN: %clang_cc1 -include-pch %t -fsyntax-only -verify %s 
+
+// expected-no-diagnostics
+fexcept_t *flagp = 0;
+fenv_t *envp = 0;
+
+void f(void) {
+  #pragma STDC FENV_ACCESS ON
+  feclearexcept(FE_INVALID);
+  fegetexceptflag(flagp, FE_INVALID);
+  feraiseexcept(FE_INVALID);
+  fesetexceptflag(flagp, FE_INVALID);
+  fetestexcept(FE_INVALID);
+  fegetround();
+  fesetround(0);
+  fegetenv(envp);
+  feholdexcept(envp);
+  fesetenv(envp);
+  feupdateenv(envp);
+}
diff --git a/clang/test/PCH/builtins-fenv.h b/clang/test/PCH/builtins-fenv.h
new file mode 100644
index 0000000000000..8397c270df58e
--- /dev/null
+++ b/clang/test/PCH/builtins-fenv.h
@@ -0,0 +1,18 @@
+// Header for PCH test builtins-fenv.c
+
+#define FE_INVALID 1
+
+typedef struct {} fenv_t;
+typedef unsigned short int fexcept_t;
+
+int feclearexcept(int excepts);
+int fegetexceptflag(fexcept_t *flagp, int excepts);
+int feraiseexcept(int excepts);
+int fesetexceptflag(const fexcept_t *flagp, int excepts);
+int fetestexcept(int excepts);
+int fegetround(void);
+int fesetround(int rounding_mode);
+int fegetenv(fenv_t *envp);
+int feholdexcept(fenv_t *envp);
+int fesetenv(const fenv_t *envp);
+int feupdateenv(const fenv_t *envp);
diff --git a/clang/test/Sema/builtin-fenv.c b/clang/test/Sema/builtin-fenv.c
new file mode 100644
index 0000000000000..db8a5334d1073
--- /dev/null
+++ b/clang/test/Sema/builtin-fenv.c
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -verify=c,expected -DWRONG_FEXCEPT_T %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -verify=c,expected -DRIGHT_FEXCEPT_T %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -verify=c,expected -DONLY_FEXCEPT_T %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -verify=c,expected -DNO_FEGETEXCEPTFLAG %s -ast-dump 2>&1 | FileCheck %s --check-prefixes=CHECK1
+
+// tests inspired by clang/test/Sema/builtin-setjmp.c
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if WRONG_FEXCEPT_T
+typedef unsigned short int fexcept_t;
+extern int fegetexceptflag(int, int); // c-warning {{incompatible redeclaration of library function 'fegetexceptflag'}}
+                                      // c-note@-1 {{'fegetexceptflag' is a builtin with type 'int (fexcept_t *, int)' (aka 'int (unsigned short *, int)')}}
+#elif RIGHT_FEXCEPT_T
+// c-no-diagnostics
+typedef unsigned short int fexcept_t;
+extern int fegetexceptflag(unsigned short int *, int); // OK, right type.
+#elif ONLY_FEXCEPT_T
+typedef long *fexcept_t;
+#endif
+
+void use(void) {
+  #pragma STDC FENV_ACCESS ON
+  fegetexceptflag(0, 0);
+  #if NO_FEGETEXCEPTFLAG
+  // cxx-error@-2 {{undeclared identifier 'fegetexceptflag'}}
+  // c-error@-3 {{call to undeclared function 'fegetexceptflag'; ISO C99 and later do not support implicit function declarations}}
+  // c-warning@-4 {{declaration of built-in function 'fegetexceptflag' requires inclusion of the header <fenv.h>}}
+  #elif ONLY_FEXCEPT_T
+  // cxx-error@-6 {{undeclared identifier 'fegetexceptflag'}}
+  // c-error@-7 {{call to undeclared library function 'fegetexceptflag' with type 'int (fexcept_t *, int)' (aka 'int (long **, int)'); ISO C99 and later do not support implicit function declarations}}
+  // c-note@-8 {{include the header <fenv.h> or explicitly provide a declaration for 'fegetexceptflag'}}
+  #else
+  // cxx-no-diagnostics
+  #endif
+
+  #ifdef NO_FEGETEXCEPTFLAG
+  // In this case, the regular AST dump doesn't dump the implicit declaration of 'fegetexceptflag'.
+  #pragma clang __debug dump fegetexceptflag 
+  #endif
+}
+
+// CHECK1: FunctionDecl {{.*}} used fegetexceptflag
+// CHECK2: BuiltinAttr {{.*}} Implicit
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/clang/test/Sema/fenv-access-implicit.c b/clang/test/Sema/fenv-access-implicit.c
new file mode 100644
index 0000000000000..0c4bd6b0eb855
--- /dev/null
+++ b/clang/test/Sema/fenv-access-implicit.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -verify -Wfenv-access %s
+
+typedef struct {} fenv_t;
+typedef unsigned short int fexcept_t;
+
+fexcept_t *flagp = 0;
+fenv_t *envp = 0;
+
+#define FE_INVALID 1
+
+void test_fenv_access_undeclared(void) {
+  #pragma STDC FENV_ACCESS ON
+  feclearexcept(FE_INVALID); // expected-note {{include the header <fenv.h> or explicitly provide a declaration for 'feclearexcept'}} \
+   expected-error {{call to undeclared library function 'feclearexcept' with type 'int (int)'; ISO C99 and later do not support implicit function declarations}}
+  fegetexceptflag(flagp, FE_INVALID); // expected-note {{include the header <fenv.h> or explicitly provide a declaration for 'fegetexceptflag'}} \
+   expected-error {{call to undeclared library function 'fegetexceptflag' with type 'int (fexcept_t *, int)' (aka 'int (unsigned short *, int)'); ISO C99 and later do not support implicit function declarations}}
+  feraiseexcept(FE_INVALID); // expected-note {{include the header <fenv.h> or explicitly provide a declaration for 'feraiseexcept'}} \
+   expected-error {{call to undeclared library function 'feraiseexcept' with type 'int (int)'; ISO C99 and later do not support implicit function declarations}}
+  fesetexceptflag(flagp, FE_INVALID); // expected-note {{include the header <fenv.h> or explicitly provide a declaration for 'fesetexceptflag'}} \
+   expected-error {{call to undeclared library function 'fesetexceptflag' with type 'int (const fexcept_t *, int)' (aka 'int (const unsigned short *, int)'); ISO C99 and later do not support implicit function declarations}}
+  fetestexcept(FE_INVALID); // expected-note {{include the header <fenv.h> or explicitly provide a declaration for 'fetestexcept'}} \
+   expected-error {{call to undeclared library function 'fetestexcept' with type 'int (int)'; ISO C99 and later do not support implicit function declarations}}
+  fegetround(); // expected-note {{include the header <fenv.h> or explicitly provide a declaration for 'fegetround'}} \
+   expected-error {{call to undeclared library function 'fegetround' with type 'int (void)'; ISO C99 and later do not support implicit function declarations}}
+  fesetround(0); // expected-note {{include the header <fenv.h> or explicitly provide a declaration for 'fesetround'}} \
+   expected-error {{call to undeclared library function 'fesetround' with type 'int (int)'; ISO C99 and later do not support implicit function declarations}}
+  fegetenv(envp); // expected-note {{include the header <fenv.h> or explicitly provide a declaration for 'fegetenv'}} \
+   expected-error {{call to undeclared library function 'fegetenv' with type 'int (fenv_t *)'; ISO C99 and later do not support implicit function declarations}}
+  feholdexcept(envp); // expected-note {{include the header <fenv.h> or explicitly provide a declaration for 'feholdexcept'}} \
+   expected-error {{call to undeclared library function 'feholdexcept' with type 'int (fenv_t *)'; ISO C99 and later do not support implicit function declarations}}
+  fesetenv(envp); // expected-note {{include the header <fenv.h> or explicitly provide a declaration for 'fesetenv'}} \
+   expected-error {{call to undeclared library function 'fesetenv' with type 'int (const fenv_t *)'; ISO C99 and later do not support implicit function declarations}}
+  feupdateenv(envp); // expected-note {{include the header <fenv.h> or explicitly provide a declaration for 'feupdateenv'}} \
+   expected-error {{call to undeclared library function 'feupdateenv' with type 'int (const fenv_t *)'; ISO C99 and later do not support implicit function declarations}}
+}
diff --git a/clang/test/Sema/fenv-access-unevaluated.cpp b/clang/test/Sema/fenv-access-unevaluated.cpp
new file mode 100644
index 0000000000000..14752ba3c377e
--- /dev/null
+++ b/clang/test/Sema/fenv-access-unevaluated.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -verify -Wfenv-access %s
+
+typedef struct {} fenv_t;
+typedef unsigned short int fexcept_t;
+
+int feclearexcept(int excepts);
+int fegetexceptflag(fexcept_t *flagp, int excepts);
+int feraiseexcept(int excepts);
+int fesetexceptflag(const fexcept_t *flagp, int excepts);
+int fetestexcept(int excepts);
+int fegetround(void);
+int fesetround(int rounding_mode);
+int fegetenv(fenv_t *envp);
+int feholdexcept(fenv_t *envp);
+int fesetenv(const fenv_t *envp);
+int feupdateenv(const fenv_t *envp);
+
+// expected-no-diagnostics
+void test_fenv_access_unevaluated() {
+  decltype(::feclearexcept) a;
+  decltype(::fegetexceptflag) b;
+  decltype(::feraiseexcept) c;
+  decltype(::fesetexceptflag) d;
+  decltype(::fetestexcept) e;
+  decltype(::fegetround) f;
+  decltype(::fesetround) g;
+  decltype(::fegetenv) h;
+  decltype(::feholdexcept) i;
+  decltype(::fesetenv) j;
+  decltype(::feupdateenv) k;
+}
diff --git a/clang/test/Sema/fenv-access.c b/clang/test/Sema/fenv-access.c
new file mode 100644
index 0000000000000..3a7b95af7ab4c
--- /dev/null
+++ b/clang/test/Sema/fenv-access.c
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 -verify -Wfenv-access %s
+// RUN: %clang_cc1 -verify -Wfenv-access -ffp-exception-behavior=maytrap -DNO_WARN %s
+// RUN: %clang_cc1 -verify -Wfenv-access -ffp-exception-behavior=strict -DNO_WARN %s
+
+typedef struct {} fenv_t;
+typedef unsigned short int fexcept_t;
+
+int feclearexcept(int excepts);
+int fegetexceptflag(fexcept_t *flagp, int excepts);
+int feraiseexcept(int excepts);
+int fesetexceptflag(const fexcept_t *flagp, int excepts);
+int fetestexcept(int excepts);
+int fegetround(void);
+int fesetround(int rounding_mode);
+int fegetenv(fenv_t *envp);
+int feholdexcept(fenv_t *envp);
+int fesetenv(const fenv_t *envp);
+int feupdateenv(const fenv_t *envp);
+
+#define FE_INVALID 1
+
+fexcept_t *flagp = 0;
+fenv_t *envp = 0;
+
+void test_fenv_access_off(void) {
+#ifdef NO_WARN
+  // expected-no-diagnostics
+  feclearexcept(FE_INVALID);
+  fegetexceptflag(flagp, FE_INVALID);
+  feraiseexcept(FE_INVALID);
+  fesetexceptflag(flagp, FE_INVALID);
+  fetestexcept(FE_INVALID);
+  fegetround();
+  fesetround(0);
+  fegetenv(envp);
+  feholdexcept(envp);
+  fesetenv(envp);
+  feupdateenv(envp);
+#else 
+  feclearexcept(FE_INVALID); // expected-warning {{'feclearexcept' used without enabling floating-point exception behavior; use 'pragma STDC FENV_ACCESS ON' or compile with '-ffp-exception-behavior=maytrap'}}
+  fegetexceptflag(flagp, FE_INVALID); // expected-warning {{'fegetexceptflag' used without enabling floating-point exception behavior; use 'pragma STDC FENV_ACCESS ON' or compile with '-ffp-exception-behavior=maytrap'}}
+  feraiseexcept(FE_INVALID); // expected-warning {{'feraiseexcept' used without enabling floating-point exception behavior; use 'pragma STDC FENV_ACCESS ON' or compile with '-ffp-exception-behavior=maytrap'}}
+  fesetexceptflag(flagp, FE_INVALID); // expected-warning {{'fesetexceptflag' used without enabling floating-point exception behavior; use 'pragma STDC FENV_ACCESS ON' or compile with '-ffp-exception-behavior=maytrap'}}
+  fetestexcept(FE_INVALID); // expected-warning {{'fetestexcept' used without enabling floating-point exception behavior; use 'pragma STDC FENV_ACCESS ON' or compile with '-ffp-exception-behavior=maytrap'}}
+  fegetround(); // expected-warning {{'fegetround' used without enabling floating-point exception behavior; use 'pragma STDC FENV_ACCESS ON' or compile with '-ffp-exception-behavior=maytrap'}}
+  fesetround(0); // expected-warning {{'fesetround' used without enabling floating-point exception behavior; use 'pragma STDC FENV_ACCESS ON' or compile with '-ffp-exception-behavior=maytrap'}}
+  fegetenv(envp); // expected-warning {{'fegetenv' used without enabling floating-point exception behavior; use 'pragma STDC FENV_ACCESS ON' or compile with '-ffp-exception-behavior=maytrap'}}
+  feholdexcept(envp); // expected-warning {{'feholdexcept' used without enabling floating-point exception behavior; use 'pragma STDC FENV_ACCESS ON' or compile with '-ffp-exception-behavior=maytrap'}}
+  fesetenv(envp); // expected-warning {{'fesetenv' used without enabling floating-point exception behavior; use 'pragma STDC FENV_ACCESS ON' or compile with '-ffp-exception-behavior=maytrap'}}
+  feupdateenv(envp); // expected-warning {{'feupdateenv' used without enabling floating-point exception behavior; use 'pragma STDC FENV_ACCESS ON' or compile with '-ffp-exception-behavior=maytrap'}}
+#endif
+}
+
+void test_fenv_access_on(void) {
+  #pragma STDC FENV_ACCESS ON
+  fesetround(0);
+  feclearexcept(FE_INVALID);
+  fegetexceptflag(flagp, FE_INVALID);
+  feraiseexcept(FE_INVALID);
+  fesetexceptflag(flagp, FE_INVALID);
+  fetestexcept(FE_INVALID);
+  fegetround();
+  fesetround(0);
+  fegetenv(envp);
+  feholdexcept(envp);
+  fesetenv(envp);
+  feupdateenv(envp);
+}
diff --git a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
index c2e38c0d6aeb8..2394c3b299c81 100644
--- a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
+++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
@@ -376,6 +376,8 @@ class PrototypeParser {
                                .Case("uint64_t", "UWi")
                                .Case("void", "v")
                                .Case("wchar_t", "w")
+                               .Case("fexcept_t", "Tx")
+                               .Case("fenv_t", "Te")
                                .Case("...", ".")
                                .Default("error");
       if (ReturnTypeVal == "error")
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.cpp
index 7ea6134b702bf..1cc64a38aaed8 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.cpp
@@ -51,6 +51,7 @@
 #include <sys/timeb.h>
 #include <sys/times.h>
 #include <sys/types.h>
+#include <sys/ucontext.h>
 #include <sys/utsname.h>
 #include <termios.h>
 #include <time.h>
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
index 02833888747ac..e6e400bb1fef7 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
@@ -174,11 +174,11 @@ class StackDepotBenchmark
 //   '--gtest_filter=*Benchmark*'
 TEST_P(StackDepotBenchmark, DISABLED_Benchmark) {
   auto Param = GetParam();
-  std::atomic<unsigned int> here = {};
+  std::atomic<int> here = {};
 
   auto thread = [&](int idx) {
     here++;
-    while (here < Param.UniqueThreads) std::this_thread::yield();
+    while (here < Param.Threads) std::this_thread::yield();
 
     std::vector<uptr> frames(64);
     for (int r = 0; r < Param.RepeatPerThread; ++r) {
diff --git a/compiler-rt/test/asan/TestCases/Posix/coverage-module-unloaded.cpp b/compiler-rt/test/asan/TestCases/Posix/coverage-module-unloaded.cpp
index feef8f81d70da..7c143701af6a3 100644
--- a/compiler-rt/test/asan/TestCases/Posix/coverage-module-unloaded.cpp
+++ b/compiler-rt/test/asan/TestCases/Posix/coverage-module-unloaded.cpp
@@ -1,8 +1,8 @@
 // Check that unloading a module doesn't break coverage dumping for remaining
 // modules.
 // RUN: mkdir -p %t.dir && cd %t.dir
-// RUN: %clangxx_asan -fsanitize-coverage=func,trace-pc-guard -DSHARED %s -shared -o %dynamiclib1 -fPIC
-// RUN: %clangxx_asan -fsanitize-coverage=func,trace-pc-guard -DSHARED %s -shared -o %dynamiclib2 -fPIC
+// RUN: %clangxx_asan -fsanitize-coverage=func,trace-pc-guard -DSHARED_LIB %s -shared -o %dynamiclib1 -fPIC
+// RUN: %clangxx_asan -fsanitize-coverage=func,trace-pc-guard -DSHARED_LIB %s -shared -o %dynamiclib2 -fPIC
 // RUN: %clangxx_asan -fsanitize-coverage=func,trace-pc-guard %s %libdl -o %t.dir/exe
 // RUN: mkdir -p %t.tmp/coverage-module-unloaded && cd %t.tmp/coverage-module-unloaded
 // RUN: %env_asan_opts=coverage=1:verbosity=1 %run %t.dir/exe %dynamiclib1 %dynamiclib2 2>&1        | FileCheck %s
@@ -18,7 +18,7 @@
 #include <stdio.h>
 #include <unistd.h>
 
-#ifdef SHARED
+#ifdef SHARED_LIB
 extern "C" {
 void bar() { printf("bar\n"); }
 }
diff --git a/compiler-rt/test/sanitizer_common/TestCases/get_module_and_offset_for_pc.cpp b/compiler-rt/test/sanitizer_common/TestCases/get_module_and_offset_for_pc.cpp
index 662625e16f3e1..bfabf40fdaa86 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/get_module_and_offset_for_pc.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/get_module_and_offset_for_pc.cpp
@@ -1,5 +1,5 @@
 // RUN: mkdir -p %t-dir
-// RUN: %clangxx -DSHARED %s -shared -o %t-dir/get_module_and_offset_for_pc.so -fPIC
+// RUN: %clangxx -DSHARED_LIB %s -shared -o %t-dir/get_module_and_offset_for_pc.so -fPIC
 // RUN: %clangxx -DSO_DIR=\"%t-dir\" -O0 %s -o %t
 // RUN: %run %t 2>&1 | FileCheck %s
 
@@ -13,7 +13,7 @@
 #include <sanitizer/common_interface_defs.h>
 #include <stdio.h>
 
-#ifdef SHARED
+#ifdef SHARED_LIB
 extern "C" {
 int foo() { return 1; }
 }
diff --git a/llvm/docs/AliasAnalysis.rst b/llvm/docs/AliasAnalysis.rst
index af6da8cf64ea3..9b05c9d7be138 100644
--- a/llvm/docs/AliasAnalysis.rst
+++ b/llvm/docs/AliasAnalysis.rst
@@ -638,16 +638,16 @@ implementations.  You can use them with commands like:
 
   % opt -ds-aa -aa-eval foo.bc -disable-output -stats
 
-The ``-print-alias-sets`` pass
+The ``print<alias-sets>`` pass
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The ``-print-alias-sets`` pass is exposed as part of the ``opt`` tool to print
+The ``print<alias-sets>`` pass is exposed as part of the ``opt`` tool to print
 out the Alias Sets formed by the `AliasSetTracker`_ class.  This is useful if
 you're using the ``AliasSetTracker`` class.  To use it, use something like:
 
 .. code-block:: bash
 
-  % opt -ds-aa -print-alias-sets -disable-output
+  % opt -passes='print<alias-sets>' -disable-output
 
 The ``-aa-eval`` pass
 ^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/docs/Passes.rst b/llvm/docs/Passes.rst
index 882ec2a71577e..66a682983d9d5 100644
--- a/llvm/docs/Passes.rst
+++ b/llvm/docs/Passes.rst
@@ -220,8 +220,8 @@ This pass decodes the debug info metadata in a module and prints it to standard
 This pass is a simple post-dominator construction algorithm for finding
 post-dominators.
 
-``print-alias-sets``: Alias Set Printer
----------------------------------------
+``print<alias-sets>``: Alias Set Printer
+----------------------------------------
 
 Yet to be written.
 
@@ -237,8 +237,8 @@ in a human-readable form.
 This pass, only available in ``opt``, prints the SCCs of the call graph to
 standard error in a human-readable form.
 
-``print-cfg-sccs``: Print SCCs of each function CFG
----------------------------------------------------
+``print<cfg-sccs>``: Print SCCs of each function CFG
+----------------------------------------------------
 
 This pass, only available in ``opt``, prints the SCCs of each function CFG to
 standard error in a human-readable form.
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index fffd696e59baf..12b7ad458e93e 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -148,6 +148,11 @@ Makes programs 10x faster by doing Special New Thing.
   disabled by default to maintain compatibility with Binutils and LLVM older
   toolchains that do not define the `R_AARCH64_TLS_DTPREL64` static relocation
   type for TLS offsets.
+* A bug was fixed that caused LLVM IR inline assembly clobbers of the x29 and
+  x30 registers to be ignored when they were written using their xN names
+  instead of the ABI names FP and LR. Note that LLVM IR produced by Clang
+  always uses the ABI names, but other frontends may not.
+  ([#167783](https://github.com/llvm/llvm-project/pull/167783))
 
 ### Changes to the AMDGPU Backend
 
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index a01a67f136710..5182341fa7a89 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -155,7 +155,6 @@ struct ForcePassLinking {
     llvm::AliasAnalysis AA(TLI);
     llvm::BatchAAResults BAA(AA);
     llvm::AliasSetTracker X(BAA);
-    X.add(llvm::MemoryLocation()); // for -print-alias-sets
     (void)llvm::AreStatisticsEnabled();
     (void)llvm::sys::RunningOnValgrind();
   }
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 3328bc0fe836f..0b5a6d8223102 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -507,12 +507,11 @@ FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass())
 FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt())
 FUNCTION_PASS("place-safepoints", PlaceSafepointsPass())
 FUNCTION_PASS("print", PrintFunctionPass(errs()))
-// TODO: rename to print<foo> after NPM switch
-FUNCTION_PASS("print-alias-sets", AliasSetsPrinterPass(errs()))
-FUNCTION_PASS("print-cfg-sccs", CFGSCCPrinterPass(errs()))
-FUNCTION_PASS("print-memderefs", MemDerefPrinterPass(errs()))
-FUNCTION_PASS("print-mustexecute", MustExecutePrinterPass(errs()))
-FUNCTION_PASS("print-predicateinfo", PredicateInfoPrinterPass(errs()))
+FUNCTION_PASS("print<alias-sets>", AliasSetsPrinterPass(errs()))
+FUNCTION_PASS("print<cfg-sccs>", CFGSCCPrinterPass(errs()))
+FUNCTION_PASS("print<mem-derefs>", MemDerefPrinterPass(errs()))
+FUNCTION_PASS("print<must-execute>", MustExecutePrinterPass(errs()))
+FUNCTION_PASS("print<predicate-info>", PredicateInfoPrinterPass(errs()))
 FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(errs()))
 FUNCTION_PASS("print<block-freq>", BlockFrequencyPrinterPass(errs()))
 FUNCTION_PASS("print<branch-prob>", BranchProbabilityPrinterPass(errs()))
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index a8fc28944af34..3ef6e693a4076 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -155,7 +155,7 @@ void initializeAArch64DAGToDAGISelLegacyPass(PassRegistry &);
 void initializeAArch64DeadRegisterDefinitionsLegacyPass(PassRegistry &);
 void initializeAArch64ExpandPseudoLegacyPass(PassRegistry &);
 void initializeAArch64LoadStoreOptLegacyPass(PassRegistry &);
-void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
+void initializeAArch64LowerHomogeneousPrologEpilogLegacyPass(PassRegistry &);
 void initializeAArch64CodeLayoutOptPass(PassRegistry &);
 void initializeAArch64MIPeepholeOptLegacyPass(PassRegistry &);
 void initializeAArch64O0PreLegalizerCombinerLegacyPass(PassRegistry &);
@@ -317,6 +317,12 @@ class AArch64ConditionalComparesPass
                         MachineFunctionAnalysisManager &MFAM);
 };
 
+class AArch64LowerHomogeneousPrologEpilogPass
+    : public PassInfoMixin<AArch64LowerHomogeneousPrologEpilogPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0eac7bc11a777..3016e1b94c22d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -30,6 +30,7 @@
 #include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -13698,6 +13699,20 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
     return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
   }
 
+  // Clang will correctly decode the usage of register name aliases into their
+  // official names. However, other frontends like `rustc` do not. The
+  // conversion below allows users of these frontends to use the ABI names for
+  // registers in LLVM-style register constraints.
+  //
+  // x31->sp is not included here because it's not a general register and
+  // needs different handling
+  unsigned XRegFromAlias = StringSwitch<unsigned>(Constraint.lower())
+                               .Cases({"{x29}", "{fp}"}, AArch64::FP)
+                               .Cases({"{x30}", "{lr}"}, AArch64::LR)
+                               .Default(AArch64::NoRegister);
+  if (XRegFromAlias != AArch64::NoRegister)
+    return std::make_pair(XRegFromAlias, &AArch64::GPR64RegClass);
+
   // Use the default implementation in TargetLowering to convert the register
   // constraint into a member of a register class.
   std::pair<unsigned, const TargetRegisterClass *> Res;
diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
index d69f12e7c0a7c..23fef847a75f1 100644
--- a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
@@ -10,11 +10,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64InstPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -23,6 +25,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include <optional>
 #include <sstream>
@@ -39,11 +42,11 @@ static cl::opt<int> FrameHelperSizeThreshold(
 
 namespace {
 
-class AArch64LowerHomogeneousPE {
+class AArch64LowerHomogeneousPrologEpilogImpl {
 public:
   const AArch64InstrInfo *TII;
 
-  AArch64LowerHomogeneousPE(Module *M, MachineModuleInfo *MMI)
+  AArch64LowerHomogeneousPrologEpilogImpl(Module *M, MachineModuleInfo *MMI)
       : M(M), MMI(MMI) {}
 
   bool run();
@@ -69,11 +72,11 @@ class AArch64LowerHomogeneousPE {
                    MachineBasicBlock::iterator &NextMBBI);
 };
 
-class AArch64LowerHomogeneousPrologEpilog : public ModulePass {
+class AArch64LowerHomogeneousPrologEpilogLegacy : public ModulePass {
 public:
   static char ID;
 
-  AArch64LowerHomogeneousPrologEpilog() : ModulePass(ID) {}
+  AArch64LowerHomogeneousPrologEpilogLegacy() : ModulePass(ID) {}
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineModuleInfoWrapperPass>();
     AU.addPreserved<MachineModuleInfoWrapperPass>();
@@ -89,22 +92,34 @@ class AArch64LowerHomogeneousPrologEpilog : public ModulePass {
 
 } // end anonymous namespace
 
-char AArch64LowerHomogeneousPrologEpilog::ID = 0;
+char AArch64LowerHomogeneousPrologEpilogLegacy::ID = 0;
 
-INITIALIZE_PASS(AArch64LowerHomogeneousPrologEpilog,
+INITIALIZE_PASS(AArch64LowerHomogeneousPrologEpilogLegacy,
                 "aarch64-lower-homogeneous-prolog-epilog",
                 AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME, false, false)
 
-bool AArch64LowerHomogeneousPrologEpilog::runOnModule(Module &M) {
+bool AArch64LowerHomogeneousPrologEpilogLegacy::runOnModule(Module &M) {
   if (skipModule(M))
     return false;
 
   MachineModuleInfo *MMI =
       &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
-  return AArch64LowerHomogeneousPE(&M, MMI).run();
+  return AArch64LowerHomogeneousPrologEpilogImpl(&M, MMI).run();
 }
 
-bool AArch64LowerHomogeneousPE::run() {
+PreservedAnalyses
+AArch64LowerHomogeneousPrologEpilogPass::run(Module &M,
+                                             ModuleAnalysisManager &MAM) {
+  MachineModuleInfo *MMI = &MAM.getResult<MachineModuleAnalysis>(M).getMMI();
+  bool Changed = AArch64LowerHomogeneousPrologEpilogImpl(&M, MMI).run();
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<MachineModuleAnalysis>();
+  return PA;
+}
+
+bool AArch64LowerHomogeneousPrologEpilogImpl::run() {
   bool Changed = false;
   for (auto &F : *M) {
     if (F.empty())
@@ -455,7 +470,7 @@ static bool shouldUseFrameHelper(MachineBasicBlock &MBB,
 ///    ldp x29, x30, [sp, #32]
 ///    ldp x20, x19, [sp, #16]
 ///    ldp x22, x21, [sp], #48
-bool AArch64LowerHomogeneousPE::lowerEpilog(
+bool AArch64LowerHomogeneousPrologEpilogImpl::lowerEpilog(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     MachineBasicBlock::iterator &NextMBBI) {
   auto &MF = *MBB.getParent();
@@ -545,7 +560,7 @@ bool AArch64LowerHomogeneousPE::lowerEpilog(
 ///    stp	x22, x21, [sp, #-48]!
 ///    stp	x20, x19, [sp, #16]
 ///    stp	x29, x30, [sp, #32]
-bool AArch64LowerHomogeneousPE::lowerProlog(
+bool AArch64LowerHomogeneousPrologEpilogImpl::lowerProlog(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     MachineBasicBlock::iterator &NextMBBI) {
   auto &MF = *MBB.getParent();
@@ -626,9 +641,9 @@ bool AArch64LowerHomogeneousPE::lowerProlog(
 /// @param MBBI current instruction iterator
 /// @param NextMBBI next instruction iterator which can be updated
 /// @return True when IR is changed.
-bool AArch64LowerHomogeneousPE::runOnMI(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MBBI,
-                                        MachineBasicBlock::iterator &NextMBBI) {
+bool AArch64LowerHomogeneousPrologEpilogImpl::runOnMI(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
   MachineInstr &MI = *MBBI;
   unsigned Opcode = MI.getOpcode();
   switch (Opcode) {
@@ -642,7 +657,7 @@ bool AArch64LowerHomogeneousPE::runOnMI(MachineBasicBlock &MBB,
   return false;
 }
 
-bool AArch64LowerHomogeneousPE::runOnMBB(MachineBasicBlock &MBB) {
+bool AArch64LowerHomogeneousPrologEpilogImpl::runOnMBB(MachineBasicBlock &MBB) {
   bool Modified = false;
 
   MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
@@ -655,7 +670,8 @@ bool AArch64LowerHomogeneousPE::runOnMBB(MachineBasicBlock &MBB) {
   return Modified;
 }
 
-bool AArch64LowerHomogeneousPE::runOnMachineFunction(MachineFunction &MF) {
+bool AArch64LowerHomogeneousPrologEpilogImpl::runOnMachineFunction(
+    MachineFunction &MF) {
   TII = MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
 
   bool Modified = false;
@@ -665,5 +681,5 @@ bool AArch64LowerHomogeneousPE::runOnMachineFunction(MachineFunction &MF) {
 }
 
 ModulePass *llvm::createAArch64LowerHomogeneousPrologEpilogPass() {
-  return new AArch64LowerHomogeneousPrologEpilog();
+  return new AArch64LowerHomogeneousPrologEpilogLegacy();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64PassRegistry.def b/llvm/lib/Target/AArch64/AArch64PassRegistry.def
index de1b5834112e7..e3e47b12a9ec3 100644
--- a/llvm/lib/Target/AArch64/AArch64PassRegistry.def
+++ b/llvm/lib/Target/AArch64/AArch64PassRegistry.def
@@ -16,6 +16,7 @@
 #ifndef MODULE_PASS
 #define MODULE_PASS(NAME, CREATE_PASS)
 #endif
+MODULE_PASS("aarch64-lower-homogeneous-prolog-epilog", AArch64LowerHomogeneousPrologEpilogPass())
 #undef MODULE_PASS
 
 #ifndef FUNCTION_PASS
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 57929316954d6..b8cc69f7569d6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -277,7 +277,7 @@ LLVMInitializeAArch64Target() {
   initializeAArch64SLSHardeningLegacyPass(PR);
   initializeAArch64StackTaggingPass(PR);
   initializeAArch64StackTaggingPreRALegacyPass(PR);
-  initializeAArch64LowerHomogeneousPrologEpilogPass(PR);
+  initializeAArch64LowerHomogeneousPrologEpilogLegacyPass(PR);
   initializeAArch64DAGToDAGISelLegacyPass(PR);
   initializeAArch64CondBrTuningPass(PR);
   initializeAArch64Arm64ECCallLoweringPass(PR);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3dc9f503088e6..c91143fefb6af 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -20961,7 +20961,7 @@ std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
 /// Horizontal vector math instructions may be slower than normal math with
 /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
 /// implementation, and likely shuffle complexity of the alternate sequence.
-static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
+static bool shouldUseHorizontalOp(bool IsSingleSource, const SelectionDAG &DAG,
                                   const X86Subtarget &Subtarget) {
   bool IsOptimizingSize = DAG.shouldOptForSize();
   bool HasFastHOps = Subtarget.hasFastHorizontalOps();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index a064aa1016399..15e4668671807 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3653,7 +3653,7 @@ static Instruction *foldNestedSelects(SelectInst &OuterSelVal,
 /// already poison. For example, if ValAssumedPoison is `icmp samesign X, 10`
 /// and V is `icmp ne X, 5`, impliesPoisonOrCond returns true.
 static bool impliesPoisonOrCond(const Value *ValAssumedPoison, const Value *V,
-                                bool Expected) {
+                                bool Expected, const SimplifyQuery &SQ) {
   if (impliesPoison(ValAssumedPoison, V))
     return true;
 
@@ -3678,6 +3678,14 @@ static bool impliesPoisonOrCond(const Value *ValAssumedPoison, const Value *V,
                       *RHSC2);
     }
   }
+  Value *A;
+  if (match(ValAssumedPoison, m_NUWTrunc(m_Value(A))) &&
+      isGuaranteedNotToBePoison(A)) {
+    assert(ValAssumedPoison->getType()->isIntOrIntVectorTy(1));
+    return computeKnownBits(
+               A, SQ.getWithInstruction(cast<Instruction>(ValAssumedPoison)))
+               .getMaxValue() == 1;
+  }
 
   return false;
 }
@@ -3703,13 +3711,13 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
   // checks whether folding it does not convert a well-defined value into
   // poison.
   if (match(TrueVal, m_One())) {
-    if (impliesPoisonOrCond(FalseVal, CondVal, /*Expected=*/false)) {
+    if (impliesPoisonOrCond(FalseVal, CondVal, /*Expected=*/false, SQ)) {
       // Change: A = select B, true, C --> A = or B, C
       return BinaryOperator::CreateOr(CondVal, FalseVal);
     }
 
     if (match(CondVal, m_OneUse(m_Select(m_Value(A), m_One(), m_Value(B)))) &&
-        impliesPoisonOrCond(FalseVal, B, /*Expected=*/false)) {
+        impliesPoisonOrCond(FalseVal, B, /*Expected=*/false, SQ)) {
       // (A || B) || C --> A || (B | C)
       Value *LOr = Builder.CreateLogicalOr(A, Builder.CreateOr(B, FalseVal));
       if (auto *I = dyn_cast<Instruction>(LOr)) {
@@ -3749,13 +3757,13 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
   }
 
   if (match(FalseVal, m_Zero())) {
-    if (impliesPoisonOrCond(TrueVal, CondVal, /*Expected=*/true)) {
+    if (impliesPoisonOrCond(TrueVal, CondVal, /*Expected=*/true, SQ)) {
       // Change: A = select B, C, false --> A = and B, C
       return BinaryOperator::CreateAnd(CondVal, TrueVal);
     }
 
     if (match(CondVal, m_OneUse(m_Select(m_Value(A), m_Value(B), m_Zero()))) &&
-        impliesPoisonOrCond(TrueVal, B, /*Expected=*/true)) {
+        impliesPoisonOrCond(TrueVal, B, /*Expected=*/true, SQ)) {
       // (A && B) && C --> A && (B & C)
       Value *LAnd = Builder.CreateLogicalAnd(A, Builder.CreateAnd(B, TrueVal));
       if (auto *I = dyn_cast<Instruction>(LAnd)) {
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index f3cdb3518ddcb..9fe751a56df64 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -641,9 +641,9 @@ Instruction *IndirectCallPromoter::computeVTableInfos(
       continue;
 
     auto &Candidate = Candidates[CalleeIndexIter->second];
-    // There shouldn't be duplicate GUIDs in one !prof metadata (except
-    // duplicated zeros), so assign counters directly won't cause overwrite or
-    // counter loss.
+    // There should never be duplicate GUIDs in one !prof metdata, as this is
+    // an IR invariant enforced by the verifier. Assigning counters directly
+    // won't cause overwrite or counter loss.
     Candidate.VTableGUIDAndCounts[VTableVal] = V.Count;
     Candidate.AddressPoints.push_back(
         getOrCreateVTableAddressPointVar(VTableVar, AddressPointOffset));
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index da581e41dc9a8..d5696bc9ea956 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -158,10 +158,9 @@ expandToSwitch(CallBase *CB, const JumpTableTy &JT, DomTreeUpdater &DTU,
 
     for (const auto &[G, C] : Targets) {
       [[maybe_unused]] auto It = GuidToCounter.insert({G, C});
-      // TODO(boomanaiden154): Currently we do not assert on inserting
-      // duplicate GUIDs because we might have multiple zeros when the profile
-      // loader fails to map addresses to functions. Readd the assertion that
-      // we did insert once this has been fixed.
+      // We should always be inserting as it is verifier-enforced IR invariant
+      // that VP metadata does not have duplicate values.
+      assert(It.second);
     }
   }
   for (auto [Index, Func] : llvm::enumerate(JT.Funcs)) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp
index c6997a6c8e717..93d06172ae75f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp
@@ -623,6 +623,96 @@ void VFSelectionContext::collectInLoopReductions() {
   }
 }
 
+bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
+                                                const VectorizationFactor &B,
+                                                const unsigned MaxTripCount,
+                                                bool HasTail,
+                                                bool IsEpilogue) const {
+  InstructionCost CostA = A.Cost;
+  InstructionCost CostB = B.Cost;
+
+  // When there is a hint to always prefer scalable vectors, honour that hint.
+  if (Hints.isScalableVectorizationAlwaysPreferred())
+    if (A.Width.isScalable() && CostA.isValid() && !B.Width.isScalable() &&
+        !B.Width.isScalar())
+      return true;
+
+  // Improve estimate for the vector width if it is scalable.
+  unsigned EstimatedWidthA = A.Width.getKnownMinValue();
+  unsigned EstimatedWidthB = B.Width.getKnownMinValue();
+  if (std::optional<unsigned> VScale = Config.getVScaleForTuning()) {
+    if (A.Width.isScalable())
+      EstimatedWidthA *= *VScale;
+    if (B.Width.isScalable())
+      EstimatedWidthB *= *VScale;
+  }
+
+  // When optimizing for size choose whichever is smallest, which will be the
+  // one with the smallest cost for the whole loop. On a tie pick the larger
+  // vector width, on the assumption that throughput will be greater.
+  if (Config.CostKind == TTI::TCK_CodeSize)
+    return CostA < CostB ||
+           (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
+
+  // Assume vscale may be larger than 1 (or the value being tuned for),
+  // so that scalable vectorization is slightly favorable over fixed-width
+  // vectorization.
+  bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
+                        A.Width.isScalable() && !B.Width.isScalable();
+
+  auto CmpFn = [PreferScalable](const InstructionCost &LHS,
+                                const InstructionCost &RHS) {
+    return PreferScalable ? LHS <= RHS : LHS < RHS;
+  };
+
+  // To avoid the need for FP division:
+  //      (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
+  // <=>  (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
+  bool LowerCostWithoutTC =
+      CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
+  if (!MaxTripCount)
+    return LowerCostWithoutTC;
+
+  auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
+                                              InstructionCost VectorCost,
+                                              InstructionCost ScalarCost) {
+    // If the trip count is a known (possibly small) constant, the trip count
+    // will be rounded up to an integer number of iterations under
+    // FoldTailByMasking. The total cost in that case will be
+    // VecCost*ceil(TripCount/VF). When not folding the tail, the total
+    // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
+    // some extra overheads, but for the purpose of comparing the costs of
+    // different VFs we can use this to compare the total loop-body cost
+    // expected after vectorization.
+    if (HasTail)
+      return VectorCost * (MaxTripCount / VF) +
+             ScalarCost * (MaxTripCount % VF);
+    return VectorCost * divideCeil(MaxTripCount, VF);
+  };
+
+  auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
+  auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
+  bool LowerCostWithTC = CmpFn(RTCostA, RTCostB);
+  LLVM_DEBUG(if (LowerCostWithTC != LowerCostWithoutTC) {
+    dbgs() << "LV: VF " << (LowerCostWithTC ? A.Width : B.Width)
+           << " has lower cost than VF "
+           << (LowerCostWithTC ? B.Width : A.Width)
+           << " when taking the cost of the remaining scalar loop iterations "
+              "into consideration for a maximum trip count of "
+           << MaxTripCount << ".\n";
+  });
+  return LowerCostWithTC;
+}
+
+bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
+                                                const VectorizationFactor &B,
+                                                bool HasTail,
+                                                bool IsEpilogue) const {
+  const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
+  return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
+                                                    IsEpilogue);
+}
+
 // TODO: we could return a pair of values that specify the max VF and
 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1de70dfd09a83..4f9ab60f1c526 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3091,96 +3091,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   return FixedScalableVFPair::getNone();
 }
 
-bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
-                                                const VectorizationFactor &B,
-                                                const unsigned MaxTripCount,
-                                                bool HasTail,
-                                                bool IsEpilogue) const {
-  InstructionCost CostA = A.Cost;
-  InstructionCost CostB = B.Cost;
-
-  // When there is a hint to always prefer scalable vectors, honour that hint.
-  if (Hints.isScalableVectorizationAlwaysPreferred())
-    if (A.Width.isScalable() && CostA.isValid() && !B.Width.isScalable() &&
-        !B.Width.isScalar())
-      return true;
-
-  // Improve estimate for the vector width if it is scalable.
-  unsigned EstimatedWidthA = A.Width.getKnownMinValue();
-  unsigned EstimatedWidthB = B.Width.getKnownMinValue();
-  if (std::optional<unsigned> VScale = Config.getVScaleForTuning()) {
-    if (A.Width.isScalable())
-      EstimatedWidthA *= *VScale;
-    if (B.Width.isScalable())
-      EstimatedWidthB *= *VScale;
-  }
-
-  // When optimizing for size choose whichever is smallest, which will be the
-  // one with the smallest cost for the whole loop. On a tie pick the larger
-  // vector width, on the assumption that throughput will be greater.
-  if (Config.CostKind == TTI::TCK_CodeSize)
-    return CostA < CostB ||
-           (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
-
-  // Assume vscale may be larger than 1 (or the value being tuned for),
-  // so that scalable vectorization is slightly favorable over fixed-width
-  // vectorization.
-  bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
-                        A.Width.isScalable() && !B.Width.isScalable();
-
-  auto CmpFn = [PreferScalable](const InstructionCost &LHS,
-                                const InstructionCost &RHS) {
-    return PreferScalable ? LHS <= RHS : LHS < RHS;
-  };
-
-  // To avoid the need for FP division:
-  //      (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
-  // <=>  (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
-  bool LowerCostWithoutTC =
-      CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
-  if (!MaxTripCount)
-    return LowerCostWithoutTC;
-
-  auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
-                                              InstructionCost VectorCost,
-                                              InstructionCost ScalarCost) {
-    // If the trip count is a known (possibly small) constant, the trip count
-    // will be rounded up to an integer number of iterations under
-    // FoldTailByMasking. The total cost in that case will be
-    // VecCost*ceil(TripCount/VF). When not folding the tail, the total
-    // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
-    // some extra overheads, but for the purpose of comparing the costs of
-    // different VFs we can use this to compare the total loop-body cost
-    // expected after vectorization.
-    if (HasTail)
-      return VectorCost * (MaxTripCount / VF) +
-             ScalarCost * (MaxTripCount % VF);
-    return VectorCost * divideCeil(MaxTripCount, VF);
-  };
-
-  auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
-  auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
-  bool LowerCostWithTC = CmpFn(RTCostA, RTCostB);
-  LLVM_DEBUG(if (LowerCostWithTC != LowerCostWithoutTC) {
-    dbgs() << "LV: VF " << (LowerCostWithTC ? A.Width : B.Width)
-           << " has lower cost than VF "
-           << (LowerCostWithTC ? B.Width : A.Width)
-           << " when taking the cost of the remaining scalar loop iterations "
-              "into consideration for a maximum trip count of "
-           << MaxTripCount << ".\n";
-  });
-  return LowerCostWithTC;
-}
-
-bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
-                                                const VectorizationFactor &B,
-                                                bool HasTail,
-                                                bool IsEpilogue) const {
-  const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
-  return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
-                                                    IsEpilogue);
-}
-
 void LoopVectorizationPlanner::emitInvalidCostRemarks(
     OptimizationRemarkEmitter *ORE) {
   using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
@@ -7604,7 +7514,7 @@ preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
 /// preheader of the vector epilogue loop, after created by the execution of \p
 /// Plan.
 static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
-    VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
+    VPlan &MainPlan, VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
     EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM,
     VFSelectionContext &Config, ScalarEvolution &SE) {
   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
@@ -7614,28 +7524,30 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
   VPValue *IV = VectorLoop->getCanonicalIV();
   // When vectorizing the epilogue loop, the canonical induction needs to start
   // at the resume value from the main vector loop. Find the resume value
-  // created during execution of the main VPlan. It must be the first phi in the
-  // loop preheader. Add this resume value as an offset to the canonical IV of
-  // the epilogue loop.
+  // created during execution of the main VPlan. Add this resume value as an
+  // offset to the canonical IV of the epilogue loop.
   using namespace llvm::PatternMatch;
-  PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
-  for (Value *Inc : EPResumeVal->incoming_values()) {
-    if (match(Inc, m_SpecificInt(0)))
-      continue;
-    assert(!EPI.VectorTripCount &&
-           "Must only have a single non-zero incoming value");
-    EPI.VectorTripCount = Inc;
-  }
-  // If we didn't find a non-zero vector trip count, all incoming values
-  // must be zero, which also means the vector trip count is zero. Pick the
-  // first zero as vector trip count.
-  // TODO: We should not choose VF * UF so the main vector loop is known to
-  // be dead.
-  if (!EPI.VectorTripCount) {
-    assert(EPResumeVal->getNumIncomingValues() > 0 &&
-           all_of(EPResumeVal->incoming_values(), match_fn(m_SpecificInt(0))) &&
-           "all incoming values must be 0");
-    EPI.VectorTripCount = EPResumeVal->getOperand(0);
+  VPInstruction *ResumeForEpilogue =
+      cast<VPInstruction>(&*MainPlan.getScalarPreheader()->getFirstNonPhi());
+  Value *EPResumeVal = ResumeForEpilogue->getUnderlyingValue();
+  if (auto *ResumePhi = dyn_cast<PHINode>(EPResumeVal)) {
+    for (Value *Inc : ResumePhi->incoming_values()) {
+      if (match(Inc, m_SpecificInt(0)))
+        continue;
+      assert(!EPI.VectorTripCount &&
+             "Must only have a single non-zero incoming value");
+      EPI.VectorTripCount = Inc;
+    }
+    // If we didn't find a non-zero vector trip count, all incoming values
+    // must be zero, which also means the vector trip count is zero.
+    if (!EPI.VectorTripCount) {
+      assert(ResumePhi->getNumIncomingValues() > 0 &&
+             all_of(ResumePhi->incoming_values(), match_fn(m_SpecificInt(0))) &&
+             "all incoming values must be 0");
+      EPI.VectorTripCount = ResumePhi->getIncomingValue(0);
+    }
+  } else {
+    EPI.VectorTripCount = EPResumeVal;
   }
   VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
   assert(all_of(IV->users(),
@@ -7869,40 +7781,30 @@ static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L,
   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
          "expected this to be saved from the previous pass.");
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-  EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
-      VecEpilogueIterationCountCheck, VecEpiloguePreHeader);
 
-  DTU.applyUpdates({{DominatorTree::Delete, EPI.MainLoopIterationCountCheck,
-                     VecEpilogueIterationCountCheck},
-                    {DominatorTree::Insert, EPI.MainLoopIterationCountCheck,
-                     VecEpiloguePreHeader}});
+  // Helper to redirect an edge from \p BB to \p VecEpilogueIterationCountCheck
+  // to \p NewSucc instead, updating the DomTree.
+  auto RedirectEdge = [&](BasicBlock *BB, BasicBlock *NewSucc) {
+    BB->getTerminator()->replaceUsesOfWith(VecEpilogueIterationCountCheck,
+                                           NewSucc);
+    DTU.applyUpdates(
+        {{DominatorTree::Delete, BB, VecEpilogueIterationCountCheck},
+         {DominatorTree::Insert, BB, NewSucc}});
+  };
+
+  RedirectEdge(EPI.MainLoopIterationCountCheck, VecEpiloguePreHeader);
 
   BasicBlock *ScalarPH =
       cast<VPIRBasicBlock>(EpiPlan.getScalarPreheader())->getIRBasicBlock();
-  EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
-      VecEpilogueIterationCountCheck, ScalarPH);
-  DTU.applyUpdates(
-      {{DominatorTree::Delete, EPI.EpilogueIterationCountCheck,
-        VecEpilogueIterationCountCheck},
-       {DominatorTree::Insert, EPI.EpilogueIterationCountCheck, ScalarPH}});
+  RedirectEdge(EPI.EpilogueIterationCountCheck, ScalarPH);
 
   // Adjust the terminators of runtime check blocks and phis using them.
   BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second;
   BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second;
-  if (SCEVCheckBlock) {
-    SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
-        VecEpilogueIterationCountCheck, ScalarPH);
-    DTU.applyUpdates({{DominatorTree::Delete, SCEVCheckBlock,
-                       VecEpilogueIterationCountCheck},
-                      {DominatorTree::Insert, SCEVCheckBlock, ScalarPH}});
-  }
-  if (MemCheckBlock) {
-    MemCheckBlock->getTerminator()->replaceUsesOfWith(
-        VecEpilogueIterationCountCheck, ScalarPH);
-    DTU.applyUpdates(
-        {{DominatorTree::Delete, MemCheckBlock, VecEpilogueIterationCountCheck},
-         {DominatorTree::Insert, MemCheckBlock, ScalarPH}});
-  }
+  if (SCEVCheckBlock)
+    RedirectEdge(SCEVCheckBlock, ScalarPH);
+  if (MemCheckBlock)
+    RedirectEdge(MemCheckBlock, ScalarPH);
 
   // The vec.epilog.iter.check block may contain Phi nodes from inductions
   // or reductions which merge control-flow from the latch block and the
@@ -7925,11 +7827,11 @@ static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L,
           return EPI.EpilogueIterationCountCheck == IncB;
         }))
       continue;
-    Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
-    if (SCEVCheckBlock)
-      Phi->removeIncomingValue(SCEVCheckBlock);
-    if (MemCheckBlock)
-      Phi->removeIncomingValue(MemCheckBlock);
+    for (BasicBlock *BB :
+         {EPI.EpilogueIterationCountCheck, SCEVCheckBlock, MemCheckBlock}) {
+      if (BB)
+        Phi->removeIncomingValue(BB);
+    }
   }
 
   auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
@@ -8381,7 +8283,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
                                              Checks, BestEpiPlan);
     SmallVector<Instruction *> InstsToMove = preparePlanForEpilogueVectorLoop(
-        BestEpiPlan, L, ExpandedSCEVs, EPI, CM, Config, *PSE.getSE());
+        BestMainPlan, BestEpiPlan, L, ExpandedSCEVs, EPI, CM, Config,
+        *PSE.getSE());
     LVP.attachRuntimeChecks(BestEpiPlan, Checks, HasBranchWeights);
     LVP.executePlan(
         EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
diff --git a/llvm/test/Analysis/AliasSet/argmemonly.ll b/llvm/test/Analysis/AliasSet/argmemonly.ll
index 995fb26ca436a..6912447491149 100644
--- a/llvm/test/Analysis/AliasSet/argmemonly.ll
+++ b/llvm/test/Analysis/AliasSet/argmemonly.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes=print-alias-sets -S -o - < %s 2>&1 | FileCheck %s
+; RUN: opt -passes='print<alias-sets>' -S -o - < %s 2>&1 | FileCheck %s
 
 @s = global i8 1, align 1
 @d = global i8 2, align 1
diff --git a/llvm/test/Analysis/AliasSet/guards.ll b/llvm/test/Analysis/AliasSet/guards.ll
index 9ca70f6244800..32f74a41773d6 100644
--- a/llvm/test/Analysis/AliasSet/guards.ll
+++ b/llvm/test/Analysis/AliasSet/guards.ll
@@ -1,4 +1,4 @@
-; RUN: opt -aa-pipeline=basic-aa -passes=print-alias-sets -S -o - < %s 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='print<alias-sets>' -S -o - < %s 2>&1 | FileCheck %s
 declare void @llvm.experimental.guard(i1, ...)
 
 ; CHECK: Alias sets for function 'test0':
diff --git a/llvm/test/Analysis/AliasSet/intrinsics.ll b/llvm/test/Analysis/AliasSet/intrinsics.ll
index 0dc802ca7e0aa..a0a319ae3e683 100644
--- a/llvm/test/Analysis/AliasSet/intrinsics.ll
+++ b/llvm/test/Analysis/AliasSet/intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes=print-alias-sets -S -o - < %s 2>&1 | FileCheck %s --implicit-check-not="Unknown instructions"
+; RUN: opt -passes='print<alias-sets>' -S -o - < %s 2>&1 | FileCheck %s --implicit-check-not="Unknown instructions"
 
 ; CHECK: Alias sets for function 'test1':
 ; CHECK: Alias Set Tracker: 2 alias sets for 2 pointer values.
diff --git a/llvm/test/Analysis/AliasSet/memloc-vscale.ll b/llvm/test/Analysis/AliasSet/memloc-vscale.ll
index 6b41604637405..acd183818903b 100644
--- a/llvm/test/Analysis/AliasSet/memloc-vscale.ll
+++ b/llvm/test/Analysis/AliasSet/memloc-vscale.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S < %s -passes=print-alias-sets 2>&1 | FileCheck %s
+; RUN: opt -S < %s -passes='print<alias-sets>' 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: Alias sets for function 'sn'
 ; CHECK: AliasSet[{{.*}}, 1] must alias, Mod       Memory locations: (ptr %p, LocationSize::precise(vscale x 16)), (ptr %p, LocationSize::precise(8))
diff --git a/llvm/test/Analysis/AliasSet/memset.ll b/llvm/test/Analysis/AliasSet/memset.ll
index 17f1e53f77e32..01242fa0171b3 100644
--- a/llvm/test/Analysis/AliasSet/memset.ll
+++ b/llvm/test/Analysis/AliasSet/memset.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes=print-alias-sets -S -o - < %s 2>&1 | FileCheck %s
+; RUN: opt -passes='print<alias-sets>' -S -o - < %s 2>&1 | FileCheck %s
 
 @s = global i8 1, align 1
 @d = global i8 2, align 1
diff --git a/llvm/test/Analysis/AliasSet/memtransfer.ll b/llvm/test/Analysis/AliasSet/memtransfer.ll
index 93290c39620eb..b91a14099f68c 100644
--- a/llvm/test/Analysis/AliasSet/memtransfer.ll
+++ b/llvm/test/Analysis/AliasSet/memtransfer.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes=print-alias-sets -S -o - < %s 2>&1 | FileCheck %s
+; RUN: opt -passes='print<alias-sets>' -S -o - < %s 2>&1 | FileCheck %s
 
 @s = global i8 1, align 1
 @d = global i8 2, align 1
diff --git a/llvm/test/Analysis/AliasSet/saturation.ll b/llvm/test/Analysis/AliasSet/saturation.ll
index 27f5bbee2f55c..1d547dcd8f26a 100644
--- a/llvm/test/Analysis/AliasSet/saturation.ll
+++ b/llvm/test/Analysis/AliasSet/saturation.ll
@@ -1,5 +1,5 @@
-; RUN: opt -passes=print-alias-sets -alias-set-saturation-threshold=4 -S -o - < %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=NOSAT
-; RUN: opt -passes=print-alias-sets -alias-set-saturation-threshold=3 -S -o - < %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=SAT
+; RUN: opt -passes='print<alias-sets>' -alias-set-saturation-threshold=4 -S -o - < %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=NOSAT
+; RUN: opt -passes='print<alias-sets>' -alias-set-saturation-threshold=3 -S -o - < %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=SAT
 
 ; CHECK-LABEL: 'nomerge'
 ; CHECK: AliasSet[{{.*}}, 1] must alias, Mod Memory locations: (ptr %a, LocationSize::precise(4))
diff --git a/llvm/test/Analysis/BasicAA/separate_storage-alias-sets.ll b/llvm/test/Analysis/BasicAA/separate_storage-alias-sets.ll
index 37d8e55cb2ff3..ad213c79697ad 100644
--- a/llvm/test/Analysis/BasicAA/separate_storage-alias-sets.ll
+++ b/llvm/test/Analysis/BasicAA/separate_storage-alias-sets.ll
@@ -2,7 +2,7 @@
 ; separate storage hints. This lets alias analysis users (such as the alias set
 ; tracker) who can't be context-sensitive still get the benefits of hints.
 
-; RUN: opt < %s -basic-aa-separate-storage -S -passes=print-alias-sets 2>&1 | FileCheck %s
+; RUN: opt < %s -basic-aa-separate-storage -S -passes='print<alias-sets>' 2>&1 | FileCheck %s
 
 declare void @llvm.assume(i1)
 
diff --git a/llvm/test/Analysis/MustExecute/const-cond.ll b/llvm/test/Analysis/MustExecute/const-cond.ll
index 97358f670b88c..01553b2ca857d 100644
--- a/llvm/test/Analysis/MustExecute/const-cond.ll
+++ b/llvm/test/Analysis/MustExecute/const-cond.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -disable-output -passes=print-mustexecute %s 2>&1 | FileCheck %s
+; RUN: opt -disable-output -passes='print<must-execute>' %s 2>&1 | FileCheck %s
 
 ; In general the CFG below is easily simplified but this is useful for
 ; pass ordering issue elimination.
diff --git a/llvm/test/Analysis/MustExecute/infinite_loops.ll b/llvm/test/Analysis/MustExecute/infinite_loops.ll
index 3fac1293df5d4..844dd944b5d38 100644
--- a/llvm/test/Analysis/MustExecute/infinite_loops.ll
+++ b/llvm/test/Analysis/MustExecute/infinite_loops.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -disable-output -passes=print-mustexecute %s 2>&1 | FileCheck %s
+; RUN: opt -disable-output -passes='print<must-execute>' %s 2>&1 | FileCheck %s
 
 ; Infinite loop.
 ; Make sure that the backedge is mustexec.
diff --git a/llvm/test/Analysis/MustExecute/irreducible-cfg.ll b/llvm/test/Analysis/MustExecute/irreducible-cfg.ll
index a452761ab3356..5db862995cecd 100644
--- a/llvm/test/Analysis/MustExecute/irreducible-cfg.ll
+++ b/llvm/test/Analysis/MustExecute/irreducible-cfg.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -disable-output -passes=print-mustexecute %s 2>&1 | FileCheck %s
+; RUN: opt -disable-output -passes='print<must-execute>' %s 2>&1 | FileCheck %s
 
 ; The loop body has two predecessors, %header and %side-entry. This leads to irreducible-cfg
 define i64 @baz() {
diff --git a/llvm/test/Analysis/MustExecute/loop-header.ll b/llvm/test/Analysis/MustExecute/loop-header.ll
index 50efd74b61b11..d632323f94978 100644
--- a/llvm/test/Analysis/MustExecute/loop-header.ll
+++ b/llvm/test/Analysis/MustExecute/loop-header.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -disable-output -passes=print-mustexecute %s 2>&1 | FileCheck %s
+; RUN: opt -disable-output -passes='print<must-execute>' %s 2>&1 | FileCheck %s
 
 define i1 @header_with_icf(ptr noalias %p, i32 %high) {
 ; CHECK-LABEL: @header_with_icf(
diff --git a/llvm/test/Analysis/MustExecute/must_be_executed_context.ll b/llvm/test/Analysis/MustExecute/must_be_executed_context.ll
index f3360f7cd0753..d57a54a84eb6e 100644
--- a/llvm/test/Analysis/MustExecute/must_be_executed_context.ll
+++ b/llvm/test/Analysis/MustExecute/must_be_executed_context.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=print-mustexecute               -disable-output 2>&1 | FileCheck %s --check-prefix=ME
+; RUN: opt < %s -passes='print<must-execute>'               -disable-output 2>&1 | FileCheck %s --check-prefix=ME
 ; RUN: opt < %s -passes=print-must-be-executed-contexts -disable-output 2>&1 | FileCheck %s --check-prefix=MBEC
 ;
 ;    void simple_conditional(int c) {
diff --git a/llvm/test/Analysis/MustExecute/pr57780.ll b/llvm/test/Analysis/MustExecute/pr57780.ll
index 4b26cadbb42c5..2f6a7a2831f8e 100644
--- a/llvm/test/Analysis/MustExecute/pr57780.ll
+++ b/llvm/test/Analysis/MustExecute/pr57780.ll
@@ -1,4 +1,4 @@
-; RUN: opt -disable-output -passes=print-mustexecute < %s 2>&1 | FileCheck %s
+; RUN: opt -disable-output -passes='print<must-execute>' < %s 2>&1 | FileCheck %s
 
 @c = global i16 0, align 2
 
diff --git a/llvm/test/Analysis/ValueTracking/deref-abstract-gc.ll b/llvm/test/Analysis/ValueTracking/deref-abstract-gc.ll
index 4b63c036f5491..47b8cc95388b4 100644
--- a/llvm/test/Analysis/ValueTracking/deref-abstract-gc.ll
+++ b/llvm/test/Analysis/ValueTracking/deref-abstract-gc.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes=print-memderefs -S < %s -disable-output  -use-dereferenceable-at-point-semantics 2>&1 | FileCheck %s --check-prefixes=CHECK
+; RUN: opt -passes='print<mem-derefs>' -S < %s -disable-output  -use-dereferenceable-at-point-semantics 2>&1 | FileCheck %s --check-prefixes=CHECK
 
 target datalayout = "e-i32:32:64"
 
diff --git a/llvm/test/Analysis/ValueTracking/memory-dereferenceable.ll b/llvm/test/Analysis/ValueTracking/memory-dereferenceable.ll
index 8c5216e0c45d9..fc40c68624f2d 100644
--- a/llvm/test/Analysis/ValueTracking/memory-dereferenceable.ll
+++ b/llvm/test/Analysis/ValueTracking/memory-dereferenceable.ll
@@ -1,5 +1,5 @@
-; RUN: opt -passes=print-memderefs -S < %s -disable-output  -use-dereferenceable-at-point-semantics=false 2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL
-; RUN: opt -passes=print-memderefs -S < %s -disable-output  -use-dereferenceable-at-point-semantics 2>&1 | FileCheck %s --check-prefixes=CHECK,POINT
+; RUN: opt -passes='print<mem-derefs>' -S < %s -disable-output  -use-dereferenceable-at-point-semantics=false 2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL
+; RUN: opt -passes='print<mem-derefs>' -S < %s -disable-output  -use-dereferenceable-at-point-semantics 2>&1 | FileCheck %s --check-prefixes=CHECK,POINT
 
 
 ; Uses the print-deref (+ analyze to print) pass to run
diff --git a/llvm/test/CodeGen/AArch64/inline-asm-clobber-x29-x30.ll b/llvm/test/CodeGen/AArch64/inline-asm-clobber-x29-x30.ll
new file mode 100644
index 0000000000000..cde55522fb19b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/inline-asm-clobber-x29-x30.ll
@@ -0,0 +1,44 @@
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs < %s | FileCheck %s
+
+; Test that both numeric register names (x29, x30) and their architectural
+; aliases (fp, lr) work correctly as clobbers in inline assembly.
+
+define void @clobber_x29() nounwind {
+; CHECK-LABEL: clobber_x29:
+; CHECK:       str x29, [sp
+; CHECK-NEXT:  //APP
+; CHECK-NEXT:  //NO_APP
+; CHECK-NEXT:  ldr x29, [sp
+  tail call void asm sideeffect "", "~{x29}"()
+  ret void
+}
+
+define void @clobber_fp() nounwind {
+; CHECK-LABEL: clobber_fp:
+; CHECK:       str x29, [sp
+; CHECK-NEXT:  //APP
+; CHECK-NEXT:  //NO_APP
+; CHECK-NEXT:  ldr x29, [sp
+  tail call void asm sideeffect "", "~{fp}"()
+  ret void
+}
+
+define void @clobber_x30() nounwind {
+; CHECK-LABEL: clobber_x30:
+; CHECK:       str x30, [sp
+; CHECK-NEXT:  //APP
+; CHECK-NEXT:  //NO_APP
+; CHECK-NEXT:  ldr x30, [sp
+  tail call void asm sideeffect "", "~{x30}"()
+  ret void
+}
+
+define void @clobber_lr() nounwind {
+; CHECK-LABEL: clobber_lr:
+; CHECK:       str x30, [sp
+; CHECK-NEXT:  //APP
+; CHECK-NEXT:  //NO_APP
+; CHECK-NEXT:  ldr x30, [sp
+  tail call void asm sideeffect "", "~{lr}"()
+  ret void
+}
diff --git a/llvm/test/Other/debugcounter-predicateinfo.ll b/llvm/test/Other/debugcounter-predicateinfo.ll
index d91b0f8904cb0..57929220c6a28 100644
--- a/llvm/test/Other/debugcounter-predicateinfo.ll
+++ b/llvm/test/Other/debugcounter-predicateinfo.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -debug-counter=predicateinfo-rename=1 -passes=print-predicateinfo < %s 2>&1 | FileCheck %s
+; RUN: opt -debug-counter=predicateinfo-rename=1 -passes='print<predicate-info>' < %s 2>&1 | FileCheck %s
 ;; Test that, with debug counters on, we don't rename the first info, only the second
 define fastcc void @barney() {
 ; CHECK-LABEL: @barney(
diff --git a/llvm/test/Other/print-cfg-scc.ll b/llvm/test/Other/print-cfg-scc.ll
index 19ac00e3d7641..31ccd614a1918 100644
--- a/llvm/test/Other/print-cfg-scc.ll
+++ b/llvm/test/Other/print-cfg-scc.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=print-cfg-sccs -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='print<cfg-sccs>' -disable-output 2>&1 | FileCheck %s
 
 ; CHECK: SCC #1: %UnifiedExitNode
 ; CHECK: SCC #2: %loopexit.5, %loopexit.6, %loopentry.7, %loopentry.6, %loopentry.5, %endif.2
diff --git a/llvm/test/Transforms/FunctionSpecialization/ssa-copy.ll b/llvm/test/Transforms/FunctionSpecialization/ssa-copy.ll
index 11696cb76c8bb..b7cee46aa6f32 100644
--- a/llvm/test/Transforms/FunctionSpecialization/ssa-copy.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/ssa-copy.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
-; RUN: opt -passes=print-predicateinfo -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=PREDINF
+; RUN: opt -passes='print<predicate-info>' -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=PREDINF
 ; RUN: opt -passes="ipsccp<func-spec>" -funcspec-min-function-size=1       \
 ; RUN:                                 -funcspec-for-literal-constant=true \
 ; RUN:                                 -funcspec-min-codesize-savings=50   \
diff --git a/llvm/test/Transforms/InstCombine/logical-select.ll b/llvm/test/Transforms/InstCombine/logical-select.ll
index e6de063969a6a..85e8c98455c91 100644
--- a/llvm/test/Transforms/InstCombine/logical-select.ll
+++ b/llvm/test/Transforms/InstCombine/logical-select.ll
@@ -1637,3 +1637,58 @@ define <2 x i1> @test_logical_and_icmp_samesign_vec_with_poison_tv(<2 x i8> %x)
   %and = select <2 x i1> %cmp1, <2 x i1> %cmp2, <2 x i1> zeroinitializer
   ret <2 x i1> %and
 }
+
+define i1 @test_logical_and_trunc_nuw(i1 %c, i8 noundef range(i8 0,2) %x) {
+; CHECK-LABEL: @test_logical_and_trunc_nuw(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc nuw i8 [[X:%.*]] to i1
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[C:%.*]], [[TRUNC]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %trunc = trunc nuw i8 %x to i1
+  %and = select i1 %c, i1 %trunc, i1 false
+  ret i1 %and
+}
+
+define i1 @test_logical_or_trunc_nuw(i1 %c, i8 noundef range(i8 0,2) %x) {
+; CHECK-LABEL: @test_logical_or_trunc_nuw(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc nuw i8 [[X:%.*]] to i1
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[C:%.*]], [[TRUNC]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %trunc = trunc nuw i8 %x to i1
+  %or = select i1 %c, i1 true, i1 %trunc
+  ret i1 %or
+}
+
+define <2 x i1> @test_logical_and_trunc_nuw_vec(<2 x i1> %c, <2 x i8> noundef range(i8 0,2) %x) {
+; CHECK-LABEL: @test_logical_and_trunc_nuw_vec(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc nuw <2 x i8> [[X:%.*]] to <2 x i1>
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i1> [[C:%.*]], [[TRUNC]]
+; CHECK-NEXT:    ret <2 x i1> [[AND]]
+;
+  %trunc = trunc nuw <2 x i8> %x to <2 x i1>
+  %and = select <2 x i1> %c, <2 x i1> %trunc, <2 x i1> zeroinitializer
+  ret <2 x i1> %and
+}
+
+define i1 @neg_test_logical_and_trunc_nuw_no_range(i1 %c, i8 noundef %x) {
+; CHECK-LABEL: @neg_test_logical_and_trunc_nuw_no_range(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc nuw i8 [[X:%.*]] to i1
+; CHECK-NEXT:    [[AND:%.*]] = select i1 [[C:%.*]], i1 [[TRUNC]], i1 false
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %trunc = trunc nuw i8 %x to i1
+  %and = select i1 %c, i1 %trunc, i1 false
+  ret i1 %and
+}
+
+define i1 @neg_test_logical_and_trunc_nuw_no_noundef(i1 %c, i8 range(i8 0,2) %x) {
+; CHECK-LABEL: @neg_test_logical_and_trunc_nuw_no_noundef(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc nuw i8 [[X:%.*]] to i1
+; CHECK-NEXT:    [[AND:%.*]] = select i1 [[C:%.*]], i1 [[TRUNC]], i1 false
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %trunc = trunc nuw i8 %x to i1
+  %and = select i1 %c, i1 %trunc, i1 false
+  ret i1 %and
+}
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
index 96e74212dce4e..0a98a895f3223 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=8 -epilogue-vectorization-force-VF=4 -S < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=8 -epilogue-vectorization-force-VF=4 -force-target-supports-masked-memory-ops -S < %s | FileCheck %s
 
 define i64 @select_icmp_const(ptr %a, i64 %n) {
 ; CHECK-LABEL: define i64 @select_icmp_const(
@@ -908,3 +908,79 @@ loop:
 exit:
   ret i64 %sel
 }
+
+
+define i32 @predicated_iv_select(ptr %A) {
+; CHECK-LABEL: define i32 @predicated_iv_select(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[VECTOR_MAIN_LOOP_ITER_CHECK:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <8 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <8 x i32> [[WIDE_LOAD]], splat (i32 -1)
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[VEC_IND]], ptr align 4 [[TMP0]], <8 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <8 x i1> [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP5]] = select i1 [[TMP4]], <8 x i1> [[TMP1]], <8 x i1> [[TMP7]]
+; CHECK-NEXT:    [[TMP6]] = select i1 [[TMP4]], <8 x i32> [[VEC_IND]], <8 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1104
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> [[TMP6]], <8 x i1> [[TMP5]], i32 -1)
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1104, %[[VEC_EPILOG_VECTOR_BODY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[TMP8]], %[[VEC_EPILOG_VECTOR_BODY]] ], [ [[LAST_1:%.*]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[GEP_A]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP15]], -1
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[LOOP_THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_THEN]]:
+; CHECK-NEXT:    [[T:%.*]] = trunc nuw nsw i64 [[IV]] to i32
+; CHECK-NEXT:    store i32 [[T]], ptr [[GEP_A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[LAST_1]] = phi i32 [ [[T]], %[[LOOP_THEN]] ], [ [[RED]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1111
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[LAST_1_LCSSA:%.*]] = phi i32 [ [[LAST_1]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i32 [[LAST_1_LCSSA]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %red = phi i32 [ -1, %entry ], [ %last.1, %loop.latch ]
+  %gep.A = getelementptr inbounds nuw i32, ptr %A, i64 %iv
+  %0 = load i32, ptr %gep.A, align 4
+  %cmp1 = icmp sgt i32 %0, -1
+  br i1 %cmp1, label %loop.then, label %loop.latch
+
+loop.then:
+  %t = trunc nuw nsw i64 %iv to i32
+  store i32 %t, ptr %gep.A, align 4
+  br label %loop.latch
+
+loop.latch:
+  %last.1 = phi i32 [ %t, %loop.then ], [ %red, %loop.header ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1111
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret i32 %last.1
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr25281.ll b/llvm/test/Transforms/LoopVectorize/pr25281.ll
index 06031d7e925af..4b8369936123c 100644
--- a/llvm/test/Transforms/LoopVectorize/pr25281.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr25281.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -aa-pipeline=scev-aa -passes=loop-vectorize,print-alias-sets -S  -o - 2>&1 | FileCheck %s
+; RUN: opt < %s  -aa-pipeline=scev-aa -passes='loop-vectorize,print<alias-sets>' -S  -o - 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; PR25281
diff --git a/llvm/test/Transforms/Util/PredicateInfo/assume-operand-bundles.ll b/llvm/test/Transforms/Util/PredicateInfo/assume-operand-bundles.ll
index ab23d3354da66..646e5383fade4 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/assume-operand-bundles.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/assume-operand-bundles.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
-; RUN: opt -S -passes=print-predicateinfo %s 2>&1 >/dev/null | FileCheck %s
+; RUN: opt -S -passes='print<predicate-info>' %s 2>&1 >/dev/null | FileCheck %s
 
 declare void @use(i1)
 
diff --git a/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll b/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll
index f024106b7299a..8feec397ac545 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
-; RUN: opt -S -passes=print-predicateinfo < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: opt -S -passes='print<predicate-info>' < %s 2>&1 >/dev/null | FileCheck %s
 
 ; FIXME:  RenamedOp should be %cmp or %x in all cases here,
 ; which is the value used in the condition.
diff --git a/llvm/test/Transforms/Util/PredicateInfo/condprop.ll b/llvm/test/Transforms/Util/PredicateInfo/condprop.ll
index 42e8ccb760b3f..e9e36c7a2fdc6 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/condprop.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/condprop.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
-; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -passes='print<predicate-info>' -disable-output < %s 2>&1 | FileCheck %s
 
 @a = external global i32		; <ptr> [#uses=7]
 
diff --git a/llvm/test/Transforms/Util/PredicateInfo/diamond.ll b/llvm/test/Transforms/Util/PredicateInfo/diamond.ll
index 06c02d699c511..4e4ee0f2b09bd 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/diamond.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/diamond.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
-; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -passes='print<predicate-info>' -disable-output < %s 2>&1 | FileCheck %s
 define i1 @f(i32 %x, i1 %y) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:    br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]]
diff --git a/llvm/test/Transforms/Util/PredicateInfo/edge.ll b/llvm/test/Transforms/Util/PredicateInfo/edge.ll
index 913832696215e..de18a7ad4474f 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/edge.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/edge.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
-; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -passes='print<predicate-info>' -disable-output < %s 2>&1 | FileCheck %s
 
 define i32 @f1(i32 %x) {
 ; CHECK-LABEL: @f1(
diff --git a/llvm/test/Transforms/Util/PredicateInfo/ordering.ll b/llvm/test/Transforms/Util/PredicateInfo/ordering.ll
index d7ce9dc652c8b..3a1cdf1ee955d 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/ordering.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/ordering.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -passes=print-predicateinfo -debug < %s 2>&1 | FileCheck %s
+; RUN: opt -passes='print<predicate-info>' -debug < %s 2>&1 | FileCheck %s
 
 declare void @use(i32)
 declare void @use.i1(i1)
diff --git a/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll b/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll
index 4762d376ef5aa..3f26f437c860a 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
-; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -passes='print<predicate-info>' -disable-output < %s 2>&1 | FileCheck %s
 ; Don't insert predicate info for conditions with a single target.
 @a = global i32 1, align 4
 @d = common global i32 0, align 4
diff --git a/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll b/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll
index e4fd4cc6dd8a2..138297ccdb4b5 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
-; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -passes='print<predicate-info>' -disable-output < %s 2>&1 | FileCheck %s
 ; Don't insert predicate info for conditions with a single target.
 @a = global i32 6, align 4
 @c = global i32 -1, align 4
diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
index d29aadd54128c..bee4fcd292427 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
-; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -passes='print<predicate-info>' -disable-output < %s 2>&1 | FileCheck %s
 
 declare void @foo(i1)
 declare void @bar(i32)
diff --git a/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll b/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll
index faf4bec61c935..0f1685f942fa1 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll
@@ -1,4 +1,4 @@
-; RUN: opt -disable-output -passes=print-predicateinfo < %s 2>&1 | FileCheck %s
+; RUN: opt -disable-output -passes='print<predicate-info>' < %s 2>&1 | FileCheck %s
 
 %1 = type opaque
 %0 = type opaque
diff --git a/llvm/test/Transforms/Util/PredicateInfo/unreachable.ll b/llvm/test/Transforms/Util/PredicateInfo/unreachable.ll
index 99314d45913e0..05536b0962906 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/unreachable.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/unreachable.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s
+; RUN: opt -passes='print<predicate-info>' < %s 2>&1 | FileCheck %s
 
 declare void @foo()
 declare void @llvm.assume(i1)
diff --git a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
index 01285c6c0ec09..265aca75fae36 100644
--- a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
+++ b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
@@ -15,9 +15,11 @@
 #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Matchers.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/FormatVariadic.h"
 
 #define DEBUG_TYPE "math-to-spirv-pattern"
@@ -360,75 +362,81 @@ struct PowFOpPattern final : public OpConversionPattern<math::PowFOp> {
     if (!dstType)
       return failure();
 
-    // Get the scalar float type.
-    FloatType scalarFloatType;
-    if (auto scalarType = dyn_cast<FloatType>(powfOp.getType())) {
-      scalarFloatType = scalarType;
-    } else if (auto vectorType = dyn_cast<VectorType>(powfOp.getType())) {
-      scalarFloatType = cast<FloatType>(vectorType.getElementType());
-    } else {
-      return failure();
+    Location loc = powfOp.getLoc();
+    Type operandType = adaptor.getRhs().getType();
+
+    // Parity-based lowering requires an integer-valued constant exponent.
+    // Otherwise fall back to exp(y*log(x)), which yields NaN for x<0 (matches
+    // C).
+    auto isOdd = [](const APFloat &v) {
+      APSInt i(/*BitWidth=*/64, /*isUnsigned=*/false);
+      bool ignored;
+      v.convertToInteger(i, APFloat::rmTowardZero, &ignored);
+      return i[0];
+    };
+
+    SmallVector<bool> oddMask;
+    Attribute rhsAttr;
+    if (matchPattern(adaptor.getRhs(), m_Constant(&rhsAttr))) {
+      TypeSwitch<Attribute>(rhsAttr)
+          .Case([&](FloatAttr a) {
+            if (a.getValue().isInteger())
+              oddMask.push_back(isOdd(a.getValue()));
+          })
+          .Case([&](SplatElementsAttr a) {
+            APFloat splat = a.getSplatValue<APFloat>();
+            if (splat.isInteger())
+              oddMask.push_back(isOdd(splat));
+          })
+          .Case([&](DenseElementsAttr a) {
+            SmallVector<bool> mask;
+            for (const APFloat &elt : a.getValues<APFloat>()) {
+              if (!elt.isInteger())
+                return;
+              mask.push_back(isOdd(elt));
+            }
+            oddMask = std::move(mask);
+          });
     }
 
-    // Get int type of the same shape as the float type.
-    Type scalarIntType = rewriter.getIntegerType(32);
-    Type intType = scalarIntType;
-    auto operandType = adaptor.getRhs().getType();
-    if (auto vectorType = dyn_cast<VectorType>(operandType)) {
-      auto shape = vectorType.getShape();
-      intType = VectorType::get(shape, scalarIntType);
+    if (oddMask.empty()) {
+      Value log = spirv::GLLogOp::create(rewriter, loc, adaptor.getLhs());
+      Value mul = spirv::FMulOp::create(rewriter, loc, adaptor.getRhs(), log);
+      rewriter.replaceOpWithNewOp<spirv::GLExpOp>(powfOp, mul);
+      return success();
+    }
+
+    // GL.Pow is undefined for x < 0; take abs and conditionally negate the
+    // result for lanes whose exponent is odd.
+    Value abs = spirv::GLFAbsOp::create(rewriter, loc, adaptor.getLhs());
+    Value pow = spirv::GLPowOp::create(rewriter, loc, abs, adaptor.getRhs());
+
+    // No odd-parity element: result has the same sign as |lhs|^rhs >= 0.
+    if (llvm::none_of(oddMask, [](bool b) { return b; })) {
+      rewriter.replaceOp(powfOp, pow);
+      return success();
     }
 
-    // Per GL Pow extended instruction spec:
-    // "Result is undefined if x < 0. Result is undefined if x = 0 and y <= 0."
-    Location loc = powfOp.getLoc();
     Value zero = spirv::ConstantOp::getZero(operandType, loc, rewriter);
     Value lessThan =
         spirv::FOrdLessThanOp::create(rewriter, loc, adaptor.getLhs(), zero);
-
-    // Per C/C++ spec:
-    // > pow(base, exponent) returns NaN (and raises FE_INVALID) if base is
-    // > finite and negative and exponent is finite and non-integer.
-    // Calculate the reminder from the exponent and check whether it is zero.
-    Value floatOne = spirv::ConstantOp::getOne(operandType, loc, rewriter);
-    Value expRem =
-        spirv::FRemOp::create(rewriter, loc, adaptor.getRhs(), floatOne);
-    Value expRemNonZero =
-        spirv::FOrdNotEqualOp::create(rewriter, loc, expRem, zero);
-    Value cmpNegativeWithFractionalExp =
-        spirv::LogicalAndOp::create(rewriter, loc, expRemNonZero, lessThan);
-    // Create NaN result and replace base value if conditions are met.
-    const auto &floatSemantics = scalarFloatType.getFloatSemantics();
-    const auto nan = APFloat::getNaN(floatSemantics);
-    Attribute nanAttr = rewriter.getFloatAttr(scalarFloatType, nan);
-    if (auto vectorType = dyn_cast<VectorType>(operandType))
-      nanAttr = DenseElementsAttr::get(vectorType, nan);
-
-    Value nanValue =
-        spirv::ConstantOp::create(rewriter, loc, operandType, nanAttr);
-    Value lhs =
-        spirv::SelectOp::create(rewriter, loc, cmpNegativeWithFractionalExp,
-                                nanValue, adaptor.getLhs());
-    Value abs = spirv::GLFAbsOp::create(rewriter, loc, lhs);
-
-    // TODO: The following just forcefully casts y into an integer value in
-    // order to properly propagate the sign, assuming integer y cases. It
-    // doesn't cover other cases and should be fixed.
-
-    // Cast exponent to integer and calculate exponent % 2 != 0.
-    Value intRhs =
-        spirv::ConvertFToSOp::create(rewriter, loc, intType, adaptor.getRhs());
-    Value intOne = spirv::ConstantOp::getOne(intType, loc, rewriter);
-    Value bitwiseAndOne =
-        spirv::BitwiseAndOp::create(rewriter, loc, intRhs, intOne);
-    Value isOdd = spirv::IEqualOp::create(rewriter, loc, bitwiseAndOne, intOne);
-
-    // calculate pow based on abs(lhs)^rhs.
-    Value pow = spirv::GLPowOp::create(rewriter, loc, abs, adaptor.getRhs());
     Value negate = spirv::FNegateOp::create(rewriter, loc, pow);
-    // if the exponent is odd and lhs < 0, negate the result.
-    Value shouldNegate =
-        spirv::LogicalAndOp::create(rewriter, loc, lessThan, isOdd);
+
+    Value shouldNegate;
+    if (llvm::all_equal(oddMask)) {
+      // Every lane has odd exponent: negate iff lhs < 0.
+      shouldNegate = lessThan;
+    } else {
+      // Mixed parity (non-splat dense vector): AND lhs<0 with a per-element
+      // constant odd-mask.
+      auto vecType = cast<VectorType>(operandType);
+      auto maskType = VectorType::get(vecType.getShape(), rewriter.getI1Type());
+      Value oddConst = spirv::ConstantOp::create(
+          rewriter, loc, maskType, DenseElementsAttr::get(maskType, oddMask));
+      shouldNegate =
+          spirv::LogicalAndOp::create(rewriter, loc, lessThan, oddConst);
+    }
+
     rewriter.replaceOpWithNewOp<spirv::SelectOp>(powfOp, shouldNegate, negate,
                                                  pow);
     return success();
diff --git a/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir b/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir
index 8eb533eeff2a9..08d7822d04cc1 100644
--- a/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir
+++ b/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir
@@ -183,45 +183,98 @@ func.func @ctlz_vector2(%val: vector<2xi32>) -> vector<2xi32> {
   return %0 : vector<2xi32>
 }
 
+// Dynamic exponent: exp(y * log(x)); yields NaN for x<0.
 // CHECK-LABEL: @powf_scalar
 //  CHECK-SAME: (%[[LHS:.+]]: f32, %[[RHS:.+]]: f32)
 func.func @powf_scalar(%lhs: f32, %rhs: f32) -> f32 {
+  // CHECK: %[[LOG:.+]] = spirv.GL.Log %[[LHS]] : f32
+  // CHECK: %[[MUL:.+]] = spirv.FMul %[[RHS]], %[[LOG]] : f32
+  // CHECK: %[[EXP:.+]] = spirv.GL.Exp %[[MUL]] : f32
+  %0 = math.powf %lhs, %rhs : f32
+  // CHECK: return %[[EXP]]
+  return %0: f32
+}
+
+// CHECK-LABEL: @powf_vector
+func.func @powf_vector(%lhs: vector<4xf32>, %rhs: vector<4xf32>) -> vector<4xf32> {
+  // CHECK: spirv.GL.Log %{{.*}} : vector<4xf32>
+  // CHECK: spirv.FMul %{{.*}} : vector<4xf32>
+  // CHECK: spirv.GL.Exp %{{.*}} : vector<4xf32>
+  %0 = math.powf %lhs, %rhs : vector<4xf32>
+  return %0: vector<4xf32>
+}
+
+// Constant odd integer exponent: parity is known statically, so the lowering
+// drops the runtime FToS/BitwiseAnd/IEqual/LogicalAnd parity computation.
+// CHECK-LABEL: @powf_const_odd_int_exp
+//  CHECK-SAME: (%[[LHS:.+]]: f32)
+func.func @powf_const_odd_int_exp(%lhs: f32) -> f32 {
+  // CHECK: %[[RHS:.+]] = arith.constant 3.000000e+00 : f32
+  // CHECK: %[[ABS:.+]] = spirv.GL.FAbs %[[LHS]] : f32
+  // CHECK: %[[POW:.+]] = spirv.GL.Pow %[[ABS]], %[[RHS]] : f32
   // CHECK: %[[F0:.+]] = spirv.Constant 0.000000e+00 : f32
   // CHECK: %[[LT:.+]] = spirv.FOrdLessThan %[[LHS]], %[[F0]] : f32
-  // CHECK: %[[F1:.+]] = spirv.Constant 1.000000e+00 : f32
-  // CHECK: %[[REM:.+]] = spirv.FRem %[[RHS]], %[[F1]] : f32
-  // CHECK: %[[IS_FRACTION:.+]] = spirv.FOrdNotEqual %[[REM]], %[[F0]] : f32
-  // CHECK: %[[AND:.+]] = spirv.LogicalAnd %[[IS_FRACTION]], %[[LT]] : i1
-  // CHECK: %[[NAN:.+]] = spirv.Constant 0x7FC00000 : f32
-  // CHECK: %[[NEW_LHS:.+]] = spirv.Select %[[AND]], %[[NAN]], %[[LHS]] : i1, f32
-  // CHECK: %[[ABS:.+]] = spirv.GL.FAbs %[[NEW_LHS]] : f32
-  // CHECK: %[[IRHS:.+]] = spirv.ConvertFToS
-  // CHECK: %[[CST1:.+]] = spirv.Constant 1 : i32
-  // CHECK: %[[REM:.+]] = spirv.BitwiseAnd %[[IRHS]]
-  // CHECK: %[[ODD:.+]] = spirv.IEqual %[[REM]], %[[CST1]] : i32
-  // CHECK: %[[POW:.+]] = spirv.GL.Pow %[[ABS]], %[[RHS]] : f32
   // CHECK: %[[NEG:.+]] = spirv.FNegate %[[POW]] : f32
-  // CHECK: %[[SNEG:.+]] = spirv.LogicalAnd %[[LT]], %[[ODD]] : i1
-  // CHECK: %[[SEL:.+]] = spirv.Select %[[SNEG]], %[[NEG]], %[[POW]] : i1, f32
-  %0 = math.powf %lhs, %rhs : f32
+  // CHECK: %[[SEL:.+]] = spirv.Select %[[LT]], %[[NEG]], %[[POW]] : i1, f32
+  %c = arith.constant 3.0 : f32
+  %0 = math.powf %lhs, %c : f32
   // CHECK: return %[[SEL]]
   return %0: f32
 }
 
-// CHECK-LABEL: @powf_vector
-func.func @powf_vector(%lhs: vector<4xf32>, %rhs: vector<4xf32>) -> vector<4xf32> {
+// Constant even integer exponent: result is non-negative, no select needed.
+// CHECK-LABEL: @powf_const_even_int_exp
+//  CHECK-SAME: (%[[LHS:.+]]: f32)
+func.func @powf_const_even_int_exp(%lhs: f32) -> f32 {
+  // CHECK: %[[RHS:.+]] = arith.constant 4.000000e+00 : f32
+  // CHECK: %[[ABS:.+]] = spirv.GL.FAbs %[[LHS]] : f32
+  // CHECK: %[[POW:.+]] = spirv.GL.Pow %[[ABS]], %[[RHS]] : f32
+  %c = arith.constant 4.0 : f32
+  %0 = math.powf %lhs, %c : f32
+  // CHECK: return %[[POW]]
+  return %0: f32
+}
+
+// Constant non-integer exponent: falls into the dynamic exp(y*log(x)) path.
+// CHECK-LABEL: @powf_const_frac_exp
+//  CHECK-SAME: (%[[LHS:.+]]: f32)
+func.func @powf_const_frac_exp(%lhs: f32) -> f32 {
+  // CHECK: %[[RHS:.+]] = arith.constant 2.500000e+00 : f32
+  // CHECK: %[[LOG:.+]] = spirv.GL.Log %[[LHS]] : f32
+  // CHECK: %[[MUL:.+]] = spirv.FMul %[[RHS]], %[[LOG]] : f32
+  // CHECK: %[[EXP:.+]] = spirv.GL.Exp %[[MUL]] : f32
+  %c = arith.constant 2.5 : f32
+  %0 = math.powf %lhs, %c : f32
+  // CHECK: return %[[EXP]]
+  return %0: f32
+}
+
+// Splat constant odd integer-valued vector exponent: uniform odd parity.
+// CHECK-LABEL: @powf_const_odd_int_exp_vector
+func.func @powf_const_odd_int_exp_vector(%lhs: vector<4xf32>) -> vector<4xf32> {
+  // CHECK: spirv.GL.FAbs
+  // CHECK: spirv.GL.Pow %{{.*}}: vector<4xf32>
   // CHECK: spirv.FOrdLessThan
-  // CHECK: spirv.FRem
-  // CHECK: spirv.FOrdNotEqual
-  // CHECK: spirv.LogicalAnd
+  // CHECK: spirv.FNegate
   // CHECK: spirv.Select
+  %c = arith.constant dense<3.0> : vector<4xf32>
+  %0 = math.powf %lhs, %c : vector<4xf32>
+  return %0: vector<4xf32>
+}
+
+// Mixed-parity constant integer-valued vector exponent: per-element odd-mask
+// constant is materialized and AND-ed with lhs<0.
+// CHECK-LABEL: @powf_const_mixed_int_exp_vector
+func.func @powf_const_mixed_int_exp_vector(%lhs: vector<4xf32>) -> vector<4xf32> {
   // CHECK: spirv.GL.FAbs
-  // CHECK: spirv.BitwiseAnd %{{.*}} : vector<4xi32>
-  // CHECK: spirv.IEqual %{{.*}} : vector<4xi32>
   // CHECK: spirv.GL.Pow %{{.*}}: vector<4xf32>
+  // CHECK: spirv.FOrdLessThan
   // CHECK: spirv.FNegate
+  // CHECK: %[[ODD:.+]] = spirv.Constant dense<[true, false, true, false]> : vector<4xi1>
+  // CHECK: spirv.LogicalAnd %{{.*}}, %[[ODD]] : vector<4xi1>
   // CHECK: spirv.Select
-  %0 = math.powf %lhs, %rhs : vector<4xf32>
+  %c = arith.constant dense<[3.0, 2.0, 5.0, 4.0]> : vector<4xf32>
+  %0 = math.powf %lhs, %c : vector<4xf32>
   return %0: vector<4xf32>
 }