ROCm · Rolaand-Jayz · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
@@ -19,6 +19,7 @@ Full documentation for MIGraphX is available at
 * Changed parsing of ONNX ops like ConstantOfShape to insert undefined if expected shape has 0 elements (#4567).
 * Updated the ONNX clip operator to support opset 13 (#4518).
 * Updated `argmin` and `argmax` ops to be implemented as reduction ops, so they now have JIT support and can fuse (#4620).
+* Updated GPU stream-count, NHWC layout, and MLIR attention defaults to adapt to the detected architecture (#4668).
 
 ### Resolved issues
 
@@ -34,6 +35,7 @@ Full documentation for MIGraphX is available at
 * Added a new pass to replace convolution with constant broadcast input with a reduced GEMM which improves model compilation time (#4621).
 * Implemented JIT compilation for `logsoftmax` by decomposing it into fusible operations (`log`, `exp`, `reduce_max`, `reduce_sum`), enabling kernel fusion. (#4630).
 * Added early return for `find_conv_dot_horiz_fusion` matcher based on if operator output size is less than two (#4662).
+* Cached repeated HIP compilation and MIOpen solution lookups, and tuned GPU stream partitioning and pointwise launch bounds (#4668).
 
 ### Removed
 * Removed legacy device implementations for `argmin` and `argmax` in favor of the JIT implementations recently added (#4658).

@@ -21,13 +21,16 @@ Model performance tunable variables change the compilation behavior of a model.
   * - Environment variable
     - Values
 
-  * - | ``MIGRAPHX_ENABLE_NHWC``
-      | Forces the model to use the NHWC layout.
-
-    - | ``1``: Forces the use of the NHWC layout.
-      | ``0``: Returns to default behavior.
-
-      | Default: The use of the NHWC layout isn't forced.
+  * - | ``MIGRAPHX_ENABLE_NHWC``
+      | Forces the model to use the NHWC layout.
+
+    - | ``1``: Forces the use of the NHWC layout.
+      | ``0``: Disables the automatic NHWC policy and keeps convolution layout transforms off.
+
+      | Default: NHWC is enabled automatically on ``gfx942+``, ``gfx95x``, ``gfx11x``, and
+      | ``gfx12x`` targets, while older targets keep the legacy NCHW behavior.
+      |
+      | Note: Group convolutions still remain in NCHW for performance.
 
   * - | ``MIGRAPHX_DISABLE_MLIR``
       | When set, the rocMLIR library won't be used.
@@ -704,11 +707,20 @@ Advanced settings
 
       | Default: A null stream can't be used for stream handling.
 
-  * - | ``MIGRAPHX_NSTREAMS``
-      | Sets the number of HIP streams to use in the GPU. 
-
-    - | Takes a positive integer.
-      | Default: one stream will be used.
+  * - | ``MIGRAPHX_NSTREAMS``
+      | Sets the number of HIP streams to use in the GPU. 
+
+    - | Takes a positive integer.
+      | Set to ``0`` to use the adaptive default.
+      | Default: adaptive stream count based on GPU size.
+      |
+      | Current defaults:
+      | ``1`` stream for GPUs with fewer than 32 compute units
+      | ``2`` streams for GPUs with 32 to 63 compute units
+      | ``3`` streams for GPUs with 64 to 95 compute units
+      | ``4`` streams for GPUs with 96 or more compute units
+      |
+      | Note: when ``MIGRAPHX_ENABLE_NULL_STREAM=1`` is set, the adaptive default falls back to ``1`` stream.
 
   * - | ``MIGRAPHX_TRACE_BENCHMARKING``
       | Sets the verbosity of benchmarking traces. 

@@ -40,6 +40,7 @@ struct compile_options
 
     bool fast_math       = true;
     bool exhaustive_tune = false;
+    bool disable_nhwc    = false;
 
     tracer trace{};
 };

@@ -48,6 +48,8 @@ struct schedule_model
 {
     /// Get the number of concurrent instruction allowed
     std::size_t concurrency() const;
+    /// Get the minimum accumulated weight required to split a partition
+    std::size_t split_threshold() const;
     /// Schedule a concurrent instruction
     void sched(module& m, instruction_ref ins, std::size_t n) const;
     // Insert necessary waits before an instruction
@@ -68,6 +70,8 @@ struct MIGRAPHX_EXPORT schedule_model
     //
     std::size_t concurrency() const;
     //
+    std::size_t split_threshold() const;
+    //
     void sched(module& m, instruction_ref ins, std::size_t n) const;
     //
     void wait(module& m, instruction_ref ins, std::size_t wait_id) const;
@@ -99,6 +103,7 @@ struct schedule_model
     template <class PrivateDetailTypeErasedT>
     using private_te_constraints_impl =
         decltype(std::declval<PrivateDetailTypeErasedT>().concurrency(),
+                 std::declval<PrivateDetailTypeErasedT>().split_threshold(),
                  std::declval<PrivateDetailTypeErasedT>().sched(std::declval<module&>(),
                                                                 std::declval<instruction_ref>(),
                                                                 std::declval<std::size_t>()),
@@ -191,6 +196,12 @@ struct schedule_model
         return (*this).private_detail_te_get_handle().concurrency();
     }
 
+    std::size_t split_threshold() const
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        return (*this).private_detail_te_get_handle().split_threshold();
+    }
+
     void sched(module& m, instruction_ref ins, std::size_t n) const
     {
         assert((*this).private_detail_te_handle_mem_var);
@@ -230,6 +241,7 @@ struct schedule_model
         virtual const std::type_info& type() const                                = 0;
 
         virtual std::size_t concurrency() const                                        = 0;
+        virtual std::size_t split_threshold() const                                    = 0;
         virtual void sched(module& m, instruction_ref ins, std::size_t n) const        = 0;
         virtual void wait(module& m, instruction_ref ins, std::size_t wait_id) const   = 0;
         virtual void record(module& m, instruction_ref ins, std::size_t wait_id) const = 0;
@@ -266,6 +278,11 @@ struct schedule_model
 
         std::size_t concurrency() const override { return private_detail_te_value.concurrency(); }
 
+        std::size_t split_threshold() const override
+        {
+            return private_detail_te_value.split_threshold();
+        }
+
         void sched(module& m, instruction_ref ins, std::size_t n) const override
         {
 

@@ -23,6 +23,7 @@
  */
 #include <migraphx/msgpack.hpp>
 #include <migraphx/serialize.hpp>
+#include <limits>
 #include <msgpack.hpp>
 
 namespace migraphx {

@@ -30,6 +30,7 @@
 #include <migraphx/simplify_reshapes.hpp>
 #include <migraphx/simplify_qdq.hpp>
 #include <migraphx/eliminate_common_subexpression.hpp>
+#include <migraphx/compile_options.hpp>
 #include <migraphx/optimize_module.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/program.hpp>
@@ -134,7 +135,9 @@ static void quantize_8bits(program& prog,
 
     // use the calibration data to compute the quantization scale
     auto capture_prog = prog;
-    capture_prog.compile(t);
+    compile_options capture_compile_options;
+    capture_compile_options.disable_nhwc = true;
+    capture_prog.compile(t, capture_compile_options);
 
     // use all calibration data to run the program to calculate the
     // quantization scale and shift

@@ -107,14 +107,14 @@ struct stream_info
                   }));
     }
 
-    std::vector<instruction_ref>::iterator sort_args(std::vector<instruction_ref>& args)
+    std::vector<instruction_ref>::iterator sort_args(std::vector<instruction_ref>& args,
+                                                     std::size_t min_partition_threshold)
     {
         if(args.size() < 2)
         {
             return args.end();
         }
 
-        const std::size_t min_partition_threshold = 2;
         sort_args_by_weight(args, std::greater<>{});
 
         auto it = std::lower_bound(std::next(args.begin()),
@@ -139,7 +139,7 @@ struct stream_info
         }
     };
 
-    std::size_t assign_streams(module& m, std::size_t n)
+    std::size_t assign_streams(module& m, std::size_t n, std::size_t min_partition_threshold)
     {
         assert(n > 0);
         partition critical;
@@ -157,7 +157,7 @@ struct stream_info
             part.add(ins, this->iweights[ins]);
 
             auto args         = ins->inputs();
-            auto threshold_it = this->sort_args(args);
+            auto threshold_it = this->sort_args(args, min_partition_threshold);
 
             if(not args.empty())
             {
@@ -539,7 +539,7 @@ void schedule::apply(module& m) const
     si.calc_implicit_deps(m);
     auto last = std::prev(m.end());
     si.accumulate_weights(last, model);
-    auto nstreams = si.assign_streams(m, model.concurrency());
+    auto nstreams = si.assign_streams(m, model.concurrency(), model.split_threshold());
     si.sort(m, model.concurrency());
 
     if(enabled(MIGRAPHX_TRACE_COMPILE{}) or enabled(MIGRAPHX_TRACE_SCHEDULE{}))