hpsim · greole · Jul 30, 2025 · Aug 4, 2025 · Aug 4, 2025 · Aug 5, 2025
diff --git a/.github/workflows/compile_on_aws.yml b/.github/workflows/compile_on_aws.yml
diff --git a/.github/workflows/static_checks.yaml b/.github/workflows/static_checks.yaml
@@ -67,7 +67,7 @@ jobs:
       run: |
         git config --global --add safe.directory /__w/OGL/OGL
         # Create list of all source files belonging to this repository
-        git ls-files | grep -E "\.(C)" > pattern
+        git ls-files | grep -E "\.(cpp)" > pattern
         # Create list of .cpp files that are in this repository and part of the
         # compilation database
         # also filters out " at the begin and end of the filename

diff --git a/include/OGL/CommunicationPattern.hpp b/include/OGL/CommunicationPattern.hpp
@@ -19,6 +19,18 @@ struct AllToAllPattern {
     std::vector<int> recv_offsets;
 };
 
+/* @brief computes AllToAllPattern for repartioned communincator from global
+ * allToAll pattern by discarding all zero communication before and after the
+ * repartioner scope.
+ *
+ * @param exec_handler The executor handler
+ * @param allToAll The original allToAll pattern
+ * @param start_rank the original comm_world rank
+ */
+AllToAllPattern compute_repart_allToall(const ExecutorHandler &exec_handler,
+                                        const AllToAllPattern allToAll,
+                                        label start_rank);
+
 /* @brief  This function computes the send and recv counts vectors and the send
  * and recv offsets vectors for scattering from an owner to all ranks, including
  * owner itself

diff --git a/include/OGL/DevicePersistent/ExecutorHandler.hpp b/include/OGL/DevicePersistent/ExecutorHandler.hpp
@@ -71,6 +71,14 @@ struct DeviceIdHandler {
         return device_global_id % num_devices_per_node;
     }
 
+    /* @brief returns the owner rank on the global comm world communicator
+     */
+    label global_owner() const
+    {
+        label rank = Pstream::myProcNo();
+        return rank - (rank % ranks_per_gpu);
+    }
+
     /* @brief check if rank is an owning rank
      */
     bool is_owner() const
@@ -270,15 +278,16 @@ class ExecutorHandler
             // gko comm
             label group = device_id_handler_.compute_group();
             MPI_Comm gko_comm;
-            label host_rank = 0;
+            label host_rank = Pstream::myProcNo();
             MPI_Comm_split(MPI_COMM_WORLD, group, host_rank, &gko_comm);
             device_comm_ =
                 std::make_shared<gko::experimental::mpi::communicator>(
                     gko_comm, gko_force_host_buffer_);
 
             // repart comm
             MPI_Comm repart_comm;
-            label device_id = device_id_handler_.compute_device_id(4);
+            label global_rank = Pstream::myProcNo();
+            label device_id = global_rank / device_id_handler_.ranks_per_gpu;
             MPI_Comm_split(MPI_COMM_WORLD, device_id, host_rank, &repart_comm);
             repart_comm_ =
                 std::make_shared<gko::experimental::mpi::communicator>(
@@ -300,6 +309,15 @@ class ExecutorHandler
      * */
     bool get_non_orig_device_comm() const { return non_orig_device_comm_; }
 
+    label get_ranks_per_gpu() const { return device_id_handler_.ranks_per_gpu; }
+
+    void set_ranks_per_gpu(label ranks_per_gpu)
+    {
+        device_id_handler_.ranks_per_gpu = ranks_per_gpu;
+    }
+
+    label get_owner_rank() const { return device_id_handler_.global_owner(); }
+
     const std::shared_ptr<gko::Executor> get_device_exec() const
     {
         return this->get_persistent_object();

diff --git a/include/OGL/DevicePersistent/Vector.hpp b/include/OGL/DevicePersistent/Vector.hpp
@@ -67,7 +67,7 @@ struct VectorInitFunctor {
         //// TODO store
         auto comm_pattern = compute_gather_to_owner_counts(
             exec_, repartitioner->get_ranks_per_gpu(), host_size);
-        bool host_buffer = !exec_.get_non_orig_device_comm();
+        bool host_buffer = exec_.get_gko_force_host_buffer();
 
         communicate_values(ref_exec, exec, comm, comm_pattern,
                            host_view.get_const_data(),
@@ -177,7 +177,8 @@ class PersistentVector
         auto rank = exec_.get_host_rank();
         auto ref_exec = exec_.get_ref_exec();
         auto comm = exec_.get_host_comm();
-        bool host_buffer = !exec_.get_non_orig_device_comm();
+        auto repart_comm = exec_.get_repart_comm();
+        bool host_buffer = exec_.get_gko_force_host_buffer();
 
         auto repartitioner = dist_matrix_->get_repartitioner();
         auto host_size = repartitioner->get_orig_size();
@@ -186,9 +187,36 @@ class PersistentVector
         auto comm_pattern = compute_scatter_from_owner_counts(
             exec_, repartitioner->get_ranks_per_gpu(), host_size);
 
-        communicate_values(exec, ref_exec, comm, comm_pattern,
-                           get_vector()->get_local_values(),
-                           const_cast<T *>(memory_), host_buffer);
+        label owner_rank = exec_.get_owner_rank();
+        auto repartAllToAll =
+            compute_repart_allToall(exec_, comm_pattern, owner_rank);
+
+        // if (owner_rank != Pstream::myProcNo()){
+        // 	label recv_count = repartAllToAll.recv_counts[0];
+        // 	repartAllToAll.recv_counts[Pstream::myProcNo()] = recv_count;
+        // 	repartAllToAll.recv_counts[0] = 0;
+        // }
+
+        // NOTE instead of all_to_all_v based communication MPI_Iscatterv
+        // seems to be preferable
+        // communicate_values(exec, ref_exec, comm, comm_pattern,
+        //                    get_vector()->get_local_values(),
+        //                    const_cast<T *>(memory_), host_buffer);
+
+        label send_size = comm_pattern.send_offsets.back();
+        auto send_view = gko::array<scalar>::const_view(
+            exec, send_size, get_vector()->get_local_values());
+        auto tmp = gko::array<scalar>(exec, send_size);
+
+        tmp = send_view;
+        tmp.set_executor(ref_exec);
+
+        MPI_Request copy_back_req;
+        MPI_Iscatterv(tmp.get_data(), repartAllToAll.send_counts.data(),
+                      repartAllToAll.send_offsets.data(), MPI_DOUBLE,
+                      const_cast<T *>(memory_), repartAllToAll.recv_counts[0],
+                      MPI_DOUBLE, 0, repart_comm->get(), &copy_back_req);
+        MPI_Wait(&copy_back_req, MPI_STATUS_IGNORE);
     }
 
     /** Writes the content of the distributed vector to disk

diff --git a/include/OGL/StoppingCriterion.hpp b/include/OGL/StoppingCriterion.hpp
@@ -147,6 +147,8 @@ class StoppingCriterion {
 
     const label frequency_;
 
+    const word frequencyMode_;
+
     const scalar relaxationFactor_;
 
     const bool adapt_minIter_;
@@ -172,6 +174,8 @@ class StoppingCriterion {
           norm_eval_limit_(
               controlDict.lookupOrDefault("normEvalLimit", label(100))),
           frequency_(controlDict.lookupOrDefault("evalFrequency", label(1))),
+          frequencyMode_(controlDict.lookupOrDefault(
+              "evalFrequencyMode", word("relative"))),  // optimizer, fixed
           relaxationFactor_(
               controlDict.lookupOrDefault("relaxationFactor", scalar(0.6))),
           adapt_minIter_(
@@ -197,21 +201,30 @@ class StoppingCriterion {
                                   bool export_res, label prev_solve_iters,
                                   scalar prev_rel_cost) const
     {
+        word frequencyMode = "optimizer";
         label minIter = minIter_;
         label frequency = frequency_;
+        // in case of export_res all residuals need to be computed
         if (!export_res) {
             if (prev_solve_iters > 0 && adapt_minIter_ && prev_rel_cost > 0) {
                 minIter = prev_solve_iters * relaxationFactor_;
-                auto alpha =
-                    sqrt(1.0 / (prev_solve_iters * (1.0 - relaxationFactor_)) *
-                         prev_rel_cost);
-                frequency = min(norm_eval_limit_, max(1, label(1 / alpha)));
+                if (frequencyMode == "optimizer") {
+                    auto alpha = sqrt(
+                        1.0 / (prev_solve_iters * (1.0 - relaxationFactor_)) *
+                        prev_rel_cost);
+                    frequency = min(norm_eval_limit_, max(1, label(1 / alpha)));
+                }
+                if (frequencyMode == "relative") {
+                    frequency = label(prev_solve_iters * 0.075) + 1;
+                }
             }
         }
 
         word msg = "Creating stopping criterion with minIter " +
                    std::to_string(minIter) + " frequency " +
-                   std::to_string(frequency);
+                   std::to_string(frequency) + " prev_solve_iters " +
+                   std::to_string(prev_solve_iters) + " adapt_minIter_  " +
+                   std::to_string(adapt_minIter_) + " prev_rel_cost  ";
 
         MLOG_0(verbose, msg)
 

diff --git a/include/OGL/lduLduBase.hpp b/include/OGL/lduLduBase.hpp
@@ -216,6 +216,7 @@ class lduLduBase : public OGL_Info,
                                            solverPerformance &solverPerf) const
     {
         bool fused = solver_controls_.lookupOrDefault<Switch>("fuse", true);
+        exec_handler_.init_device_comm();
 
         auto repartitioner = std::make_shared<Repartitioner>(
             host_matrix_wrapper_->get_local_nrows(), ranks_per_gpu_, verbose_,
@@ -334,8 +335,8 @@ class lduLduBase : public OGL_Info,
             std::to_string(time_per_dof) + std::string(" [ns]") +
             std::string("\n\tTime per iteration and DOF: ") +
             std::to_string(time_per_iter_and_dof) + std::string(" [ns]") +
-            std::string("\n\tRetrieve results bandwidth ") +
-            std::to_string(bandwidth_copy_back) + std::string(" [GByte/s]");
+            std::string("\n\tRetrieve results bandwidth ");  // +
+        std::to_string(bandwidth_copy_back) + std::string(" [GByte/s]");
         MLOG_0(verbose_, msg)
 
         return solverPerf;