From fdd8523a1f34bad717d77d0d72c81846fc3ddfd6 Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Wed, 11 Mar 2026 00:04:48 +0000 Subject: [PATCH 1/3] clean up for sycl --- include/exchcxx/exchcxx_config.hpp.in | 1 + include/exchcxx/util/exchcxx_macros.hpp | 188 +-- src/sycl/builtin_sycl.cxx | 1523 ++++++++++++++--------- src/sycl/libxc_device.cxx | 236 ++-- src/sycl/xc_functional_device.cxx | 658 +++++----- test/xc_kernel_test.cxx | 4 +- 6 files changed, 1424 insertions(+), 1186 deletions(-) diff --git a/include/exchcxx/exchcxx_config.hpp.in b/include/exchcxx/exchcxx_config.hpp.in index 37e4383..0d6d6eb 100644 --- a/include/exchcxx/exchcxx_config.hpp.in +++ b/include/exchcxx/exchcxx_config.hpp.in @@ -62,4 +62,5 @@ #ifdef EXCHCXX_ENABLE_SYCL #include + namespace syclex = sycl::ext::oneapi; #endif diff --git a/include/exchcxx/util/exchcxx_macros.hpp b/include/exchcxx/util/exchcxx_macros.hpp index 7fbf0fa..8b1c60a 100644 --- a/include/exchcxx/util/exchcxx_macros.hpp +++ b/include/exchcxx/util/exchcxx_macros.hpp @@ -76,10 +76,8 @@ #ifdef EXCHCXX_ENABLE_SYCL - #define DEVICE_PARAMS sycl::queue* queue - #define DEVICE_PARAMS_NOTYPE queue - - #define SYCL_KERNEL_PARAMS sycl::id<1> tid + #define DEVICE_PARAMS sycl::queue* stream + #define DEVICE_PARAMS_NOTYPE stream #endif @@ -101,19 +99,6 @@ #endif -#ifdef EXCHCXX_ENABLE_SYCL - - #define RET_GENERATOR_SYCL_KERNEL( APPROX, TYPE, func ) \ - func( DEV_ ## APPROX ## _IPARAMS , DEV_ ## APPROX ## _OPARAMS_ ## TYPE, \ - SYCL_KERNEL_PARAMS ) - - #define RET_INC_GENERATOR_SYCL_KERNEL( APPROX, TYPE, func ) \ - func( double scal_fact, DEV_ ## APPROX ## _IPARAMS , DEV_ ## APPROX ## _OPARAMS_ ## TYPE, \ - SYCL_KERNEL_PARAMS ) - -#endif - - // LDA Generators #define RET_LDA_EXC_GENERATOR(func) RET_GENERATOR( LDA, EXC, func ) @@ -203,65 +188,6 @@ #endif -#ifdef EXCHCXX_ENABLE_SYCL - - #define RET_LDA_EXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( LDA, EXC, func ) - #define RET_LDA_VXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( LDA, VXC, func ) - #define RET_LDA_FXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( LDA, FXC, func ) - #define RET_LDA_KXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( LDA, KXC, func ) - #define RET_LDA_EXC_VXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( LDA, EXC_VXC, func ) - #define RET_LDA_VXC_FXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( LDA, VXC_FXC, func ) - - - #define LDA_EXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_LDA_EXC_GENERATOR_SYCL_KERNEL(func) - #define LDA_VXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_LDA_VXC_GENERATOR_SYCL_KERNEL(func) - #define LDA_FXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_LDA_FXC_GENERATOR_SYCL_KERNEL(func) - #define LDA_KXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_LDA_KXC_GENERATOR_SYCL_KERNEL(func) - #define LDA_EXC_VXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_LDA_EXC_VXC_GENERATOR_SYCL_KERNEL(func) - #define LDA_VXC_FXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_LDA_VXC_FXC_GENERATOR_SYCL_KERNEL(func) - - - #define RET_LDA_EXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( LDA, EXC, func ) - #define RET_LDA_VXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( LDA, VXC, func ) - #define RET_LDA_FXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( LDA, FXC, func ) - #define RET_LDA_KXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( LDA, KXC, func ) - #define RET_LDA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( LDA, EXC_VXC, func ) - #define RET_LDA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( LDA, VXC_FXC, func ) - - - #define LDA_EXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_LDA_EXC_INC_GENERATOR_SYCL_KERNEL(func) - #define LDA_VXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_LDA_VXC_INC_GENERATOR_SYCL_KERNEL(func) - #define LDA_FXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_LDA_FXC_INC_GENERATOR_SYCL_KERNEL(func) - #define LDA_KXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_LDA_KXC_INC_GENERATOR_SYCL_KERNEL(func) - #define LDA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_LDA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL(func) - #define LDA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_LDA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL(func) - -#endif - // GGA Generators @@ -353,65 +279,6 @@ #endif -#ifdef EXCHCXX_ENABLE_SYCL - - #define RET_GGA_EXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( GGA, EXC, func ) - #define RET_GGA_VXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( GGA, VXC, func ) - #define RET_GGA_FXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( GGA, FXC, func ) - #define RET_GGA_KXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( GGA, KXC, func ) - #define RET_GGA_EXC_VXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( GGA, EXC_VXC, func ) - #define RET_GGA_VXC_FXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( GGA, VXC_FXC, func ) - - - #define GGA_EXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_GGA_EXC_GENERATOR_SYCL_KERNEL(func) - #define GGA_VXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_GGA_VXC_GENERATOR_SYCL_KERNEL(func) - #define GGA_FXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_GGA_FXC_GENERATOR_SYCL_KERNEL(func) - #define GGA_KXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_GGA_KXC_GENERATOR_SYCL_KERNEL(func) - #define GGA_EXC_VXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_GGA_EXC_VXC_GENERATOR_SYCL_KERNEL(func) - #define GGA_VXC_FXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_GGA_VXC_FXC_GENERATOR_SYCL_KERNEL(func) - - #define RET_GGA_EXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( GGA, EXC, func ) - #define RET_GGA_VXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( GGA, VXC, func ) - #define RET_GGA_FXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( GGA, FXC, func ) - #define RET_GGA_KXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( GGA, KXC, func ) - #define RET_GGA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( GGA, EXC_VXC, func ) - #define RET_GGA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( GGA, VXC_FXC, func ) - - - #define GGA_EXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_GGA_EXC_INC_GENERATOR_SYCL_KERNEL(func) - #define GGA_VXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_GGA_VXC_INC_GENERATOR_SYCL_KERNEL(func) - #define GGA_FXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_GGA_FXC_INC_GENERATOR_SYCL_KERNEL(func) - #define GGA_KXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_GGA_KXC_INC_GENERATOR_SYCL_KERNEL(func) - #define GGA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_GGA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL(func) - #define GGA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_GGA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL(func) - -#endif - - // MGGA Generators #define RET_MGGA_EXC_GENERATOR(func) RET_GENERATOR( MGGA, EXC, func ) @@ -492,57 +359,6 @@ -#ifdef EXCHCXX_ENABLE_SYCL - - #define RET_MGGA_EXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( MGGA, EXC, func ) - #define RET_MGGA_VXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( MGGA, VXC, func ) - #define RET_MGGA_EXC_VXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( MGGA, EXC_VXC, func ) - #define RET_MGGA_FXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( MGGA, FXC, func ) - #define RET_MGGA_VXC_FXC_GENERATOR_SYCL_KERNEL(func) \ - RET_GENERATOR_SYCL_KERNEL( MGGA, VXC_FXC, func ) - - - #define MGGA_EXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_MGGA_EXC_GENERATOR_SYCL_KERNEL(func) - #define MGGA_VXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_MGGA_VXC_GENERATOR_SYCL_KERNEL(func) - #define MGGA_EXC_VXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_MGGA_EXC_VXC_GENERATOR_SYCL_KERNEL(func) - #define MGGA_FXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_MGGA_FXC_GENERATOR_SYCL_KERNEL(func) - #define MGGA_VXC_FXC_GENERATOR_SYCL_KERNEL(func) \ - void RET_MGGA_VXC_FXC_GENERATOR_SYCL_KERNEL(func) - - - #define RET_MGGA_EXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( MGGA, EXC, func ) - #define RET_MGGA_VXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( MGGA, VXC, func ) - #define RET_MGGA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( MGGA, EXC_VXC, func ) - #define RET_MGGA_FXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( MGGA, FXC, func ) - #define RET_MGGA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL(func) \ - RET_INC_GENERATOR_SYCL_KERNEL( MGGA, VXC_FXC, func ) - - - #define MGGA_EXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_MGGA_EXC_INC_GENERATOR_SYCL_KERNEL(func) - #define MGGA_VXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_MGGA_VXC_INC_GENERATOR_SYCL_KERNEL(func) - #define MGGA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_MGGA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL(func) - #define MGGA_FXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_MGGA_FXC_INC_GENERATOR_SYCL_KERNEL(func) - #define MGGA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL(func) \ - void RET_MGGA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL(func) - -#endif - diff --git a/src/sycl/builtin_sycl.cxx b/src/sycl/builtin_sycl.cxx index 8631f1b..09f324a 100644 --- a/src/sycl/builtin_sycl.cxx +++ b/src/sycl/builtin_sycl.cxx @@ -1,30 +1,30 @@ /** - * ExchCXX + * ExchCXX * * Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). + * any required approvals from the U.S. Dept. of Energy). * * Portions Copyright (c) Microsoft Corporation. * * All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: - * + * * (1) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. - * + * * (2) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * + * * (3) Neither the name of the University of California, Lawrence Berkeley * National Laboratory, U.S. Dept. of Energy nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. - * - * + * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -36,7 +36,7 @@ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. - * + * * You are under no obligation whatsoever to provide any bug fixes, patches, * or upgrades to the features, functionality or performance of the source * code ("Enhancements") to anyone; however, if you choose to make your @@ -52,6 +52,7 @@ #include #include #include +#include namespace ExchCXX { namespace detail { @@ -75,177 +76,273 @@ template class device_eval_vxc_fxc_inc_helper_polar_kernel template -__attribute__((always_inline)) LDA_EXC_GENERATOR_SYCL_KERNEL( device_eval_exc_helper_unpolar_kernel ) { +__attribute__((always_inline)) LDA_EXC_GENERATOR( device_eval_exc_helper_unpolar_kernel ) { using traits = kernel_traits; - traits::eval_exc_unpolar( rho[tid], eps[tid] ); + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { + + traits::eval_exc_unpolar( rho[tid], eps[tid] ); + + } } template -__attribute__((always_inline)) LDA_EXC_GENERATOR_SYCL_KERNEL( device_eval_exc_helper_polar_kernel ) { +__attribute__((always_inline)) LDA_EXC_GENERATOR( device_eval_exc_helper_polar_kernel ) { using traits = kernel_traits; - auto rho_i = rho + 2*tid; - traits::eval_exc_polar( rho_i[0], rho_i[1], eps[tid] ); + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { + + auto rho_i = rho + 2*tid; + traits::eval_exc_polar( rho_i[0], rho_i[1], eps[tid] ); + + } } template -__attribute__((always_inline)) LDA_EXC_VXC_GENERATOR_SYCL_KERNEL( device_eval_exc_vxc_helper_unpolar_kernel ) { +__attribute__((always_inline)) LDA_EXC_VXC_GENERATOR( device_eval_exc_vxc_helper_unpolar_kernel ) { using traits = kernel_traits; - traits::eval_exc_vxc_unpolar( rho[tid], eps[tid], vxc[tid] ); + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { + + traits::eval_exc_vxc_unpolar( rho[tid], eps[tid], vxc[tid] ); + + } } template -__attribute__((always_inline)) LDA_EXC_VXC_GENERATOR_SYCL_KERNEL( device_eval_exc_vxc_helper_polar_kernel ) { +__attribute__((always_inline)) LDA_EXC_VXC_GENERATOR( device_eval_exc_vxc_helper_polar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { - using traits = kernel_traits; auto rho_i = rho + 2*tid; auto vxc_i = vxc + 2*tid; traits::eval_exc_vxc_polar( rho_i[0], rho_i[1], eps[tid], - vxc_i[0], vxc_i[1] ); + vxc_i[0], vxc_i[1] ); + + } } template -__attribute__((always_inline)) LDA_FXC_GENERATOR_SYCL_KERNEL( device_eval_fxc_helper_unpolar_kernel ) { +__attribute__((always_inline)) LDA_FXC_GENERATOR( device_eval_fxc_helper_unpolar_kernel ) { using traits = kernel_traits; - traits::eval_fxc_unpolar( rho[tid], fxc[tid] ); + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { + + traits::eval_fxc_unpolar( rho[tid], fxc[tid] ); + + } } template -__attribute__((always_inline)) LDA_FXC_GENERATOR_SYCL_KERNEL( device_eval_fxc_helper_polar_kernel ) { +__attribute__((always_inline)) LDA_FXC_GENERATOR( device_eval_fxc_helper_polar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { - using traits = kernel_traits; auto rho_i = rho + 2*tid; auto v2rho2_i = fxc + 3*tid; traits::eval_fxc_polar( rho_i[0], rho_i[1], v2rho2_i[0], - v2rho2_i[1], v2rho2_i[2] ); - + v2rho2_i[1], v2rho2_i[2] ); + } } template -__attribute__((always_inline)) LDA_VXC_FXC_GENERATOR_SYCL_KERNEL( device_eval_vxc_fxc_helper_unpolar_kernel ) { +__attribute__((always_inline)) LDA_VXC_FXC_GENERATOR( device_eval_vxc_fxc_helper_unpolar_kernel ) { using traits = kernel_traits; - traits::eval_vxc_fxc_unpolar( rho[tid], vxc[tid], fxc[tid] ); + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { + + traits::eval_vxc_fxc_unpolar( rho[tid], vxc[tid], fxc[tid] ); + + } } template -__attribute__((always_inline)) LDA_VXC_FXC_GENERATOR_SYCL_KERNEL( device_eval_vxc_fxc_helper_polar_kernel ) { +__attribute__((always_inline)) LDA_VXC_FXC_GENERATOR( device_eval_vxc_fxc_helper_polar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { - using traits = kernel_traits; auto rho_i = rho + 2*tid; auto vxc_i = vxc + 2*tid; auto v2rho2_i = fxc + 3*tid; traits::eval_vxc_fxc_polar( rho_i[0], rho_i[1], vxc_i[0], vxc_i[1], - v2rho2_i[0], v2rho2_i[1], v2rho2_i[2] ); + v2rho2_i[0], v2rho2_i[1], v2rho2_i[2] ); + + } } template -__attribute__((always_inline)) LDA_EXC_INC_GENERATOR_SYCL_KERNEL( device_eval_exc_inc_helper_unpolar_kernel ) { +__attribute__((always_inline)) LDA_EXC_INC_GENERATOR( device_eval_exc_inc_helper_unpolar_kernel ) { using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); double e; - traits::eval_exc_unpolar( rho[tid], e ); - eps[tid] += scal_fact * e; + if( tid < N ) { + + traits::eval_exc_unpolar( rho[tid], e ); + eps[tid] += scal_fact * e; + + } } template -__attribute__((always_inline)) LDA_EXC_INC_GENERATOR_SYCL_KERNEL( device_eval_exc_inc_helper_polar_kernel ) { +__attribute__((always_inline)) LDA_EXC_INC_GENERATOR( device_eval_exc_inc_helper_polar_kernel ) { using traits = kernel_traits; - auto rho_i = rho + 2*tid; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); - double e; - traits::eval_exc_polar( rho_i[0], rho_i[1], e ); + if( tid < N ) { + + auto rho_i = rho + 2*tid; + + double e; + traits::eval_exc_polar( rho_i[0], rho_i[1], e ); + + eps[tid] += scal_fact * e; - eps[tid] += scal_fact * e; + } } template -__attribute__((always_inline)) LDA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL( device_eval_exc_vxc_inc_helper_unpolar_kernel ) { +__attribute__((always_inline)) LDA_EXC_VXC_INC_GENERATOR( device_eval_exc_vxc_inc_helper_unpolar_kernel ) { using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); double e,v; - traits::eval_exc_vxc_unpolar( rho[tid], e, v ); - eps[tid] += scal_fact * e; - vxc[tid] += scal_fact * v; + if( tid < N ) { + + traits::eval_exc_vxc_unpolar( rho[tid], e, v ); + eps[tid] += scal_fact * e; + vxc[tid] += scal_fact * v; + + } } template -__attribute__((always_inline)) LDA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL( device_eval_exc_vxc_inc_helper_polar_kernel ) { +__attribute__((always_inline)) LDA_EXC_VXC_INC_GENERATOR( device_eval_exc_vxc_inc_helper_polar_kernel ) { using traits = kernel_traits; - auto rho_i = rho + 2*tid; - auto vxc_i = vxc + 2*tid; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { + + auto rho_i = rho + 2*tid; + auto vxc_i = vxc + 2*tid; - double v_a, v_b, e; - traits::eval_exc_vxc_polar( rho_i[0], rho_i[1], e, v_a, v_b); - eps[tid] += scal_fact * e; - vxc_i[0] += scal_fact * v_a; - vxc_i[1] += scal_fact * v_b; + double v_a, v_b, e; + traits::eval_exc_vxc_polar( rho_i[0], rho_i[1], e, v_a, v_b); + eps[tid] += scal_fact * e; + vxc_i[0] += scal_fact * v_a; + vxc_i[1] += scal_fact * v_b; + + } } template -__attribute__((always_inline)) LDA_FXC_INC_GENERATOR_SYCL_KERNEL( device_eval_fxc_inc_helper_unpolar_kernel ) { +__attribute__((always_inline)) LDA_FXC_INC_GENERATOR( device_eval_fxc_inc_helper_unpolar_kernel ) { using traits = kernel_traits; - double f; - traits::eval_fxc_unpolar( rho[tid], f ); - fxc[tid] += scal_fact * f; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + if( tid < N ) { + double f; + traits::eval_fxc_unpolar( rho[tid], f ); + fxc[tid] += scal_fact * f; + } } template -__attribute__((always_inline)) LDA_FXC_INC_GENERATOR_SYCL_KERNEL( device_eval_fxc_inc_helper_polar_kernel ) { +__attribute__((always_inline)) LDA_FXC_INC_GENERATOR( device_eval_fxc_inc_helper_polar_kernel ) { using traits = kernel_traits; - auto rho_i = rho + 2*tid; - auto fxc_i = fxc + 3*tid; - double f0, f1, f2; - traits::eval_fxc_polar( rho_i[0], rho_i[1], f0, f1, f2 ); - fxc_i[0] += scal_fact * f0; - fxc_i[1] += scal_fact * f1; - fxc_i[2] += scal_fact * f2; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + if( tid < N ) { + auto rho_i = rho + 2*tid; + auto fxc_i = fxc + 3*tid; + double f0, f1, f2; + traits::eval_fxc_polar( rho_i[0], rho_i[1], f0, f1, f2 ); + fxc_i[0] += scal_fact * f0; + fxc_i[1] += scal_fact * f1; + fxc_i[2] += scal_fact * f2; + } } template -__attribute__((always_inline)) LDA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL( device_eval_vxc_fxc_inc_helper_unpolar_kernel ) { +__attribute__((always_inline)) LDA_VXC_FXC_INC_GENERATOR( device_eval_vxc_fxc_inc_helper_unpolar_kernel ) { using traits = kernel_traits; - double v, f; - traits::eval_vxc_fxc_unpolar( rho[tid], v, f ); - vxc[tid] += scal_fact * v; - fxc[tid] += scal_fact * f; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + if( tid < N ) { + double v, f; + traits::eval_vxc_fxc_unpolar( rho[tid], v, f ); + vxc[tid] += scal_fact * v; + fxc[tid] += scal_fact * f; + } } template -__attribute__((always_inline)) LDA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL( device_eval_vxc_fxc_inc_helper_polar_kernel ) { +__attribute__((always_inline)) LDA_VXC_FXC_INC_GENERATOR( device_eval_vxc_fxc_inc_helper_polar_kernel ) { using traits = kernel_traits; - auto rho_i = rho + 2*tid; - auto vxc_i = vxc + 2*tid; - auto fxc_i = fxc + 3*tid; - double v0, v1, f0, f1, f2; - traits::eval_vxc_fxc_polar( rho_i[0], rho_i[1], v0, v1, f0, f1, f2 ); - vxc_i[0] += scal_fact * v0; - vxc_i[1] += scal_fact * v1; - fxc_i[0] += scal_fact * f0; - fxc_i[1] += scal_fact * f1; - fxc_i[2] += scal_fact * f2; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + if( tid < N ) { + auto rho_i = rho + 2*tid; + auto vxc_i = vxc + 2*tid; + auto fxc_i = fxc + 3*tid; + double v0, v1, f0, f1, f2; + traits::eval_vxc_fxc_polar( rho_i[0], rho_i[1], v0, v1, f0, f1, f2 ); + vxc_i[0] += scal_fact * v0; + vxc_i[1] += scal_fact * v1; + fxc_i[0] += scal_fact * f0; + fxc_i[1] += scal_fact * f1; + fxc_i[2] += scal_fact * f2; + } } @@ -254,61 +351,101 @@ __attribute__((always_inline)) LDA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL( device_eva template -__attribute__((always_inline)) GGA_EXC_GENERATOR_SYCL_KERNEL( device_eval_exc_helper_unpolar_kernel ) { +__attribute__((always_inline)) GGA_EXC_GENERATOR( device_eval_exc_helper_unpolar_kernel ) { using traits = kernel_traits; - traits::eval_exc_unpolar( rho[tid], sigma[tid], eps[tid] ); + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { + + traits::eval_exc_unpolar( rho[tid], sigma[tid], eps[tid] ); + + } } template -__attribute__((always_inline)) GGA_EXC_GENERATOR_SYCL_KERNEL( device_eval_exc_helper_polar_kernel ) { +__attribute__((always_inline)) GGA_EXC_GENERATOR( device_eval_exc_helper_polar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { - using traits = kernel_traits; auto* rho_i = rho + 2*tid; auto* sigma_i = sigma + 3*tid; traits::eval_exc_polar( rho_i[0], rho_i[1], sigma_i[0], - sigma_i[1], sigma_i[2], eps[tid] ); + sigma_i[1], sigma_i[2], eps[tid] ); + + } } template -__attribute__((always_inline)) GGA_EXC_VXC_GENERATOR_SYCL_KERNEL( device_eval_exc_vxc_helper_unpolar_kernel ) { +__attribute__((always_inline)) GGA_EXC_VXC_GENERATOR( device_eval_exc_vxc_helper_unpolar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { - using traits = kernel_traits; traits::eval_exc_vxc_unpolar( rho[tid], sigma[tid], eps[tid], - vrho[tid], vsigma[tid] ); + vrho[tid], vsigma[tid] ); + + } } template -__attribute__((always_inline)) GGA_EXC_VXC_GENERATOR_SYCL_KERNEL( device_eval_exc_vxc_helper_polar_kernel ) { +__attribute__((always_inline)) GGA_EXC_VXC_GENERATOR( device_eval_exc_vxc_helper_polar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { - using traits = kernel_traits; auto* rho_i = rho + 2*tid; auto* sigma_i = sigma + 3*tid; auto* vrho_i = vrho + 2*tid; auto* vsigma_i = vsigma + 3*tid; traits::eval_exc_vxc_polar( rho_i[0], rho_i[1], sigma_i[0], - sigma_i[1], sigma_i[2], eps[tid], vrho_i[0], vrho_i[1], - vsigma_i[0], vsigma_i[1], vsigma_i[2] ); + sigma_i[1], sigma_i[2], eps[tid], vrho_i[0], vrho_i[1], + vsigma_i[0], vsigma_i[1], vsigma_i[2] ); + + } } template -__attribute__((always_inline)) GGA_FXC_GENERATOR_SYCL_KERNEL( device_eval_fxc_helper_unpolar_kernel ) { +__attribute__((always_inline)) GGA_FXC_GENERATOR( device_eval_fxc_helper_unpolar_kernel ) { using traits = kernel_traits; - traits::eval_fxc_unpolar( rho[tid], sigma[tid], v2rho2[tid], v2rhosigma[tid], v2sigma2[tid] ); + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { + + traits::eval_fxc_unpolar( rho[tid], sigma[tid], v2rho2[tid], v2rhosigma[tid], v2sigma2[tid] ); + + } } template -__attribute__((always_inline)) GGA_FXC_GENERATOR_SYCL_KERNEL( device_eval_fxc_helper_polar_kernel ) { +__attribute__((always_inline)) GGA_FXC_GENERATOR( device_eval_fxc_helper_polar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { - using traits = kernel_traits; auto* rho_i = rho + 2*tid; auto* sigma_i = sigma + 3*tid; auto* v2rho2_i = v2rho2 + 3*tid; @@ -317,27 +454,40 @@ __attribute__((always_inline)) GGA_FXC_GENERATOR_SYCL_KERNEL( device_eval_fxc_he traits::eval_fxc_polar( rho_i[0], rho_i[1], sigma_i[0], sigma_i[1], sigma_i[2], - v2rho2_i[0], v2rho2_i[1], v2rho2_i[2], - v2rhosigma_i[0], v2rhosigma_i[1], v2rhosigma_i[2], - v2rhosigma_i[3], v2rhosigma_i[4], v2rhosigma_i[5], - v2sigma2_i[0], v2sigma2_i[1], v2sigma2_i[2], - v2sigma2_i[3], v2sigma2_i[4], v2sigma2_i[5] ); + v2rho2_i[0], v2rho2_i[1], v2rho2_i[2], + v2rhosigma_i[0], v2rhosigma_i[1], v2rhosigma_i[2], + v2rhosigma_i[3], v2rhosigma_i[4], v2rhosigma_i[5], + v2sigma2_i[0], v2sigma2_i[1], v2sigma2_i[2], + v2sigma2_i[3], v2sigma2_i[4], v2sigma2_i[5] ); + } } template -__attribute__((always_inline)) GGA_VXC_FXC_GENERATOR_SYCL_KERNEL( device_eval_vxc_fxc_helper_unpolar_kernel ) { +__attribute__((always_inline)) GGA_VXC_FXC_GENERATOR( device_eval_vxc_fxc_helper_unpolar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { - using traits = kernel_traits; traits::eval_vxc_fxc_unpolar( rho[tid], sigma[tid], vrho[tid], vsigma[tid], - v2rho2[tid], v2rhosigma[tid], v2sigma2[tid] ); + v2rho2[tid], v2rhosigma[tid], v2sigma2[tid] ); + + } } template -__attribute__((always_inline)) GGA_VXC_FXC_GENERATOR_SYCL_KERNEL( device_eval_vxc_fxc_helper_polar_kernel ) { +__attribute__((always_inline)) GGA_VXC_FXC_GENERATOR( device_eval_vxc_fxc_helper_polar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { - using traits = kernel_traits; auto* rho_i = rho + 2*tid; auto* sigma_i = sigma + 3*tid; auto* vrho_i = vrho + 2*tid; @@ -347,57 +497,84 @@ __attribute__((always_inline)) GGA_VXC_FXC_GENERATOR_SYCL_KERNEL( device_eval_vx auto* v2sigma2_i = v2sigma2 + 6*tid; traits::eval_vxc_fxc_polar( rho_i[0], rho_i[1], sigma_i[0], sigma_i[1], sigma_i[2], - vrho_i[0], vrho_i[1], vsigma_i[0], vsigma_i[1], vsigma_i[2], - v2rho2_i[0], v2rho2_i[1], v2rho2_i[2], - v2rhosigma_i[0], v2rhosigma_i[1], v2rhosigma_i[2], - v2rhosigma_i[3], v2rhosigma_i[4], v2rhosigma_i[5], - v2sigma2_i[0], v2sigma2_i[1], v2sigma2_i[2], - v2sigma2_i[3], v2sigma2_i[4], v2sigma2_i[5] ); + vrho_i[0], vrho_i[1], vsigma_i[0], vsigma_i[1], vsigma_i[2], + v2rho2_i[0], v2rho2_i[1], v2rho2_i[2], + v2rhosigma_i[0], v2rhosigma_i[1], v2rhosigma_i[2], + v2rhosigma_i[3], v2rhosigma_i[4], v2rhosigma_i[5], + v2sigma2_i[0], v2sigma2_i[1], v2sigma2_i[2], + v2sigma2_i[3], v2sigma2_i[4], v2sigma2_i[5] ); + + } } template -__attribute__((always_inline)) GGA_EXC_INC_GENERATOR_SYCL_KERNEL( device_eval_exc_inc_helper_unpolar_kernel ) { +__attribute__((always_inline)) GGA_EXC_INC_GENERATOR( device_eval_exc_inc_helper_unpolar_kernel ) { using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); double e; - traits::eval_exc_unpolar( rho[tid], sigma[tid], e ); - eps[tid] += scal_fact * e; + if( tid < N ) { + + traits::eval_exc_unpolar( rho[tid], sigma[tid], e ); + eps[tid] += scal_fact * e; + + } } template -__attribute__((always_inline)) GGA_EXC_INC_GENERATOR_SYCL_KERNEL( device_eval_exc_inc_helper_polar_kernel ) { +__attribute__((always_inline)) GGA_EXC_INC_GENERATOR( device_eval_exc_inc_helper_polar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { - using traits = kernel_traits; auto* rho_i = rho + 2*tid; auto* sigma_i = sigma + 3*tid; double e; traits::eval_exc_polar( rho_i[0], rho_i[1], sigma_i[0], - sigma_i[1], sigma_i[2], e ); + sigma_i[1], sigma_i[2], e ); eps[tid] += scal_fact * e; + + } + } template -__attribute__((always_inline)) GGA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL( device_eval_exc_vxc_inc_helper_unpolar_kernel ) { +__attribute__((always_inline)) GGA_EXC_VXC_INC_GENERATOR( device_eval_exc_vxc_inc_helper_unpolar_kernel ) { using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); double e, vr, vs; - traits::eval_exc_vxc_unpolar( rho[tid], sigma[tid], e, vr, vs ); - eps[tid] += scal_fact * e; - vrho[tid] += scal_fact * vr; - vsigma[tid] += scal_fact * vs; + if( tid < N ) { + + traits::eval_exc_vxc_unpolar( rho[tid], sigma[tid], e, vr, vs ); + eps[tid] += scal_fact * e; + vrho[tid] += scal_fact * vr; + vsigma[tid] += scal_fact * vs; + + } } template -__attribute__((always_inline)) GGA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL( device_eval_exc_vxc_inc_helper_polar_kernel ) { +__attribute__((always_inline)) GGA_EXC_VXC_INC_GENERATOR( device_eval_exc_vxc_inc_helper_polar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { - using traits = kernel_traits; auto* rho_i = rho + 2*tid; auto* sigma_i = sigma + 3*tid; auto* vrho_i = vrho + 2*tid; @@ -405,7 +582,7 @@ __attribute__((always_inline)) GGA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL( device_eva double e, vra, vrb, vsaa,vsab,vsbb; traits::eval_exc_vxc_polar( rho_i[0], rho_i[1], sigma_i[0], - sigma_i[1], sigma_i[2], e, vra, vrb, vsaa, vsab, vsbb ); + sigma_i[1], sigma_i[2], e, vra, vrb, vsaa, vsab, vsbb ); eps[tid] += scal_fact * e; vrho_i[0] += scal_fact * vra; @@ -414,71 +591,88 @@ __attribute__((always_inline)) GGA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL( device_eva vsigma_i[1] += scal_fact * vsab; vsigma_i[2] += scal_fact * vsbb; + } + } template -__attribute__((always_inline)) GGA_FXC_INC_GENERATOR_SYCL_KERNEL( device_eval_fxc_inc_helper_unpolar_kernel ) { +__attribute__((always_inline)) GGA_FXC_INC_GENERATOR( device_eval_fxc_inc_helper_unpolar_kernel ) { using traits = kernel_traits; - double f2, f3, f4; - traits::eval_fxc_unpolar( rho[tid], sigma[tid], f2, f3, f4 ); - v2rho2[tid] += scal_fact * f2; - v2rhosigma[tid] += scal_fact * f3; - v2sigma2[tid] += scal_fact * f4; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + if( tid < N ) { + double f2, f3, f4; + traits::eval_fxc_unpolar( rho[tid], sigma[tid], f2, f3, f4 ); + v2rho2[tid] += scal_fact * f2; + v2rhosigma[tid] += scal_fact * f3; + v2sigma2[tid] += scal_fact * f4; + } } template -__attribute__((always_inline)) GGA_FXC_INC_GENERATOR_SYCL_KERNEL( device_eval_fxc_inc_helper_polar_kernel ) { +__attribute__((always_inline)) GGA_FXC_INC_GENERATOR( device_eval_fxc_inc_helper_polar_kernel ) { using traits = kernel_traits; - - auto* rho_i = rho + 2*tid; - auto* sigma_i = sigma + 3*tid; - auto* v2rho2_i = v2rho2 + 3*tid; - auto* v2rhosigma_i = v2rhosigma + 6*tid; - auto* v2sigma2_i = v2sigma2 + 6*tid; - double f2[3], f3[6], f4[6]; - traits::eval_fxc_polar( rho_i[0], rho_i[1], sigma_i[0], sigma_i[1], sigma_i[2], - f2[0], f2[1], f2[2], - f3[0], f3[1], f3[2], f3[3], f3[4], f3[5], - f4[0], f4[1], f4[2], f4[3], f4[4], f4[5] ); - for(int i=0;i<3;++i) v2rho2_i[i] += scal_fact * f2[i]; - for(int i=0;i<6;++i) v2rhosigma_i[i] += scal_fact * f3[i]; - for(int i=0;i<6;++i) v2sigma2_i[i] += scal_fact * f4[i]; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + if( tid < N ) { + auto* rho_i = rho + 2*tid; + auto* sigma_i = sigma + 3*tid; + auto* v2rho2_i = v2rho2 + 3*tid; + auto* v2rhosigma_i = v2rhosigma + 6*tid; + auto* v2sigma2_i = v2sigma2 + 6*tid; + double f2[3], f3[6], f4[6]; + traits::eval_fxc_polar( rho_i[0], rho_i[1], sigma_i[0], sigma_i[1], sigma_i[2], + f2[0], f2[1], f2[2], + f3[0], f3[1], f3[2], f3[3], f3[4], f3[5], + f4[0], f4[1], f4[2], f4[3], f4[4], f4[5] ); + for(int i=0;i<3;++i) v2rho2_i[i] += scal_fact * f2[i]; + for(int i=0;i<6;++i) v2rhosigma_i[i] += scal_fact * f3[i]; + for(int i=0;i<6;++i) v2sigma2_i[i] += scal_fact * f4[i]; + } } template -__attribute__((always_inline)) GGA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL( device_eval_vxc_fxc_inc_helper_unpolar_kernel ) { +__attribute__((always_inline)) GGA_VXC_FXC_INC_GENERATOR( device_eval_vxc_fxc_inc_helper_unpolar_kernel ) { using traits = kernel_traits; - double v, s, f2, f3, f4; - traits::eval_vxc_fxc_unpolar( rho[tid], sigma[tid], v, s, f2, f3, f4 ); - vrho[tid] += scal_fact * v; - vsigma[tid] += scal_fact * s; - v2rho2[tid] += scal_fact * f2; - v2rhosigma[tid] += scal_fact * f3; - v2sigma2[tid] += scal_fact * f4; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + if( tid < N ) { + double v, s, f2, f3, f4; + traits::eval_vxc_fxc_unpolar( rho[tid], sigma[tid], v, s, f2, f3, f4 ); + vrho[tid] += scal_fact * v; + vsigma[tid] += scal_fact * s; + v2rho2[tid] += scal_fact * f2; + v2rhosigma[tid] += scal_fact * f3; + v2sigma2[tid] += scal_fact * f4; + } } template -__attribute__((always_inline)) GGA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL( device_eval_vxc_fxc_inc_helper_polar_kernel ) { +__attribute__((always_inline)) GGA_VXC_FXC_INC_GENERATOR( device_eval_vxc_fxc_inc_helper_polar_kernel ) { using traits = kernel_traits; - auto* rho_i = rho + 2*tid; - auto* sigma_i = sigma + 3*tid; - auto* vrho_i = vrho + 2*tid; - auto* vsigma_i = vsigma + 3*tid; - auto* v2rho2_i = v2rho2 + 3*tid; - auto* v2rhosigma_i = v2rhosigma + 6*tid; - auto* v2sigma2_i = v2sigma2 + 6*tid; - double v[2], s[3], f2[3], f3[6], f4[6]; - traits::eval_vxc_fxc_polar( rho_i[0], rho_i[1], sigma_i[0], sigma_i[1], sigma_i[2], - v[0], v[1], s[0], s[1], s[2], - f2[0], f2[1], f2[2], - f3[0], f3[1], f3[2], f3[3], f3[4], f3[5], - f4[0], f4[1], f4[2], f4[3], f4[4], f4[5] ); - for(int i=0;i<2;++i) vrho_i[i] += scal_fact * v[i]; - for(int i=0;i<3;++i) vsigma_i[i] += scal_fact * s[i]; - for(int i=0;i<3;++i) v2rho2_i[i] += scal_fact * f2[i]; - for(int i=0;i<6;++i) v2rhosigma_i[i] += scal_fact * f3[i]; - for(int i=0;i<6;++i) v2sigma2_i[i] += scal_fact * f4[i]; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + if( tid < N ) { + auto* rho_i = rho + 2*tid; + auto* sigma_i = sigma + 3*tid; + auto* vrho_i = vrho + 2*tid; + auto* vsigma_i = vsigma + 3*tid; + auto* v2rho2_i = v2rho2 + 3*tid; + auto* v2rhosigma_i = v2rhosigma + 6*tid; + auto* v2sigma2_i = v2sigma2 + 6*tid; + double v[2], s[3], f2[3], f3[6], f4[6]; + traits::eval_vxc_fxc_polar( rho_i[0], rho_i[1], sigma_i[0], sigma_i[1], sigma_i[2], + v[0], v[1], s[0], s[1], s[2], + f2[0], f2[1], f2[2], + f3[0], f3[1], f3[2], f3[3], f3[4], f3[5], + f4[0], f4[1], f4[2], f4[3], f4[4], f4[5] ); + for(int i=0;i<2;++i) vrho_i[i] += scal_fact * v[i]; + for(int i=0;i<3;++i) vsigma_i[i] += scal_fact * s[i]; + for(int i=0;i<3;++i) v2rho2_i[i] += scal_fact * f2[i]; + for(int i=0;i<6;++i) v2rhosigma_i[i] += scal_fact * f3[i]; + for(int i=0;i<6;++i) v2sigma2_i[i] += scal_fact * f4[i]; + } } @@ -495,19 +689,31 @@ __attribute__((always_inline)) GGA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL( device_eva template -__attribute__((always_inline)) MGGA_EXC_GENERATOR_SYCL_KERNEL( device_eval_exc_helper_unpolar_kernel ) { +__attribute__((always_inline)) MGGA_EXC_GENERATOR( device_eval_exc_helper_unpolar_kernel ) { using traits = kernel_traits; - const double lapl_use = traits::needs_laplacian ? lapl[tid] : 0.0; - traits::eval_exc_unpolar( rho[tid], sigma[tid], lapl_use, tau[tid], eps[tid] ); + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { + + const double lapl_use = traits::needs_laplacian ? lapl[tid] : 0.0; + traits::eval_exc_unpolar( rho[tid], sigma[tid], lapl_use, tau[tid], eps[tid] ); + + } } template -__attribute__((always_inline)) MGGA_EXC_GENERATOR_SYCL_KERNEL( device_eval_exc_helper_polar_kernel ) { +__attribute__((always_inline)) MGGA_EXC_GENERATOR( device_eval_exc_helper_polar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { - using traits = kernel_traits; auto* rho_i = rho + 2*tid; auto* sigma_i = sigma + 3*tid; auto* lapl_i = traits::needs_laplacian ? (lapl + 2*tid) : nullptr; @@ -517,119 +723,152 @@ __attribute__((always_inline)) MGGA_EXC_GENERATOR_SYCL_KERNEL( device_eval_exc_h const double lapl_b_use = traits::needs_laplacian ? lapl_i[1] : 0.0; traits::eval_exc_polar( rho_i[0], rho_i[1], sigma_i[0], - sigma_i[1], sigma_i[2], lapl_a_use, lapl_b_use, tau_i[0], - tau_i[1], eps[tid] ); + sigma_i[1], sigma_i[2], lapl_a_use, lapl_b_use, tau_i[0], + tau_i[1], eps[tid] ); + + } } template -__attribute__((always_inline)) MGGA_EXC_VXC_GENERATOR_SYCL_KERNEL( device_eval_exc_vxc_helper_unpolar_kernel ) { +__attribute__((always_inline)) MGGA_EXC_VXC_GENERATOR( device_eval_exc_vxc_helper_unpolar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { - using traits = kernel_traits; const double lapl_use = traits::needs_laplacian ? lapl[tid] : 0.0; double dummy; auto& vlapl_return = traits::needs_laplacian ? vlapl[tid] : dummy; traits::eval_exc_vxc_unpolar( rho[tid], sigma[tid], lapl_use, tau[tid], - eps[tid], vrho[tid], vsigma[tid], vlapl_return, vtau[tid] ); + eps[tid], vrho[tid], vsigma[tid], vlapl_return, vtau[tid] ); + + } } template -__attribute__((always_inline)) MGGA_EXC_VXC_GENERATOR_SYCL_KERNEL( device_eval_exc_vxc_helper_polar_kernel ) { +__attribute__((always_inline)) MGGA_EXC_VXC_GENERATOR( device_eval_exc_vxc_helper_polar_kernel ) { using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); double dummy_vlapl[2]; - auto* rho_i = rho + 2*tid; - auto* sigma_i = sigma + 3*tid; - auto* lapl_i = traits::needs_laplacian ? (lapl + 2*tid) : lapl; - auto* tau_i = tau + 2*tid; + if( tid < N ) { + + auto* rho_i = rho + 2*tid; + auto* sigma_i = sigma + 3*tid; + auto* lapl_i = traits::needs_laplacian ? (lapl + 2*tid) : lapl; + auto* tau_i = tau + 2*tid; + + auto* vrho_i = vrho + 2*tid; + auto* vsigma_i = vsigma + 3*tid; + auto* vlapl_i = traits::needs_laplacian ? vlapl + 2*tid : dummy_vlapl; + auto* vtau_i = vtau + 2*tid; + const double lapl_a_use = traits::needs_laplacian ? lapl_i[0] : 0.0; + const double lapl_b_use = traits::needs_laplacian ? lapl_i[1] : 0.0; - auto* vrho_i = vrho + 2*tid; - auto* vsigma_i = vsigma + 3*tid; - auto* vlapl_i = traits::needs_laplacian ? vlapl + 2*tid : dummy_vlapl; - auto* vtau_i = vtau + 2*tid; - const double lapl_a_use = traits::needs_laplacian ? lapl_i[0] : 0.0; - const double lapl_b_use = traits::needs_laplacian ? lapl_i[1] : 0.0; + traits::eval_exc_vxc_polar( rho_i[0], rho_i[1], sigma_i[0], + sigma_i[1], sigma_i[2], lapl_a_use, lapl_b_use, tau_i[0], + tau_i[1], eps[tid], vrho_i[0], vrho_i[1], vsigma_i[0], vsigma_i[1], + vsigma_i[2], vlapl_i[0], vlapl_i[1], vtau_i[0], vtau_i[1] ); - traits::eval_exc_vxc_polar( rho_i[0], rho_i[1], sigma_i[0], - sigma_i[1], sigma_i[2], lapl_a_use, lapl_b_use, tau_i[0], - tau_i[1], eps[tid], vrho_i[0], vrho_i[1], vsigma_i[0], vsigma_i[1], - vsigma_i[2], vlapl_i[0], vlapl_i[1], vtau_i[0], vtau_i[1] ); + } } template -__attribute__((always_inline)) MGGA_FXC_GENERATOR_SYCL_KERNEL( device_eval_fxc_helper_unpolar_kernel ) { +__attribute__((always_inline)) MGGA_FXC_GENERATOR( device_eval_fxc_helper_unpolar_kernel ) { using traits = kernel_traits; - const double lapl_use = traits::needs_laplacian ? lapl[tid] : 0.0; - double local_v2rholapl, local_v2sigmalapl, local_v2lapl2, local_v2lapltau; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { + + const double lapl_use = traits::needs_laplacian ? lapl[tid] : 0.0; + double local_v2rholapl, local_v2sigmalapl, local_v2lapl2, local_v2lapltau; - auto& v2rholapl_return = traits::needs_laplacian ? v2rholapl[tid] : local_v2rholapl; - auto& v2sigmalapl_return = traits::needs_laplacian ? v2sigmalapl[tid] : local_v2sigmalapl; - auto& v2lapl2_return = traits::needs_laplacian ? v2lapl2[tid] : local_v2lapl2; - auto& v2lapltau_return = traits::needs_laplacian ? v2lapltau[tid] : local_v2lapltau; + auto& v2rholapl_return = traits::needs_laplacian ? v2rholapl[tid] : local_v2rholapl; + auto& v2sigmalapl_return = traits::needs_laplacian ? v2sigmalapl[tid] : local_v2sigmalapl; + auto& v2lapl2_return = traits::needs_laplacian ? v2lapl2[tid] : local_v2lapl2; + auto& v2lapltau_return = traits::needs_laplacian ? v2lapltau[tid] : local_v2lapltau; - traits::eval_fxc_unpolar( rho[tid], sigma[tid], lapl_use, tau[tid], - v2rho2[tid], v2rhosigma[tid], v2rholapl_return, v2rhotau[tid], - v2sigma2[tid], v2sigmalapl_return, v2sigmatau[tid], - v2lapl2_return, v2lapltau_return, v2tau2[tid] ); + traits::eval_fxc_unpolar( rho[tid], sigma[tid], lapl_use, tau[tid], + v2rho2[tid], v2rhosigma[tid], v2rholapl_return, v2rhotau[tid], + v2sigma2[tid], v2sigmalapl_return, v2sigmatau[tid], + v2lapl2_return, v2lapltau_return, v2tau2[tid] ); + } } template -__attribute__((always_inline)) MGGA_FXC_GENERATOR_SYCL_KERNEL( device_eval_fxc_helper_polar_kernel ) { +__attribute__((always_inline)) MGGA_FXC_GENERATOR( device_eval_fxc_helper_polar_kernel ) { using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); double dummy_v2rholapl[4]; double dummy_v2sigmalapl[6]; double dummy_v2lapl2[3]; double dummy_v2lapltau[4]; - auto* rho_i = rho + 2 * tid; - auto* sigma_i = sigma + 3 * tid; - auto* tau_i = tau + 2 * tid; - auto* v2rho2_i = v2rho2 + 3 * tid; - auto* v2rhosigma_i = v2rhosigma + 6 * tid; - auto* v2rhotau_i = v2rhotau + 4 * tid; - auto* v2sigma2_i = v2sigma2 + 6 * tid; - auto* v2sigmatau_i = v2sigmatau + 6 * tid; - auto* v2tau2_i = v2tau2 + 3 * tid; - - auto* lapl_i = traits::needs_laplacian ? (lapl + 2 * tid) : lapl; - auto* v2rholapl_i = traits::needs_laplacian ? (v2rholapl + 4 * tid) : dummy_v2rholapl; - auto* v2sigmalapl_i = traits::needs_laplacian ? (v2sigmalapl + 6 * tid) : dummy_v2sigmalapl; - auto* v2lapl2_i = traits::needs_laplacian ? (v2lapl2 + 3 * tid) : dummy_v2lapl2; - auto* v2lapltau_i = traits::needs_laplacian ? (v2lapltau + 4 * tid) : dummy_v2lapltau; - - const double lapl_a_use = traits::needs_laplacian ? lapl_i[0] : 0.0; - const double lapl_b_use = traits::needs_laplacian ? lapl_i[1] : 0.0; - - traits::eval_fxc_polar( rho_i[0], rho_i[1], sigma_i[0], sigma_i[1], sigma_i[2], - lapl_a_use, lapl_b_use, tau_i[0], tau_i[1], - v2rho2_i[0], v2rho2_i[1], v2rho2_i[2], - v2rhosigma_i[0], v2rhosigma_i[1], v2rhosigma_i[2], - v2rhosigma_i[3], v2rhosigma_i[4], v2rhosigma_i[5], - v2rholapl_i[0], v2rholapl_i[1], v2rholapl_i[2], v2rholapl_i[3], - v2rhotau_i[0], v2rhotau_i[1], v2rhotau_i[2], v2rhotau_i[3], - v2sigma2_i[0], v2sigma2_i[1], v2sigma2_i[2], - v2sigma2_i[3], v2sigma2_i[4], v2sigma2_i[5], - v2sigmalapl_i[0], v2sigmalapl_i[1], v2sigmalapl_i[2], - v2sigmalapl_i[3], v2sigmalapl_i[4], v2sigmalapl_i[5], - v2sigmatau_i[0], v2sigmatau_i[1], v2sigmatau_i[2], - v2sigmatau_i[3], v2sigmatau_i[4], v2sigmatau_i[5], - v2lapl2_i[0], v2lapl2_i[1], v2lapl2_i[2], - v2lapltau_i[0], v2lapltau_i[1], v2lapltau_i[2], v2lapltau_i[3], - v2tau2_i[0], v2tau2_i[1], v2tau2_i[2] ); -} - -template -__attribute__((always_inline)) MGGA_VXC_FXC_GENERATOR_SYCL_KERNEL( device_eval_vxc_fxc_helper_unpolar_kernel ) { - - using traits = kernel_traits; + if( tid < N ) { + + auto* rho_i = rho + 2 * tid; + auto* sigma_i = sigma + 3 * tid; + auto* tau_i = tau + 2 * tid; + auto* v2rho2_i = v2rho2 + 3 * tid; + auto* v2rhosigma_i = v2rhosigma + 6 * tid; + auto* v2rhotau_i = v2rhotau + 4 * tid; + auto* v2sigma2_i = v2sigma2 + 6 * tid; + auto* v2sigmatau_i = v2sigmatau + 6 * tid; + auto* v2tau2_i = v2tau2 + 3 * tid; + + auto* lapl_i = traits::needs_laplacian ? (lapl + 2 * tid) : lapl; + auto* v2rholapl_i = traits::needs_laplacian ? (v2rholapl + 4 * tid) : dummy_v2rholapl; + auto* v2sigmalapl_i = traits::needs_laplacian ? (v2sigmalapl + 6 * tid) : dummy_v2sigmalapl; + auto* v2lapl2_i = traits::needs_laplacian ? (v2lapl2 + 3 * tid) : dummy_v2lapl2; + auto* v2lapltau_i = traits::needs_laplacian ? (v2lapltau + 4 * tid) : dummy_v2lapltau; + + const double lapl_a_use = traits::needs_laplacian ? lapl_i[0] : 0.0; + const double lapl_b_use = traits::needs_laplacian ? lapl_i[1] : 0.0; + + traits::eval_fxc_polar( rho_i[0], rho_i[1], sigma_i[0], sigma_i[1], sigma_i[2], + lapl_a_use, lapl_b_use, tau_i[0], tau_i[1], + v2rho2_i[0], v2rho2_i[1], v2rho2_i[2], + v2rhosigma_i[0], v2rhosigma_i[1], v2rhosigma_i[2], + v2rhosigma_i[3], v2rhosigma_i[4], v2rhosigma_i[5], + v2rholapl_i[0], v2rholapl_i[1], v2rholapl_i[2], v2rholapl_i[3], + v2rhotau_i[0], v2rhotau_i[1], v2rhotau_i[2], v2rhotau_i[3], + v2sigma2_i[0], v2sigma2_i[1], v2sigma2_i[2], + v2sigma2_i[3], v2sigma2_i[4], v2sigma2_i[5], + v2sigmalapl_i[0], v2sigmalapl_i[1], v2sigmalapl_i[2], + v2sigmalapl_i[3], v2sigmalapl_i[4], v2sigmalapl_i[5], + v2sigmatau_i[0], v2sigmatau_i[1], v2sigmatau_i[2], + v2sigmatau_i[3], v2sigmatau_i[4], v2sigmatau_i[5], + v2lapl2_i[0], v2lapl2_i[1], v2lapl2_i[2], + v2lapltau_i[0], v2lapltau_i[1], v2lapltau_i[2], v2lapltau_i[3], + v2tau2_i[0], v2tau2_i[1], v2tau2_i[2] ); + + + } +} + +template +__attribute__((always_inline)) MGGA_VXC_FXC_GENERATOR( device_eval_vxc_fxc_helper_unpolar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { + const double lapl_use = traits::needs_laplacian ? lapl[tid] : 0.0; double dummy_v2rholapl, dummy_v2sigmalapl, dummy_v2lapl2, dummy_v2lapltau, dummy_vlapl; auto& vlapl_return = traits::needs_laplacian ? vlapl[tid] : dummy_vlapl; @@ -639,84 +878,103 @@ __attribute__((always_inline)) MGGA_VXC_FXC_GENERATOR_SYCL_KERNEL( device_eval_v auto& v2lapltau_return = traits::needs_laplacian ? v2lapltau[tid] : dummy_v2lapltau; traits::eval_vxc_fxc_unpolar( rho[tid], sigma[tid], lapl_use, tau[tid], - vrho[tid], vsigma[tid], vlapl_return, vtau[tid], - v2rho2[tid], v2rhosigma[tid], v2rholapl_return, - v2rhotau[tid], v2sigma2[tid], v2sigmalapl_return, - v2sigmatau[tid], v2lapl2_return, v2lapltau_return, - v2tau2[tid] ); + vrho[tid], vsigma[tid], vlapl_return, vtau[tid], + v2rho2[tid], v2rhosigma[tid], v2rholapl_return, + v2rhotau[tid], v2sigma2[tid], v2sigmalapl_return, + v2sigmatau[tid], v2lapl2_return, v2lapltau_return, + v2tau2[tid] ); + + } } template -__attribute__((always_inline)) MGGA_VXC_FXC_GENERATOR_SYCL_KERNEL( device_eval_vxc_fxc_helper_polar_kernel ) { +__attribute__((always_inline)) MGGA_VXC_FXC_GENERATOR( device_eval_vxc_fxc_helper_polar_kernel ) { using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); double dummy_vlapl[2]; double dummy_v2rholapl[4]; double dummy_v2sigmalapl[6]; double dummy_v2lapl2[3]; double dummy_v2lapltau[4]; - auto* rho_i = rho + 2 * tid; - auto* sigma_i = sigma + 3 * tid; - auto* tau_i = tau + 2 * tid; - auto* vrho_i = vrho + 2 * tid; - auto* vsigma_i = vsigma + 3 * tid; - auto* vtau_i = vtau + 2 * tid; - - auto* v2rho2_i = v2rho2 + 3 * tid; - auto* v2rhosigma_i = v2rhosigma + 6 * tid; - auto* v2rhotau_i = v2rhotau + 4 * tid; - auto* v2sigma2_i = v2sigma2 + 6 * tid; - auto* v2sigmatau_i = v2sigmatau + 6 * tid; - auto* v2tau2_i = v2tau2 + 3 * tid; - - auto* lapl_i = traits::needs_laplacian ? (lapl + 2 * tid) : lapl; - auto* vlapl_i = traits::needs_laplacian ? (vlapl + 2 * tid) : dummy_vlapl; - auto* v2rholapl_i = traits::needs_laplacian ? (v2rholapl + 4 * tid) : dummy_v2rholapl; - auto* v2sigmalapl_i = traits::needs_laplacian ? (v2sigmalapl + 6 * tid) : dummy_v2sigmalapl; - auto* v2lapl2_i = traits::needs_laplacian ? (v2lapl2 + 3 * tid) : dummy_v2lapl2; - auto* v2lapltau_i = traits::needs_laplacian ? (v2lapltau + 4 * tid) : dummy_v2lapltau; - const double lapl_a_use = traits::needs_laplacian ? lapl_i[0] : 0.0; - const double lapl_b_use = traits::needs_laplacian ? lapl_i[1] : 0.0; - - traits::eval_vxc_fxc_polar( rho_i[0], rho_i[1], sigma_i[0], sigma_i[1], sigma_i[2], - lapl_a_use, lapl_b_use, tau_i[0], tau_i[1], - vrho_i[0], vrho_i[1], vsigma_i[0], vsigma_i[1], vsigma_i[2], - vlapl_i[0], vlapl_i[1], vtau_i[0], vtau_i[1], - v2rho2_i[0], v2rho2_i[1], v2rho2_i[2], - v2rhosigma_i[0], v2rhosigma_i[1], v2rhosigma_i[2], - v2rhosigma_i[3], v2rhosigma_i[4], v2rhosigma_i[5], - v2rholapl_i[0], v2rholapl_i[1], v2rholapl_i[2], v2rholapl_i[3], - v2rhotau_i[0], v2rhotau_i[1], v2rhotau_i[2], v2rhotau_i[3], - v2sigma2_i[0], v2sigma2_i[1], v2sigma2_i[2], - v2sigma2_i[3], v2sigma2_i[4], v2sigma2_i[5], - v2sigmalapl_i[0], v2sigmalapl_i[1], v2sigmalapl_i[2], - v2sigmalapl_i[3], v2sigmalapl_i[4], v2sigmalapl_i[5], - v2sigmatau_i[0], v2sigmatau_i[1], v2sigmatau_i[2], - v2sigmatau_i[3], v2sigmatau_i[4], v2sigmatau_i[5], - v2lapl2_i[0], v2lapl2_i[1], v2lapl2_i[2], - v2lapltau_i[0], v2lapltau_i[1], v2lapltau_i[2], v2lapltau_i[3], - v2tau2_i[0], v2tau2_i[1], v2tau2_i[2] ); -} - -template -__attribute__((always_inline)) MGGA_EXC_INC_GENERATOR_SYCL_KERNEL( device_eval_exc_inc_helper_unpolar_kernel ) { + if( tid < N ) { + + auto* rho_i = rho + 2 * tid; + auto* sigma_i = sigma + 3 * tid; + auto* tau_i = tau + 2 * tid; + auto* vrho_i = vrho + 2 * tid; + auto* vsigma_i = vsigma + 3 * tid; + auto* vtau_i = vtau + 2 * tid; + + auto* v2rho2_i = v2rho2 + 3 * tid; + auto* v2rhosigma_i = v2rhosigma + 6 * tid; + auto* v2rhotau_i = v2rhotau + 4 * tid; + auto* v2sigma2_i = v2sigma2 + 6 * tid; + auto* v2sigmatau_i = v2sigmatau + 6 * tid; + auto* v2tau2_i = v2tau2 + 3 * tid; + + auto* lapl_i = traits::needs_laplacian ? (lapl + 2 * tid) : lapl; + auto* vlapl_i = traits::needs_laplacian ? (vlapl + 2 * tid) : dummy_vlapl; + auto* v2rholapl_i = traits::needs_laplacian ? (v2rholapl + 4 * tid) : dummy_v2rholapl; + auto* v2sigmalapl_i = traits::needs_laplacian ? (v2sigmalapl + 6 * tid) : dummy_v2sigmalapl; + auto* v2lapl2_i = traits::needs_laplacian ? (v2lapl2 + 3 * tid) : dummy_v2lapl2; + auto* v2lapltau_i = traits::needs_laplacian ? (v2lapltau + 4 * tid) : dummy_v2lapltau; + const double lapl_a_use = traits::needs_laplacian ? lapl_i[0] : 0.0; + const double lapl_b_use = traits::needs_laplacian ? lapl_i[1] : 0.0; + + traits::eval_vxc_fxc_polar( rho_i[0], rho_i[1], sigma_i[0], sigma_i[1], sigma_i[2], + lapl_a_use, lapl_b_use, tau_i[0], tau_i[1], + vrho_i[0], vrho_i[1], vsigma_i[0], vsigma_i[1], vsigma_i[2], + vlapl_i[0], vlapl_i[1], vtau_i[0], vtau_i[1], + v2rho2_i[0], v2rho2_i[1], v2rho2_i[2], + v2rhosigma_i[0], v2rhosigma_i[1], v2rhosigma_i[2], + v2rhosigma_i[3], v2rhosigma_i[4], v2rhosigma_i[5], + v2rholapl_i[0], v2rholapl_i[1], v2rholapl_i[2], v2rholapl_i[3], + v2rhotau_i[0], v2rhotau_i[1], v2rhotau_i[2], v2rhotau_i[3], + v2sigma2_i[0], v2sigma2_i[1], v2sigma2_i[2], + v2sigma2_i[3], v2sigma2_i[4], v2sigma2_i[5], + v2sigmalapl_i[0], v2sigmalapl_i[1], v2sigmalapl_i[2], + v2sigmalapl_i[3], v2sigmalapl_i[4], v2sigmalapl_i[5], + v2sigmatau_i[0], v2sigmatau_i[1], v2sigmatau_i[2], + v2sigmatau_i[3], v2sigmatau_i[4], v2sigmatau_i[5], + v2lapl2_i[0], v2lapl2_i[1], v2lapl2_i[2], + v2lapltau_i[0], v2lapltau_i[1], v2lapltau_i[2], v2lapltau_i[3], + v2tau2_i[0], v2tau2_i[1], v2tau2_i[2] ); + + } +} + +template +__attribute__((always_inline)) MGGA_EXC_INC_GENERATOR( device_eval_exc_inc_helper_unpolar_kernel ) { using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); double e; + if( tid < N ) { + + const double lapl_use = traits::needs_laplacian ? lapl[tid] : 0.0; + traits::eval_exc_unpolar( rho[tid], sigma[tid], lapl_use, tau[tid], e ); + eps[tid] += scal_fact * e; + - const double lapl_use = traits::needs_laplacian ? lapl[tid] : 0.0; - traits::eval_exc_unpolar( rho[tid], sigma[tid], lapl_use, tau[tid], e ); - eps[tid] += scal_fact * e; + } } template -__attribute__((always_inline)) MGGA_EXC_INC_GENERATOR_SYCL_KERNEL( device_eval_exc_inc_helper_polar_kernel ) { +__attribute__((always_inline)) MGGA_EXC_INC_GENERATOR( device_eval_exc_inc_helper_polar_kernel ) { + + using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { - using traits = kernel_traits; auto* rho_i = rho + 2*tid; auto* sigma_i = sigma + 3*tid; auto* lapl_i = traits::needs_laplacian ? (lapl + 2*tid) : lapl; @@ -727,135 +985,156 @@ __attribute__((always_inline)) MGGA_EXC_INC_GENERATOR_SYCL_KERNEL( device_eval_e double e; traits::eval_exc_polar( rho_i[0], rho_i[1], sigma_i[0], - sigma_i[1], sigma_i[2], lapl_a_use, lapl_b_use, tau_i[0], - tau_i[1], e ); + sigma_i[1], sigma_i[2], lapl_a_use, lapl_b_use, tau_i[0], + tau_i[1], e ); eps[tid] += scal_fact * e; + + } + } template -__attribute__((always_inline)) MGGA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL( device_eval_exc_vxc_inc_helper_unpolar_kernel ) { +__attribute__((always_inline)) MGGA_EXC_VXC_INC_GENERATOR( device_eval_exc_vxc_inc_helper_unpolar_kernel ) { using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); double e, vr, vs, vl, vt; + if( tid < N ) { + + const double lapl_use = traits::needs_laplacian ? lapl[tid] : 0.0; - const double lapl_use = traits::needs_laplacian ? lapl[tid] : 0.0; + traits::eval_exc_vxc_unpolar( rho[tid], sigma[tid], lapl_use, tau[tid], + e, vr, vs, vl, vt ); + eps[tid] += scal_fact * e; + vrho[tid] += scal_fact * vr; + vsigma[tid] += scal_fact * vs; + vtau[tid] += scal_fact * vt; + if(traits::needs_laplacian) vlapl[tid] += scal_fact * vl; - traits::eval_exc_vxc_unpolar( rho[tid], sigma[tid], lapl_use, tau[tid], - e, vr, vs, vl, vt ); - eps[tid] += scal_fact * e; - vrho[tid] += scal_fact * vr; - vsigma[tid] += scal_fact * vs; - vtau[tid] += scal_fact * vt; - if(traits::needs_laplacian) vlapl[tid] += scal_fact * vl; + } } template -__attribute__((always_inline)) MGGA_EXC_VXC_INC_GENERATOR_SYCL_KERNEL( device_eval_exc_vxc_inc_helper_polar_kernel ) { +__attribute__((always_inline)) MGGA_EXC_VXC_INC_GENERATOR( device_eval_exc_vxc_inc_helper_polar_kernel ) { using traits = kernel_traits; + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); double dummy_vlapl[2]; + if( tid < N ) { + + auto* rho_i = rho + 2*tid; + auto* sigma_i = sigma + 3*tid; + auto* lapl_i = traits::needs_laplacian ? (lapl + 2*tid) : lapl; + auto* tau_i = tau + 2*tid; + + auto* vrho_i = vrho + 2*tid; + auto* vsigma_i = vsigma + 3*tid; + auto* vlapl_i = traits::needs_laplacian ? vlapl + 2*tid : dummy_vlapl; + auto* vtau_i = vtau + 2*tid; + + const double lapl_a_use = traits::needs_laplacian ? lapl_i[0] : 0.0; + const double lapl_b_use = traits::needs_laplacian ? lapl_i[1] : 0.0; + + + double e, vra, vrb, vsaa,vsab,vsbb, vla, vlb, vta, vtb; + traits::eval_exc_vxc_polar( rho_i[0], rho_i[1], sigma_i[0], + sigma_i[1], sigma_i[2], lapl_a_use, lapl_b_use, tau_i[0], + tau_i[1], e, vra, vrb, vsaa, vsab, vsbb, vla, vlb, vta, vtb ); - auto* rho_i = rho + 2*tid; - auto* sigma_i = sigma + 3*tid; - auto* lapl_i = traits::needs_laplacian ? (lapl + 2*tid) : lapl; - auto* tau_i = tau + 2*tid; - - auto* vrho_i = vrho + 2*tid; - auto* vsigma_i = vsigma + 3*tid; - auto* vlapl_i = traits::needs_laplacian ? vlapl + 2*tid : dummy_vlapl; - auto* vtau_i = vtau + 2*tid; - - const double lapl_a_use = traits::needs_laplacian ? lapl_i[0] : 0.0; - const double lapl_b_use = traits::needs_laplacian ? lapl_i[1] : 0.0; - - - double e, vra, vrb, vsaa,vsab,vsbb, vla, vlb, vta, vtb; - traits::eval_exc_vxc_polar( rho_i[0], rho_i[1], sigma_i[0], - sigma_i[1], sigma_i[2], lapl_a_use, lapl_b_use, tau_i[0], - tau_i[1], e, vra, vrb, vsaa, vsab, vsbb, vla, vlb, vta, vtb ); - - eps[tid] += scal_fact * e; - vrho_i[0] += scal_fact * vra; - vrho_i[1] += scal_fact * vrb; - vsigma_i[0] += scal_fact * vsaa; - vsigma_i[1] += scal_fact * vsab; - vsigma_i[2] += scal_fact * vsbb; - vtau_i[0] += scal_fact * vta; - vtau_i[1] += scal_fact * vtb; - if(traits::needs_laplacian) { + eps[tid] += scal_fact * e; + vrho_i[0] += scal_fact * vra; + vrho_i[1] += scal_fact * vrb; + vsigma_i[0] += scal_fact * vsaa; + vsigma_i[1] += scal_fact * vsab; + vsigma_i[2] += scal_fact * vsbb; + vtau_i[0] += scal_fact * vta; + vtau_i[1] += scal_fact * vtb; + if(traits::needs_laplacian) { vlapl_i[0] += scal_fact * vla; vlapl_i[1] += scal_fact * vlb; + } + } } template -__attribute__((always_inline)) MGGA_FXC_INC_GENERATOR_SYCL_KERNEL( device_eval_fxc_inc_helper_unpolar_kernel ) { +__attribute__((always_inline)) MGGA_FXC_INC_GENERATOR( device_eval_fxc_inc_helper_unpolar_kernel ) { using traits = kernel_traits; - const double lapl_use = traits::needs_laplacian ? lapl[tid] : 0.0; - double f_rho2, f_rhosigma, f_rholapl, f_rhotau, f_sigma2, f_sigmalapl, f_sigmatau, f_lapl2, f_lapltau, f_tau2; - traits::eval_fxc_unpolar( rho[tid], sigma[tid], lapl_use, tau[tid], - f_rho2, f_rhosigma, f_rholapl, f_rhotau, - f_sigma2, f_sigmalapl, f_sigmatau, - f_lapl2, f_lapltau, f_tau2 ); - v2rho2[tid] += scal_fact * f_rho2; - v2rhosigma[tid] += scal_fact * f_rhosigma; - v2rhotau[tid] += scal_fact * f_rhotau; - v2sigma2[tid] += scal_fact * f_sigma2; - v2sigmatau[tid] += scal_fact * f_sigmatau; - v2tau2[tid] += scal_fact * f_tau2; - if(traits::needs_laplacian) { + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + if( tid < N ) { + const double lapl_use = traits::needs_laplacian ? lapl[tid] : 0.0; + double f_rho2, f_rhosigma, f_rholapl, f_rhotau, f_sigma2, f_sigmalapl, f_sigmatau, f_lapl2, f_lapltau, f_tau2; + traits::eval_fxc_unpolar( rho[tid], sigma[tid], lapl_use, tau[tid], + f_rho2, f_rhosigma, f_rholapl, f_rhotau, + f_sigma2, f_sigmalapl, f_sigmatau, + f_lapl2, f_lapltau, f_tau2 ); + v2rho2[tid] += scal_fact * f_rho2; + v2rhosigma[tid] += scal_fact * f_rhosigma; + v2rhotau[tid] += scal_fact * f_rhotau; + v2sigma2[tid] += scal_fact * f_sigma2; + v2sigmatau[tid] += scal_fact * f_sigmatau; + v2tau2[tid] += scal_fact * f_tau2; + if(traits::needs_laplacian) { v2rholapl[tid] += scal_fact * f_rholapl; v2sigmalapl[tid] += scal_fact * f_sigmalapl; v2lapl2[tid] += scal_fact * f_lapl2; v2lapltau[tid] += scal_fact * f_lapltau; + } } } template -__attribute__((always_inline)) MGGA_FXC_INC_GENERATOR_SYCL_KERNEL( device_eval_fxc_inc_helper_polar_kernel ) { +__attribute__((always_inline)) MGGA_FXC_INC_GENERATOR( device_eval_fxc_inc_helper_polar_kernel ) { using traits = kernel_traits; - auto* rho_i = rho + 2 * tid; - auto* sigma_i = sigma + 3 * tid; - auto* tau_i = tau + 2 * tid; - auto* v2rho2_i = v2rho2 + 3 * tid; - auto* v2rhosigma_i = v2rhosigma + 6 * tid; - auto* v2rhotau_i = v2rhotau + 4 * tid; - auto* v2sigma2_i = v2sigma2 + 6 * tid; - auto* v2sigmatau_i = v2sigmatau + 6 * tid; - auto* v2tau2_i = v2tau2 + 3 * tid; - - auto* lapl_i = traits::needs_laplacian ? (lapl + 2 * tid) : lapl; - const double lapl_a_use = traits::needs_laplacian ? lapl_i[0] : 0.0; - const double lapl_b_use = traits::needs_laplacian ? lapl_i[1] : 0.0; - - double f_rho2[3], f_rhosigma[6], f_rholapl[4], f_rhotau[4], f_sigma2[6], f_sigmalapl[6], f_sigmatau[6], f_lapl2[3], f_lapltau[4], f_tau2[3]; - - traits::eval_fxc_polar( rho_i[0], rho_i[1], sigma_i[0], sigma_i[1], sigma_i[2], - lapl_a_use, lapl_b_use, tau_i[0], tau_i[1], - f_rho2[0], f_rho2[1], f_rho2[2], - f_rhosigma[0], f_rhosigma[1], f_rhosigma[2], f_rhosigma[3], f_rhosigma[4], f_rhosigma[5], - f_rholapl[0], f_rholapl[1], f_rholapl[2], f_rholapl[3], - f_rhotau[0], f_rhotau[1], f_rhotau[2], f_rhotau[3], - f_sigma2[0], f_sigma2[1], f_sigma2[2], f_sigma2[3], f_sigma2[4], f_sigma2[5], - f_sigmalapl[0], f_sigmalapl[1], f_sigmalapl[2], f_sigmalapl[3], f_sigmalapl[4], f_sigmalapl[5], - f_sigmatau[0], f_sigmatau[1], f_sigmatau[2], f_sigmatau[3], f_sigmatau[4], f_sigmatau[5], - f_lapl2[0], f_lapl2[1], f_lapl2[2], - f_lapltau[0], f_lapltau[1], f_lapltau[2], f_lapltau[3], - f_tau2[0], f_tau2[1], f_tau2[2] ); - - for(int i=0;i<3;++i) v2rho2_i[i] += scal_fact * f_rho2[i]; - for(int i=0;i<6;++i) v2rhosigma_i[i] += scal_fact * f_rhosigma[i]; - for(int i=0;i<4;++i) v2rhotau_i[i] += scal_fact * f_rhotau[i]; - for(int i=0;i<6;++i) v2sigma2_i[i] += scal_fact * f_sigma2[i]; - for(int i=0;i<6;++i) v2sigmatau_i[i] += scal_fact * f_sigmatau[i]; - for(int i=0;i<3;++i) v2tau2_i[i] += scal_fact * f_tau2[i]; - - if(traits::needs_laplacian) { + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { + auto* rho_i = rho + 2 * tid; + auto* sigma_i = sigma + 3 * tid; + auto* tau_i = tau + 2 * tid; + auto* v2rho2_i = v2rho2 + 3 * tid; + auto* v2rhosigma_i = v2rhosigma + 6 * tid; + auto* v2rhotau_i = v2rhotau + 4 * tid; + auto* v2sigma2_i = v2sigma2 + 6 * tid; + auto* v2sigmatau_i = v2sigmatau + 6 * tid; + auto* v2tau2_i = v2tau2 + 3 * tid; + + auto* lapl_i = traits::needs_laplacian ? (lapl + 2 * tid) : lapl; + const double lapl_a_use = traits::needs_laplacian ? lapl_i[0] : 0.0; + const double lapl_b_use = traits::needs_laplacian ? lapl_i[1] : 0.0; + + double f_rho2[3], f_rhosigma[6], f_rholapl[4], f_rhotau[4], f_sigma2[6], f_sigmalapl[6], f_sigmatau[6], f_lapl2[3], f_lapltau[4], f_tau2[3]; + + traits::eval_fxc_polar( rho_i[0], rho_i[1], sigma_i[0], sigma_i[1], sigma_i[2], + lapl_a_use, lapl_b_use, tau_i[0], tau_i[1], + f_rho2[0], f_rho2[1], f_rho2[2], + f_rhosigma[0], f_rhosigma[1], f_rhosigma[2], f_rhosigma[3], f_rhosigma[4], f_rhosigma[5], + f_rholapl[0], f_rholapl[1], f_rholapl[2], f_rholapl[3], + f_rhotau[0], f_rhotau[1], f_rhotau[2], f_rhotau[3], + f_sigma2[0], f_sigma2[1], f_sigma2[2], f_sigma2[3], f_sigma2[4], f_sigma2[5], + f_sigmalapl[0], f_sigmalapl[1], f_sigmalapl[2], f_sigmalapl[3], f_sigmalapl[4], f_sigmalapl[5], + f_sigmatau[0], f_sigmatau[1], f_sigmatau[2], f_sigmatau[3], f_sigmatau[4], f_sigmatau[5], + f_lapl2[0], f_lapl2[1], f_lapl2[2], + f_lapltau[0], f_lapltau[1], f_lapltau[2], f_lapltau[3], + f_tau2[0], f_tau2[1], f_tau2[2] ); + + for(int i=0;i<3;++i) v2rho2_i[i] += scal_fact * f_rho2[i]; + for(int i=0;i<6;++i) v2rhosigma_i[i] += scal_fact * f_rhosigma[i]; + for(int i=0;i<4;++i) v2rhotau_i[i] += scal_fact * f_rhotau[i]; + for(int i=0;i<6;++i) v2sigma2_i[i] += scal_fact * f_sigma2[i]; + for(int i=0;i<6;++i) v2sigmatau_i[i] += scal_fact * f_sigmatau[i]; + for(int i=0;i<3;++i) v2tau2_i[i] += scal_fact * f_tau2[i]; + + if(traits::needs_laplacian) { auto* v2rholapl_i = v2rholapl + 4 * tid; auto* v2sigmalapl_i = v2sigmalapl + 6 * tid; auto* v2lapl2_i = v2lapl2 + 3 * tid; @@ -864,97 +1143,105 @@ __attribute__((always_inline)) MGGA_FXC_INC_GENERATOR_SYCL_KERNEL( device_eval_f for(int i=0;i<6;++i) v2sigmalapl_i[i] += scal_fact * f_sigmalapl[i]; for(int i=0;i<3;++i) v2lapl2_i[i] += scal_fact * f_lapl2[i]; for(int i=0;i<4;++i) v2lapltau_i[i] += scal_fact * f_lapltau[i]; + } } - } template -__attribute__((always_inline)) MGGA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL( device_eval_vxc_fxc_inc_helper_unpolar_kernel ) { +__attribute__((always_inline)) MGGA_VXC_FXC_INC_GENERATOR( device_eval_vxc_fxc_inc_helper_unpolar_kernel ) { using traits = kernel_traits; - const double lapl_use = traits::needs_laplacian ? lapl[tid] : 0.0; - double f_rho2, f_rhosigma, f_rholapl, f_rhotau, f_sigma2, f_sigmalapl, f_sigmatau, f_lapl2, f_lapltau, f_tau2; - double vr, vs, vl, vt; - traits::eval_vxc_fxc_unpolar( rho[tid], sigma[tid], lapl_use, tau[tid], - vr, vs, vl, vt, - f_rho2, f_rhosigma, f_rholapl, f_rhotau, - f_sigma2, f_sigmalapl, f_sigmatau, - f_lapl2, f_lapltau, f_tau2); - - vrho[tid] += scal_fact * vr; - vsigma[tid] += scal_fact * vs; - vtau[tid] += scal_fact * vt; - v2rho2[tid] += scal_fact * f_rho2; - v2rhosigma[tid] += scal_fact * f_rhosigma; - v2rhotau[tid] += scal_fact * f_rhotau; - v2sigma2[tid] += scal_fact * f_sigma2; - v2sigmatau[tid] += scal_fact * f_sigmatau; - v2tau2[tid] += scal_fact * f_tau2; - - if(traits::needs_laplacian) { + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + if( tid < N ) { + const double lapl_use = traits::needs_laplacian ? lapl[tid] : 0.0; + double f_rho2, f_rhosigma, f_rholapl, f_rhotau, f_sigma2, f_sigmalapl, f_sigmatau, f_lapl2, f_lapltau, f_tau2; + double vr, vs, vl, vt; + traits::eval_vxc_fxc_unpolar( rho[tid], sigma[tid], lapl_use, tau[tid], + vr, vs, vl, vt, + f_rho2, f_rhosigma, f_rholapl, f_rhotau, + f_sigma2, f_sigmalapl, f_sigmatau, + f_lapl2, f_lapltau, f_tau2); + + vrho[tid] += scal_fact * vr; + vsigma[tid] += scal_fact * vs; + vtau[tid] += scal_fact * vt; + v2rho2[tid] += scal_fact * f_rho2; + v2rhosigma[tid] += scal_fact * f_rhosigma; + v2rhotau[tid] += scal_fact * f_rhotau; + v2sigma2[tid] += scal_fact * f_sigma2; + v2sigmatau[tid] += scal_fact * f_sigmatau; + v2tau2[tid] += scal_fact * f_tau2; + + if(traits::needs_laplacian) { vlapl[tid] += scal_fact * vl; v2rholapl[tid] += scal_fact * f_rholapl; v2sigmalapl[tid] += scal_fact * f_sigmalapl; v2lapl2[tid] += scal_fact * f_lapl2; v2lapltau[tid] += scal_fact * f_lapltau; + } } } template -__attribute__((always_inline)) MGGA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL(device_eval_vxc_fxc_inc_helper_polar_kernel) { +__attribute__((always_inline)) MGGA_VXC_FXC_INC_GENERATOR(device_eval_vxc_fxc_inc_helper_polar_kernel) { using traits = kernel_traits; - auto* rho_i = rho + 2 * tid; - auto* sigma_i = sigma + 3 * tid; - auto* tau_i = tau + 2 * tid; - auto* vrho_i = vrho + 2 * tid; - auto* vsigma_i = vsigma + 3 * tid; - auto* vtau_i = vtau + 2 * tid; - - auto* v2rho2_i = v2rho2 + 3 * tid; - auto* v2rhosigma_i = v2rhosigma + 6 * tid; - auto* v2rhotau_i = v2rhotau + 4 * tid; - auto* v2sigma2_i = v2sigma2 + 6 * tid; - auto* v2sigmatau_i = v2sigmatau + 6 * tid; - auto* v2tau2_i = v2tau2 + 3 * tid; - - auto* lapl_i = traits::needs_laplacian ? (lapl + 2 * tid) : lapl; - const double lapl_a_use = traits::needs_laplacian ? lapl_i[0] : 0.0; - const double lapl_b_use = traits::needs_laplacian ? lapl_i[1] : 0.0; - - double frho[2], fsigma[3], flapl[2], ftau[2]; - double f_rho2[3], f_rhosigma[6], f_rholapl[4], f_rhotau[4], f_sigma2[6], f_sigmalapl[6], f_sigmatau[6], f_lapl2[3], f_lapltau[4], f_tau2[3]; - - traits::eval_vxc_fxc_polar( rho_i[0], rho_i[1], sigma_i[0], sigma_i[1], sigma_i[2], - lapl_a_use, lapl_b_use, tau_i[0], tau_i[1], - frho[0], frho[1], fsigma[0], fsigma[1], fsigma[2], - flapl[0], flapl[1], ftau[0], ftau[1], - f_rho2[0], f_rho2[1], f_rho2[2], - f_rhosigma[0], f_rhosigma[1], f_rhosigma[2], - f_rhosigma[3], f_rhosigma[4], f_rhosigma[5], - f_rholapl[0], f_rholapl[1], f_rholapl[2], f_rholapl[3], - f_rhotau[0], f_rhotau[1], f_rhotau[2], f_rhotau[3], - f_sigma2[0], f_sigma2[1], f_sigma2[2], - f_sigma2[3], f_sigma2[4], f_sigma2[5], - f_sigmalapl[0], f_sigmalapl[1], f_sigmalapl[2], - f_sigmalapl[3], f_sigmalapl[4], f_sigmalapl[5], - f_sigmatau[0], f_sigmatau[1], f_sigmatau[2], - f_sigmatau[3], f_sigmatau[4], f_sigmatau[5], - f_lapl2[0], f_lapl2[1], f_lapl2[2], - f_lapltau[0], f_lapltau[1], f_lapltau[2], f_lapltau[3], - f_tau2[0], f_tau2[1], f_tau2[2] ); - - for(int i=0;i<2;++i) vrho_i[i] += scal_fact * frho[i]; - for(int i=0;i<3;++i) vsigma_i[i] += scal_fact * fsigma[i]; - for(int i=0;i<2;++i) vtau_i[i] += scal_fact * ftau[i]; - - for(int i=0;i<3;++i) v2rho2_i[i] += scal_fact * f_rho2[i]; - for(int i=0;i<6;++i) v2rhosigma_i[i] += scal_fact * f_rhosigma[i]; - for(int i=0;i<4;++i) v2rhotau_i[i] += scal_fact * f_rhotau[i]; - for(int i=0;i<6;++i) v2sigma2_i[i] += scal_fact * f_sigma2[i]; - for(int i=0;i<6;++i) v2sigmatau_i[i] += scal_fact * f_sigmatau[i]; - for(int i=0;i<3;++i) v2tau2_i[i] += scal_fact * f_tau2[i]; - - if(traits::needs_laplacian) { + auto item = syclex::this_work_item::get_nd_item<1>(); + int tid = item.get_global_id(0); + + if( tid < N ) { + auto* rho_i = rho + 2 * tid; + auto* sigma_i = sigma + 3 * tid; + auto* tau_i = tau + 2 * tid; + auto* vrho_i = vrho + 2 * tid; + auto* vsigma_i = vsigma + 3 * tid; + auto* vtau_i = vtau + 2 * tid; + + auto* v2rho2_i = v2rho2 + 3 * tid; + auto* v2rhosigma_i = v2rhosigma + 6 * tid; + auto* v2rhotau_i = v2rhotau + 4 * tid; + auto* v2sigma2_i = v2sigma2 + 6 * tid; + auto* v2sigmatau_i = v2sigmatau + 6 * tid; + auto* v2tau2_i = v2tau2 + 3 * tid; + + auto* lapl_i = traits::needs_laplacian ? (lapl + 2 * tid) : lapl; + const double lapl_a_use = traits::needs_laplacian ? lapl_i[0] : 0.0; + const double lapl_b_use = traits::needs_laplacian ? lapl_i[1] : 0.0; + + double frho[2], fsigma[3], flapl[2], ftau[2]; + double f_rho2[3], f_rhosigma[6], f_rholapl[4], f_rhotau[4], f_sigma2[6], f_sigmalapl[6], f_sigmatau[6], f_lapl2[3], f_lapltau[4], f_tau2[3]; + + traits::eval_vxc_fxc_polar( rho_i[0], rho_i[1], sigma_i[0], sigma_i[1], sigma_i[2], + lapl_a_use, lapl_b_use, tau_i[0], tau_i[1], + frho[0], frho[1], fsigma[0], fsigma[1], fsigma[2], + flapl[0], flapl[1], ftau[0], ftau[1], + f_rho2[0], f_rho2[1], f_rho2[2], + f_rhosigma[0], f_rhosigma[1], f_rhosigma[2], + f_rhosigma[3], f_rhosigma[4], f_rhosigma[5], + f_rholapl[0], f_rholapl[1], f_rholapl[2], f_rholapl[3], + f_rhotau[0], f_rhotau[1], f_rhotau[2], f_rhotau[3], + f_sigma2[0], f_sigma2[1], f_sigma2[2], + f_sigma2[3], f_sigma2[4], f_sigma2[5], + f_sigmalapl[0], f_sigmalapl[1], f_sigmalapl[2], + f_sigmalapl[3], f_sigmalapl[4], f_sigmalapl[5], + f_sigmatau[0], f_sigmatau[1], f_sigmatau[2], + f_sigmatau[3], f_sigmatau[4], f_sigmatau[5], + f_lapl2[0], f_lapl2[1], f_lapl2[2], + f_lapltau[0], f_lapltau[1], f_lapltau[2], f_lapltau[3], + f_tau2[0], f_tau2[1], f_tau2[2] ); + + for(int i=0;i<2;++i) vrho_i[i] += scal_fact * frho[i]; + for(int i=0;i<3;++i) vsigma_i[i] += scal_fact * fsigma[i]; + for(int i=0;i<2;++i) vtau_i[i] += scal_fact * ftau[i]; + + for(int i=0;i<3;++i) v2rho2_i[i] += scal_fact * f_rho2[i]; + for(int i=0;i<6;++i) v2rhosigma_i[i] += scal_fact * f_rhosigma[i]; + for(int i=0;i<4;++i) v2rhotau_i[i] += scal_fact * f_rhotau[i]; + for(int i=0;i<6;++i) v2sigma2_i[i] += scal_fact * f_sigma2[i]; + for(int i=0;i<6;++i) v2sigmatau_i[i] += scal_fact * f_sigmatau[i]; + for(int i=0;i<3;++i) v2tau2_i[i] += scal_fact * f_tau2[i]; + + if(traits::needs_laplacian) { auto* vlapl_i = vlapl + 2 * tid; auto* v2rholapl_i = v2rholapl + 4 * tid; auto* v2sigmalapl_i = v2sigmalapl + 6 * tid; @@ -965,8 +1252,8 @@ __attribute__((always_inline)) MGGA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL(device_eva for(int i=0;i<6;++i) v2sigmalapl_i[i] += scal_fact * f_sigmalapl[i]; for(int i=0;i<3;++i) v2lapl2_i[i] += scal_fact * f_lapl2[i]; for(int i=0;i<4;++i) v2lapltau_i[i] += scal_fact * f_lapltau[i]; + } } - } @@ -979,9 +1266,11 @@ __attribute__((always_inline)) MGGA_VXC_FXC_INC_GENERATOR_SYCL_KERNEL(device_eva template LDA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_helper_unpolar_kernel( - N, rho, eps, tid); + N, rho, eps); }); } @@ -989,9 +1278,11 @@ LDA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_unpolar ) { template LDA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_helper_polar_kernel( - N, rho, eps, tid); + N, rho, eps); }); } @@ -999,9 +1290,11 @@ LDA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_polar ) { template LDA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_helper_unpolar_kernel( - N, rho, eps, vxc, tid); + N, rho, eps, vxc); }); } @@ -1009,9 +1302,11 @@ LDA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_unpolar ) { template LDA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_helper_polar_kernel( - N, rho, eps, vxc, tid); + N, rho, eps, vxc); }); } @@ -1019,9 +1314,11 @@ LDA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_polar ) { template LDA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_helper_unpolar_kernel( - N, rho, fxc, tid); + N, rho, fxc); }); } @@ -1029,18 +1326,22 @@ LDA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_unpolar ) { template LDA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_helper_polar_kernel( - N, rho, fxc, tid); + N, rho, fxc); }); } template LDA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_helper_unpolar_kernel( - N, rho, vxc, fxc, tid); + N, rho, vxc, fxc); }); } @@ -1048,9 +1349,11 @@ LDA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_unpolar ) { template LDA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_helper_polar_kernel( - N, rho, vxc, fxc, tid); + N, rho, vxc, fxc); }); } @@ -1058,9 +1361,11 @@ LDA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_polar ) { template LDA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_inc_helper_unpolar_kernel( - scal_fact, N, rho, eps, tid); + scal_fact, N, rho, eps); }); } @@ -1068,9 +1373,11 @@ LDA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_unpolar ) { template LDA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_inc_helper_polar_kernel( - scal_fact, N, rho, eps, tid); + scal_fact, N, rho, eps); }); } @@ -1078,9 +1385,11 @@ LDA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_polar ) { template LDA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_inc_helper_unpolar_kernel( - scal_fact, N, rho, eps, vxc, tid); + scal_fact, N, rho, eps, vxc); }); } @@ -1088,9 +1397,11 @@ LDA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_unpolar ) { template LDA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_inc_helper_polar_kernel( - scal_fact, N, rho, eps, vxc, tid); + scal_fact, N, rho, eps, vxc); }); } @@ -1098,9 +1409,11 @@ LDA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_polar ) { template LDA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_inc_helper_unpolar_kernel( - scal_fact, N, rho, fxc, tid); + scal_fact, N, rho, fxc); }); } @@ -1108,9 +1421,11 @@ LDA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_unpolar ) { template LDA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_inc_helper_polar_kernel( - scal_fact, N, rho, fxc, tid); + scal_fact, N, rho, fxc); }); } @@ -1118,9 +1433,11 @@ LDA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_polar ) { template LDA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_inc_helper_unpolar_kernel( - scal_fact, N, rho, vxc, fxc, tid); + scal_fact, N, rho, vxc, fxc); }); } @@ -1128,9 +1445,11 @@ LDA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_unpolar ) { template LDA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_inc_helper_polar_kernel( - scal_fact, N, rho, vxc, fxc, tid); + scal_fact, N, rho, vxc, fxc); }); } @@ -1141,9 +1460,11 @@ LDA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_polar ) { template GGA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_helper_unpolar_kernel( - N, rho, sigma, eps, tid); + N, rho, sigma, eps); }); } @@ -1151,9 +1472,11 @@ GGA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_unpolar ) { template GGA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_helper_polar_kernel( - N, rho, sigma, eps, tid); + N, rho, sigma, eps); }); } @@ -1161,9 +1484,11 @@ GGA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_polar ) { template GGA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_helper_unpolar_kernel( - N, rho, sigma, eps, vrho, vsigma, tid); + N, rho, sigma, eps, vrho, vsigma); }); } @@ -1171,9 +1496,11 @@ GGA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_unpolar ) { template GGA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_helper_polar_kernel( - N, rho, sigma, eps, vrho, vsigma, tid); + N, rho, sigma, eps, vrho, vsigma); }); } @@ -1181,9 +1508,11 @@ GGA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_polar ) { template GGA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_helper_unpolar_kernel( - N, rho, sigma, v2rho2, v2rhosigma, v2sigma2, tid); + N, rho, sigma, v2rho2, v2rhosigma, v2sigma2); }); } @@ -1191,9 +1520,11 @@ GGA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_unpolar ) { template GGA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_helper_polar_kernel( - N, rho, sigma, v2rho2, v2rhosigma, v2sigma2, tid); + N, rho, sigma, v2rho2, v2rhosigma, v2sigma2); }); } @@ -1201,9 +1532,11 @@ GGA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_polar ) { template GGA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_helper_unpolar_kernel( - N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, v2sigma2, tid); + N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, v2sigma2); }); } @@ -1211,9 +1544,11 @@ GGA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_unpolar ) { template GGA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_helper_polar_kernel( - N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, v2sigma2, tid); + N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, v2sigma2); }); } @@ -1221,9 +1556,11 @@ GGA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_polar ) { template GGA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_inc_helper_unpolar_kernel( - scal_fact, N, rho, sigma, eps, tid); + scal_fact, N, rho, sigma, eps); }); } @@ -1231,9 +1568,11 @@ GGA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_unpolar ) { template GGA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_inc_helper_polar_kernel( - scal_fact, N, rho, sigma, eps, tid); + scal_fact, N, rho, sigma, eps); }); } @@ -1241,9 +1580,11 @@ GGA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_polar ) { template GGA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_inc_helper_unpolar_kernel( - scal_fact, N, rho, sigma, eps, vrho, vsigma, tid); + scal_fact, N, rho, sigma, eps, vrho, vsigma); }); } @@ -1251,9 +1592,11 @@ GGA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_unpolar ) { template GGA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_inc_helper_polar_kernel( - scal_fact, N, rho, sigma, eps, vrho, vsigma, tid); + scal_fact, N, rho, sigma, eps, vrho, vsigma); }); } @@ -1262,36 +1605,44 @@ GGA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_polar ) { template GGA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_inc_helper_unpolar_kernel( - scal_fact, N, rho, sigma, v2rho2, v2rhosigma, v2sigma2, tid); + scal_fact, N, rho, sigma, v2rho2, v2rhosigma, v2sigma2); }); } template GGA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_inc_helper_polar_kernel( - scal_fact, N, rho, sigma, v2rho2, v2rhosigma, v2sigma2, tid); + scal_fact, N, rho, sigma, v2rho2, v2rhosigma, v2sigma2); }); } template GGA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_inc_helper_unpolar_kernel( - scal_fact, N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, v2sigma2, tid); + scal_fact, N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, v2sigma2); }); } template GGA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_inc_helper_polar_kernel( - scal_fact, N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, v2sigma2, tid); + scal_fact, N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, v2sigma2); }); } @@ -1299,9 +1650,11 @@ GGA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_polar ) { template MGGA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_helper_unpolar_kernel( - N, rho, sigma, lapl, tau, eps, tid); + N, rho, sigma, lapl, tau, eps); }); } @@ -1309,9 +1662,11 @@ MGGA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_unpolar ) { template MGGA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_helper_polar_kernel( - N, rho, sigma, lapl, tau, eps, tid); + N, rho, sigma, lapl, tau, eps); }); } @@ -1319,9 +1674,11 @@ MGGA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_polar ) { template MGGA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_helper_unpolar_kernel( - N, rho, sigma, lapl, tau, eps, vrho, vsigma, vlapl, vtau, tid); + N, rho, sigma, lapl, tau, eps, vrho, vsigma, vlapl, vtau); }); } @@ -1329,9 +1686,11 @@ MGGA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_unpolar ) { template MGGA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_helper_polar_kernel( - N, rho, sigma, lapl, tau, eps, vrho, vsigma, vlapl, vtau, tid); + N, rho, sigma, lapl, tau, eps, vrho, vsigma, vlapl, vtau); }); } @@ -1339,10 +1698,12 @@ MGGA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_polar ) { template MGGA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_helper_unpolar_kernel( N, rho, sigma, lapl, tau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, - v2sigma2, v2sigmalapl, v2sigmatau, v2lapl2, v2lapltau, v2tau2, tid); + v2sigma2, v2sigmalapl, v2sigmatau, v2lapl2, v2lapltau, v2tau2); }); } @@ -1350,10 +1711,12 @@ MGGA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_unpolar ) { template MGGA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_helper_polar_kernel( N, rho, sigma, lapl, tau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, - v2sigma2, v2sigmalapl, v2sigmatau, v2lapl2, v2lapltau, v2tau2, tid); + v2sigma2, v2sigmalapl, v2sigmatau, v2lapl2, v2lapltau, v2tau2); }); } @@ -1361,12 +1724,14 @@ MGGA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_polar ) { template MGGA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_helper_unpolar_kernel( N, rho, sigma, lapl, tau, vrho, vsigma, vlapl, vtau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, v2sigma2, v2sigmalapl, v2sigmatau, - v2lapl2, v2lapltau, v2tau2, tid); + v2lapl2, v2lapltau, v2tau2); }); } @@ -1374,12 +1739,14 @@ MGGA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_unpolar ) { template MGGA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_helper_polar_kernel( N, rho, sigma, lapl, tau, vrho, vsigma, vlapl, vtau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, v2sigma2, v2sigmalapl, v2sigmatau, - v2lapl2, v2lapltau, v2tau2, tid); + v2lapl2, v2lapltau, v2tau2); }); } @@ -1387,9 +1754,11 @@ MGGA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_polar ) { template MGGA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_inc_helper_unpolar_kernel( - scal_fact, N, rho, sigma, lapl, tau, eps, tid); + scal_fact, N, rho, sigma, lapl, tau, eps); }); } @@ -1397,9 +1766,11 @@ MGGA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_unpolar ) { template MGGA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_inc_helper_polar_kernel( - scal_fact, N, rho, sigma, lapl, tau, eps, tid); + scal_fact, N, rho, sigma, lapl, tau, eps); }); } @@ -1407,9 +1778,11 @@ MGGA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_polar ) { template MGGA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_inc_helper_unpolar_kernel( - scal_fact, N, rho, sigma, lapl, tau, eps, vrho, vsigma, vlapl, vtau, tid); + scal_fact, N, rho, sigma, lapl, tau, eps, vrho, vsigma, vlapl, vtau); }); } @@ -1417,9 +1790,11 @@ MGGA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_unpolar ) { template MGGA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_inc_helper_polar_kernel( - scal_fact, N, rho, sigma, lapl, tau, eps, vrho, vsigma, vlapl, vtau, tid); + scal_fact, N, rho, sigma, lapl, tau, eps, vrho, vsigma, vlapl, vtau); }); } @@ -1427,12 +1802,14 @@ MGGA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_polar ) { template MGGA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_inc_helper_unpolar_kernel( scal_fact, N, rho, sigma, lapl, tau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, v2sigma2, v2sigmalapl, v2sigmatau, - v2lapl2, v2lapltau, v2tau2, tid); + v2lapl2, v2lapltau, v2tau2); }); } @@ -1440,12 +1817,14 @@ MGGA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_unpolar ) { template MGGA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_inc_helper_polar_kernel( scal_fact, N, rho, sigma, lapl, tau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, v2sigma2, v2sigmalapl, v2sigmatau, - v2lapl2, v2lapltau, v2tau2, tid); + v2lapl2, v2lapltau, v2tau2); }); } @@ -1453,13 +1832,15 @@ MGGA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_polar ) { template MGGA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_unpolar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_inc_helper_unpolar_kernel( scal_fact, N, rho, sigma, lapl, tau, vrho, vsigma, vlapl, vtau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, v2sigma2, v2sigmalapl, v2sigmatau, - v2lapl2, v2lapltau, v2tau2, tid); + v2lapl2, v2lapltau, v2tau2); }); } @@ -1467,13 +1848,15 @@ MGGA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_unpolar ) { template MGGA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_polar ) { - queue->parallel_for>( sycl::range<1>(N), [=](sycl::id<1> tid) { + sycl::range<1> threads(32); + sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_inc_helper_polar_kernel( scal_fact, N, rho, sigma, lapl, tau, vrho, vsigma, vlapl, vtau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, v2sigma2, v2sigmalapl, v2sigmatau, - v2lapl2, v2lapltau, v2tau2, tid); + v2lapl2, v2lapltau, v2tau2); }); } diff --git a/src/sycl/libxc_device.cxx b/src/sycl/libxc_device.cxx index 758485e..d525fae 100644 --- a/src/sycl/libxc_device.cxx +++ b/src/sycl/libxc_device.cxx @@ -1,7 +1,13 @@ /** - * ExchCXX Copyright (c) 2020-2022, The Regents of the University of California, + * ExchCXX + * + * Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * Portions Copyright (c) Microsoft Corporation. + * + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -52,10 +58,10 @@ void recv_from_device( void* dest, const void* src, const size_t len, - sycl::queue* queue ) { + sycl::queue* stream ) { try { - queue->memcpy( dest, src, len ); + stream->memcpy( dest, src, len ); } catch( sycl::exception const &ex ) { throw( std::runtime_error( "recv failed + " + std::string(ex.what()) ) ); } @@ -64,29 +70,23 @@ void recv_from_device( void* dest, const void* src, const size_t len, void send_to_device( void* dest, const void* src, const size_t len, - sycl::queue* queue ) { + sycl::queue* stream ) { try { - queue->memcpy( dest, src, len ); + stream->memcpy( dest, src, len ); } catch( sycl::exception const &ex ) { throw( std::runtime_error( "send failed + " + std::string(ex.what()) ) ); } } -void queue_sync( sycl::queue* queue ) { +void stream_sync( sycl::queue* stream ) { - queue->wait_and_throw(); + stream->wait_and_throw(); } -template -using sycl_host_allocator = sycl::usm_allocator; - -template -using sycl_host_vector = std::vector>; - namespace ExchCXX { namespace detail { @@ -105,13 +105,13 @@ LDA_EXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_exc_device_ ) const { std::vector rho_host( sz_rho ), eps_host( sz_exc ); - recv_from_device( rho_host.data(), rho, len_rho, queue ); + recv_from_device( rho_host.data(), rho, len_rho, stream ); - queue_sync( queue ); + stream_sync( stream ); xc_lda_exc( &kernel_, N, rho_host.data(), eps_host.data() ); - send_to_device( eps, eps_host.data(), len_eps, queue ); - queue_sync( queue ); // Lifetime of host vectors + send_to_device( eps, eps_host.data(), len_eps, stream ); + stream_sync( stream ); // Lifetime of host vectors } @@ -131,14 +131,14 @@ LDA_EXC_VXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_exc_vxc_device_ ) const { std::vector rho_host( sz_rho ), eps_host( sz_exc ), vxc_host( sz_vxc ); - recv_from_device( rho_host.data(), rho, len_rho, queue ); + recv_from_device( rho_host.data(), rho, len_rho, stream ); - queue_sync( queue ); + stream_sync( stream ); xc_lda_exc_vxc( &kernel_, N, rho_host.data(), eps_host.data(), vxc_host.data() ); - send_to_device( eps, eps_host.data(), len_eps, queue ); - send_to_device( vxc, vxc_host.data(), len_vxc, queue ); - queue_sync( queue ); // Lifetime of host vectors + send_to_device( eps, eps_host.data(), len_eps, stream ); + send_to_device( vxc, vxc_host.data(), len_vxc, stream ); + stream_sync( stream ); // Lifetime of host vectors } @@ -155,13 +155,13 @@ LDA_FXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_fxc_device_ ) const { std::vector rho_host( sz_rho ), fxc_host( sz_fxc ); - recv_from_device( rho_host.data(), rho, len_rho, queue ); + recv_from_device( rho_host.data(), rho, len_rho, stream ); - queue_sync( queue ); + stream_sync( stream ); xc_lda_fxc( &kernel_, N, rho_host.data(), fxc_host.data() ); - send_to_device( fxc, fxc_host.data(), len_fxc, queue ); - queue_sync( queue ); // Lifetime of host vectors + send_to_device( fxc, fxc_host.data(), len_fxc, stream ); + stream_sync( stream ); // Lifetime of host vectors } @@ -181,14 +181,14 @@ LDA_VXC_FXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_vxc_fxc_device_ ) const { std::vector rho_host( sz_rho ), vxc_host( sz_vxc ), fxc_host( sz_fxc ); - recv_from_device( rho_host.data(), rho, len_rho, queue ); + recv_from_device( rho_host.data(), rho, len_rho, stream ); - queue_sync( queue ); + stream_sync( stream ); xc_lda_vxc_fxc( &kernel_, N, rho_host.data(), vxc_host.data(), fxc_host.data() ); - send_to_device( vxc, vxc_host.data(), len_vxc, queue ); - send_to_device( fxc, fxc_host.data(), len_fxc, queue ); - queue_sync( queue ); // Lifetime of host vectors + send_to_device( vxc, vxc_host.data(), len_vxc, stream ); + send_to_device( fxc, fxc_host.data(), len_fxc, stream ); + stream_sync( stream ); // Lifetime of host vectors } @@ -210,14 +210,14 @@ GGA_EXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_exc_device_ ) const { std::vector rho_host( sz_rho ), eps_host( sz_eps ), sigma_host( sz_sigma ); - recv_from_device( rho_host.data(), rho, len_rho , queue ); - recv_from_device( sigma_host.data(), sigma, len_sigma, queue ); + recv_from_device( rho_host.data(), rho, len_rho , stream ); + recv_from_device( sigma_host.data(), sigma, len_sigma, stream ); - queue_sync( queue ); + stream_sync( stream ); xc_gga_exc( &kernel_, N, rho_host.data(), sigma_host.data(), eps_host.data() ); - send_to_device( eps, eps_host.data(), len_eps, queue ); - queue_sync( queue ); // Lifetime of host vectors + send_to_device( eps, eps_host.data(), len_eps, stream ); + stream_sync( stream ); // Lifetime of host vectors } @@ -242,21 +242,21 @@ GGA_EXC_VXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_exc_vxc_device_ ) const { std::vector rho_host( sz_rho ), eps_host( sz_eps ), sigma_host( sz_sigma ), vrho_host( sz_vrho ), vsigma_host( sz_vsigma ); - recv_from_device( rho_host.data(), rho, len_rho , queue ); - recv_from_device( sigma_host.data(), sigma, len_sigma, queue ); + recv_from_device( rho_host.data(), rho, len_rho , stream ); + recv_from_device( sigma_host.data(), sigma, len_sigma, stream ); - queue_sync( queue ); + stream_sync( stream ); xc_gga_exc_vxc( &kernel_, N, rho_host.data(), sigma_host.data(), eps_host.data(), vrho_host.data(), vsigma_host.data() ); - send_to_device( eps, eps_host.data(), len_eps , queue); - send_to_device( vrho, vrho_host.data(), len_vrho , queue); - send_to_device( vsigma, vsigma_host.data(), len_vsigma, queue); - queue_sync( queue ); // Lifetime of host vectors + send_to_device( eps, eps_host.data(), len_eps , stream); + send_to_device( vrho, vrho_host.data(), len_vrho , stream); + send_to_device( vsigma, vsigma_host.data(), len_vsigma, stream); + stream_sync( stream ); // Lifetime of host vectors } - GGA_FXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_fxc_device_ ) const { +GGA_FXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_fxc_device_ ) const { throw_if_uninitialized(); EXCHCXX_BOOL_CHECK("KERNEL IS NOT GGA", is_gga() ); @@ -277,18 +277,18 @@ GGA_EXC_VXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_exc_vxc_device_ ) const { v2rho2_host(sz_v2rho2), v2rhosigma_host(sz_v2rhosigma), v2sigma2_host(sz_v2sigma2); - recv_from_device(rho_host.data(), rho, len_rho, queue); - recv_from_device(sigma_host.data(), sigma, len_sigma, queue); + recv_from_device(rho_host.data(), rho, len_rho, stream); + recv_from_device(sigma_host.data(), sigma, len_sigma, stream); - queue_sync(queue); + stream_sync(stream); xc_gga_fxc(&kernel_, N, rho_host.data(), sigma_host.data(), v2rho2_host.data(), v2rhosigma_host.data(), v2sigma2_host.data()); - send_to_device(v2rho2, v2rho2_host.data(), len_v2rho2, queue); - send_to_device(v2rhosigma, v2rhosigma_host.data(), len_v2rhosigma, queue); - send_to_device(v2sigma2, v2sigma2_host.data(), len_v2sigma2, queue); + send_to_device(v2rho2, v2rho2_host.data(), len_v2rho2, stream); + send_to_device(v2rhosigma, v2rhosigma_host.data(), len_v2rhosigma, stream); + send_to_device(v2sigma2, v2sigma2_host.data(), len_v2sigma2, stream); - queue_sync(queue); // Lifetime of host vectors + stream_sync(stream); // Lifetime of host vectors } GGA_VXC_FXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_vxc_fxc_device_ ) const { @@ -319,21 +319,21 @@ GGA_VXC_FXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_vxc_fxc_device_ ) const { v2rho2_host(sz_v2rho2), v2rhosigma_host(sz_v2rhosigma), v2sigma2_host(sz_v2sigma2); - recv_from_device(rho_host.data(), rho, len_rho, queue); - recv_from_device(sigma_host.data(), sigma, len_sigma, queue); + recv_from_device(rho_host.data(), rho, len_rho, stream); + recv_from_device(sigma_host.data(), sigma, len_sigma, stream); - queue_sync(queue); + stream_sync(stream); xc_gga_vxc_fxc(&kernel_, N, rho_host.data(), sigma_host.data(), vrho_host.data(), vsigma_host.data(), v2rho2_host.data(), v2rhosigma_host.data(), v2sigma2_host.data()); - send_to_device(vrho, vrho_host.data(), len_vrho, queue); - send_to_device(vsigma, vsigma_host.data(), len_vsigma, queue); - send_to_device(v2rho2, v2rho2_host.data(), len_v2rho2, queue); - send_to_device(v2rhosigma, v2rhosigma_host.data(), len_v2rhosigma, queue); - send_to_device(v2sigma2, v2sigma2_host.data(), len_v2sigma2, queue); - queue_sync(queue); // Lifetime of host vectors + send_to_device(vrho, vrho_host.data(), len_vrho, stream); + send_to_device(vsigma, vsigma_host.data(), len_vsigma, stream); + send_to_device(v2rho2, v2rho2_host.data(), len_v2rho2, stream); + send_to_device(v2rhosigma, v2rhosigma_host.data(), len_v2rhosigma, stream); + send_to_device(v2sigma2, v2sigma2_host.data(), len_v2sigma2, stream); + stream_sync(stream); // Lifetime of host vectors } // TODO: GGA kxc interfaces @@ -361,17 +361,17 @@ MGGA_EXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_exc_device_ ) const { sigma_host( sz_sigma ), lapl_host( sz_lapl ), tau_host( sz_tau ); - recv_from_device( rho_host.data(), rho, len_rho , queue ); - recv_from_device( sigma_host.data(), sigma, len_sigma, queue ); - recv_from_device( lapl_host.data(), lapl, len_lapl , queue ); - recv_from_device( tau_host.data(), tau, len_tau , queue ); + recv_from_device( rho_host.data(), rho, len_rho , stream ); + recv_from_device( sigma_host.data(), sigma, len_sigma, stream ); + recv_from_device( lapl_host.data(), lapl, len_lapl , stream ); + recv_from_device( tau_host.data(), tau, len_tau , stream ); - queue_sync( queue ); + stream_sync( stream ); xc_mgga_exc( &kernel_, N, rho_host.data(), sigma_host.data(), lapl_host.data(), tau_host.data(), eps_host.data() ); - send_to_device( eps, eps_host.data(), len_eps, queue ); - queue_sync( queue ); // Lifetime of host vectors + send_to_device( eps, eps_host.data(), len_eps, stream ); + stream_sync( stream ); // Lifetime of host vectors } @@ -406,22 +406,22 @@ MGGA_EXC_VXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_exc_vxc_device_ ) const { std::vector vrho_host( sz_vrho ), vsigma_host( sz_vsigma ), vlapl_host( sz_vlapl ), vtau_host( sz_vtau ); - recv_from_device( rho_host.data(), rho, len_rho , queue ); - recv_from_device( sigma_host.data(), sigma, len_sigma, queue ); - recv_from_device( lapl_host.data(), lapl, len_lapl , queue ); - recv_from_device( tau_host.data(), tau, len_tau , queue ); + recv_from_device( rho_host.data(), rho, len_rho , stream ); + recv_from_device( sigma_host.data(), sigma, len_sigma, stream ); + recv_from_device( lapl_host.data(), lapl, len_lapl , stream ); + recv_from_device( tau_host.data(), tau, len_tau , stream ); - queue_sync( queue ); + stream_sync( stream ); xc_mgga_exc_vxc( &kernel_, N, rho_host.data(), sigma_host.data(), lapl_host.data(), tau_host.data(), eps_host.data(), vrho_host.data(), vsigma_host.data(), vlapl_host.data(), vtau_host.data() ); - send_to_device( eps, eps_host.data(), len_eps , queue ); - send_to_device( vrho, vrho_host.data(), len_vrho , queue ); - send_to_device( vsigma, vsigma_host.data(), len_vsigma, queue ); - send_to_device( vlapl, vlapl_host.data(), len_vlapl , queue ); - send_to_device( vtau, vtau_host.data(), len_vtau , queue ); - queue_sync( queue ); // Lifetime of host vectors + send_to_device( eps, eps_host.data(), len_eps , stream ); + send_to_device( vrho, vrho_host.data(), len_vrho , stream ); + send_to_device( vsigma, vsigma_host.data(), len_vsigma, stream ); + send_to_device( vlapl, vlapl_host.data(), len_vlapl , stream ); + send_to_device( vtau, vtau_host.data(), len_vtau , stream ); + stream_sync( stream ); // Lifetime of host vectors } @@ -471,12 +471,12 @@ MGGA_FXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_fxc_device_ ) const { v2sigmatau_host(sz_v2sigmatau), v2lapl2_host(sz_v2lapl2), v2lapltau_host(sz_v2lapltau), v2tau2_host(sz_v2tau2); - recv_from_device(rho_host.data(), rho, len_rho, queue); - recv_from_device(sigma_host.data(), sigma, len_sigma, queue); - recv_from_device(lapl_host.data(), lapl, len_lapl, queue); - recv_from_device(tau_host.data(), tau, len_tau, queue); + recv_from_device(rho_host.data(), rho, len_rho, stream); + recv_from_device(sigma_host.data(), sigma, len_sigma, stream); + recv_from_device(lapl_host.data(), lapl, len_lapl, stream); + recv_from_device(tau_host.data(), tau, len_tau, stream); - queue_sync(queue); + stream_sync(stream); xc_mgga_fxc(&kernel_, N, rho_host.data(), sigma_host.data(), lapl_host.data(), tau_host.data(), v2rho2_host.data(), v2rhosigma_host.data(), v2rholapl_host.data(), @@ -484,18 +484,18 @@ MGGA_FXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_fxc_device_ ) const { v2sigmatau_host.data(), v2lapl2_host.data(), v2lapltau_host.data(), v2tau2_host.data()); - send_to_device(v2rho2, v2rho2_host.data(), len_v2rho2, queue); - send_to_device(v2rhosigma, v2rhosigma_host.data(), len_v2rhosigma, queue); - send_to_device(v2rholapl, v2rholapl_host.data(), len_v2rholapl, queue); - send_to_device(v2rhotau, v2rhotau_host.data(), len_v2rhotau, queue); - send_to_device(v2sigma2, v2sigma2_host.data(), len_v2sigma2, queue); - send_to_device(v2sigmalapl, v2sigmalapl_host.data(), len_v2sigmalapl, queue); - send_to_device(v2sigmatau, v2sigmatau_host.data(), len_v2sigmatau, queue); - send_to_device(v2lapl2, v2lapl2_host.data(), len_v2lapl2, queue); - send_to_device(v2lapltau, v2lapltau_host.data(), len_v2lapltau, queue); - send_to_device(v2tau2, v2tau2_host.data(), len_v2tau2, queue); - - queue_sync(queue); // Lifetime of host vectors + send_to_device(v2rho2, v2rho2_host.data(), len_v2rho2, stream); + send_to_device(v2rhosigma, v2rhosigma_host.data(), len_v2rhosigma, stream); + send_to_device(v2rholapl, v2rholapl_host.data(), len_v2rholapl, stream); + send_to_device(v2rhotau, v2rhotau_host.data(), len_v2rhotau, stream); + send_to_device(v2sigma2, v2sigma2_host.data(), len_v2sigma2, stream); + send_to_device(v2sigmalapl, v2sigmalapl_host.data(), len_v2sigmalapl, stream); + send_to_device(v2sigmatau, v2sigmatau_host.data(), len_v2sigmatau, stream); + send_to_device(v2lapl2, v2lapl2_host.data(), len_v2lapl2, stream); + send_to_device(v2lapltau, v2lapltau_host.data(), len_v2lapltau, stream); + send_to_device(v2tau2, v2tau2_host.data(), len_v2tau2, stream); + + stream_sync(stream); // Lifetime of host vectors } @@ -558,12 +558,12 @@ MGGA_VXC_FXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_vxc_fxc_device_ ) const { v2sigmatau_host(sz_v2sigmatau), v2lapl2_host(sz_v2lapl2), v2lapltau_host(sz_v2lapltau), v2tau2_host(sz_v2tau2); - recv_from_device(rho_host.data(), rho, len_rho, queue); - recv_from_device(sigma_host.data(), sigma, len_sigma, queue); - recv_from_device(lapl_host.data(), lapl, len_lapl, queue); - recv_from_device(tau_host.data(), tau, len_tau, queue); + recv_from_device(rho_host.data(), rho, len_rho, stream); + recv_from_device(sigma_host.data(), sigma, len_sigma, stream); + recv_from_device(lapl_host.data(), lapl, len_lapl, stream); + recv_from_device(tau_host.data(), tau, len_tau, stream); - queue_sync(queue); + stream_sync(stream); xc_mgga_vxc_fxc(&kernel_, N, rho_host.data(), sigma_host.data(), lapl_host.data(), tau_host.data(), vrho_host.data(), vsigma_host.data(), vlapl_host.data(), vtau_host.data(), @@ -572,23 +572,23 @@ MGGA_VXC_FXC_GENERATOR_DEVICE( LibxcKernelImpl::eval_vxc_fxc_device_ ) const { v2sigmatau_host.data(), v2lapl2_host.data(), v2lapltau_host.data(), v2tau2_host.data()); - send_to_device(vrho, vrho_host.data(), len_vrho, queue); - send_to_device(vsigma, vsigma_host.data(), len_vsigma, queue); - send_to_device(vlapl, vlapl_host.data(), len_vlapl, queue); - send_to_device(vtau, vtau_host.data(), len_vtau, queue); - - send_to_device(v2rho2, v2rho2_host.data(), len_v2rho2, queue); - send_to_device(v2rhosigma, v2rhosigma_host.data(), len_v2rhosigma, queue); - send_to_device(v2rholapl, v2rholapl_host.data(), len_v2rholapl, queue); - send_to_device(v2rhotau, v2rhotau_host.data(), len_v2rhotau, queue); - send_to_device(v2sigma2, v2sigma2_host.data(), len_v2sigma2, queue); - send_to_device(v2sigmalapl, v2sigmalapl_host.data(), len_v2sigmalapl, queue); - send_to_device(v2sigmatau, v2sigmatau_host.data(), len_v2sigmatau, queue); - send_to_device(v2lapl2, v2lapl2_host.data(), len_v2lapl2, queue); - send_to_device(v2lapltau, v2lapltau_host.data(), len_v2lapltau, queue); - send_to_device(v2tau2, v2tau2_host.data(), len_v2tau2, queue); - - queue_sync(queue); // Lifetime of host vectors + send_to_device(vrho, vrho_host.data(), len_vrho, stream); + send_to_device(vsigma, vsigma_host.data(), len_vsigma, stream); + send_to_device(vlapl, vlapl_host.data(), len_vlapl, stream); + send_to_device(vtau, vtau_host.data(), len_vtau, stream); + + send_to_device(v2rho2, v2rho2_host.data(), len_v2rho2, stream); + send_to_device(v2rhosigma, v2rhosigma_host.data(), len_v2rhosigma, stream); + send_to_device(v2rholapl, v2rholapl_host.data(), len_v2rholapl, stream); + send_to_device(v2rhotau, v2rhotau_host.data(), len_v2rhotau, stream); + send_to_device(v2sigma2, v2sigma2_host.data(), len_v2sigma2, stream); + send_to_device(v2sigmalapl, v2sigmalapl_host.data(), len_v2sigmalapl, stream); + send_to_device(v2sigmatau, v2sigmatau_host.data(), len_v2sigmatau, stream); + send_to_device(v2lapl2, v2lapl2_host.data(), len_v2lapl2, stream); + send_to_device(v2lapltau, v2lapltau_host.data(), len_v2lapltau, stream); + send_to_device(v2tau2, v2tau2_host.data(), len_v2tau2, stream); + + stream_sync(stream); // Lifetime of host vectors } diff --git a/src/sycl/xc_functional_device.cxx b/src/sycl/xc_functional_device.cxx index 42cce84..35ed90a 100644 --- a/src/sycl/xc_functional_device.cxx +++ b/src/sycl/xc_functional_device.cxx @@ -1,24 +1,30 @@ /** - * ExchCXX Copyright (c) 2020-2022, The Regents of the University of California, + * ExchCXX + * + * Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. - * + * any required approvals from the U.S. Dept. of Energy). + * + * Portions Copyright (c) Microsoft Corporation. + * + * All rights reserved. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: - * + * * (1) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. - * + * * (2) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * + * * (3) Neither the name of the University of California, Lawrence Berkeley * National Laboratory, U.S. Dept. of Energy nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. - * - * + * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -30,7 +36,7 @@ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. - * + * * You are under no obligation whatsoever to provide any bug fixes, patches, * or upgrades to the features, functionality or performance of the source * code ("Enhancements") to anyone; however, if you choose to make your @@ -44,31 +50,40 @@ */ #include +#include #include template class scal_device_tag; template class add_scal_device_tag; -void scal_device( const int N, const double fact, const double* X_device, double* Y_device, sycl::queue* queue ) { - queue->parallel_for>( sycl::range<1>(N), - [=]( sycl::id<1> idx ) { Y_device[idx] = fact * X_device[idx]; }); +void scal_device( const int N, const double fact, const double* X_device, double* Y_device, sycl::queue* stream ) { + sycl::range<1> threads(512); + sycl::range<1> blocks(ExchCXX::util::div_ceil(N,512)); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) { + int tid = item.get_global_id(0); + if( tid < N ) Y_device[tid] = X_device[tid] * fact; + }); } -void add_scal_device( const int N, const double fact, const double* X_device, double* Y_device, sycl::queue* queue ) { - queue->parallel_for>( sycl::range<1>(N), - [=]( sycl::id<1> idx ) { Y_device[idx] += fact * X_device[idx]; }); +void add_scal_device( const int N, const double fact, const double* X_device, double* Y_device, sycl::queue* stream ) { + sycl::range<1> threads(512); + sycl::range<1> blocks(ExchCXX::util::div_ceil(N,512)); + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) { + int tid = item.get_global_id(0); + if( tid < N ) Y_device[tid] += X_device[tid] * fact; + }); } template -T* safe_sycl_malloc( size_t N, sycl::queue* queue ) { - return sycl::malloc_device( N, *queue ); +T* safe_cuda_malloc( size_t N, sycl::queue* stream ) { + return sycl::malloc_device( N, *stream ); } template -void safe_zero( size_t len, T* ptr, sycl::queue* queue ) { - queue->memset( ptr, 0, len*sizeof(T) ); +void safe_zero( size_t len, T* ptr, sycl::queue* stream ) { + stream->memset( ptr, 0, len*sizeof(T) ); } namespace ExchCXX { @@ -87,9 +102,9 @@ LDA_EXC_GENERATOR_DEVICE( XCFunctional::eval_exc_device ) const { double* eps_scr = nullptr; if( kernels_.size() > 1 and not supports_inc_interface() ) - eps_scr = safe_sycl_malloc( len_exc_buffer, queue ); + eps_scr = safe_cuda_malloc( len_exc_buffer, stream ); - safe_zero( len_exc_buffer, eps, queue ); + safe_zero( len_exc_buffer, eps, stream ); for( auto i = 0ul; i < kernels_.size(); ++i ) { @@ -97,24 +112,25 @@ LDA_EXC_GENERATOR_DEVICE( XCFunctional::eval_exc_device ) const { if( supports_inc_interface() ) { kernels_[i].second.eval_exc_inc_device( - kernels_[i].first, N, rho, eps, queue + kernels_[i].first, N, rho, eps, stream ); } else { double* eps_eval = i ? eps_scr : eps; - kernels_[i].second.eval_exc_device(N, rho, eps_eval, queue); + kernels_[i].second.eval_exc_device(N, rho, eps_eval, stream); if( i ) - add_scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, queue ); + add_scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, stream ); else - scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, queue ); + scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, stream ); } } - if( eps_scr ) sycl::free( eps_scr, *queue ); + stream->wait(); + if( eps_scr ) sycl::free( eps_scr, *stream ); } @@ -129,36 +145,36 @@ LDA_EXC_VXC_GENERATOR_DEVICE( XCFunctional::eval_exc_vxc_device ) const { double* eps_scr(nullptr), *vxc_scr(nullptr); if( kernels_.size() > 1 and not supports_inc_interface() ) { - eps_scr = safe_sycl_malloc( len_exc_buffer, queue ); - vxc_scr = safe_sycl_malloc( len_vxc_buffer, queue ); + eps_scr = safe_cuda_malloc( len_exc_buffer, stream ); + vxc_scr = safe_cuda_malloc( len_vxc_buffer, stream ); } - safe_zero( len_exc_buffer, eps, queue ); - safe_zero( len_vxc_buffer, vxc, queue ); + safe_zero( len_exc_buffer, eps, stream ); + safe_zero( len_vxc_buffer, vxc, stream ); for( auto i = 0ul; i < kernels_.size(); ++i ) { if( supports_inc_interface() ) { kernels_[i].second.eval_exc_vxc_inc_device( - kernels_[i].first, N, rho, eps, vxc, queue + kernels_[i].first, N, rho, eps, vxc, stream ); } else { double* eps_eval = i ? eps_scr : eps; double* vxc_eval = i ? vxc_scr : vxc; - kernels_[i].second.eval_exc_vxc_device(N, rho, eps_eval, vxc_eval, queue); + kernels_[i].second.eval_exc_vxc_device(N, rho, eps_eval, vxc_eval, stream); if( i ) { - add_scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, queue ); - add_scal_device( len_vxc_buffer, kernels_[i].first, vxc_eval, vxc, queue ); + add_scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, stream ); + add_scal_device( len_vxc_buffer, kernels_[i].first, vxc_eval, vxc, stream ); } else { - scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, queue ); - scal_device( len_vxc_buffer, kernels_[i].first, vxc_eval, vxc, queue ); + scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, stream ); + scal_device( len_vxc_buffer, kernels_[i].first, vxc_eval, vxc, stream ); } @@ -166,8 +182,9 @@ LDA_EXC_VXC_GENERATOR_DEVICE( XCFunctional::eval_exc_vxc_device ) const { } - if( eps_scr ) sycl::free( eps_scr, *queue ); - if( vxc_scr ) sycl::free( vxc_scr, *queue ); + stream->wait(); + if( eps_scr ) sycl::free( eps_scr, *stream ); + if( vxc_scr ) sycl::free( vxc_scr, *stream ); } @@ -182,27 +199,28 @@ LDA_FXC_GENERATOR_DEVICE( XCFunctional::eval_fxc_device ) const { double* fxc_scr = nullptr; bool use_inc = supports_inc_interface(); if( kernels_.size() > 1 && !use_inc ) - fxc_scr = safe_sycl_malloc( len_fxc_buffer, queue ); + fxc_scr = safe_cuda_malloc( len_fxc_buffer, stream ); - safe_zero( len_fxc_buffer, fxc, queue ); + safe_zero( len_fxc_buffer, fxc, stream ); for( auto i = 0ul; i < kernels_.size(); ++i ) { if (use_inc) { kernels_[i].second.eval_fxc_inc_device( - kernels_[i].first, N, rho, fxc, queue + kernels_[i].first, N, rho, fxc, stream ); } else { double* fxc_eval = i ? fxc_scr : fxc; - kernels_[i].second.eval_fxc_device(N, rho, fxc_eval, queue); + kernels_[i].second.eval_fxc_device(N, rho, fxc_eval, stream); if( i ) - add_scal_device( len_fxc_buffer, kernels_[i].first, fxc_eval, fxc, queue ); + add_scal_device( len_fxc_buffer, kernels_[i].first, fxc_eval, fxc, stream ); else - scal_device( len_fxc_buffer, kernels_[i].first, fxc_eval, fxc, queue ); + scal_device( len_fxc_buffer, kernels_[i].first, fxc_eval, fxc, stream ); } } - if( fxc_scr ) sycl::free( fxc_scr, *queue ); + stream->wait(); + if( fxc_scr ) sycl::free( fxc_scr, *stream ); } LDA_VXC_FXC_GENERATOR_DEVICE( XCFunctional::eval_vxc_fxc_device ) const { @@ -216,35 +234,36 @@ LDA_VXC_FXC_GENERATOR_DEVICE( XCFunctional::eval_vxc_fxc_device ) const { double* vxc_scr(nullptr), *fxc_scr(nullptr); bool use_inc = supports_inc_interface(); if( kernels_.size() > 1 && !use_inc ) { - vxc_scr = safe_sycl_malloc( len_vxc_buffer, queue ); - fxc_scr = safe_sycl_malloc( len_fxc_buffer, queue ); + vxc_scr = safe_cuda_malloc( len_vxc_buffer, stream ); + fxc_scr = safe_cuda_malloc( len_fxc_buffer, stream ); } - safe_zero( len_vxc_buffer, vxc, queue ); - safe_zero( len_fxc_buffer, fxc, queue ); + safe_zero( len_vxc_buffer, vxc, stream ); + safe_zero( len_fxc_buffer, fxc, stream ); for( auto i = 0ul; i < kernels_.size(); ++i ) { if (use_inc) { kernels_[i].second.eval_vxc_fxc_inc_device( - kernels_[i].first, N, rho, vxc, fxc, queue + kernels_[i].first, N, rho, vxc, fxc, stream ); } else { double* vxc_eval = i ? vxc_scr : vxc; double* fxc_eval = i ? fxc_scr : fxc; - kernels_[i].second.eval_vxc_fxc_device(N, rho, vxc_eval, fxc_eval, queue); + kernels_[i].second.eval_vxc_fxc_device(N, rho, vxc_eval, fxc_eval, stream); if( i ) { - add_scal_device( len_vxc_buffer, kernels_[i].first, vxc_eval, vxc, queue ); - add_scal_device( len_fxc_buffer, kernels_[i].first, fxc_eval, fxc, queue ); + add_scal_device( len_vxc_buffer, kernels_[i].first, vxc_eval, vxc, stream ); + add_scal_device( len_fxc_buffer, kernels_[i].first, fxc_eval, fxc, stream ); } else { - scal_device( len_vxc_buffer, kernels_[i].first, vxc_eval, vxc, queue ); - scal_device( len_fxc_buffer, kernels_[i].first, fxc_eval, fxc, queue ); + scal_device( len_vxc_buffer, kernels_[i].first, vxc_eval, vxc, stream ); + scal_device( len_fxc_buffer, kernels_[i].first, fxc_eval, fxc, stream ); } } } - if( vxc_scr ) sycl::free( vxc_scr, *queue ); - if( fxc_scr ) sycl::free( fxc_scr, *queue ); + stream->wait(); + if( vxc_scr ) sycl::free( vxc_scr, *stream ); + if( fxc_scr ) sycl::free( fxc_scr, *stream ); } @@ -259,9 +278,9 @@ GGA_EXC_GENERATOR_DEVICE( XCFunctional::eval_exc_device ) const { double* eps_scr = nullptr; if( kernels_.size() > 1 and not supports_inc_interface() ) - eps_scr = safe_sycl_malloc( len_exc_buffer, queue ); + eps_scr = safe_cuda_malloc( len_exc_buffer, stream ); - safe_zero( len_exc_buffer, eps, queue ); + safe_zero( len_exc_buffer, eps, stream ); for( auto i = 0ul; i < kernels_.size(); ++i ) { @@ -269,11 +288,11 @@ GGA_EXC_GENERATOR_DEVICE( XCFunctional::eval_exc_device ) const { if( kernels_[i].second.is_gga() ) kernels_[i].second.eval_exc_inc_device( - kernels_[i].first, N, rho, sigma, eps, queue + kernels_[i].first, N, rho, sigma, eps, stream ); else kernels_[i].second.eval_exc_inc_device( - kernels_[i].first, N, rho, eps, queue + kernels_[i].first, N, rho, eps, stream ); } else { @@ -281,19 +300,20 @@ GGA_EXC_GENERATOR_DEVICE( XCFunctional::eval_exc_device ) const { double* eps_eval = i ? eps_scr : eps; if( kernels_[i].second.is_gga() ) - kernels_[i].second.eval_exc_device(N, rho, sigma, eps_eval, queue); + kernels_[i].second.eval_exc_device(N, rho, sigma, eps_eval, stream); else - kernels_[i].second.eval_exc_device(N, rho, eps_eval, queue); + kernels_[i].second.eval_exc_device(N, rho, eps_eval, stream); if( i ) - add_scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, queue ); + add_scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, stream ); else - scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, queue ); + scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, stream ); } } - if( eps_scr ) sycl::free( eps_scr, *queue ); + stream->wait(); + if( eps_scr ) sycl::free( eps_scr, *stream ); } @@ -309,14 +329,14 @@ GGA_EXC_VXC_GENERATOR_DEVICE( XCFunctional::eval_exc_vxc_device ) const { double* eps_scr(nullptr), *vrho_scr(nullptr), *vsigma_scr(nullptr); if( kernels_.size() > 1 and not supports_inc_interface() ) { - eps_scr = safe_sycl_malloc( len_exc_buffer, queue ); - vrho_scr = safe_sycl_malloc( len_vrho_buffer, queue ); - vsigma_scr = safe_sycl_malloc( len_vsigma_buffer, queue ); + eps_scr = safe_cuda_malloc( len_exc_buffer, stream ); + vrho_scr = safe_cuda_malloc( len_vrho_buffer, stream ); + vsigma_scr = safe_cuda_malloc( len_vsigma_buffer, stream ); } - safe_zero( len_exc_buffer, eps, queue ); - safe_zero( len_vrho_buffer, vrho, queue ); - safe_zero( len_vsigma_buffer, vsigma, queue ); + safe_zero( len_exc_buffer, eps, stream ); + safe_zero( len_vrho_buffer, vrho, stream ); + safe_zero( len_vsigma_buffer, vsigma, stream ); for( auto i = 0ul; i < kernels_.size(); ++i ) { @@ -325,11 +345,11 @@ GGA_EXC_VXC_GENERATOR_DEVICE( XCFunctional::eval_exc_vxc_device ) const { if( kernels_[i].second.is_gga() ) kernels_[i].second.eval_exc_vxc_inc_device( kernels_[i].first, N, rho, sigma, eps, vrho, - vsigma, queue + vsigma, stream ); else kernels_[i].second.eval_exc_vxc_inc_device( - kernels_[i].first, N, rho, eps, vrho, queue + kernels_[i].first, N, rho, eps, vrho, stream ); } else { @@ -340,32 +360,34 @@ GGA_EXC_VXC_GENERATOR_DEVICE( XCFunctional::eval_exc_vxc_device ) const { if( kernels_[i].second.is_gga() ) kernels_[i].second.eval_exc_vxc_device(N, rho, sigma, eps_eval, vrho_eval, - vsigma_eval, queue ); + vsigma_eval, stream ); else - kernels_[i].second.eval_exc_vxc_device(N, rho, eps_eval, vrho_eval, queue); + kernels_[i].second.eval_exc_vxc_device(N, rho, eps_eval, vrho_eval, stream); if( i ) { - add_scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, queue ); - add_scal_device( len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, queue); + add_scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, stream ); + add_scal_device( len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, stream); if( kernels_[i].second.is_gga() ) - add_scal_device( len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, queue ); + add_scal_device( len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, stream ); } else { - scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, queue ); - scal_device( len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, queue ); + scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, stream ); + scal_device( len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, stream ); if( kernels_[i].second.is_gga() ) - scal_device( len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, queue ); + scal_device( len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, stream ); } } } - if( eps_scr ) sycl::free( eps_scr, *queue ); - if( vrho_scr ) sycl::free( vrho_scr, *queue ); - if( vsigma_scr ) sycl::free( vsigma_scr, *queue ); + stream->wait(); + if( eps_scr ) sycl::free( eps_scr, *stream ); + if( vrho_scr ) sycl::free( vrho_scr, *stream ); + if( vsigma_scr ) sycl::free( vsigma_scr, *stream ); + } GGA_FXC_GENERATOR_DEVICE( XCFunctional::eval_fxc_device ) const { @@ -380,24 +402,24 @@ GGA_FXC_GENERATOR_DEVICE( XCFunctional::eval_fxc_device ) const { double* v2rho2_scr(nullptr), *v2rhosigma_scr(nullptr), *v2sigma2_scr(nullptr); bool use_inc = supports_inc_interface(); if( kernels_.size() > 1 && !use_inc ) { - v2rho2_scr = safe_sycl_malloc( len_v2rho2_buffer, queue ); - v2rhosigma_scr = safe_sycl_malloc( len_v2rhosigma_buffer, queue ); - v2sigma2_scr = safe_sycl_malloc( len_v2sigma2_buffer, queue ); + v2rho2_scr = safe_cuda_malloc( len_v2rho2_buffer, stream ); + v2rhosigma_scr = safe_cuda_malloc( len_v2rhosigma_buffer, stream ); + v2sigma2_scr = safe_cuda_malloc( len_v2sigma2_buffer, stream ); } - safe_zero( len_v2rho2_buffer, v2rho2, queue ); - safe_zero( len_v2rhosigma_buffer, v2rhosigma, queue ); - safe_zero( len_v2sigma2_buffer, v2sigma2, queue ); + safe_zero( len_v2rho2_buffer, v2rho2, stream ); + safe_zero( len_v2rhosigma_buffer, v2rhosigma, stream ); + safe_zero( len_v2sigma2_buffer, v2sigma2, stream ); for( auto i = 0ul; i < kernels_.size(); ++i ) { if (use_inc) { if( kernels_[i].second.is_gga() ) kernels_[i].second.eval_fxc_inc_device( - kernels_[i].first, N, rho, sigma, v2rho2, v2rhosigma, v2sigma2, queue + kernels_[i].first, N, rho, sigma, v2rho2, v2rhosigma, v2sigma2, stream ); else kernels_[i].second.eval_fxc_inc_device( - kernels_[i].first, N, rho, v2rho2, queue + kernels_[i].first, N, rho, v2rho2, stream ); } else { double* v2rho2_eval = i ? v2rho2_scr : v2rho2; @@ -406,30 +428,31 @@ GGA_FXC_GENERATOR_DEVICE( XCFunctional::eval_fxc_device ) const { if( kernels_[i].second.is_gga() ) kernels_[i].second.eval_fxc_device(N, rho, sigma, v2rho2_eval, - v2rhosigma_eval, v2sigma2_eval, queue ); + v2rhosigma_eval, v2sigma2_eval, stream ); else - kernels_[i].second.eval_fxc_device(N, rho, v2rho2_eval, queue); + kernels_[i].second.eval_fxc_device(N, rho, v2rho2_eval, stream); if( i ) { - add_scal_device( len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, queue ); + add_scal_device( len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, stream ); if( kernels_[i].second.is_gga() ){ - add_scal_device( len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, queue ); - add_scal_device( len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, queue ); + add_scal_device( len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, stream ); + add_scal_device( len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, stream ); } } else { - scal_device( len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, queue ); + scal_device( len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, stream ); if( kernels_[i].second.is_gga() ){ - scal_device( len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, queue ); - scal_device( len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, queue ); + scal_device( len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, stream ); + scal_device( len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, stream ); } } } } - if( v2rho2_scr ) sycl::free( v2rho2_scr, *queue); - if( v2rhosigma_scr ) sycl::free( v2rhosigma_scr, *queue); - if( v2sigma2_scr ) sycl::free( v2sigma2_scr, *queue); + stream->wait(); + if( v2rho2_scr ) sycl::free( v2rho2_scr, *stream ); + if( v2rhosigma_scr ) sycl::free( v2rhosigma_scr, *stream ); + if( v2sigma2_scr ) sycl::free( v2sigma2_scr, *stream ); } GGA_VXC_FXC_GENERATOR_DEVICE( XCFunctional::eval_vxc_fxc_device ) const { @@ -447,28 +470,28 @@ GGA_VXC_FXC_GENERATOR_DEVICE( XCFunctional::eval_vxc_fxc_device ) const { double* v2rho2_scr(nullptr), *v2rhosigma_scr(nullptr), *v2sigma2_scr(nullptr); bool use_inc = supports_inc_interface(); if( kernels_.size() > 1 && !use_inc ) { - vrho_scr = safe_sycl_malloc( len_vrho_buffer, queue ); - vsigma_scr = safe_sycl_malloc( len_vsigma_buffer, queue ); - v2rho2_scr = safe_sycl_malloc( len_v2rho2_buffer, queue ); - v2rhosigma_scr = safe_sycl_malloc( len_v2rhosigma_buffer, queue ); - v2sigma2_scr = safe_sycl_malloc( len_v2sigma2_buffer, queue ); + vrho_scr = safe_cuda_malloc( len_vrho_buffer, stream ); + vsigma_scr = safe_cuda_malloc( len_vsigma_buffer, stream ); + v2rho2_scr = safe_cuda_malloc( len_v2rho2_buffer, stream ); + v2rhosigma_scr = safe_cuda_malloc( len_v2rhosigma_buffer, stream ); + v2sigma2_scr = safe_cuda_malloc( len_v2sigma2_buffer, stream ); } - safe_zero( len_vrho_buffer, vrho, queue ); - safe_zero( len_vsigma_buffer, vsigma, queue ); - safe_zero( len_v2rho2_buffer, v2rho2, queue ); - safe_zero( len_v2rhosigma_buffer, v2rhosigma, queue ); - safe_zero( len_v2sigma2_buffer, v2sigma2, queue ); + safe_zero( len_vrho_buffer, vrho, stream ); + safe_zero( len_vsigma_buffer, vsigma, stream ); + safe_zero( len_v2rho2_buffer, v2rho2, stream ); + safe_zero( len_v2rhosigma_buffer, v2rhosigma, stream ); + safe_zero( len_v2sigma2_buffer, v2sigma2, stream ); for( auto i = 0ul; i < kernels_.size(); ++i ) { if (use_inc) { if (kernels_[i].second.is_gga()) { kernels_[i].second.eval_vxc_fxc_inc_device( - kernels_[i].first, N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, v2sigma2, queue + kernels_[i].first, N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, v2sigma2, stream ); } else { kernels_[i].second.eval_vxc_fxc_inc_device( - kernels_[i].first, N, rho, vrho, v2rho2, queue + kernels_[i].first, N, rho, vrho, v2rho2, stream ); } } else { @@ -480,38 +503,39 @@ GGA_VXC_FXC_GENERATOR_DEVICE( XCFunctional::eval_vxc_fxc_device ) const { if (kernels_[i].second.is_gga()) { kernels_[i].second.eval_vxc_fxc_device( - N, rho, sigma, vrho_eval, vsigma_eval, v2rho2_eval, v2rhosigma_eval, v2sigma2_eval, queue); + N, rho, sigma, vrho_eval, vsigma_eval, v2rho2_eval, v2rhosigma_eval, v2sigma2_eval, stream); } else { - kernels_[i].second.eval_vxc_fxc_device(N, rho, vrho_eval, v2rho2_eval, queue); + kernels_[i].second.eval_vxc_fxc_device(N, rho, vrho_eval, v2rho2_eval, stream); } if (i) { - add_scal_device(len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, queue); - add_scal_device(len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, queue); + add_scal_device(len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, stream); + add_scal_device(len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, stream); if (kernels_[i].second.is_gga()) { - add_scal_device(len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, queue); - add_scal_device(len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, queue); - add_scal_device(len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, queue); + add_scal_device(len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, stream); + add_scal_device(len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, stream); + add_scal_device(len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, stream); } } else { - scal_device(len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, queue); - scal_device(len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, queue); + scal_device(len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, stream); + scal_device(len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, stream); if (kernels_[i].second.is_gga()) { - scal_device(len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, queue); - scal_device(len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, queue); - scal_device(len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, queue); + scal_device(len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, stream); + scal_device(len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, stream); + scal_device(len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, stream); } } } } - if( vrho_scr ) sycl::free( vrho_scr, *queue); - if( vsigma_scr ) sycl::free( vsigma_scr, *queue); - if( v2rho2_scr ) sycl::free( v2rho2_scr, *queue); - if( v2rhosigma_scr ) sycl::free( v2rhosigma_scr, *queue); - if( v2sigma2_scr ) sycl::free( v2sigma2_scr, *queue); + stream->wait(); + if( vrho_scr ) sycl::free( vrho_scr, *stream ); + if( vsigma_scr ) sycl::free( vsigma_scr, *stream ); + if( v2rho2_scr ) sycl::free( v2rho2_scr, *stream ); + if( v2rhosigma_scr ) sycl::free( v2rhosigma_scr, *stream ); + if( v2sigma2_scr ) sycl::free( v2sigma2_scr, *stream ); } // mGGA Interfaces @@ -525,9 +549,9 @@ MGGA_EXC_GENERATOR_DEVICE( XCFunctional::eval_exc_device ) const { double* eps_scr = nullptr; if( kernels_.size() > 1 and not supports_inc_interface() ) - eps_scr = safe_sycl_malloc( len_exc_buffer, queue ); + eps_scr = safe_cuda_malloc( len_exc_buffer, stream ); - safe_zero( len_exc_buffer, eps, queue ); + safe_zero( len_exc_buffer, eps, stream ); for( auto i = 0ul; i < kernels_.size(); ++i ) { @@ -535,15 +559,15 @@ MGGA_EXC_GENERATOR_DEVICE( XCFunctional::eval_exc_device ) const { if( kernels_[i].second.is_mgga() ) kernels_[i].second.eval_exc_inc_device( - kernels_[i].first, N, rho, sigma, lapl, tau, eps, queue + kernels_[i].first, N, rho, sigma, lapl, tau, eps, stream ); else if( kernels_[i].second.is_gga() ) kernels_[i].second.eval_exc_inc_device( - kernels_[i].first, N, rho, sigma, eps, queue + kernels_[i].first, N, rho, sigma, eps, stream ); else kernels_[i].second.eval_exc_inc_device( - kernels_[i].first, N, rho, eps, queue + kernels_[i].first, N, rho, eps, stream ); } else { @@ -551,21 +575,22 @@ MGGA_EXC_GENERATOR_DEVICE( XCFunctional::eval_exc_device ) const { double* eps_eval = i ? eps_scr : eps; if( kernels_[i].second.is_mgga() ) - kernels_[i].second.eval_exc_device(N, rho, sigma, lapl, tau, eps_eval, queue); + kernels_[i].second.eval_exc_device(N, rho, sigma, lapl, tau, eps_eval, stream); else if( kernels_[i].second.is_gga() ) - kernels_[i].second.eval_exc_device(N, rho, sigma, eps_eval, queue); + kernels_[i].second.eval_exc_device(N, rho, sigma, eps_eval, stream); else - kernels_[i].second.eval_exc_device(N, rho, eps_eval, queue); + kernels_[i].second.eval_exc_device(N, rho, eps_eval, stream); if( i ) - add_scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, queue ); + add_scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, stream ); else - scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, queue ); + scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, stream ); } } - if( eps_scr ) sycl::free( eps_scr, *queue ); + stream->wait(); + if( eps_scr ) sycl::free( eps_scr, *stream ); } @@ -573,7 +598,7 @@ MGGA_EXC_GENERATOR_DEVICE( XCFunctional::eval_exc_device ) const { MGGA_EXC_VXC_GENERATOR_DEVICE( XCFunctional::eval_exc_vxc_device ) const { throw_if_not_sane(); - EXCHCXX_BOOL_CHECK("KERNEL IS NOT GGA", is_gga() ); + EXCHCXX_BOOL_CHECK("KERNEL IS NOT MGGA", is_mgga() ); size_t len_exc_buffer = exc_buffer_len(N); size_t len_vrho_buffer = vrho_buffer_len(N); @@ -584,19 +609,21 @@ MGGA_EXC_VXC_GENERATOR_DEVICE( XCFunctional::eval_exc_vxc_device ) const { double* eps_scr(nullptr), *vrho_scr(nullptr), *vsigma_scr(nullptr), *vlapl_scr(nullptr), *vtau_scr(nullptr); if( kernels_.size() > 1 and not supports_inc_interface() ) { - eps_scr = safe_sycl_malloc( len_exc_buffer, queue ); - vrho_scr = safe_sycl_malloc( len_vrho_buffer, queue ); - vsigma_scr = safe_sycl_malloc( len_vsigma_buffer, queue ); - vlapl_scr = safe_sycl_malloc( len_vlapl_buffer, queue ); - vtau_scr = safe_sycl_malloc( len_vtau_buffer, queue ); + eps_scr = safe_cuda_malloc( len_exc_buffer, stream ); + vrho_scr = safe_cuda_malloc( len_vrho_buffer, stream ); + vsigma_scr = safe_cuda_malloc( len_vsigma_buffer, stream ); + vtau_scr = safe_cuda_malloc( len_vtau_buffer, stream ); + if(needs_laplacian()) + vlapl_scr = safe_cuda_malloc( len_vlapl_buffer, stream ); } - safe_zero( len_exc_buffer, eps, queue ); - safe_zero( len_vrho_buffer, vrho, queue ); - safe_zero( len_vsigma_buffer, vsigma, queue ); - safe_zero( len_vlapl_buffer, vlapl, queue ); - safe_zero( len_vtau_buffer, vtau, queue ); + safe_zero( len_exc_buffer, eps, stream ); + safe_zero( len_vrho_buffer, vrho, stream ); + safe_zero( len_vsigma_buffer, vsigma, stream ); + safe_zero( len_vtau_buffer, vtau, stream ); + if(needs_laplacian()) + safe_zero( len_vlapl_buffer, vlapl, stream ); for( auto i = 0ul; i < kernels_.size(); ++i ) { @@ -605,16 +632,16 @@ MGGA_EXC_VXC_GENERATOR_DEVICE( XCFunctional::eval_exc_vxc_device ) const { if( kernels_[i].second.is_mgga() ) kernels_[i].second.eval_exc_vxc_inc_device( kernels_[i].first, N, rho, sigma, lapl, tau, eps, - vrho, vsigma, vlapl, vtau, queue + vrho, vsigma, vlapl, vtau, stream ); else if( kernels_[i].second.is_gga() ) kernels_[i].second.eval_exc_vxc_inc_device( kernels_[i].first, N, rho, sigma, eps, vrho, - vsigma, queue + vsigma, stream ); else kernels_[i].second.eval_exc_vxc_inc_device( - kernels_[i].first, N, rho, eps, vrho, queue + kernels_[i].first, N, rho, eps, vrho, stream ); } else { @@ -627,48 +654,57 @@ MGGA_EXC_VXC_GENERATOR_DEVICE( XCFunctional::eval_exc_vxc_device ) const { if( kernels_[i].second.is_mgga() ) kernels_[i].second.eval_exc_vxc_device(N, rho, sigma, lapl, tau, eps_eval, - vrho_eval, vsigma_eval, vlapl_eval, vtau_eval, queue ); + vrho_eval, vsigma_eval, vlapl_eval, vtau_eval, stream ); else if( kernels_[i].second.is_gga() ) kernels_[i].second.eval_exc_vxc_device(N, rho, sigma, eps_eval, vrho_eval, - vsigma_eval, queue ); + vsigma_eval, stream ); else - kernels_[i].second.eval_exc_vxc_device(N, rho, eps_eval, vrho_eval, queue); + kernels_[i].second.eval_exc_vxc_device(N, rho, eps_eval, vrho_eval, stream); if( i ) { - add_scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, queue ); - add_scal_device( len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, queue ); + add_scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, stream ); + add_scal_device( len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, stream ); - if( kernels_[i].second.is_gga() ) - add_scal_device( len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, queue ); + if( kernels_[i].second.is_gga() or kernels_[i].second.is_mgga() ) { + add_scal_device( len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, stream ); + } if( kernels_[i].second.is_mgga() ) { - add_scal_device( len_vlapl_buffer, kernels_[i].first, vlapl_eval, vlapl, queue ); - add_scal_device( len_vtau_buffer, kernels_[i].first, vtau_eval, vtau, queue ); + add_scal_device( len_vtau_buffer, kernels_[i].first, vtau_eval, vtau, stream ); + } + + if( kernels_[i].second.needs_laplacian() ) { + add_scal_device( len_vlapl_buffer, kernels_[i].first, vlapl_eval, vlapl, stream ); } } else { - scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, queue ); - scal_device( len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, queue ); + scal_device( len_exc_buffer, kernels_[i].first, eps_eval, eps, stream ); + scal_device( len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, stream ); - if( kernels_[i].second.is_gga() or kernels_[i].second.is_mgga() ) - scal_device( len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, queue ); + if( kernels_[i].second.is_gga() or kernels_[i].second.is_mgga() ) { + scal_device( len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, stream ); + } if( kernels_[i].second.is_mgga() ) { - scal_device( len_vlapl_buffer, kernels_[i].first, vlapl_eval, vlapl, queue ); - scal_device( len_vtau_buffer, kernels_[i].first, vtau_eval, vtau, queue ); + scal_device( len_vtau_buffer, kernels_[i].first, vtau_eval, vtau, stream ); + } + + if( kernels_[i].second.needs_laplacian() ) { + scal_device( len_vlapl_buffer, kernels_[i].first, vlapl_eval, vlapl, stream ); } } } } - if( eps_scr ) sycl::free( eps_scr, *queue ); - if( vrho_scr ) sycl::free( vrho_scr, *queue ); - if( vsigma_scr ) sycl::free( vsigma_scr, *queue ); - if( vlapl_scr ) sycl::free( vlapl_scr, *queue ); - if( vtau_scr ) sycl::free( vtau_scr, *queue ); + stream->wait(); + if( eps_scr ) sycl::free( eps_scr, *stream ); + if( vrho_scr ) sycl::free( vrho_scr, *stream ); + if( vsigma_scr ) sycl::free( vsigma_scr, *stream ); + if( vlapl_scr ) sycl::free( vlapl_scr, *stream ); + if( vtau_scr ) sycl::free( vtau_scr, *stream ); } MGGA_FXC_GENERATOR_DEVICE( XCFunctional::eval_fxc_device ) const { @@ -693,28 +729,28 @@ MGGA_FXC_GENERATOR_DEVICE( XCFunctional::eval_fxc_device ) const { bool use_inc = supports_inc_interface(); if( kernels_.size() > 1 && !use_inc ) { - v2rho2_scr = safe_sycl_malloc( len_v2rho2_buffer, queue ); - v2rhosigma_scr = safe_sycl_malloc( len_v2rhosigma_buffer, queue ); - v2rholapl_scr = safe_sycl_malloc( len_v2rholapl_buffer, queue ); - v2rhotau_scr = safe_sycl_malloc( len_v2rhotau_buffer, queue ); - v2sigma2_scr = safe_sycl_malloc( len_v2sigma2_buffer, queue ); - v2sigmalapl_scr = safe_sycl_malloc( len_v2sigmalapl_buffer, queue ); - v2sigmatau_scr = safe_sycl_malloc( len_v2sigmatau_buffer, queue ); - v2lapl2_scr = safe_sycl_malloc( len_v2lapl2_buffer, queue ); - v2lapltau_scr = safe_sycl_malloc( len_v2lapltau_buffer, queue ); - v2tau2_scr = safe_sycl_malloc( len_v2tau2_buffer, queue ); + v2rho2_scr = safe_cuda_malloc( len_v2rho2_buffer, stream ); + v2rhosigma_scr = safe_cuda_malloc( len_v2rhosigma_buffer, stream ); + v2rholapl_scr = safe_cuda_malloc( len_v2rholapl_buffer, stream ); + v2rhotau_scr = safe_cuda_malloc( len_v2rhotau_buffer, stream ); + v2sigma2_scr = safe_cuda_malloc( len_v2sigma2_buffer, stream ); + v2sigmalapl_scr = safe_cuda_malloc( len_v2sigmalapl_buffer, stream ); + v2sigmatau_scr = safe_cuda_malloc( len_v2sigmatau_buffer, stream ); + v2lapl2_scr = safe_cuda_malloc( len_v2lapl2_buffer, stream ); + v2lapltau_scr = safe_cuda_malloc( len_v2lapltau_buffer, stream ); + v2tau2_scr = safe_cuda_malloc( len_v2tau2_buffer, stream ); } - safe_zero( len_v2rho2_buffer, v2rho2, queue ); - safe_zero( len_v2rhosigma_buffer, v2rhosigma, queue ); - safe_zero( len_v2rholapl_buffer, v2rholapl, queue ); - safe_zero( len_v2rhotau_buffer, v2rhotau, queue ); - safe_zero( len_v2sigma2_buffer, v2sigma2, queue ); - safe_zero( len_v2sigmalapl_buffer, v2sigmalapl, queue ); - safe_zero( len_v2sigmatau_buffer, v2sigmatau, queue ); - safe_zero( len_v2lapl2_buffer, v2lapl2, queue ); - safe_zero( len_v2lapltau_buffer, v2lapltau, queue ); - safe_zero( len_v2tau2_buffer, v2tau2, queue ); + safe_zero( len_v2rho2_buffer, v2rho2, stream ); + safe_zero( len_v2rhosigma_buffer, v2rhosigma, stream ); + safe_zero( len_v2rholapl_buffer, v2rholapl, stream ); + safe_zero( len_v2rhotau_buffer, v2rhotau, stream ); + safe_zero( len_v2sigma2_buffer, v2sigma2, stream ); + safe_zero( len_v2sigmalapl_buffer, v2sigmalapl, stream ); + safe_zero( len_v2sigmatau_buffer, v2sigmatau, stream ); + safe_zero( len_v2lapl2_buffer, v2lapl2, stream ); + safe_zero( len_v2lapltau_buffer, v2lapltau, stream ); + safe_zero( len_v2tau2_buffer, v2tau2, stream ); for( auto i = 0ul; i < kernels_.size(); ++i ) { @@ -722,15 +758,15 @@ MGGA_FXC_GENERATOR_DEVICE( XCFunctional::eval_fxc_device ) const { if( kernels_[i].second.is_mgga() ) kernels_[i].second.eval_fxc_inc_device( kernels_[i].first, N, rho, sigma, lapl, tau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, - v2sigma2, v2sigmalapl, v2sigmatau, v2lapl2, v2lapltau, v2tau2, queue + v2sigma2, v2sigmalapl, v2sigmatau, v2lapl2, v2lapltau, v2tau2, stream ); else if( kernels_[i].second.is_gga() ) kernels_[i].second.eval_fxc_inc_device( - kernels_[i].first, N, rho, sigma, v2rho2, v2rhosigma, v2sigma2, queue + kernels_[i].first, N, rho, sigma, v2rho2, v2rhosigma, v2sigma2, stream ); else kernels_[i].second.eval_fxc_inc_device( - kernels_[i].first, N, rho, v2rho2, queue + kernels_[i].first, N, rho, v2rho2, stream ); } else { double* v2rho2_eval = i ? v2rho2_scr : v2rho2; @@ -747,74 +783,75 @@ MGGA_FXC_GENERATOR_DEVICE( XCFunctional::eval_fxc_device ) const { if( kernels_[i].second.is_mgga() ) kernels_[i].second.eval_fxc_device(N, rho, sigma, lapl, tau, v2rho2_eval, v2rhosigma_eval, v2rholapl_eval, v2rhotau_eval, v2sigma2_eval, v2sigmalapl_eval, - v2sigmatau_eval, v2lapl2_eval, v2lapltau_eval, v2tau2_eval, queue); + v2sigmatau_eval, v2lapl2_eval, v2lapltau_eval, v2tau2_eval, stream); else if( kernels_[i].second.is_gga() ) - kernels_[i].second.eval_fxc_device(N, rho, sigma, v2rho2_eval, v2rhosigma_eval, v2sigma2_eval, queue); + kernels_[i].second.eval_fxc_device(N, rho, sigma, v2rho2_eval, v2rhosigma_eval, v2sigma2_eval, stream); else - kernels_[i].second.eval_fxc_device(N, rho, v2rho2_eval, queue); + kernels_[i].second.eval_fxc_device(N, rho, v2rho2_eval, stream); if (i) { - add_scal_device(len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, queue); + add_scal_device(len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, stream); if( kernels_[i].second.is_gga() or kernels_[i].second.is_mgga() ){ - add_scal_device(len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, queue); - add_scal_device(len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, queue); + add_scal_device(len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, stream); + add_scal_device(len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, stream); } if( kernels_[i].second.needs_laplacian() ) { - add_scal_device(len_v2rholapl_buffer, kernels_[i].first, v2rholapl_eval, v2rholapl, queue); - add_scal_device(len_v2sigmalapl_buffer, kernels_[i].first, v2sigmalapl_eval, v2sigmalapl, queue); - add_scal_device(len_v2lapl2_buffer, kernels_[i].first, v2lapl2_eval, v2lapl2, queue); + add_scal_device(len_v2rholapl_buffer, kernels_[i].first, v2rholapl_eval, v2rholapl, stream); + add_scal_device(len_v2sigmalapl_buffer, kernels_[i].first, v2sigmalapl_eval, v2sigmalapl, stream); + add_scal_device(len_v2lapl2_buffer, kernels_[i].first, v2lapl2_eval, v2lapl2, stream); } if( kernels_[i].second.is_mgga() ) { - add_scal_device(len_v2rhotau_buffer, kernels_[i].first, v2rhotau_eval, v2rhotau, queue); - add_scal_device(len_v2sigmatau_buffer, kernels_[i].first, v2sigmatau_eval, v2sigmatau, queue); - add_scal_device(len_v2tau2_buffer, kernels_[i].first, v2tau2_eval, v2tau2, queue); + add_scal_device(len_v2rhotau_buffer, kernels_[i].first, v2rhotau_eval, v2rhotau, stream); + add_scal_device(len_v2sigmatau_buffer, kernels_[i].first, v2sigmatau_eval, v2sigmatau, stream); + add_scal_device(len_v2tau2_buffer, kernels_[i].first, v2tau2_eval, v2tau2, stream); } if ( kernels_[i].second.needs_laplacian() && kernels_[i].second.is_mgga() ) { - add_scal_device(len_v2lapltau_buffer, kernels_[i].first, v2lapltau_eval, v2lapltau, queue); + add_scal_device(len_v2lapltau_buffer, kernels_[i].first, v2lapltau_eval, v2lapltau, stream); } } else{ - scal_device(len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, queue); + scal_device(len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, stream); if (kernels_[i].second.is_gga() or kernels_[i].second.is_mgga()) { - scal_device(len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, queue); - scal_device(len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, queue); + scal_device(len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, stream); + scal_device(len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, stream); } if (kernels_[i].second.needs_laplacian()) { - scal_device(len_v2rholapl_buffer, kernels_[i].first, v2rholapl_eval, v2rholapl, queue); - scal_device(len_v2sigmalapl_buffer, kernels_[i].first, v2sigmalapl_eval, v2sigmalapl, queue); - scal_device(len_v2lapl2_buffer, kernels_[i].first, v2lapl2_eval, v2lapl2, queue); + scal_device(len_v2rholapl_buffer, kernels_[i].first, v2rholapl_eval, v2rholapl, stream); + scal_device(len_v2sigmalapl_buffer, kernels_[i].first, v2sigmalapl_eval, v2sigmalapl, stream); + scal_device(len_v2lapl2_buffer, kernels_[i].first, v2lapl2_eval, v2lapl2, stream); } if (kernels_[i].second.is_mgga()) { - scal_device(len_v2rhotau_buffer, kernels_[i].first, v2rhotau_eval, v2rhotau, queue); - scal_device(len_v2sigmatau_buffer, kernels_[i].first, v2sigmatau_eval, v2sigmatau, queue); - scal_device(len_v2tau2_buffer, kernels_[i].first, v2tau2_eval, v2tau2, queue); + scal_device(len_v2rhotau_buffer, kernels_[i].first, v2rhotau_eval, v2rhotau, stream); + scal_device(len_v2sigmatau_buffer, kernels_[i].first, v2sigmatau_eval, v2sigmatau, stream); + scal_device(len_v2tau2_buffer, kernels_[i].first, v2tau2_eval, v2tau2, stream); } if (kernels_[i].second.needs_laplacian() && kernels_[i].second.is_mgga()) { - scal_device(len_v2lapltau_buffer, kernels_[i].first, v2lapltau_eval, v2lapltau, queue); + scal_device(len_v2lapltau_buffer, kernels_[i].first, v2lapltau_eval, v2lapltau, stream); } } } } - if( v2rho2_scr ) sycl::free( v2rho2_scr, *queue); - if( v2rhosigma_scr ) sycl::free( v2rhosigma_scr, *queue); - if( v2rholapl_scr ) sycl::free( v2rholapl_scr, *queue); - if( v2rhotau_scr ) sycl::free( v2rhotau_scr, *queue); - if( v2sigma2_scr ) sycl::free( v2sigma2_scr, *queue); - if( v2sigmalapl_scr ) sycl::free( v2sigmalapl_scr, *queue); - if( v2sigmatau_scr ) sycl::free( v2sigmatau_scr, *queue); - if( v2lapl2_scr ) sycl::free( v2lapl2_scr, *queue); - if( v2lapltau_scr ) sycl::free( v2lapltau_scr, *queue); - if( v2tau2_scr ) sycl::free( v2tau2_scr, *queue); + stream->wait(); + if( v2rho2_scr ) sycl::free( v2rho2_scr, *stream ); + if( v2rhosigma_scr ) sycl::free( v2rhosigma_scr, *stream ); + if( v2rholapl_scr ) sycl::free( v2rholapl_scr, *stream ); + if( v2rhotau_scr ) sycl::free( v2rhotau_scr, *stream ); + if( v2sigma2_scr ) sycl::free( v2sigma2_scr, *stream ); + if( v2sigmalapl_scr ) sycl::free( v2sigmalapl_scr, *stream ); + if( v2sigmatau_scr ) sycl::free( v2sigmatau_scr, *stream ); + if( v2lapl2_scr ) sycl::free( v2lapl2_scr, *stream ); + if( v2lapltau_scr ) sycl::free( v2lapltau_scr, *stream ); + if( v2tau2_scr ) sycl::free( v2tau2_scr, *stream ); } MGGA_VXC_FXC_GENERATOR_DEVICE( XCFunctional::eval_vxc_fxc_device ) const { @@ -844,36 +881,36 @@ MGGA_VXC_FXC_GENERATOR_DEVICE( XCFunctional::eval_vxc_fxc_device ) const { bool use_inc = supports_inc_interface(); if( kernels_.size() > 1 && !use_inc ) { - vrho_scr = safe_sycl_malloc( len_vrho_buffer, queue ); - vsigma_scr = safe_sycl_malloc( len_vsigma_buffer, queue ); - vlapl_scr = safe_sycl_malloc( len_vlapl_buffer, queue ); - vtau_scr = safe_sycl_malloc( len_vtau_buffer, queue ); - v2rho2_scr = safe_sycl_malloc( len_v2rho2_buffer, queue ); - v2rhosigma_scr = safe_sycl_malloc( len_v2rhosigma_buffer, queue ); - v2rholapl_scr = safe_sycl_malloc( len_v2rholapl_buffer, queue ); - v2rhotau_scr = safe_sycl_malloc(len_v2rhotau_buffer, queue); - v2sigma2_scr = safe_sycl_malloc(len_v2sigma2_buffer, queue); - v2sigmalapl_scr = safe_sycl_malloc(len_v2sigmalapl_buffer, queue); - v2sigmatau_scr = safe_sycl_malloc(len_v2sigmatau_buffer, queue); - v2lapl2_scr = safe_sycl_malloc(len_v2lapl2_buffer, queue); - v2lapltau_scr = safe_sycl_malloc(len_v2lapltau_buffer, queue); - v2tau2_scr = safe_sycl_malloc(len_v2tau2_buffer, queue); + vrho_scr = safe_cuda_malloc( len_vrho_buffer, stream ); + vsigma_scr = safe_cuda_malloc( len_vsigma_buffer, stream ); + vlapl_scr = safe_cuda_malloc( len_vlapl_buffer, stream ); + vtau_scr = safe_cuda_malloc( len_vtau_buffer, stream ); + v2rho2_scr = safe_cuda_malloc( len_v2rho2_buffer, stream ); + v2rhosigma_scr = safe_cuda_malloc( len_v2rhosigma_buffer, stream ); + v2rholapl_scr = safe_cuda_malloc( len_v2rholapl_buffer, stream ); + v2rhotau_scr = safe_cuda_malloc(len_v2rhotau_buffer, stream ); + v2sigma2_scr = safe_cuda_malloc(len_v2sigma2_buffer, stream ); + v2sigmalapl_scr = safe_cuda_malloc(len_v2sigmalapl_buffer, stream ); + v2sigmatau_scr = safe_cuda_malloc(len_v2sigmatau_buffer, stream ); + v2lapl2_scr = safe_cuda_malloc(len_v2lapl2_buffer, stream ); + v2lapltau_scr = safe_cuda_malloc(len_v2lapltau_buffer, stream ); + v2tau2_scr = safe_cuda_malloc(len_v2tau2_buffer, stream ); } - safe_zero(len_vrho_buffer, vrho, queue); - safe_zero(len_vsigma_buffer, vsigma, queue); - safe_zero(len_vlapl_buffer, vlapl, queue); - safe_zero(len_vtau_buffer, vtau, queue); - safe_zero(len_v2rho2_buffer, v2rho2, queue); - safe_zero(len_v2rhosigma_buffer, v2rhosigma, queue); - safe_zero(len_v2rholapl_buffer, v2rholapl, queue); - safe_zero(len_v2rhotau_buffer, v2rhotau, queue); - safe_zero(len_v2sigma2_buffer, v2sigma2, queue); - safe_zero(len_v2sigmalapl_buffer, v2sigmalapl, queue); - safe_zero(len_v2sigmatau_buffer, v2sigmatau, queue); - safe_zero(len_v2lapl2_buffer, v2lapl2, queue); - safe_zero(len_v2lapltau_buffer, v2lapltau, queue); - safe_zero(len_v2tau2_buffer, v2tau2, queue); + safe_zero(len_vrho_buffer, vrho, stream); + safe_zero(len_vsigma_buffer, vsigma, stream); + safe_zero(len_vlapl_buffer, vlapl, stream); + safe_zero(len_vtau_buffer, vtau, stream); + safe_zero(len_v2rho2_buffer, v2rho2, stream); + safe_zero(len_v2rhosigma_buffer, v2rhosigma, stream); + safe_zero(len_v2rholapl_buffer, v2rholapl, stream); + safe_zero(len_v2rhotau_buffer, v2rhotau, stream); + safe_zero(len_v2sigma2_buffer, v2sigma2, stream); + safe_zero(len_v2sigmalapl_buffer, v2sigmalapl, stream); + safe_zero(len_v2sigmatau_buffer, v2sigmatau, stream); + safe_zero(len_v2lapl2_buffer, v2lapl2, stream); + safe_zero(len_v2lapltau_buffer, v2lapltau, stream); + safe_zero(len_v2tau2_buffer, v2tau2, stream); for (auto i = 0ul; i < kernels_.size(); ++i) { if( use_inc ) { @@ -882,14 +919,14 @@ MGGA_VXC_FXC_GENERATOR_DEVICE( XCFunctional::eval_vxc_fxc_device ) const { kernels_[i].first, N, rho, sigma, lapl, tau, vrho, vsigma, vlapl, vtau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, v2sigma2, v2sigmalapl, v2sigmatau, v2lapl2, - v2lapltau, v2tau2, queue); + v2lapltau, v2tau2, stream); } else if (kernels_[i].second.is_gga()) { kernels_[i].second.eval_vxc_fxc_inc_device( kernels_[i].first, N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, - v2sigma2, queue); + v2sigma2, stream); } else { kernels_[i].second.eval_vxc_fxc_inc_device( - kernels_[i].first, N, rho, vrho, v2rho2, queue); + kernels_[i].first, N, rho, vrho, v2rho2, stream); } } else { double* vrho_eval = i ? vrho_scr : vrho; @@ -912,87 +949,88 @@ MGGA_VXC_FXC_GENERATOR_DEVICE( XCFunctional::eval_vxc_fxc_device ) const { N, rho, sigma, lapl, tau, vrho_eval, vsigma_eval, vlapl_eval, vtau_eval, v2rho2_eval, v2rhosigma_eval, v2rholapl_eval, v2rhotau_eval, v2sigma2_eval, v2sigmalapl_eval, v2sigmatau_eval, v2lapl2_eval, - v2lapltau_eval, v2tau2_eval, queue); + v2lapltau_eval, v2tau2_eval, stream); } else if (kernels_[i].second.is_gga()) { kernels_[i].second.eval_vxc_fxc_device( N, rho, sigma, vrho_eval, vsigma_eval, v2rho2_eval, v2rhosigma_eval, - v2sigma2_eval, queue); + v2sigma2_eval, stream); } else { - kernels_[i].second.eval_vxc_fxc_device(N, rho, vrho_eval, v2rho2_eval, queue); + kernels_[i].second.eval_vxc_fxc_device(N, rho, vrho_eval, v2rho2_eval, stream); } if (i) { - add_scal_device(len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, queue); - add_scal_device(len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, queue); + add_scal_device(len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, stream); + add_scal_device(len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, stream); if (kernels_[i].second.is_gga() || kernels_[i].second.is_mgga()) { - add_scal_device(len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, queue); - add_scal_device(len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, queue); - add_scal_device(len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, queue); + add_scal_device(len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, stream); + add_scal_device(len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, stream); + add_scal_device(len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, stream); } if (kernels_[i].second.needs_laplacian()) { - add_scal_device(len_vlapl_buffer, kernels_[i].first, vlapl_eval, vlapl, queue); - add_scal_device(len_v2rholapl_buffer, kernels_[i].first, v2rholapl_eval, v2rholapl, queue); - add_scal_device(len_v2sigmalapl_buffer, kernels_[i].first, v2sigmalapl_eval, v2sigmalapl, queue); - add_scal_device(len_v2lapl2_buffer, kernels_[i].first, v2lapl2_eval, v2lapl2, queue); + add_scal_device(len_vlapl_buffer, kernels_[i].first, vlapl_eval, vlapl, stream); + add_scal_device(len_v2rholapl_buffer, kernels_[i].first, v2rholapl_eval, v2rholapl, stream); + add_scal_device(len_v2sigmalapl_buffer, kernels_[i].first, v2sigmalapl_eval, v2sigmalapl, stream); + add_scal_device(len_v2lapl2_buffer, kernels_[i].first, v2lapl2_eval, v2lapl2, stream); } if (kernels_[i].second.is_mgga()) { - add_scal_device(len_vtau_buffer, kernels_[i].first, vtau_eval, vtau, queue); - add_scal_device(len_v2rhotau_buffer, kernels_[i].first, v2rhotau_eval, v2rhotau, queue); - add_scal_device(len_v2sigmatau_buffer, kernels_[i].first, v2sigmatau_eval, v2sigmatau, queue); - add_scal_device(len_v2tau2_buffer, kernels_[i].first, v2tau2_eval, v2tau2, queue); + add_scal_device(len_vtau_buffer, kernels_[i].first, vtau_eval, vtau, stream); + add_scal_device(len_v2rhotau_buffer, kernels_[i].first, v2rhotau_eval, v2rhotau, stream); + add_scal_device(len_v2sigmatau_buffer, kernels_[i].first, v2sigmatau_eval, v2sigmatau, stream); + add_scal_device(len_v2tau2_buffer, kernels_[i].first, v2tau2_eval, v2tau2, stream); } if (kernels_[i].second.needs_laplacian() && kernels_[i].second.is_mgga()) { - add_scal_device(len_v2lapltau_buffer, kernels_[i].first, v2lapltau_eval, v2lapltau, queue); + add_scal_device(len_v2lapltau_buffer, kernels_[i].first, v2lapltau_eval, v2lapltau, stream); } } else { - scal_device(len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, queue); - scal_device(len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, queue); + scal_device(len_vrho_buffer, kernels_[i].first, vrho_eval, vrho, stream); + scal_device(len_v2rho2_buffer, kernels_[i].first, v2rho2_eval, v2rho2, stream); if (kernels_[i].second.is_gga() || kernels_[i].second.is_mgga()) { - scal_device(len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, queue); - scal_device(len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, queue); - scal_device(len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, queue); + scal_device(len_vsigma_buffer, kernels_[i].first, vsigma_eval, vsigma, stream); + scal_device(len_v2rhosigma_buffer, kernels_[i].first, v2rhosigma_eval, v2rhosigma, stream); + scal_device(len_v2sigma2_buffer, kernels_[i].first, v2sigma2_eval, v2sigma2, stream); } if (kernels_[i].second.needs_laplacian()) { - scal_device(len_vlapl_buffer, kernels_[i].first, vlapl_eval, vlapl, queue); - scal_device(len_v2rholapl_buffer, kernels_[i].first, v2rholapl_eval, v2rholapl, queue); - scal_device(len_v2sigmalapl_buffer, kernels_[i].first, v2sigmalapl_eval, v2sigmalapl, queue); - scal_device(len_v2lapl2_buffer, kernels_[i].first, v2lapl2_eval, v2lapl2, queue); + scal_device(len_vlapl_buffer, kernels_[i].first, vlapl_eval, vlapl, stream); + scal_device(len_v2rholapl_buffer, kernels_[i].first, v2rholapl_eval, v2rholapl, stream); + scal_device(len_v2sigmalapl_buffer, kernels_[i].first, v2sigmalapl_eval, v2sigmalapl, stream); + scal_device(len_v2lapl2_buffer, kernels_[i].first, v2lapl2_eval, v2lapl2, stream); } if (kernels_[i].second.is_mgga()) { - scal_device(len_vtau_buffer, kernels_[i].first, vtau_eval, vtau, queue); - scal_device(len_v2rhotau_buffer, kernels_[i].first, v2rhotau_eval, v2rhotau, queue); - scal_device(len_v2sigmatau_buffer, kernels_[i].first, v2sigmatau_eval, v2sigmatau, queue); - scal_device(len_v2tau2_buffer, kernels_[i].first, v2tau2_eval, v2tau2, queue); + scal_device(len_vtau_buffer, kernels_[i].first, vtau_eval, vtau, stream); + scal_device(len_v2rhotau_buffer, kernels_[i].first, v2rhotau_eval, v2rhotau, stream); + scal_device(len_v2sigmatau_buffer, kernels_[i].first, v2sigmatau_eval, v2sigmatau, stream); + scal_device(len_v2tau2_buffer, kernels_[i].first, v2tau2_eval, v2tau2, stream); } if (kernels_[i].second.needs_laplacian() && kernels_[i].second.is_mgga()) { - scal_device(len_v2lapltau_buffer, kernels_[i].first, v2lapltau_eval, v2lapltau, queue); + scal_device(len_v2lapltau_buffer, kernels_[i].first, v2lapltau_eval, v2lapltau, stream); } } } } - if( vrho_scr ) sycl::free( vrho_scr, *queue); - if( vsigma_scr ) sycl::free( vsigma_scr, *queue); - if( vlapl_scr ) sycl::free( vlapl_scr, *queue); - if( vtau_scr ) sycl::free( vtau_scr, *queue); - if( v2rho2_scr ) sycl::free( v2rho2_scr, *queue); - if( v2rhosigma_scr ) sycl::free( v2rhosigma_scr, *queue); - if( v2rholapl_scr ) sycl::free( v2rholapl_scr, *queue); - if( v2rhotau_scr ) sycl::free( v2rhotau_scr, *queue); - if( v2sigma2_scr ) sycl::free( v2sigma2_scr, *queue); - if( v2sigmalapl_scr ) sycl::free( v2sigmalapl_scr, *queue); - if( v2sigmatau_scr ) sycl::free( v2sigmatau_scr, *queue); - if( v2lapl2_scr ) sycl::free( v2lapl2_scr, *queue); - if( v2lapltau_scr ) sycl::free( v2lapltau_scr, *queue); - if( v2tau2_scr ) sycl::free( v2tau2_scr, *queue); + stream->wait(); + if( vrho_scr ) sycl::free( vrho_scr, *stream ); + if( vsigma_scr ) sycl::free( vsigma_scr, *stream ); + if( vlapl_scr ) sycl::free( vlapl_scr, *stream ); + if( vtau_scr ) sycl::free( vtau_scr, *stream ); + if( v2rho2_scr ) sycl::free( v2rho2_scr, *stream ); + if( v2rhosigma_scr ) sycl::free( v2rhosigma_scr, *stream ); + if( v2rholapl_scr ) sycl::free( v2rholapl_scr, *stream ); + if( v2rhotau_scr ) sycl::free( v2rhotau_scr, *stream ); + if( v2sigma2_scr ) sycl::free( v2sigma2_scr, *stream ); + if( v2sigmalapl_scr ) sycl::free( v2sigmalapl_scr, *stream ); + if( v2sigmatau_scr ) sycl::free( v2sigmatau_scr, *stream ); + if( v2lapl2_scr ) sycl::free( v2lapl2_scr, *stream ); + if( v2lapltau_scr ) sycl::free( v2lapltau_scr, *stream ); + if( v2tau2_scr ) sycl::free( v2tau2_scr, *stream ); } diff --git a/test/xc_kernel_test.cxx b/test/xc_kernel_test.cxx index 71a099c..8956d86 100644 --- a/test/xc_kernel_test.cxx +++ b/test/xc_kernel_test.cxx @@ -1257,7 +1257,7 @@ void compare_libxc_builtin( TestInterface interface, EvalType evaltype, } -TEST_CASE( "Builtin Corectness Test", "[xc-builtin]" ) { +TEST_CASE( "Builtin Correctness Test", "[xc-builtin]" ) { SECTION( "Unpolarized Regular Eval : EXC" ) { for( auto kern : builtin_supported_kernels ) { @@ -2183,7 +2183,7 @@ void sycl_free_all( sycl::queue& q, T* ptr, Args&&... args ) { } void device_synchronize( sycl::queue& q ) { -q.wait_and_throw(); + q.wait_and_throw(); } From 64122afb4f46cee9cab0f68aa4d5742d13e58004 Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Wed, 11 Mar 2026 19:46:18 +0000 Subject: [PATCH 2/3] cmake fix for AoT mode in SYCL --- src/sycl/exchcxx_sycl.cmake | 7 +++++++ test/CMakeLists.txt | 10 ++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/sycl/exchcxx_sycl.cmake b/src/sycl/exchcxx_sycl.cmake index 9ce1e4d..a9a3767 100644 --- a/src/sycl/exchcxx_sycl.cmake +++ b/src/sycl/exchcxx_sycl.cmake @@ -16,6 +16,7 @@ target_link_libraries( exchcxx PUBLIC SYCL::SYCL ) # --- AoT-builds SYCL target alias pass-through --- set(_EXCHCXX_SYCL_ALLOWED + spir64_gen intel_gpu_pvc nvidia_gpu_sm_80 nvidia_gpu_sm_90 @@ -60,3 +61,9 @@ if( EXCHCXX_SYCL_HAS_NO_EARLY_OPTIMIZATIONS ) $<$: -fno-sycl-early-optimizations> ) endif() + + +target_link_options(exchcxx PRIVATE + "SHELL:-fsycl-targets=spir64_gen" + "SHELL:-Xsycl-target-backend \"-device pvc\"" +) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index fef6bea..a0be1a4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -28,5 +28,15 @@ add_executable( xc_functional_test xc_functional_test.cxx reference_values.cxx ) target_link_libraries( xc_functional_test PUBLIC exchcxx catch2_main ) target_compile_features( xc_functional_test PRIVATE cxx_std_17 ) +target_link_options(xc_kernel_test PRIVATE + "SHELL:-fsycl-targets=spir64_gen" + "SHELL:-Xsycl-target-backend \"-device pvc\"" +) +target_link_options(xc_functional_test PRIVATE + "SHELL:-fsycl-targets=spir64_gen" + "SHELL:-Xsycl-target-backend \"-device pvc\"" +) + + add_test( NAME XC_KERNEL COMMAND xc_kernel_test ) add_test( NAME XC_FUNCTIONAL COMMAND xc_functional_test ) From d7ed4af53ce0499da62d38e54e100713ee565e8f Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Fri, 13 Mar 2026 00:08:56 +0000 Subject: [PATCH 3/3] Clean up SYCL compiler and linker flags, arg warnings --- CMakeLists.txt | 8 ++++ src/sycl/builtin_sycl.cxx | 96 ++++++++++++++++++------------------- src/sycl/exchcxx_sycl.cmake | 83 +++++++++++++++++++++++--------- test/CMakeLists.txt | 10 ---- 4 files changed, 117 insertions(+), 80 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 92eca2a..70e5adb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,14 @@ else() set( BUILD_TESTING OFF CACHE BOOL "" FORCE ) FetchContent_MakeAvailable( libxc ) + + #Note: Missing this would cause test failures + if(CMAKE_C_COMPILER_ID STREQUAL "IntelLLVM") + target_compile_options(xc PRIVATE + $<$:-fp-model=precise> + ) + endif() + add_library( Libxc::xc ALIAS xc ) target_include_directories( xc PUBLIC diff --git a/src/sycl/builtin_sycl.cxx b/src/sycl/builtin_sycl.cxx index 09f324a..31a6d24 100644 --- a/src/sycl/builtin_sycl.cxx +++ b/src/sycl/builtin_sycl.cxx @@ -1268,7 +1268,7 @@ LDA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_helper_unpolar_kernel( N, rho, eps); }); @@ -1280,7 +1280,7 @@ LDA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_helper_polar_kernel( N, rho, eps); }); @@ -1292,7 +1292,7 @@ LDA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_helper_unpolar_kernel( N, rho, eps, vxc); }); @@ -1304,7 +1304,7 @@ LDA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_helper_polar_kernel( N, rho, eps, vxc); }); @@ -1316,7 +1316,7 @@ LDA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_helper_unpolar_kernel( N, rho, fxc); }); @@ -1328,7 +1328,7 @@ LDA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_helper_polar_kernel( N, rho, fxc); }); @@ -1339,7 +1339,7 @@ LDA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_helper_unpolar_kernel( N, rho, vxc, fxc); }); @@ -1351,7 +1351,7 @@ LDA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_helper_polar_kernel( N, rho, vxc, fxc); }); @@ -1363,7 +1363,7 @@ LDA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_inc_helper_unpolar_kernel( scal_fact, N, rho, eps); }); @@ -1375,7 +1375,7 @@ LDA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_inc_helper_polar_kernel( scal_fact, N, rho, eps); }); @@ -1387,7 +1387,7 @@ LDA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_inc_helper_unpolar_kernel( scal_fact, N, rho, eps, vxc); }); @@ -1399,7 +1399,7 @@ LDA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_inc_helper_polar_kernel( scal_fact, N, rho, eps, vxc); }); @@ -1411,7 +1411,7 @@ LDA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_inc_helper_unpolar_kernel( scal_fact, N, rho, fxc); }); @@ -1423,7 +1423,7 @@ LDA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_inc_helper_polar_kernel( scal_fact, N, rho, fxc); }); @@ -1435,7 +1435,7 @@ LDA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_inc_helper_unpolar_kernel( scal_fact, N, rho, vxc, fxc); }); @@ -1447,7 +1447,7 @@ LDA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_inc_helper_polar_kernel( scal_fact, N, rho, vxc, fxc); }); @@ -1462,7 +1462,7 @@ GGA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_helper_unpolar_kernel( N, rho, sigma, eps); }); @@ -1474,7 +1474,7 @@ GGA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_helper_polar_kernel( N, rho, sigma, eps); }); @@ -1486,7 +1486,7 @@ GGA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_helper_unpolar_kernel( N, rho, sigma, eps, vrho, vsigma); }); @@ -1498,7 +1498,7 @@ GGA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_helper_polar_kernel( N, rho, sigma, eps, vrho, vsigma); }); @@ -1510,7 +1510,7 @@ GGA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_helper_unpolar_kernel( N, rho, sigma, v2rho2, v2rhosigma, v2sigma2); }); @@ -1522,7 +1522,7 @@ GGA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_helper_polar_kernel( N, rho, sigma, v2rho2, v2rhosigma, v2sigma2); }); @@ -1534,7 +1534,7 @@ GGA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_helper_unpolar_kernel( N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, v2sigma2); }); @@ -1546,7 +1546,7 @@ GGA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_helper_polar_kernel( N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, v2sigma2); }); @@ -1558,7 +1558,7 @@ GGA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_inc_helper_unpolar_kernel( scal_fact, N, rho, sigma, eps); }); @@ -1570,7 +1570,7 @@ GGA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_inc_helper_polar_kernel( scal_fact, N, rho, sigma, eps); }); @@ -1582,7 +1582,7 @@ GGA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_inc_helper_unpolar_kernel( scal_fact, N, rho, sigma, eps, vrho, vsigma); }); @@ -1594,7 +1594,7 @@ GGA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_inc_helper_polar_kernel( scal_fact, N, rho, sigma, eps, vrho, vsigma); }); @@ -1607,7 +1607,7 @@ GGA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_inc_helper_unpolar_kernel( scal_fact, N, rho, sigma, v2rho2, v2rhosigma, v2sigma2); }); @@ -1618,7 +1618,7 @@ GGA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_inc_helper_polar_kernel( scal_fact, N, rho, sigma, v2rho2, v2rhosigma, v2sigma2); }); @@ -1629,7 +1629,7 @@ GGA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_inc_helper_unpolar_kernel( scal_fact, N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, v2sigma2); }); @@ -1640,7 +1640,7 @@ GGA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_inc_helper_polar_kernel( scal_fact, N, rho, sigma, vrho, vsigma, v2rho2, v2rhosigma, v2sigma2); }); @@ -1652,7 +1652,7 @@ MGGA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_helper_unpolar_kernel( N, rho, sigma, lapl, tau, eps); }); @@ -1664,7 +1664,7 @@ MGGA_EXC_GENERATOR_DEVICE( device_eval_exc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_helper_polar_kernel( N, rho, sigma, lapl, tau, eps); }); @@ -1676,7 +1676,7 @@ MGGA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_helper_unpolar_kernel( N, rho, sigma, lapl, tau, eps, vrho, vsigma, vlapl, vtau); }); @@ -1688,7 +1688,7 @@ MGGA_EXC_VXC_GENERATOR_DEVICE( device_eval_exc_vxc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_helper_polar_kernel( N, rho, sigma, lapl, tau, eps, vrho, vsigma, vlapl, vtau); }); @@ -1700,7 +1700,7 @@ MGGA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_helper_unpolar_kernel( N, rho, sigma, lapl, tau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, v2sigma2, v2sigmalapl, v2sigmatau, v2lapl2, v2lapltau, v2tau2); @@ -1713,7 +1713,7 @@ MGGA_FXC_GENERATOR_DEVICE( device_eval_fxc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_helper_polar_kernel( N, rho, sigma, lapl, tau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, v2sigma2, v2sigmalapl, v2sigmatau, v2lapl2, v2lapltau, v2tau2); @@ -1726,7 +1726,7 @@ MGGA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_helper_unpolar_kernel( N, rho, sigma, lapl, tau, vrho, vsigma, vlapl, vtau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, @@ -1741,7 +1741,7 @@ MGGA_VXC_FXC_GENERATOR_DEVICE( device_eval_vxc_fxc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_helper_polar_kernel( N, rho, sigma, lapl, tau, vrho, vsigma, vlapl, vtau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, @@ -1756,7 +1756,7 @@ MGGA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_inc_helper_unpolar_kernel( scal_fact, N, rho, sigma, lapl, tau, eps); }); @@ -1768,7 +1768,7 @@ MGGA_EXC_INC_GENERATOR_DEVICE( device_eval_exc_inc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_inc_helper_polar_kernel( scal_fact, N, rho, sigma, lapl, tau, eps); }); @@ -1780,7 +1780,7 @@ MGGA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_inc_helper_unpolar_kernel( scal_fact, N, rho, sigma, lapl, tau, eps, vrho, vsigma, vlapl, vtau); }); @@ -1792,7 +1792,7 @@ MGGA_EXC_VXC_INC_GENERATOR_DEVICE( device_eval_exc_vxc_inc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_exc_vxc_inc_helper_polar_kernel( scal_fact, N, rho, sigma, lapl, tau, eps, vrho, vsigma, vlapl, vtau); }); @@ -1804,7 +1804,7 @@ MGGA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_inc_helper_unpolar_kernel( scal_fact, N, rho, sigma, lapl, tau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, @@ -1819,7 +1819,7 @@ MGGA_FXC_INC_GENERATOR_DEVICE( device_eval_fxc_inc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_fxc_inc_helper_polar_kernel( scal_fact, N, rho, sigma, lapl, tau, v2rho2, v2rhosigma, v2rholapl, v2rhotau, @@ -1834,7 +1834,7 @@ MGGA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_unpolar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_inc_helper_unpolar_kernel( scal_fact, N, rho, sigma, lapl, tau, vrho, vsigma, vlapl, vtau, @@ -1850,7 +1850,7 @@ MGGA_VXC_FXC_INC_GENERATOR_DEVICE( device_eval_vxc_fxc_inc_helper_polar ) { sycl::range<1> threads(32); sycl::range<1> blocks( util::div_ceil( N, threads[0]) ); - stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=](auto item) [[sycl::reqd_work_group_size(32)]] { + stream->parallel_for>( sycl::nd_range<1>(blocks * threads, threads), [=]([[maybe_unused]] auto item) [[sycl::reqd_work_group_size(32)]] { device_eval_vxc_fxc_inc_helper_polar_kernel( scal_fact, N, rho, sigma, lapl, tau, vrho, vsigma, vlapl, vtau, diff --git a/src/sycl/exchcxx_sycl.cmake b/src/sycl/exchcxx_sycl.cmake index a9a3767..d167b3d 100644 --- a/src/sycl/exchcxx_sycl.cmake +++ b/src/sycl/exchcxx_sycl.cmake @@ -15,55 +15,94 @@ target_link_libraries( exchcxx PUBLIC SYCL::SYCL ) # --- AoT-builds SYCL target alias pass-through --- +# User-facing aliases set(_EXCHCXX_SYCL_ALLOWED - spir64_gen intel_gpu_pvc nvidia_gpu_sm_80 nvidia_gpu_sm_90 amd_gpu_gfx90a amd_gpu_gfx942 ) + if(DEFINED EXCHCXX_SYCL_TARGET AND NOT EXCHCXX_SYCL_TARGET STREQUAL "") list(FIND _EXCHCXX_SYCL_ALLOWED "${EXCHCXX_SYCL_TARGET}" _exchcxx_sycl_idx) if(_exchcxx_sycl_idx EQUAL -1) - message(FATAL_ERROR "Invalid EXCHCXX_SYCL_TARGET='${EXCHCXX_SYCL_TARGET}'. " "Allowed values: ${_EXCHCXX_SYCL_ALLOWED}") + message(FATAL_ERROR + "Invalid EXCHCXX_SYCL_TARGET='${EXCHCXX_SYCL_TARGET}'. " + "Allowed values: ${_EXCHCXX_SYCL_ALLOWED}") endif() - target_compile_options( exchcxx PRIVATE -fsycl-targets=${EXCHCXX_SYCL_TARGET} ) - target_link_options( exchcxx PRIVATE -fsycl-targets=${EXCHCXX_SYCL_TARGET} ) + unset(_exchcxx_sycl_compile_opts) + unset(_exchcxx_sycl_link_opts) + + if(EXCHCXX_SYCL_TARGET STREQUAL "intel_gpu_pvc") + list(APPEND _exchcxx_sycl_compile_opts + -fsycl-default-sub-group-size=32 + -fsycl-targets=spir64_gen + "SHELL:-Xsycl-target-backend \"-device pvc\"" + ) + list(APPEND _exchcxx_sycl_link_opts + "SHELL:-ftarget-register-alloc-mode=pvc:large" + "SHELL:-fsycl-targets=spir64_gen" + "SHELL:-Xsycl-target-backend \"-device pvc\"" + ) + endif() + + target_compile_options(exchcxx PRIVATE + $<$:${_exchcxx_sycl_compile_opts}> + ) + target_link_options(exchcxx PRIVATE + ${_exchcxx_sycl_link_opts} + ) + message(STATUS "ExchCXX SYCL AoT enabled for target: ${EXCHCXX_SYCL_TARGET}") endif() -target_compile_options(exchcxx PRIVATE $<$:-ffp-model=precise>) -target_link_options(exchcxx PRIVATE -fsycl-max-parallel-link-jobs=20) - include(CheckCXXCompilerFlag) check_cxx_compiler_flag("-fno-sycl-id-queries-fit-in-int" EXCHCXX_SYCL_ID_QUERIES_FIT_IN_INT ) -check_cxx_compiler_flag("-fsycl-device-code-split=per_source" EXCHCXX_SYCL_DEVICE_CODE_SPLIT_PER_SOURCE ) -check_cxx_compiler_flag("-fno-sycl-early-optimizations" EXCHCXX_SYCL_HAS_NO_EARLY_OPTIMIZATIONS ) +check_cxx_compiler_flag("-fsycl-device-code-split=per_kernel" EXCHCXX_SYCL_DEVICE_CODE_SPLIT_PER_KERNEL ) +check_cxx_compiler_flag("-Xsycl-target-frontend \"-fp-model=precise\"" EXCHCXX_HAVE_SYCL_TARGET_FRONTEND_FP_MODEL_PRECISE ) + +include(CheckLinkerFlag) +check_linker_flag(CXX "-flink-huge-device-code" EXCHCXX_SYCL_LINK_HUGE_DEVICE_CODE) +check_linker_flag(CXX "--offload-compress" EXCHCXX_SYCL_OFFLOAD_COMPRESS) +check_linker_flag(CXX "-fsycl-max-parallel-link-jobs=16" EXCHCXX_SYCL_MAX_PARALLEL_LINK_JOBS) -if( EXCHCXX_SYCL_ID_QUERIES_FIT_IN_INT ) - target_compile_options( exchcxx PRIVATE - $<$: -fno-sycl-id-queries-fit-in-int> + +if(EXCHCXX_SYCL_ID_QUERIES_FIT_IN_INT) + target_compile_options(exchcxx PRIVATE + $<$:-fno-sycl-id-queries-fit-in-int> ) endif() -if( EXCHCXX_SYCL_DEVICE_CODE_SPLIT_PER_SOURCE ) - target_compile_options( exchcxx PRIVATE - $<$: -fsycl-device-code-split=per_source> +if(EXCHCXX_SYCL_DEVICE_CODE_SPLIT_PER_KERNEL) + target_compile_options(exchcxx PRIVATE + $<$:-fsycl-device-code-split=per_kernel> ) endif() -if( EXCHCXX_SYCL_HAS_NO_EARLY_OPTIMIZATIONS ) - target_compile_options( exchcxx PRIVATE - $<$: -fno-sycl-early-optimizations> +if(EXCHCXX_HAVE_SYCL_TARGET_FRONTEND_FP_MODEL_PRECISE) + target_compile_options(exchcxx PRIVATE + "$<$:SHELL:-Xsycl-target-frontend -fp-model=precise>" ) endif() +if(EXCHCXX_SYCL_LINK_HUGE_DEVICE_CODE) + target_link_options(exchcxx PRIVATE + $<$:-flink-huge-device-code> + ) +endif() -target_link_options(exchcxx PRIVATE - "SHELL:-fsycl-targets=spir64_gen" - "SHELL:-Xsycl-target-backend \"-device pvc\"" -) +if(EXCHCXX_SYCL_OFFLOAD_COMPRESS) + target_link_options(exchcxx PRIVATE + $<$:--offload-compress> + ) +endif() + +if(EXCHCXX_SYCL_MAX_PARALLEL_LINK_JOBS) + target_link_options(exchcxx PRIVATE + $<$:-fsycl-max-parallel-link-jobs=16> + ) +endif() diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a0be1a4..fef6bea 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -28,15 +28,5 @@ add_executable( xc_functional_test xc_functional_test.cxx reference_values.cxx ) target_link_libraries( xc_functional_test PUBLIC exchcxx catch2_main ) target_compile_features( xc_functional_test PRIVATE cxx_std_17 ) -target_link_options(xc_kernel_test PRIVATE - "SHELL:-fsycl-targets=spir64_gen" - "SHELL:-Xsycl-target-backend \"-device pvc\"" -) -target_link_options(xc_functional_test PRIVATE - "SHELL:-fsycl-targets=spir64_gen" - "SHELL:-Xsycl-target-backend \"-device pvc\"" -) - - add_test( NAME XC_KERNEL COMMAND xc_kernel_test ) add_test( NAME XC_FUNCTIONAL COMMAND xc_functional_test )