From daa4b6283f01858bffb956a488a380342a5bcce0 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Tue, 24 Feb 2026 12:14:28 +0100 Subject: [PATCH 1/2] Add support for detection of GPU software --- .github/workflows/docs.yml | 5 ++ .github/workflows/prs.yml | 5 ++ scripts/generate_data_files.py | 81 +++++++++++++++------- scripts/process_eessi_software_metadata.py | 40 +++++++++-- 4 files changed, 100 insertions(+), 31 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index d32e903..8196839 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -33,6 +33,11 @@ jobs: module purge module unuse $MODULEPATH module use /cvmfs/software.eessi.io/init/modules/ + # Set CPU override for the stack to query + export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="x86_64/intel/icelake" + # Set GPU overrides for the stack to query (we have no GPU, so also override the GPU check) + export EESSI_ACCELERATOR_TARGET_OVERRIDE=accel/nvidia/cc90 + export EESSI_OVERRIDE_GPU_CHECK=1 # First do 2023.06 for EB 4 ( module load EESSI/2023.06 && module load EasyBuild/4 && module load EESSI-extend && python scripts/generate_data_files.py --eessi-version=2023.06 ) & # then 2023.06 for EB 5 diff --git a/.github/workflows/prs.yml b/.github/workflows/prs.yml index 6250365..c6187ca 100644 --- a/.github/workflows/prs.yml +++ b/.github/workflows/prs.yml @@ -28,6 +28,11 @@ jobs: module purge module unuse $MODULEPATH module use /cvmfs/software.eessi.io/init/modules/ + # Set CPU override for the stack to query + export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="x86_64/intel/icelake" + # Set GPU overrides for the stack to query (we have no GPU, so also override the GPU check) + export EESSI_ACCELERATOR_TARGET_OVERRIDE=accel/nvidia/cc90 + export EESSI_OVERRIDE_GPU_CHECK=1 # Only do 2023.06 for EB 5 since this is just a test ( module load EESSI/2023.06 && module load EasyBuild/5 && module load EESSI-extend && python scripts/generate_data_files.py --eessi-version=2023.06 ) & # Merge all these results together diff --git a/scripts/generate_data_files.py b/scripts/generate_data_files.py index 9107f31..b16e405 100644 --- a/scripts/generate_data_files.py +++ b/scripts/generate_data_files.py @@ -10,15 +10,16 @@ from collections import defaultdict, OrderedDict from datetime import datetime, timezone from easybuild.tools.version import VERSION as EASYBUILD_VERSION -from easybuild.framework.easyconfig.easyconfig import process_easyconfig, get_toolchain_hierarchy +from easybuild.framework.easyconfig.easyconfig import ( + process_easyconfig, + get_toolchain_hierarchy, +) from easybuild.tools.options import set_up_configuration from easybuild.tools.include import include_easyblocks from contextlib import contextmanager VALID_EESSI_VERSIONS = ["2025.06", "2023.06"] -EESSI_REFERENCE_ARCHITECTURE = "x86_64/intel/icelake" - # Give order to my toolchains so I can easily figure out what "latest" means EESSI_SUPPORTED_TOP_LEVEL_TOOLCHAINS = OrderedDict( { @@ -49,7 +50,11 @@ def suppress_stdout(): def module_dict_from_module_string(module): module_name, module_version = module.split("/", 1) - module_dict = {"module_name": module_name, "module_version": module_version, "full_module_name": module} + module_dict = { + "module_name": module_name, + "module_version": module_version, + "full_module_name": module, + } return module_dict @@ -182,6 +187,16 @@ def collect_eb_files(base_path): return dict(eb_files_by_version) +def merge_dicts(d1, d2): + merged = defaultdict(list) + + for d in (d1, d2): + for key, value in d.items(): + merged[key].extend(value) + + return dict(merged) + + if __name__ == "__main__": # The EESSI version is provided as an argument parser = argparse.ArgumentParser(description="EESSI version to scan.") @@ -199,10 +214,23 @@ def collect_eb_files(base_path): print(f"Using EESSI version: {eessi_version}") # We use a single architecture path to gather information about the software versions - base_path = ( - f"/cvmfs/software.eessi.io/versions/{eessi_version}/software/linux/{EESSI_REFERENCE_ARCHITECTURE}/software/" - ) - result = collect_eb_files(base_path) + eessi_reference_architecture = os.getenv("EESSI_ARCHDETECT_OPTIONS_OVERRIDE", False) + if not eessi_reference_architecture: + print("You must have selected a CPU architecture via EESSI_ARCHDETECT_OPTIONS_OVERRIDE") + exit() + base_path = f"/cvmfs/software.eessi.io/versions/{eessi_version}/software/linux/{eessi_reference_architecture}" + cpu_easyconfig_files_dict = collect_eb_files(os.path.join(base_path, "software")) + # We also gather all the acclerator installations for NVIDIA-enabled packages + # We're not typically running this script on a node with a GPU so an override must have been set + eessi_reference_nvidia_architecture = os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE", False) + if not eessi_reference_nvidia_architecture: + print("You must have selected a GPU architecture via EESSI_ACCELERATOR_TARGET_OVERRIDE") + exit() + accel_base_path = os.path.join(base_path, eessi_reference_nvidia_architecture) + accel_easyconfig_files_dict = collect_eb_files(os.path.join(accel_base_path, "software")) + + # Merge the easyconfig files + easyconfig_files_dict = merge_dicts(cpu_easyconfig_files_dict, accel_easyconfig_files_dict) set_up_configuration(args="") tmpdir = tempfile.mkdtemp() @@ -224,23 +252,23 @@ def collect_eb_files(base_path): {"name": "system", "version": "system"} ] + get_toolchain_hierarchy(top_level_toolchain) - for eb_version_of_install, files in sorted(result.items()): + for eb_version_of_install, easyconfigs in sorted(easyconfig_files_dict.items()): print(f"Major version {eb_version_of_install}:") if eb_version_of_install == str(EASYBUILD_VERSION.version[0]): - total_files = len(files) - for i, file in enumerate(files, start=1): - percent = (i / total_files) * 100 - print(f"{percent:.1f}% - {file}") + total_easyconfigs = len(easyconfigs) + for i, easyconfig in enumerate(easyconfigs, start=1): + percent = (i / total_easyconfigs) * 100 + print(f"{percent:.1f}% - {easyconfig}") # Don't try to parse an EasyBuild easyconfig that is not the same major release - if "/software/EasyBuild/" in file and f"/EasyBuild/{eb_version_of_install}" not in file: + if "/software/EasyBuild/" in easyconfig and f"/EasyBuild/{eb_version_of_install}" not in easyconfig: continue # print(process_easyconfig(path)[0]['ec'].asdict()) - eb_hooks_path = use_timestamped_reprod_if_exists(f"{os.path.dirname(file)}/reprod/easyblocks") + eb_hooks_path = use_timestamped_reprod_if_exists(f"{os.path.dirname(easyconfig)}/reprod/easyblocks") easyblocks_dir = include_easyblocks(tmpdir, [eb_hooks_path + "/*.py"]) with suppress_stdout(): - parsed_ec = process_easyconfig(file)[0] + parsed_ec = process_easyconfig(easyconfig)[0] # included easyblocks are the first entry in sys.path, so just pop them but keep a list of what was used sys.path.pop(0) easyblocks_used = [ @@ -252,26 +280,29 @@ def collect_eb_files(base_path): # Store everything we now know about the installation as a dict # Use the path as the key since we know it is unique - eessi_software["eessi_version"][eessi_version][file] = parsed_ec["ec"].asdict() - eessi_software["eessi_version"][eessi_version][file]["mtime"] = os.path.getmtime(file) + eessi_software["eessi_version"][eessi_version][easyconfig] = parsed_ec["ec"].asdict() + eessi_software["eessi_version"][eessi_version][easyconfig]["mtime"] = os.path.getmtime(easyconfig) # Make sure we can load the module before adding it's information to the main dict try: - eessi_software["eessi_version"][eessi_version][file]["required_modules"] = load_and_list_modules( - parsed_ec["full_mod_name"] + eessi_software["eessi_version"][eessi_version][easyconfig]["required_modules"] = ( + load_and_list_modules(parsed_ec["full_mod_name"]) ) except RuntimeError as e: - print(f"Ignoring {file} due to error processing module: {e}") - eessi_software["eessi_version"][eessi_version].pop(file) + print(f"Ignoring {easyconfig} due to error processing module: {e}") + eessi_software["eessi_version"][eessi_version].pop(easyconfig) continue # Add important data that is related to the module environment - eessi_software["eessi_version"][eessi_version][file]["module"] = module_dict_from_module_string( + eessi_software["eessi_version"][eessi_version][easyconfig]["module"] = module_dict_from_module_string( parsed_ec["full_mod_name"] ) # Retain the easyblocks used so we can use a heuristic to figure out the type of extensions (R, Python, Perl) - eessi_software["eessi_version"][eessi_version][file]["easyblocks"] = easyblocks_used + eessi_software["eessi_version"][eessi_version][easyconfig]["easyblocks"] = easyblocks_used # Store the result - with open(f"eessi_software_{eessi_version}-eb{str(EASYBUILD_VERSION.version[0])}.yaml", "w") as f: + with open( + f"eessi_software_{eessi_version}-eb{str(EASYBUILD_VERSION.version[0])}.yaml", + "w", + ) as f: yaml.dump(eessi_software, f) diff --git a/scripts/process_eessi_software_metadata.py b/scripts/process_eessi_software_metadata.py index 9778419..dfe23d2 100644 --- a/scripts/process_eessi_software_metadata.py +++ b/scripts/process_eessi_software_metadata.py @@ -24,6 +24,14 @@ "x86_64/intel/cascadelake", ] +NVIDIA_ARCHITECTURES = [ + "accel/nvidia/cc70", + "accel/nvidia/cc80", + "accel/nvidia/cc90", + "accel/nvidia/cc100", + "accel/nvidia/cc120", +] + TOOLCHAIN_FAMILIES = [ "2025b_foss", "2025a_foss", @@ -59,13 +67,25 @@ def get_software_information_by_filename(file_metadata, original_path=None, tool if f"/{arch}/" in original_path: detected_arch = arch break - if detected_arch is None: raise RuntimeError("No known architecture matched in the input path.") + # also detect the GPU arch (this one may not exist) + # needs to be a dict as we can filter on associated cpu arch + base_version_dict["gpu_arch"] = {} + detected_accel_arch = None + for accel_arch in NVIDIA_ARCHITECTURES: + if f"/{accel_arch}/" in original_path: + detected_accel_arch = accel_arch + break + if detected_accel_arch is None: + # Not having a GPU is not an error (we can just leave it empty, which is falsey) + detected_accel_arch = "" + # 2) Construct the modulefile path before_arch, _, _ = original_path.partition(detected_arch) - modulefile = before_arch + detected_arch + "/modules/all/" + file_metadata["module"]["full_module_name"] + ".lua" + # Remember, detected_accel_arch can be an empty string + modulefile = os.path.join(before_arch, detected_arch, detected_accel_arch, "modules/all", file_metadata["module"]["full_module_name"] + ".lua") spider_cache = before_arch + detected_arch + "/.lmod/cache/spiderT.lua" # 3) Substitute each architecture and test module file existence in spider cache @@ -76,13 +96,21 @@ def get_software_information_by_filename(file_metadata, original_path=None, tool found = subprocess.run(["grep", "-q", substituted_modulefile, substituted_spider_cache]).returncode == 0 if found: base_version_dict["cpu_arch"].append(arch) + # If we have an accelerator module let's check which architectures are supported + if detected_accel_arch: + base_version_dict["gpu_arch"][arch] = [] + for accel_arch in NVIDIA_ARCHITECTURES: + accel_substituted_modulefile = substituted_modulefile.replace(detected_accel_arch, accel_arch) + found = subprocess.run(["grep", "-q", accel_substituted_modulefile, substituted_spider_cache]).returncode == 0 + if found: + base_version_dict["gpu_arch"][arch].append(accel_arch) + else: + print(f"No module {accel_substituted_modulefile}...not adding software for architecture {arch}/{accel_arch}") + continue else: - print(f"No module {substituted_modulefile}...not adding software for archtecture {arch}") + print(f"No module {substituted_modulefile}...not adding software for architecture {arch}") continue - # TODO: Handle GPU arch later, but it is going to need to be a dict as we will filter on cpu arch - base_version_dict["gpu_arch"] = {} - # Now we can cycle throught the possibilities # - software application itself software = {} From cffc123fa0b617a95cbeeef9d8053aa92d41ba06 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Tue, 24 Feb 2026 12:45:48 +0100 Subject: [PATCH 2/2] quote strings in CI --- .github/workflows/docs.yml | 2 +- .github/workflows/prs.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 8196839..01a0e27 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -36,7 +36,7 @@ jobs: # Set CPU override for the stack to query export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="x86_64/intel/icelake" # Set GPU overrides for the stack to query (we have no GPU, so also override the GPU check) - export EESSI_ACCELERATOR_TARGET_OVERRIDE=accel/nvidia/cc90 + export EESSI_ACCELERATOR_TARGET_OVERRIDE="accel/nvidia/cc90" export EESSI_OVERRIDE_GPU_CHECK=1 # First do 2023.06 for EB 4 ( module load EESSI/2023.06 && module load EasyBuild/4 && module load EESSI-extend && python scripts/generate_data_files.py --eessi-version=2023.06 ) & diff --git a/.github/workflows/prs.yml b/.github/workflows/prs.yml index c6187ca..da0d590 100644 --- a/.github/workflows/prs.yml +++ b/.github/workflows/prs.yml @@ -31,7 +31,7 @@ jobs: # Set CPU override for the stack to query export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="x86_64/intel/icelake" # Set GPU overrides for the stack to query (we have no GPU, so also override the GPU check) - export EESSI_ACCELERATOR_TARGET_OVERRIDE=accel/nvidia/cc90 + export EESSI_ACCELERATOR_TARGET_OVERRIDE="accel/nvidia/cc90" export EESSI_OVERRIDE_GPU_CHECK=1 # Only do 2023.06 for EB 5 since this is just a test ( module load EESSI/2023.06 && module load EasyBuild/5 && module load EESSI-extend && python scripts/generate_data_files.py --eessi-version=2023.06 ) &