From 24a85ce1f984aa2e8367f9fb59a056a1052c4929 Mon Sep 17 00:00:00 2001 From: nnasiri Date: Tue, 28 Apr 2026 14:28:22 -0700 Subject: [PATCH 1/4] Added more fields to the schema of mlperf training summary results --- .../result_summarizer/result_summarizer.py | 288 +++++++----------- 1 file changed, 107 insertions(+), 181 deletions(-) diff --git a/mlperf_logging/result_summarizer/result_summarizer.py b/mlperf_logging/result_summarizer/result_summarizer.py index 1198236..9e3eb31 100644 --- a/mlperf_logging/result_summarizer/result_summarizer.py +++ b/mlperf_logging/result_summarizer/result_summarizer.py @@ -1,6 +1,9 @@ ''' Summarizes a set of results. ''' + +from __future__ import print_function + import argparse import glob import json @@ -10,15 +13,12 @@ import itertools import pandas as pd import yaml -import hashlib -import math -import operator -import uuid as uuidlib +import numpy as np from ..compliance_checker import mlp_compliance from ..compliance_checker.mlp_compliance import usage_choices, rule_choices from ..compliance_checker.mlp_parser import parse_file - +from ..rcp_checker import rcp_checker from ..benchmark_meta import get_allowed_benchmarks, get_result_file_counts @@ -262,6 +262,14 @@ def _get_weak_scaling_metric_schema(): 'time_to_train_all': float, } +def _get_strong_scaling_metric_schema(): + return { + 'time_to_train': float, + 'Energy': float, + 'GBS': float, + 'epochs': float, + 'RCP': str, + } def _get_empty_summary(usage, ruleset, weak_scaling=False): return Summary( @@ -281,7 +289,6 @@ def _get_column_schema(usage, ruleset, weak_scaling=False): 'accelerators_count': int, 'framework': str, 'notes': str, - 'private_id': str } if weak_scaling == True: benchmarks = get_allowed_benchmarks(usage, ruleset) @@ -289,10 +296,14 @@ def _get_column_schema(usage, ruleset, weak_scaling=False): for metric, dtype in _get_weak_scaling_metric_schema().items(): schema['{}:{}'.format(benchmark, metric)] = dtype else: - schema.update( - {b: float - for b in get_allowed_benchmarks(usage, ruleset)}) - schema.update({'details_url': str, 'code_url': str}) + #schema.update( + # {b: float + # for b in get_allowed_benchmarks(usage, ruleset)}) + benchmarks = get_allowed_benchmarks(usage, ruleset) + for benchmark in benchmarks: + for metric, dtype in _get_strong_scaling_metric_schema().items(): + schema['{}:{}'.format(benchmark, metric)] = dtype + schema.update({'notes': str, 'details_url': str, 'code_url': str}) return schema @@ -338,6 +349,8 @@ def _compute_strong_score_standalone( ): pattern = "{folder}/result_*.txt".format(folder=benchmark_folder) result_files = glob.glob(pattern, recursive=True) + benchmark_scores = {} + benchmark_power_scores = {} scores = [] scores_track = {} power_scores = [] @@ -474,7 +487,7 @@ def _compute_weak_score_standalone(benchmark, system, has_power, benchmark_folde -def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset): +def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset, division, rcp_bypass=False): # Collect scores for benchmarks. benchmark_scores = {} benchmark_power_scores = {} @@ -489,16 +502,82 @@ def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset): has_power = _has_power(benchmark_folder) benchmark = _benchmark_alias(folder_parts[-1]) system = folder_parts[-3] if usage == 'hpc' else folder_parts[-2] - # Read scores from result files. - score, power_score = _compute_strong_score_standalone(benchmark, system, has_power, benchmark_folder, usage, ruleset, desc) + # Compute base perf/power scores + score, power_score = _compute_strong_score_standalone( + benchmark, system, has_power, benchmark_folder, usage, ruleset, desc + ) + + # RCP/GBS/Epochs additions for closed division + benchmark_gbs = None + benchmark_epochs = None + benchmark_rcp = None + if division == 'closed': + pattern = '{folder}/result_*.txt'.format(folder=benchmark_folder) + result_files = glob.glob(pattern, recursive=True) + try: + # RCP check + verbose = False + bert_train_samples = False + rcp_pass, rcp_msg, rcp_norm_factor = rcp_checker.check_directory( + benchmark_folder, + usage, + ruleset, + verbose, + bert_train_samples, + rcp_file=None, + rcp_pass='pruned_rcps', + rcp_bypass=rcp_bypass, + set_scaling=True, + ) + if not rcp_pass: + print( + 'ERROR: RCP Test Failed on {}/{}/{} with message: {}.'.format( + desc['submitter'], system, benchmark, rcp_msg + ) + ) + if rcp_msg == 'RCP found': + benchmark_rcp = 'Fail' + elif rcp_msg == 'RCP Interpolation': + benchmark_rcp = 'Interp. Fail' + elif 'Missing' in rcp_msg: + benchmark_rcp = 'Missing' + elif rcp_msg == 'Cannot find any RCPs': + benchmark_rcp = 'No RCP' + else: + benchmark_rcp = 'Unknown state' + else: + benchmark_rcp = 'Pass' + + # GBS and epochs + benchmark_gbs, subm_epochs, _ = rcp_checker.get_submission_epochs( + result_files, ruleset, bert_train_samples=False + ) + subm_epochs.sort() + samples_rejected = 4 if benchmark == 'unet3d' else 1 + if len(subm_epochs) >= 2 * samples_rejected + 1: + benchmark_epochs = float( + np.mean( + subm_epochs[ + samples_rejected : len(subm_epochs) - samples_rejected + ] + ) + ) + except Exception as e: + print(f"WARNING: RCP/GBS computation failed for {benchmark_folder}: {e}") + + # Map into metric-suffixed keys for schema if score is not None: - benchmark_scores[benchmark] = score + benchmark_scores[f"{benchmark}:time_to_train"] = score + if benchmark_gbs is not None: + benchmark_scores[f"{benchmark}:GBS"] = float(benchmark_gbs) + if benchmark_epochs is not None: + benchmark_scores[f"{benchmark}:epochs"] = float(benchmark_epochs) + if benchmark_rcp is not None: + benchmark_scores[f"{benchmark}:RCP"] = benchmark_rcp if power_score is not None: - benchmark_power_scores[benchmark] = power_score + benchmark_scores[f"{benchmark}:Energy"] = power_score _fill_empty_benchmark_scores(benchmark_scores, usage, ruleset) - if len(benchmark_power_scores) > 0: - _fill_empty_benchmark_scores(benchmark_power_scores, usage, ruleset) - return benchmark_scores, benchmark_power_scores + return benchmark_scores, {} def _compute_weak_scaling_scores(desc, system_folder, usage, ruleset): @@ -674,19 +753,6 @@ def _load_system_desc(folder, system): raise FileNotFoundError('ERROR: Missing {}'.format(system_file)) return _read_json_file(system_file) -def _get_id_file(folder, system): - systems_folder = os.path.join(folder, 'results/',system) - system_file = os.path.join(systems_folder, 'privateid.json'.format(system)) - if not os.path.exists(system_file): - return {} - return _read_json_file(system_file) - -def _update_id_file(folder, system, id): - systems_folder = os.path.join(folder, 'results/',system) - system_file = os.path.join(systems_folder, 'privateid.json'.format(system)) - id = {'private_id': id} - with open(system_file, 'w') as f: - json.dump(id, f, indent=4) def _fill_empty_benchmark_scores( benchmark_scores, @@ -702,130 +768,10 @@ def _fill_empty_benchmark_scores( benchmark_scores[k] = None else: - if benchmark not in benchmark_scores: - benchmark_scores[benchmark] = None - - -def _get_id_from_sysinfo(summary): - """Generate private id from system information. - - Args: - summary (dictionary): Sysinfo Dictionary - """ - - - # Code from humanhash3, which is public domain. - DEFAULT_WORDLIST = ( - 'ack', 'alabama', 'alanine', 'alaska', 'alpha', 'angel', 'apart', 'april', - 'arizona', 'arkansas', 'artist', 'asparagus', 'aspen', 'august', 'autumn', - 'avocado', 'bacon', 'bakerloo', 'batman', 'beer', 'berlin', 'beryllium', - 'black', 'blossom', 'blue', 'bluebird', 'bravo', 'bulldog', 'burger', - 'butter', 'california', 'carbon', 'cardinal', 'carolina', 'carpet', 'cat', - 'ceiling', 'charlie', 'chicken', 'coffee', 'cola', 'cold', 'colorado', - 'comet', 'connecticut', 'crazy', 'cup', 'dakota', 'december', 'delaware', - 'delta', 'diet', 'don', 'double', 'early', 'earth', 'east', 'echo', - 'edward', 'eight', 'eighteen', 'eleven', 'emma', 'enemy', 'equal', - 'failed', 'fanta', 'fifteen', 'fillet', 'finch', 'fish', 'five', 'fix', - 'floor', 'florida', 'football', 'four', 'fourteen', 'foxtrot', 'freddie', - 'friend', 'fruit', 'gee', 'georgia', 'glucose', 'golf', 'green', 'grey', - 'hamper', 'happy', 'harry', 'hawaii', 'helium', 'high', 'hot', 'hotel', - 'hydrogen', 'idaho', 'illinois', 'india', 'indigo', 'ink', 'iowa', - 'island', 'item', 'jersey', 'jig', 'johnny', 'juliet', 'july', 'jupiter', - 'kansas', 'kentucky', 'kilo', 'king', 'kitten', 'lactose', 'lake', 'lamp', - 'lemon', 'leopard', 'lima', 'lion', 'lithium', 'london', 'louisiana', - 'low', 'magazine', 'magnesium', 'maine', 'mango', 'march', 'mars', - 'maryland', 'massachusetts', 'may', 'mexico', 'michigan', 'mike', - 'minnesota', 'mirror', 'mississippi', 'missouri', 'mobile', 'mockingbird', - 'monkey', 'montana', 'moon', 'mountain', 'muppet', 'music', 'nebraska', - 'neptune', 'network', 'nevada', 'nine', 'nineteen', 'nitrogen', 'north', - 'november', 'nuts', 'october', 'ohio', 'oklahoma', 'one', 'orange', - 'oranges', 'oregon', 'oscar', 'oven', 'oxygen', 'papa', 'paris', 'pasta', - 'pennsylvania', 'pip', 'pizza', 'pluto', 'potato', 'princess', 'purple', - 'quebec', 'queen', 'quiet', 'red', 'river', 'robert', 'robin', 'romeo', - 'rugby', 'sad', 'salami', 'saturn', 'september', 'seven', 'seventeen', - 'shade', 'sierra', 'single', 'sink', 'six', 'sixteen', 'skylark', 'snake', - 'social', 'sodium', 'solar', 'south', 'spaghetti', 'speaker', 'spring', - 'stairway', 'steak', 'stream', 'summer', 'sweet', 'table', 'tango', 'ten', - 'tennessee', 'tennis', 'texas', 'thirteen', 'three', 'timing', 'triple', - 'twelve', 'twenty', 'two', 'uncle', 'undress', 'uniform', 'uranus', 'utah', - 'vegan', 'venus', 'vermont', 'victor', 'video', 'violet', 'virginia', - 'washington', 'west', 'whiskey', 'white', 'william', 'winner', 'winter', - 'wisconsin', 'wolfram', 'wyoming', 'xray', 'yankee', 'yellow', 'zebra', - 'zulu') - - class HumanHasher(object): - - def __init__(self, wordlist=DEFAULT_WORDLIST): - self.wordlist = wordlist - - def humanize_list(self, hexdigest, words=4): - # Gets a list of byte values between 0-255. - bytes_ = map(lambda x: int(x, 16), - map(''.join, zip(hexdigest[::2], hexdigest[1::2]))) - # Compress an arbitrary number of bytes to `words`. - compressed = self.compress(bytes_, words) - - return [str(self.wordlist[byte]) for byte in compressed] - - def humanize(self, hexdigest, words=4, separator='-'): - # Map the compressed byte values through the word list. - return separator.join(self.humanize_list(hexdigest, words)) - - @staticmethod - def compress(bytes_, target): - bytes_list = list(bytes_) - - length = len(bytes_list) - # If there are less than the target number bytes, return input bytes - if target >= length: - return bytes_ - - # Split `bytes` evenly into `target` segments - # Each segment hashes `seg_size` bytes, rounded down for some - seg_size = float(length) / float(target) - # Initialize `target` number of segments - segments = [0] * target - seg_num = 0 - - # Use a simple XOR checksum-like function for compression - for i, byte in enumerate(bytes_list): - # Divide the byte index by the segment size to assign its segment - # Floor to create a valid segment index - # Min to ensure the index is within `target` - seg_num = min(int(math.floor(i / seg_size)), target-1) - # Apply XOR to the existing segment and the byte - segments[seg_num] = operator.xor(segments[seg_num], byte) - - return segments - - def uuid(self, **params): - digest = str(uuidlib.uuid4()).replace('-', '') - return self.humanize(digest, **params), digest - - - - def get_hash(row): - columns_for_hashing = [ - 'division', - 'submitter', - 'system_name', - 'number_of_nodes', - 'host_processor_model_name', - 'host_processors_per_node', - 'accelerator_model_name', - 'accelerators_per_node', - 'framework' - ] - to_hash = ''.join(str(row[c]) for c in columns_for_hashing) - return hashlib.sha256(to_hash.encode('utf-8')).hexdigest() - - hash = get_hash(summary) - humanhasha = HumanHasher() - summary = humanhasha.humanize(hash) - - return summary - - + for metric in _get_strong_scaling_metric_schema().keys(): + k = '{}:{}'.format(benchmark, metric) + if k not in benchmark_scores: + benchmark_scores[k] = None def summarize_results(folder, usage, ruleset, csv_file=None, **kwargs): @@ -843,22 +789,13 @@ def summarize_results(folder, usage, ruleset, csv_file=None, **kwargs): weak_scaling=True) power_summary = _get_empty_summary(usage, ruleset) power_weak_scaling_summary = _get_empty_summary(usage, ruleset, weak_scaling=True) + for system_folder in _get_sub_folders(results_folder): folder_parts = system_folder.split('/') system = folder_parts[-1] # Load corresponding system description. try: desc = _load_system_desc(folder, system) - id = _get_id_file(folder, system) - # Generate private id and update system desc to match - if kwargs.get('generate_private_ids') and 'private_id' not in id: - id['private_id'] = _get_id_from_sysinfo(desc) - _update_id_file(folder, system, desc['private_id']) - elif 'private_id' not in id: - # Ensure private_id field exists in desc for consistent processing later, even if it's empty - id['private_id'] = '' - desc['private_id'] = id['private_id'] - except (json.JSONDecodeError, FileNotFoundError) as e: print(e) continue @@ -875,7 +812,6 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None): # Construct prefix portion of the row. try: _check_and_update_system_specs('division', 'division') - _check_and_update_system_specs('private_id', 'private_id') # Map availability if requested if "availability" in kwargs: _check_and_update_system_specs('status', 'availability', lambda desc: _map_availability(desc["status"], kwargs["availability"])) @@ -925,7 +861,7 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None): # Compute the scores. strong_scaling_scores, power_scores = _compute_strong_scaling_scores( - desc, system_folder, usage, ruleset) + desc, system_folder, usage, ruleset, system_specs["division"], rcp_bypass=False) if usage == 'hpc': weak_scaling_scores, power_scores_weak_scaling = _compute_weak_scaling_scores( desc, system_folder, usage, ruleset) @@ -984,8 +920,6 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None): return strong_scaling_summary, weak_scaling_summary, power_summary, power_weak_scaling_summary - - def get_parser(): parser = argparse.ArgumentParser( prog='mlperf_logging.result_summarizer', @@ -1006,11 +940,6 @@ def get_parser(): type=str, choices=rule_choices(), help='the ruleset such as 0.6.0, 0.7.0, or 1.0.0') - - parser.add_argument('--generate_private_ids', - action='store_true', - help='Generate private IDs for each run.') - parser.add_argument('--werror', action='store_true', help='Treat warnings as errors') @@ -1028,7 +957,6 @@ def get_parser(): '--xlsx', type=str, help='Exports a xlsx of the results to the path specified') - return parser @@ -1051,15 +979,13 @@ def _update_summaries(folder): folder, args.usage, args.ruleset, - availability = config["availability"], - generate_private_ids = args.generate_private_ids, + availability = config["availability"] ) else: strong_scaling_summary, weak_scaling_summary, power_summary, power_weak_scaling_summary = summarize_results( folder, args.usage, args.ruleset, - generate_private_ids = args.generate_private_ids, ) strong_scaling_summaries.append(strong_scaling_summary) if len(weak_scaling_summary) > 0: @@ -1178,7 +1104,7 @@ def _summaries_to_xlsx(summaries: pd.DataFrame, path, version): start += len(section) index += len(section) - writer.save() + writer.close() # Print and write back results. def _print_and_write(summaries, weak_scaling=False, mode='w', power = False): if len(summaries) > 0: @@ -1199,7 +1125,7 @@ def _print_and_write(summaries, weak_scaling=False, mode='w', power = False): # Sort rows by their values summaries = summaries.sort_values(by=cols) - + print(summaries) if args.csv is not None: csv = args.csv assert csv.endswith(".csv") @@ -1229,4 +1155,4 @@ def _print_and_write(summaries, weak_scaling=False, mode='w', power = False): if __name__ == '__main__': - main() + main() \ No newline at end of file From 2ebc543c6b41467eefbcf5abfc835b4fc5bb0532 Mon Sep 17 00:00:00 2001 From: nnasiri Date: Tue, 28 Apr 2026 14:58:43 -0700 Subject: [PATCH 2/4] Added more fields to the schema of mlperf training summary results --- .../result_summarizer/result_summarizer.py | 229 +++++++++++++++--- 1 file changed, 198 insertions(+), 31 deletions(-) diff --git a/mlperf_logging/result_summarizer/result_summarizer.py b/mlperf_logging/result_summarizer/result_summarizer.py index 9e3eb31..9636911 100644 --- a/mlperf_logging/result_summarizer/result_summarizer.py +++ b/mlperf_logging/result_summarizer/result_summarizer.py @@ -1,23 +1,26 @@ ''' Summarizes a set of results. ''' - -from __future__ import print_function - import argparse import glob import json import os import re import sys +import traceback import itertools import pandas as pd import yaml import numpy as np +import hashlib +import math +import operator +import uuid as uuidlib from ..compliance_checker import mlp_compliance from ..compliance_checker.mlp_compliance import usage_choices, rule_choices from ..compliance_checker.mlp_parser import parse_file + from ..rcp_checker import rcp_checker from ..benchmark_meta import get_allowed_benchmarks, get_result_file_counts @@ -262,6 +265,7 @@ def _get_weak_scaling_metric_schema(): 'time_to_train_all': float, } + def _get_strong_scaling_metric_schema(): return { 'time_to_train': float, @@ -269,8 +273,10 @@ def _get_strong_scaling_metric_schema(): 'GBS': float, 'epochs': float, 'RCP': str, + 'rcp_scaling_factor': float, } + def _get_empty_summary(usage, ruleset, weak_scaling=False): return Summary( _get_column_schema(usage, ruleset, weak_scaling=weak_scaling).keys()) @@ -289,6 +295,7 @@ def _get_column_schema(usage, ruleset, weak_scaling=False): 'accelerators_count': int, 'framework': str, 'notes': str, + 'private_id': str } if weak_scaling == True: benchmarks = get_allowed_benchmarks(usage, ruleset) @@ -296,14 +303,11 @@ def _get_column_schema(usage, ruleset, weak_scaling=False): for metric, dtype in _get_weak_scaling_metric_schema().items(): schema['{}:{}'.format(benchmark, metric)] = dtype else: - #schema.update( - # {b: float - # for b in get_allowed_benchmarks(usage, ruleset)}) benchmarks = get_allowed_benchmarks(usage, ruleset) for benchmark in benchmarks: for metric, dtype in _get_strong_scaling_metric_schema().items(): schema['{}:{}'.format(benchmark, metric)] = dtype - schema.update({'notes': str, 'details_url': str, 'code_url': str}) + schema.update({'details_url': str, 'code_url': str}) return schema @@ -349,8 +353,6 @@ def _compute_strong_score_standalone( ): pattern = "{folder}/result_*.txt".format(folder=benchmark_folder) result_files = glob.glob(pattern, recursive=True) - benchmark_scores = {} - benchmark_power_scores = {} scores = [] scores_track = {} power_scores = [] @@ -417,8 +419,8 @@ def _compute_strong_score_standalone( power_score = olympic_avg power_score *= scaling_factor if return_full_scores: - return scores_track, power_scores_track, score, power_score - return score, power_score + return scores_track, power_scores_track, score, power_score, scaling_factor + return score, power_score, scaling_factor def _compute_weak_score_standalone(benchmark, system, has_power, benchmark_folder, usage, ruleset, desc = {"submitter": None}): @@ -490,12 +492,10 @@ def _compute_weak_score_standalone(benchmark, system, has_power, benchmark_folde def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset, division, rcp_bypass=False): # Collect scores for benchmarks. benchmark_scores = {} - benchmark_power_scores = {} - has_power = None benchmark_folder_parent = os.path.join( system_folder, 'strong') if usage == 'hpc' else system_folder if not os.path.isdir(benchmark_folder_parent): - return benchmark_scores, benchmark_power_scores + return benchmark_scores, {} for benchmark_folder in _get_sub_folders(benchmark_folder_parent): folder_parts = benchmark_folder.split('/') # Check if this benchmark has power results @@ -503,7 +503,7 @@ def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset, division benchmark = _benchmark_alias(folder_parts[-1]) system = folder_parts[-3] if usage == 'hpc' else folder_parts[-2] # Compute base perf/power scores - score, power_score = _compute_strong_score_standalone( + score, power_score, rcp_scaling_factor = _compute_strong_score_standalone( benchmark, system, has_power, benchmark_folder, usage, ruleset, desc ) @@ -518,7 +518,7 @@ def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset, division # RCP check verbose = False bert_train_samples = False - rcp_pass, rcp_msg, rcp_norm_factor = rcp_checker.check_directory( + rcp_pass, rcp_msg, _ = rcp_checker.check_directory( benchmark_folder, usage, ruleset, @@ -553,7 +553,7 @@ def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset, division result_files, ruleset, bert_train_samples=False ) subm_epochs.sort() - samples_rejected = 4 if benchmark == 'unet3d' else 1 + samples_rejected = 1 if len(subm_epochs) >= 2 * samples_rejected + 1: benchmark_epochs = float( np.mean( @@ -563,9 +563,15 @@ def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset, division ) ) except Exception as e: - print(f"WARNING: RCP/GBS computation failed for {benchmark_folder}: {e}") + print( + f"WARNING: RCP/GBS computation failed for {benchmark_folder}: {e}" + ) + traceback.print_exc() # Map into metric-suffixed keys for schema + benchmark_scores[f"{benchmark}:rcp_scaling_factor"] = float( + rcp_scaling_factor + ) if score is not None: benchmark_scores[f"{benchmark}:time_to_train"] = score if benchmark_gbs is not None: @@ -753,6 +759,19 @@ def _load_system_desc(folder, system): raise FileNotFoundError('ERROR: Missing {}'.format(system_file)) return _read_json_file(system_file) +def _get_id_file(folder, system): + systems_folder = os.path.join(folder, 'results/',system) + system_file = os.path.join(systems_folder, 'privateid.json'.format(system)) + if not os.path.exists(system_file): + return {} + return _read_json_file(system_file) + +def _update_id_file(folder, system, id): + systems_folder = os.path.join(folder, 'results/',system) + system_file = os.path.join(systems_folder, 'privateid.json'.format(system)) + id = {'private_id': id} + with open(system_file, 'w') as f: + json.dump(id, f, indent=4) def _fill_empty_benchmark_scores( benchmark_scores, @@ -768,10 +787,137 @@ def _fill_empty_benchmark_scores( benchmark_scores[k] = None else: - for metric in _get_strong_scaling_metric_schema().keys(): + strong_schema = _get_strong_scaling_metric_schema() + for metric, dtype in strong_schema.items(): k = '{}:{}'.format(benchmark, metric) - if k not in benchmark_scores: - benchmark_scores[k] = None + if dtype is str: + if k not in benchmark_scores or benchmark_scores[k] is None: + benchmark_scores[k] = '' + else: + if k not in benchmark_scores: + benchmark_scores[k] = None + + +def _get_id_from_sysinfo(summary): + """Generate private id from system information. + + Args: + summary (dictionary): Sysinfo Dictionary + """ + + + # Code from humanhash3, which is public domain. + DEFAULT_WORDLIST = ( + 'ack', 'alabama', 'alanine', 'alaska', 'alpha', 'angel', 'apart', 'april', + 'arizona', 'arkansas', 'artist', 'asparagus', 'aspen', 'august', 'autumn', + 'avocado', 'bacon', 'bakerloo', 'batman', 'beer', 'berlin', 'beryllium', + 'black', 'blossom', 'blue', 'bluebird', 'bravo', 'bulldog', 'burger', + 'butter', 'california', 'carbon', 'cardinal', 'carolina', 'carpet', 'cat', + 'ceiling', 'charlie', 'chicken', 'coffee', 'cola', 'cold', 'colorado', + 'comet', 'connecticut', 'crazy', 'cup', 'dakota', 'december', 'delaware', + 'delta', 'diet', 'don', 'double', 'early', 'earth', 'east', 'echo', + 'edward', 'eight', 'eighteen', 'eleven', 'emma', 'enemy', 'equal', + 'failed', 'fanta', 'fifteen', 'fillet', 'finch', 'fish', 'five', 'fix', + 'floor', 'florida', 'football', 'four', 'fourteen', 'foxtrot', 'freddie', + 'friend', 'fruit', 'gee', 'georgia', 'glucose', 'golf', 'green', 'grey', + 'hamper', 'happy', 'harry', 'hawaii', 'helium', 'high', 'hot', 'hotel', + 'hydrogen', 'idaho', 'illinois', 'india', 'indigo', 'ink', 'iowa', + 'island', 'item', 'jersey', 'jig', 'johnny', 'juliet', 'july', 'jupiter', + 'kansas', 'kentucky', 'kilo', 'king', 'kitten', 'lactose', 'lake', 'lamp', + 'lemon', 'leopard', 'lima', 'lion', 'lithium', 'london', 'louisiana', + 'low', 'magazine', 'magnesium', 'maine', 'mango', 'march', 'mars', + 'maryland', 'massachusetts', 'may', 'mexico', 'michigan', 'mike', + 'minnesota', 'mirror', 'mississippi', 'missouri', 'mobile', 'mockingbird', + 'monkey', 'montana', 'moon', 'mountain', 'muppet', 'music', 'nebraska', + 'neptune', 'network', 'nevada', 'nine', 'nineteen', 'nitrogen', 'north', + 'november', 'nuts', 'october', 'ohio', 'oklahoma', 'one', 'orange', + 'oranges', 'oregon', 'oscar', 'oven', 'oxygen', 'papa', 'paris', 'pasta', + 'pennsylvania', 'pip', 'pizza', 'pluto', 'potato', 'princess', 'purple', + 'quebec', 'queen', 'quiet', 'red', 'river', 'robert', 'robin', 'romeo', + 'rugby', 'sad', 'salami', 'saturn', 'september', 'seven', 'seventeen', + 'shade', 'sierra', 'single', 'sink', 'six', 'sixteen', 'skylark', 'snake', + 'social', 'sodium', 'solar', 'south', 'spaghetti', 'speaker', 'spring', + 'stairway', 'steak', 'stream', 'summer', 'sweet', 'table', 'tango', 'ten', + 'tennessee', 'tennis', 'texas', 'thirteen', 'three', 'timing', 'triple', + 'twelve', 'twenty', 'two', 'uncle', 'undress', 'uniform', 'uranus', 'utah', + 'vegan', 'venus', 'vermont', 'victor', 'video', 'violet', 'virginia', + 'washington', 'west', 'whiskey', 'white', 'william', 'winner', 'winter', + 'wisconsin', 'wolfram', 'wyoming', 'xray', 'yankee', 'yellow', 'zebra', + 'zulu') + + class HumanHasher(object): + + def __init__(self, wordlist=DEFAULT_WORDLIST): + self.wordlist = wordlist + + def humanize_list(self, hexdigest, words=4): + # Gets a list of byte values between 0-255. + bytes_ = map(lambda x: int(x, 16), + map(''.join, zip(hexdigest[::2], hexdigest[1::2]))) + # Compress an arbitrary number of bytes to `words`. + compressed = self.compress(bytes_, words) + + return [str(self.wordlist[byte]) for byte in compressed] + + def humanize(self, hexdigest, words=4, separator='-'): + # Map the compressed byte values through the word list. + return separator.join(self.humanize_list(hexdigest, words)) + + @staticmethod + def compress(bytes_, target): + bytes_list = list(bytes_) + + length = len(bytes_list) + # If there are less than the target number bytes, return input bytes + if target >= length: + return bytes_ + + # Split `bytes` evenly into `target` segments + # Each segment hashes `seg_size` bytes, rounded down for some + seg_size = float(length) / float(target) + # Initialize `target` number of segments + segments = [0] * target + seg_num = 0 + + # Use a simple XOR checksum-like function for compression + for i, byte in enumerate(bytes_list): + # Divide the byte index by the segment size to assign its segment + # Floor to create a valid segment index + # Min to ensure the index is within `target` + seg_num = min(int(math.floor(i / seg_size)), target-1) + # Apply XOR to the existing segment and the byte + segments[seg_num] = operator.xor(segments[seg_num], byte) + + return segments + + def uuid(self, **params): + digest = str(uuidlib.uuid4()).replace('-', '') + return self.humanize(digest, **params), digest + + + + def get_hash(row): + columns_for_hashing = [ + 'division', + 'submitter', + 'system_name', + 'number_of_nodes', + 'host_processor_model_name', + 'host_processors_per_node', + 'accelerator_model_name', + 'accelerators_per_node', + 'framework' + ] + to_hash = ''.join(str(row[c]) for c in columns_for_hashing) + return hashlib.sha256(to_hash.encode('utf-8')).hexdigest() + + hash = get_hash(summary) + humanhasha = HumanHasher() + summary = humanhasha.humanize(hash) + + return summary + + def summarize_results(folder, usage, ruleset, csv_file=None, **kwargs): @@ -789,13 +935,22 @@ def summarize_results(folder, usage, ruleset, csv_file=None, **kwargs): weak_scaling=True) power_summary = _get_empty_summary(usage, ruleset) power_weak_scaling_summary = _get_empty_summary(usage, ruleset, weak_scaling=True) - for system_folder in _get_sub_folders(results_folder): folder_parts = system_folder.split('/') system = folder_parts[-1] # Load corresponding system description. try: desc = _load_system_desc(folder, system) + id = _get_id_file(folder, system) + # Generate private id and update system desc to match + if kwargs.get('generate_private_ids') and 'private_id' not in id: + id['private_id'] = _get_id_from_sysinfo(desc) + _update_id_file(folder, system, desc['private_id']) + elif 'private_id' not in id: + # Ensure private_id field exists in desc for consistent processing later, even if it's empty + id['private_id'] = '' + desc['private_id'] = id['private_id'] + except (json.JSONDecodeError, FileNotFoundError) as e: print(e) continue @@ -812,6 +967,7 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None): # Construct prefix portion of the row. try: _check_and_update_system_specs('division', 'division') + _check_and_update_system_specs('private_id', 'private_id') # Map availability if requested if "availability" in kwargs: _check_and_update_system_specs('status', 'availability', lambda desc: _map_availability(desc["status"], kwargs["availability"])) @@ -892,11 +1048,12 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None): power_scores.items(), urls.items(), ): - power_summary.push(column_name, value) - if column_name in strong_scaling_scores: - power_summary.push(column_name, strong_scaling_scores[column_name]) - else: - power_summary.push(column_name, value) + merged = ( + strong_scaling_scores[column_name] + if column_name in strong_scaling_scores + else value + ) + power_summary.push(column_name, merged) if usage == 'hpc' and len(power_scores_weak_scaling) > 0: for column_name, value in itertools.chain( system_specs.items(), @@ -920,6 +1077,8 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None): return strong_scaling_summary, weak_scaling_summary, power_summary, power_weak_scaling_summary + + def get_parser(): parser = argparse.ArgumentParser( prog='mlperf_logging.result_summarizer', @@ -940,6 +1099,11 @@ def get_parser(): type=str, choices=rule_choices(), help='the ruleset such as 0.6.0, 0.7.0, or 1.0.0') + + parser.add_argument('--generate_private_ids', + action='store_true', + help='Generate private IDs for each run.') + parser.add_argument('--werror', action='store_true', help='Treat warnings as errors') @@ -957,6 +1121,7 @@ def get_parser(): '--xlsx', type=str, help='Exports a xlsx of the results to the path specified') + return parser @@ -979,13 +1144,15 @@ def _update_summaries(folder): folder, args.usage, args.ruleset, - availability = config["availability"] + availability = config["availability"], + generate_private_ids = args.generate_private_ids, ) else: strong_scaling_summary, weak_scaling_summary, power_summary, power_weak_scaling_summary = summarize_results( folder, args.usage, args.ruleset, + generate_private_ids = args.generate_private_ids, ) strong_scaling_summaries.append(strong_scaling_summary) if len(weak_scaling_summary) > 0: @@ -1104,7 +1271,7 @@ def _summaries_to_xlsx(summaries: pd.DataFrame, path, version): start += len(section) index += len(section) - writer.close() + writer.save() # Print and write back results. def _print_and_write(summaries, weak_scaling=False, mode='w', power = False): if len(summaries) > 0: @@ -1125,7 +1292,7 @@ def _print_and_write(summaries, weak_scaling=False, mode='w', power = False): # Sort rows by their values summaries = summaries.sort_values(by=cols) - print(summaries) + if args.csv is not None: csv = args.csv assert csv.endswith(".csv") @@ -1155,4 +1322,4 @@ def _print_and_write(summaries, weak_scaling=False, mode='w', power = False): if __name__ == '__main__': - main() \ No newline at end of file + main() From 12d733d0edc0e945c1861ae87f596c2ec183134b Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Wed, 6 May 2026 19:42:44 -0500 Subject: [PATCH 3/4] Output both summary and summary_detailed --- .../result_summarizer/result_summarizer.py | 104 +++++++++++------- 1 file changed, 63 insertions(+), 41 deletions(-) diff --git a/mlperf_logging/result_summarizer/result_summarizer.py b/mlperf_logging/result_summarizer/result_summarizer.py index 9636911..1947d2b 100644 --- a/mlperf_logging/result_summarizer/result_summarizer.py +++ b/mlperf_logging/result_summarizer/result_summarizer.py @@ -16,6 +16,7 @@ import math import operator import uuid as uuidlib +import copy from ..compliance_checker import mlp_compliance from ..compliance_checker.mlp_compliance import usage_choices, rule_choices @@ -277,12 +278,12 @@ def _get_strong_scaling_metric_schema(): } -def _get_empty_summary(usage, ruleset, weak_scaling=False): +def _get_empty_summary(usage, ruleset, weak_scaling=False, detailed=False): return Summary( - _get_column_schema(usage, ruleset, weak_scaling=weak_scaling).keys()) + _get_column_schema(usage, ruleset, weak_scaling=weak_scaling, detailed=detailed).keys()) -def _get_column_schema(usage, ruleset, weak_scaling=False): +def _get_column_schema(usage, ruleset, weak_scaling=False, detailed=False): schema = { 'division': str, 'availability': str, @@ -303,10 +304,17 @@ def _get_column_schema(usage, ruleset, weak_scaling=False): for metric, dtype in _get_weak_scaling_metric_schema().items(): schema['{}:{}'.format(benchmark, metric)] = dtype else: - benchmarks = get_allowed_benchmarks(usage, ruleset) - for benchmark in benchmarks: - for metric, dtype in _get_strong_scaling_metric_schema().items(): - schema['{}:{}'.format(benchmark, metric)] = dtype + if detailed: + benchmarks = get_allowed_benchmarks(usage, ruleset) + for benchmark in benchmarks: + for metric, dtype in _get_strong_scaling_metric_schema().items(): + schema['{}:{}'.format(benchmark, metric)] = dtype + else: + schema.update( + { + b: float for b in get_allowed_benchmarks(usage, ruleset) + } + ) schema.update({'details_url': str, 'code_url': str}) return schema @@ -492,6 +500,7 @@ def _compute_weak_score_standalone(benchmark, system, has_power, benchmark_folde def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset, division, rcp_bypass=False): # Collect scores for benchmarks. benchmark_scores = {} + detailed_bechmark_scores = {} benchmark_folder_parent = os.path.join( system_folder, 'strong') if usage == 'hpc' else system_folder if not os.path.isdir(benchmark_folder_parent): @@ -569,21 +578,25 @@ def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset, division traceback.print_exc() # Map into metric-suffixed keys for schema - benchmark_scores[f"{benchmark}:rcp_scaling_factor"] = float( + detailed_bechmark_scores[f"{benchmark}:rcp_scaling_factor"] = float( rcp_scaling_factor ) if score is not None: - benchmark_scores[f"{benchmark}:time_to_train"] = score + detailed_bechmark_scores[f"{benchmark}:time_to_train"] = score if benchmark_gbs is not None: - benchmark_scores[f"{benchmark}:GBS"] = float(benchmark_gbs) + detailed_bechmark_scores[f"{benchmark}:GBS"] = float(benchmark_gbs) if benchmark_epochs is not None: - benchmark_scores[f"{benchmark}:epochs"] = float(benchmark_epochs) + detailed_bechmark_scores[f"{benchmark}:epochs"] = float(benchmark_epochs) if benchmark_rcp is not None: - benchmark_scores[f"{benchmark}:RCP"] = benchmark_rcp + detailed_bechmark_scores[f"{benchmark}:RCP"] = benchmark_rcp if power_score is not None: - benchmark_scores[f"{benchmark}:Energy"] = power_score - _fill_empty_benchmark_scores(benchmark_scores, usage, ruleset) - return benchmark_scores, {} + detailed_bechmark_scores[f"{benchmark}:Energy"] = power_score + benchmark_scores[f"{benchmark}"] = float( + rcp_scaling_factor + ) + _fill_empty_benchmark_scores(benchmark_scores, usage, ruleset, detailed=False) + _fill_empty_benchmark_scores(detailed_bechmark_scores, usage, ruleset, detailed=True) + return benchmark_scores, detailed_bechmark_scores def _compute_weak_scaling_scores(desc, system_folder, usage, ruleset): @@ -778,6 +791,7 @@ def _fill_empty_benchmark_scores( usage, ruleset, weak_scaling=False, + detailed=False, ): for benchmark in get_allowed_benchmarks(usage, ruleset): if weak_scaling: @@ -787,15 +801,19 @@ def _fill_empty_benchmark_scores( benchmark_scores[k] = None else: - strong_schema = _get_strong_scaling_metric_schema() - for metric, dtype in strong_schema.items(): - k = '{}:{}'.format(benchmark, metric) - if dtype is str: - if k not in benchmark_scores or benchmark_scores[k] is None: - benchmark_scores[k] = '' - else: - if k not in benchmark_scores: - benchmark_scores[k] = None + if detailed: + strong_schema = _get_strong_scaling_metric_schema() + for metric, dtype in strong_schema.items(): + k = '{}:{}'.format(benchmark, metric) + if dtype is str: + if k not in benchmark_scores or benchmark_scores[k] is None: + benchmark_scores[k] = '' + else: + if k not in benchmark_scores: + benchmark_scores[k] = None + else: + if benchmark not in benchmark_scores: + benchmark_scores[benchmark] = None def _get_id_from_sysinfo(summary): @@ -933,7 +951,7 @@ def summarize_results(folder, usage, ruleset, csv_file=None, **kwargs): weak_scaling_summary = _get_empty_summary(usage, ruleset, weak_scaling=True) - power_summary = _get_empty_summary(usage, ruleset) + detailed_strong_scaling_summary = _get_empty_summary(usage, ruleset, detailed=True) power_weak_scaling_summary = _get_empty_summary(usage, ruleset, weak_scaling=True) for system_folder in _get_sub_folders(results_folder): folder_parts = system_folder.split('/') @@ -1016,7 +1034,7 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None): continue # Compute the scores. - strong_scaling_scores, power_scores = _compute_strong_scaling_scores( + strong_scaling_scores, detailed_strong_scaling_scores = _compute_strong_scaling_scores( desc, system_folder, usage, ruleset, system_specs["division"], rcp_bypass=False) if usage == 'hpc': weak_scaling_scores, power_scores_weak_scaling = _compute_weak_scaling_scores( @@ -1042,18 +1060,18 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None): urls.items(), ): weak_scaling_summary.push(column_name, value) - if len(power_scores) > 0: + if len(detailed_strong_scaling_scores) > 0: for column_name, value in itertools.chain( system_specs.items(), - power_scores.items(), + detailed_strong_scaling_scores.items(), urls.items(), ): merged = ( - strong_scaling_scores[column_name] - if column_name in strong_scaling_scores + detailed_strong_scaling_scores[column_name] + if column_name in detailed_strong_scaling_scores else value ) - power_summary.push(column_name, merged) + detailed_strong_scaling_summary.push(column_name, merged) if usage == 'hpc' and len(power_scores_weak_scaling) > 0: for column_name, value in itertools.chain( system_specs.items(), @@ -1068,13 +1086,13 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None): if len(weak_scaling_summary) > 0: weak_scaling_summary = weak_scaling_summary.to_dataframe().sort_values( _get_sort_by_column_names()).reset_index(drop=True) - if len(power_summary) > 0: - power_summary = power_summary.to_dataframe().sort_values( + if len(detailed_strong_scaling_summary) > 0: + detailed_strong_scaling_summary = detailed_strong_scaling_summary.to_dataframe().sort_values( _get_sort_by_column_names()).reset_index(drop=True) if len(power_weak_scaling_summary) > 0: power_weak_scaling_summary = power_weak_scaling_summary.to_dataframe().sort_values( _get_sort_by_column_names()).reset_index(drop=True) - return strong_scaling_summary, weak_scaling_summary, power_summary, power_weak_scaling_summary + return strong_scaling_summary, weak_scaling_summary, detailed_strong_scaling_summary, power_weak_scaling_summary @@ -1132,7 +1150,7 @@ def main(): strong_scaling_summaries = [] weak_scaling_summaries = [] - power_summaries = [] + detailed_strong_scaling_summaries = [] power_weak_scaling_summaries = [] def _update_summaries(folder): @@ -1140,7 +1158,7 @@ def _update_summaries(folder): config_path = os.path.join(os.path.dirname(__file__), "config.yaml") with open(config_path, "r") as f: config = yaml.safe_load(f) - strong_scaling_summary, weak_scaling_summary, power_summary, power_weak_scaling_summary = summarize_results( + strong_scaling_summary, weak_scaling_summary, detailed_strong_scaling_summary, power_weak_scaling_summary = summarize_results( folder, args.usage, args.ruleset, @@ -1148,7 +1166,7 @@ def _update_summaries(folder): generate_private_ids = args.generate_private_ids, ) else: - strong_scaling_summary, weak_scaling_summary, power_summary, power_weak_scaling_summary = summarize_results( + strong_scaling_summary, weak_scaling_summary, detailed_strong_scaling_summary, power_weak_scaling_summary = summarize_results( folder, args.usage, args.ruleset, @@ -1157,8 +1175,8 @@ def _update_summaries(folder): strong_scaling_summaries.append(strong_scaling_summary) if len(weak_scaling_summary) > 0: weak_scaling_summaries.append(weak_scaling_summary) - if len(power_summary) > 0: - power_summaries.append(power_summary) + if len(detailed_strong_scaling_summary) > 0: + detailed_strong_scaling_summaries.append(detailed_strong_scaling_summary) if len(power_weak_scaling_summary) > 0: power_weak_scaling_summaries.append(power_weak_scaling_summary) @@ -1273,13 +1291,14 @@ def _summaries_to_xlsx(summaries: pd.DataFrame, path, version): writer.save() # Print and write back results. - def _print_and_write(summaries, weak_scaling=False, mode='w', power = False): + def _print_and_write(summaries, weak_scaling=False, mode='w', power = False, detailed = False): if len(summaries) > 0: summaries = pd.concat(summaries).astype( _get_column_schema( args.usage, args.ruleset, weak_scaling=weak_scaling, + detailed=detailed ) ) if weak_scaling: @@ -1301,6 +1320,9 @@ def _print_and_write(summaries, weak_scaling=False, mode='w', power = False): specs_and_notes = [c for c in summaries.columns if c not in benchmarks] csv = csv.replace(".csv", "_power.csv") summaries.groupby(specs_and_notes).apply(lambda x: agg_columns_fn(x, benchmarks)).to_csv(csv, mode=mode) + elif detailed: + csv = csv.replace(".csv", "_detailed.csv") + summaries.to_csv(csv, index=False, mode=mode) else: summaries.to_csv(csv, index=False, mode=mode) json_path = "summary.json" if args.csv is None else f"""{csv.replace(".csv", ".json")}""" @@ -1317,7 +1339,7 @@ def _print_and_write(summaries, weak_scaling=False, mode='w', power = False): None, 'display.max_colwidth', None): _print_and_write(strong_scaling_summaries) _print_and_write(weak_scaling_summaries, weak_scaling=True, mode='a') - _print_and_write(power_summaries, mode='a', power=True) + _print_and_write(detailed_strong_scaling_summaries, mode='a', detailed=True) _print_and_write(power_weak_scaling_summaries, weak_scaling=True, mode='a', power=True) From d9250f60e8af0e8bd994459384c06bcabc164531 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Thu, 7 May 2026 10:57:52 -0500 Subject: [PATCH 4/4] Rename column: epochs -> samples_to_converge --- mlperf_logging/result_summarizer/result_summarizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlperf_logging/result_summarizer/result_summarizer.py b/mlperf_logging/result_summarizer/result_summarizer.py index 1947d2b..131d208 100644 --- a/mlperf_logging/result_summarizer/result_summarizer.py +++ b/mlperf_logging/result_summarizer/result_summarizer.py @@ -586,7 +586,7 @@ def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset, division if benchmark_gbs is not None: detailed_bechmark_scores[f"{benchmark}:GBS"] = float(benchmark_gbs) if benchmark_epochs is not None: - detailed_bechmark_scores[f"{benchmark}:epochs"] = float(benchmark_epochs) + detailed_bechmark_scores[f"{benchmark}:samples_to_converge"] = float(benchmark_epochs) if benchmark_rcp is not None: detailed_bechmark_scores[f"{benchmark}:RCP"] = benchmark_rcp if power_score is not None: