diff --git a/PyTorch/Deploying_OH/Benchmark.py b/PyTorch/Deploying_OH/Benchmark.py
new file mode 100644
index 0000000..19d6750
--- /dev/null
+++ b/PyTorch/Deploying_OH/Benchmark.py
@@ -0,0 +1,412 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+import unittest
+import subprocess
+import requests
+
+unittest.TestLoader.sortTestMethodsUsing = None
+
+# unittest.TestLoader.sortTestMethodsUsing = lambda self, a, b: (a < b) - (a > b)
+class RunCmd:
+ def run(self, cmd, env_vars=None):
+ # Ensure cmd is a list of arguments
+ if isinstance(cmd, str):
+ import shlex
+ cmd = shlex.split(cmd)
+
+ # Print the command and environment variables for debugging
+ print("Running command:", cmd)
+ if env_vars:
+ print("With environment variables:", env_vars)
+
+ # Execute the command with the environment variables
+ p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False, env=env_vars)
+ output, err = p.communicate()
+ p_status = p.wait()
+
+ # Return the status and output
+ return p_status, output
+
+class PerfUtility:
+
+ def __init__(self, filename):
+ self.class_name = filename.split(".")[0]
+ self.filename = filename # self.__class__.__name__
+ return
+
+ def load_input_data(self, model_name):
+ import os
+ gaudi_version = os.getenv('GAUDI_VER') or '3'
+ class_name = self.class_name + gaudi_version
+ matched_items=[]
+ with open(self.filename, "r") as file:
+ try:
+ data = json.load(file)
+ except:
+ print("json load failed: " + self.filename)
+ pass
+ for i in data[class_name]:
+ if i["model"] == model_name:
+ matched_items.append(i)
+ return matched_items
+
+ def model_test(self, data, perf_report):
+ # 0. setup run env
+ import os
+ hqt_output_exist = False
+ src = "HQT" + os.sep + "hqt_output_" + data["model"] + '_' + data["num_cards"] + 'c'
+ dst = "hqt_output"
+ if os.path.islink(dst):
+ os.unlink(dst)
+ if os.path.exists(src):
+ os.symlink(src,dst)
+ hqt_output_exist = True
+ else:
+ print("No Tensor measurement output, Need redo the Tensor Measurement")
+
+ ref_perf = data["ref_perf"]
+ if ref_perf =="":
+ status = 0
+ if hqt_output_exist is False:
+ # 0.1 .Run the tensor measurement instruction
+ import shutil
+ import os
+ cmd = data["run_cmd"]
+ env = data["env_vars"]
+ status, output = RunCmd().run(cmd, env)
+ output = output.decode("utf-8")
+ print(cmd)
+ # 0.1.1 copy generated hqt_output(dst) to HQT folder(src)
+ if os.path.exists(dst):
+ if not os.path.exists(src):
+ os.makedirs(src)
+ file_names = os.listdir(dst)
+ for file_name in file_names:
+ shutil.move(os.path.join(dst, file_name), src)
+ os.rmdir(dst)
+ # end 0.1 Tensor measurement
+ return status
+
+ # 1.Run the instruction
+ cmd = data["run_cmd"]
+ env = data["env_vars"]
+ #print(cmd)
+ #return 0
+ status, output = RunCmd().run(cmd, env)
+ output = output.decode("utf-8")
+
+ # 2.Parsing the run log
+ filename = data["model"] + "_" + data["input_len"] + "_" + data["output_len"] + "_" + data["num_cards"] + 'c' + "_log.txt"
+ perf_report.dump_log_to_file(output, filename)
+ throughput, mem_allocated, max_mem_allocated, graph_compile = perf_report.parse_run_log(output)
+
+ # 3.Add new row into report
+ #throughput = '0'
+ new_row = {}
+ perf_ratio = float(throughput) / float(data["ref_perf"])
+ if perf_report.report_level >= 3:
+ new_row = {"Model": data["model"], "#cards": data["num_cards"], "InputLen": data["input_len"], "OutputLen": data["output_len"], "BS": data["bs"], "ref_perf_number": data["ref_perf"], "perf_number": throughput, "perf_ratio": perf_ratio, "max_mem_allocated": max_mem_allocated ,"cmd": data["run_cmd"]}
+ else:
+ new_row = {"Model": data["model"], "#cards": data["num_cards"], "InputLen": data["input_len"], "OutputLen": data["output_len"], "BS": data["bs"], "ref_perf_number": data["ref_perf"], "perf_number": throughput, "perf_ratio": perf_ratio, "max_mem_allocated": max_mem_allocated}
+
+ df_len = len(perf_report.perf_report_df)
+ perf_report.perf_report_df.loc[df_len+1] = new_row
+ return status
+
+
+class PerfReport:
+ def __init__(self, name, report_level):
+ self.name = name
+ self.report_level = report_level
+ self.env_vars_df = None
+ self.system_info_df = None
+ self.gaudi_info_df = None
+ self.docker_ps = ""
+ self.docker_ps_df = None
+ self.perf_report_df = None
+ import datetime
+
+ d = datetime.datetime.now()
+ dateinfo = d.strftime("%m-%d_%H-%M")
+ self.result_folder_name = self.name + "_" + dateinfo
+ import os
+
+ if not os.path.exists(self.result_folder_name):
+ os.makedirs(self.result_folder_name)
+
+
+ def init_perf_report(self):
+
+ import pandas as pd
+ rows = []
+ if report_level >= 3:
+ columns = ["Model", "#cards", "InputLen", "OutputLen", "BS", "ref_perf_number", "perf_number", "perf_ratio", "max_mem_allocated", "cmd"]
+ else:
+ columns = ["Model", "#cards", "InputLen", "OutputLen", "BS", "ref_perf_number", "perf_number", "perf_ratio", "max_mem_allocated"]
+
+ df = pd.DataFrame(rows, columns=columns)
+ self.perf_report_df = df
+
+ def dump_log_to_file(self, output, filename):
+ filepath = self.result_folder_name + os.sep + filename
+ fd = open(filepath, "w") # append mode
+ fd.write(output)
+ fd.close()
+ return
+
+ def parse_run_log(self, log):
+ throughput = ''
+ mem_allocated = ''
+ max_mem_allocated = ''
+ graph_compile = ''
+ for line in log.splitlines():
+ if line.find("Throughput") != -1:
+ throughput = line.split('=')[1].split(' ')[1]
+ elif line.find("Memory") != -1:
+ mem_allocated = line.split('=')[1].split(' ')[1]
+ elif line.find("Max") != -1:
+ max_mem_allocated = line.split('=')[1].split(' ')[1]
+ elif line.find("Graph") != -1:
+ graph_compile = line.split('=')[1].split(' ')[1]
+ return throughput, mem_allocated, max_mem_allocated, graph_compile
+
+
+ def generate_perf_report(self):
+ import os
+ import re
+
+ print(" Example Name:" + self.name)
+ print(" ### System Info###")
+ print(self.system_info_df)
+ print(" ### Gaudi Info###")
+ print(self.gaudi_info_df)
+ self.docker_ps_df = None
+ print(" ### Performance Number###")
+ print(self.perf_report_df)
+
+ report_name = self.name + ".html"
+
+ report_path = self.result_folder_name + os.sep + report_name
+
+ # Log Files
+
+ docker_log_html_content = ""
+ pattern = r".*\_docker_log.txt$" # Match all files ending with ".txt"
+ for filename in os.listdir(self.result_folder_name):
+ if re.search(pattern, filename):
+ html_content = (
+ " \n\n
"
+ + filename
+ + "
\n"
+ + ""
+ )
+ docker_log_html_content = docker_log_html_content + html_content
+
+ with open(report_path, "w") as hfile:
+ hfile.write(
+ "\n\n 1. Perf Numbers
\n\n"
+ + self.perf_report_df.to_html()
+ + "\n\n 2. System Info
\n\n"
+ + self.system_info_df.head().to_html()
+ + "\n\n 3. Gaudi Info
\n\n"
+ + self.gaudi_info_df.head().to_html()
+ )
+
+ print("\nReport File is : " + report_path)
+ import shutil
+
+ shutil.make_archive(self.result_folder_name, "zip", self.result_folder_name)
+ return
+
+
+class OH_Benchmark(unittest.TestCase):
+ skip_llama2_70b=int(os.environ.get('skip_llama2_70b', 0))
+ skip_llama31_8b=int(os.environ.get('skip_llama31_8b', 0))
+ skip_llama31_70b=int(os.environ.get('skip_llama31_70b', 0))
+ skip_llama33_70b=int(os.environ.get('skip_llama33_70b', 0))
+ skip_llama31_405b=int(os.environ.get('skip_llama31_405b', 0))
+ def setUp(self):
+ self.perf_report = perf_report
+ self.ip = "http://0.0.0.0"
+ self.datafile = DataJsonFileName
+ self.classname = DataJsonFileName.split(".")[0]
+ self.utils = PerfUtility(self.datafile)
+ self.hostname = ''
+ if not os.path.exists("./HQT") and os.path.exists("./HQT.zip"):
+ import zipfile
+ zip = zipfile.ZipFile('HQT.zip') # from zipfile import ZipFile
+ zip.extractall('./')
+ zip.close()
+ return
+
+ def tearDown(self):
+ return
+
+ def test_0_system(self):
+
+ import socket
+
+ self.hostname = socket.gethostname()
+ IPAddr = socket.gethostbyname(self.hostname)
+
+ import platform
+
+ system_info = platform.uname()
+ import pandas as pd
+
+ rows = []
+ columns = ["info", "value"]
+ rows.append(["hostname", self.hostname])
+ rows.append(["ip", IPAddr])
+ rows.append(["system", system_info.system])
+ rows.append(["node", system_info.node])
+ rows.append(["release", system_info.release])
+ rows.append(["version", system_info.version])
+ rows.append(["machine", system_info.machine])
+ rows.append(["processor", system_info.processor])
+ df = pd.DataFrame(rows, columns=columns)
+ self.perf_report.system_info_df = df
+
+ self.perf_report.init_perf_report()
+
+ self.assertEqual(False, False)
+
+ def test_1_perfspect(self):
+ # PerfSpect Report
+ if not os.path.exists("./perfspect"):
+ p = subprocess.Popen('wget -qO- https://github.com/intel/PerfSpect/releases/latest/download/perfspect.tgz | tar xvz', stdout=subprocess.PIPE, shell=True)
+ output, err = p.communicate()
+ status = p.wait()
+ cmd = './perfspect/perfspect report --gaudi --output ' + self.perf_report.result_folder_name
+ status, output = RunCmd().run(cmd)
+ import socket
+ hostname = socket.gethostname()
+ xlsx_file = self.perf_report.result_folder_name + os.sep + hostname + '.xlsx'
+ import pandas as pd
+ if os.path.exists(xlsx_file):
+ print(xlsx_file)
+ df= pd.read_excel(xlsx_file)
+ print(df)
+ self.perf_report.gaudi_info_df = df
+ self.assertEqual(False, False)
+
+ @unittest.skipIf(skip_llama2_70b == 1 , "Skip over this routine")
+ def test_2_llama2_70b(self):
+
+ model_name = "Llama2_70b"
+ # Get configs/data
+ data = self.utils.load_input_data(model_name)
+ #print(data)
+ self.assertNotEqual(data, None)
+
+ # Testing
+ for i in data:
+ try:
+ response_status_code = self.utils.model_test(i, perf_report)
+ except:
+ response_status_code=-1
+ continue
+ self.assertEqual(response_status_code, 0)
+
+ @unittest.skipIf(skip_llama31_8b == 1 , "Skip over this routine")
+ def test_3_llama3_1_8b(self):
+
+ model_name = "Llama3.1_8b"
+ # Get configs/data
+ data = self.utils.load_input_data(model_name)
+ #print(data)
+ self.assertNotEqual(data, None)
+
+ # Testing
+ for i in data:
+ try:
+ response_status_code = self.utils.model_test(i, perf_report)
+ except:
+ response_status_code=-1
+ continue
+ self.assertEqual(response_status_code, 0)
+
+ @unittest.skipIf(skip_llama31_70b == 1 , "Skip over this routine")
+ def test_4_llama3_1_70b(self):
+
+ model_name = "Llama3.1_70b"
+ # Get configs/data
+ data = self.utils.load_input_data(model_name)
+ #print(data)
+ self.assertNotEqual(data, None)
+
+ # Testing
+ for i in data:
+ try:
+ response_status_code = self.utils.model_test(i, perf_report)
+ except:
+ response_status_code=-1
+ continue
+ self.assertEqual(response_status_code, 0)
+
+ @unittest.skipIf(skip_llama33_70b == 1 , "Skip over this routine")
+ def test_5_llama3_3_70b(self):
+
+ model_name = "Llama3.3_70b"
+ # Get configs/data
+ data = self.utils.load_input_data(model_name)
+ #print(data)
+ self.assertNotEqual(data, None)
+
+ # Testing
+ for i in data:
+ try:
+ response_status_code = self.utils.model_test(i, perf_report)
+ except:
+ response_status_code=-1
+ continue
+ self.assertEqual(response_status_code, 0)
+
+ @unittest.skipIf(skip_llama31_405b == 1 , "Skip over this routine")
+ def test_6_llama3_1_405b(self):
+
+ model_name = "Llama3.1_405b"
+ # Get configs/data
+ data = self.utils.load_input_data(model_name)
+ #print(data)
+ self.assertNotEqual(data, None)
+
+ # Testing
+ for i in data:
+ try:
+ response_status_code = self.utils.model_test(i, perf_report)
+ except:
+ response_status_code=-1
+ continue
+ self.assertEqual(response_status_code, 0)
+
+if __name__ == "__main__":
+ import sys
+ import os
+
+ report_level = 2 # low, medium, high
+ DataJsonFileName = "Gaudi.json" #sys.argv[1] # "ChatQnA_Xeon.json"
+ if os.path.isfile(DataJsonFileName) is False:
+ print("Missing Gaudi.json file")
+ exit(0)
+
+ perf_report = PerfReport(DataJsonFileName, report_level)
+ test_loader = unittest.TestLoader()
+ suite = test_loader.loadTestsFromTestCase(OH_Benchmark)
+ unittest.TextTestRunner(verbosity=3).run(suite)
+ perf_report.generate_perf_report()
diff --git a/PyTorch/Deploying_OH/Dockerfile-1.22.0-ub24-oh-v1.18.1 b/PyTorch/Deploying_OH/Dockerfile-1.22.0-ub24-oh-v1.18.1
new file mode 100644
index 0000000..541e908
--- /dev/null
+++ b/PyTorch/Deploying_OH/Dockerfile-1.22.0-ub24-oh-v1.18.1
@@ -0,0 +1,40 @@
+# Parameterize base image components
+ARG DOCKER_URL=vault.habana.ai/gaudi-docker
+ARG VERSION=1.22.0
+ARG BASE_NAME=ubuntu24.04
+ARG PT_VERSION=2.7.1
+ARG REVISION=latest
+ARG REPO_TYPE=habanalabs
+
+FROM ${DOCKER_URL}/${VERSION}/${BASE_NAME}/${REPO_TYPE}/pytorch-installer-${PT_VERSION}:${REVISION}
+# Parameterize commit/branch for vllm-fork checkout
+ARG OH_FORK_COMMIT=v1.18.1
+ARG DEEPSEEK_COMMIT=1.22.0
+
+ENV OMPI_MCA_btl_vader_single_copy_mechanism=none
+
+RUN apt update && \
+ apt install -y gettext moreutils jq && \
+ apt install -y npm && \
+ npm install n -g && \
+ ln -sf /usr/bin/python3 /usr/bin/python
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+ python3 -m pip install --upgrade-strategy eager optimum[habana] && \
+ python3 -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@${DEEPSEEK_COMMIT}
+
+WORKDIR /root
+
+RUN git clone https://github.com/huggingface/optimum-habana && \
+ cd optimum-habana && \
+ git checkout ${OH_FORK_COMMIT}
+
+WORKDIR /root/optimum-habana/examples/text-generation
+RUN python3 -m pip install -r requirements.txt && \
+ python3 -m pip install -r requirements_lm_eval.txt
+
+COPY . .
+COPY Gaudi_1-20.json Gaudi.json
+COPY HQT_1-20.zip HQT.zip
+
+RUN python3 -m pip install -r requirements_bm.txt
diff --git a/PyTorch/Deploying_OH/Gaudi_1-19.json b/PyTorch/Deploying_OH/Gaudi_1-19.json
new file mode 100644
index 0000000..0a264de
--- /dev/null
+++ b/PyTorch/Deploying_OH/Gaudi_1-19.json
@@ -0,0 +1,510 @@
+{
+ "Gaudi3": [
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "1750",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1750 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "4562"
+ },
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "512",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 512 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "6590"
+ },
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "242",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 242 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "486"
+ },
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "241",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 241 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "2736"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "1536",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1536 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "24364"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "768",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 768 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "18063"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "256",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "2590"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "371",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 371 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "8335"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "2048",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2048 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "4562"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "450",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 450 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "6278"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "223",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 223 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "449"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "175",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 175 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "2796"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "4000",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 4000 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "15377"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "768",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 600 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "16891"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "512",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 512 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "1594"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "600",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 600 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "9467"
+ },
+ {
+ "model": "Llama3.1_405b",
+ "num_cards": "8",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "2996",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2996 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "3306"
+ },
+ {
+ "model": "Llama3.1_405b",
+ "num_cards": "8",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "460",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 460 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "4793"
+ },
+ {
+ "model": "Llama3.1_405b",
+ "num_cards": "8",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "195",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 195 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "371"
+ },
+ {
+ "model": "Llama3.1_405b",
+ "num_cards": "8",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "180",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 180 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "2143"
+ }
+ ],
+ "Gaudi2": [
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "1750",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1750 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "2943"
+ },
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "327",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 327 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "3312"
+ },
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "95",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 95 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "316"
+ },
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "159",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 159 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "1755"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "2816",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2816 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "19875"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "512",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 512 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "14784"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "179",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 179 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "2011"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "256",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "6083"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "1792",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1792 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "2895"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "256",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "3816"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "142",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 142 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "316"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "139",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 139 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "1648"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "4000",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 4000 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "10012"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "600",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 600 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "12538"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "383",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 383 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "1083"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "476",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 476 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "6623"
+ }
+ ]
+}
diff --git a/PyTorch/Deploying_OH/Gaudi_1-20.json b/PyTorch/Deploying_OH/Gaudi_1-20.json
new file mode 100644
index 0000000..fc0f9a7
--- /dev/null
+++ b/PyTorch/Deploying_OH/Gaudi_1-20.json
@@ -0,0 +1,706 @@
+{
+ "Gaudi3": [
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "",
+ "output_len": "",
+ "bs": "",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-2-70b-hf --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": ""
+ },
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "1750",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1750 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "4853"
+ },
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "512",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 512 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "6835"
+ },
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "242",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 242 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "506"
+ },
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "241",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 241 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "2859"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "",
+ "output_len": "",
+ "bs": "",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 1 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": ""
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "1536",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1536 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "25097"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "768",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 768 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "20425"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "256",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "2765"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "256",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "9013"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "",
+ "output_len": "",
+ "bs": "",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": ""
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "2048",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2048 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "5466"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "450",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 450 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "6535"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "223",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 223 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "663"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "175",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 175 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "2891"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "",
+ "output_len": "",
+ "bs": "",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": ""
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "4000",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 4000 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "18290"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "768",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 768 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "21138"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "512",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 512 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "2273"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "600",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 600 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "10600"
+ },
+ {
+ "model": "Llama3.3_70b",
+ "num_cards": "8",
+ "input_len": "",
+ "output_len": "",
+ "bs": "",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": ""
+ },
+ {
+ "model": "Llama3.3_70b",
+ "num_cards": "8",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "3986",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 3986 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "16622"
+ },
+ {
+ "model": "Llama3.3_70b",
+ "num_cards": "8",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "2048",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 2048 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "24705"
+ },
+ {
+ "model": "Llama3.3_70b",
+ "num_cards": "8",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "774",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 774 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "1890"
+ },
+ {
+ "model": "Llama3.3_70b",
+ "num_cards": "8",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "719",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 719 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "11043"
+ },
+ {
+ "model": "Llama3.1_405b",
+ "num_cards": "8",
+ "input_len": "",
+ "output_len": "",
+ "bs": "",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": ""
+ },
+ {
+ "model": "Llama3.1_405b",
+ "num_cards": "8",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "2996",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2996 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "3488"
+ },
+ {
+ "model": "Llama3.1_405b",
+ "num_cards": "8",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "460",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 460 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "4998"
+ },
+ {
+ "model": "Llama3.1_405b",
+ "num_cards": "8",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "195",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 195 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "394"
+ },
+ {
+ "model": "Llama3.1_405b",
+ "num_cards": "8",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "180",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 180 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "2238"
+ }
+ ],
+ "Gaudi2": [
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "",
+ "output_len": "",
+ "bs": "",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-2-70b-hf --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": ""
+ },
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "1750",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1750 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "2935"
+ },
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "256",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "3963"
+ },
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "95",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 95 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "318"
+ },
+ {
+ "model": "Llama2_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "159",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 159 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "1767"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "",
+ "output_len": "",
+ "bs": "",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 1 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": ""
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "2816",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2816 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 3",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "19907"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "512",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 512 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "14866"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "179",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 179 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 3",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "2099"
+ },
+ {
+ "model": "Llama3.1_8b",
+ "num_cards": "1",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "256",
+ "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "6060"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "",
+ "output_len": "",
+ "bs": "",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": ""
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "1792",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1792 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "3421"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "256",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "3827"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "142",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 142 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "462"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "2",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "139",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 139 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "1665"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "",
+ "output_len": "",
+ "bs": "",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": ""
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "128",
+ "output_len": "128",
+ "bs": "4000",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 4000 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "10404"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "128",
+ "output_len": "2048",
+ "bs": "768",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 768 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "13639"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "2048",
+ "output_len": "128",
+ "bs": "383",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 383 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "1553"
+ },
+ {
+ "model": "Llama3.1_70b",
+ "num_cards": "8",
+ "input_len": "2048",
+ "output_len": "2048",
+ "bs": "476",
+ "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 476 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+ "env_vars": {
+ "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+ "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+ "TQDM_DISABLE": "1"
+ },
+ "ref_perf": "6715"
+ }
+ ]
+}
diff --git a/PyTorch/Deploying_OH/HQT_1-19.zip b/PyTorch/Deploying_OH/HQT_1-19.zip
new file mode 100644
index 0000000..cc81165
Binary files /dev/null and b/PyTorch/Deploying_OH/HQT_1-19.zip differ
diff --git a/PyTorch/Deploying_OH/HQT_1-20.zip b/PyTorch/Deploying_OH/HQT_1-20.zip
new file mode 100644
index 0000000..b7bd0d8
Binary files /dev/null and b/PyTorch/Deploying_OH/HQT_1-20.zip differ
diff --git a/PyTorch/Deploying_OH/README.md b/PyTorch/Deploying_OH/README.md
new file mode 100644
index 0000000..3960f17
--- /dev/null
+++ b/PyTorch/Deploying_OH/README.md
@@ -0,0 +1,68 @@
+# TODO - NEEDS REWRITE
+
+
+Rough Notes:
+BUILD_ARGS="--build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg no_proxy"
+IMAGE_TAG=oh-textgen-1.22.0-ub24-oh-v1.18.1-deepseek-1.22.0
+docker build $BUILD_ARGS -f -t $IMAGE_TAG Dockerfile-1.22.0-ub24-oh-v1.18.1 .
+
+# OLD CONTENT BELOW
+# Benchmarking Hugging Face Pipelines with fp8 on Intel® Gaudi® AI Processor
+This section contains an example of how to quantize a Hugging Face models from fp32 to fp8 with Intel Gaudi and the Optimum for Intel Gaudi (aka Optimum Habana) library. An easy benchmarking python scripts with related Dockefile is also provided. Hugging Face pipelines take advantage of the Hugging Face Tasks in transformer models, such as text generation, translation, question answering and more. You can read more about Hugging Face pipelines on their main page [here](https://huggingface.co/docs/transformers/main_classes/pipelines)
+
+A jupyter notebook with fp8 instructions and a Benchmark.py for easy benchmarking are provided.
+For learning purpose, the jupyter notebook also has instructions on bare metal to get started.
+For Gaudi benchmarking purpose, Benchmark.py script will run Llama2 70b, Llama3.1 8b, Llama3.1 70b, and Llama3.1 405b inside docker and generate a report with performance comparsion against published numbers in [Gaudi Model Performance](https://www.intel.com/content/www/us/en/developer/platform/gaudi/model-performance.html).
+
+## Requirements
+Please make sure to follow [Driver Installation](https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html) to install Gaudi driver on the system.
+### Jupyter Notebook
+Please follow [README](https://github.com/intel-ai-tce/Gaudi-tutorials/blob/OH_benchmark/PyTorch/Hugging_Face_pipelines/README.md) to setup environment for Jupyter notebook.
+### Benchmark python scripts
+
+To use dockerfile provided for the sample, please follow [Docker Installation](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html) to setup habana runtime for Docker images.
+#### Docker Build
+To build the image from the Dockerfile, please follow below command to build the optimum-habana-text-gen image.
+```bash
+docker build --no-cache -t optimum-habana-text-gen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+#### Docker Run
+After docker build, users could follow below command to run and docker instance and users will be in the docker instance under text-generation folder.
+```bash
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=ALL --privileged=true --net=host --ipc=host optimum-habana-text-gen:latest
+```
+> [!NOTE]
+> The Huggingface model file size might be large, so we recommend to use an external disk as Huggingface hub folder. \
+> Please export HF_HOME environment variable to your external disk and then export the mount point into docker instance. \
+> ex: "-e HF_HOME=/mnt/huggingface -v /mnt:/mnt"
+
+## How to run Benchmark scripts
+Benchmark script will run all the models with different input len, output len and batch size and generate a report to compare all published numbers in [Gaudi Model Performance](https://www.intel.com/content/www/us/en/developer/platform/gaudi/model-performance.html).
+### Gaudi3
+Different json file are provided for different Gaudi Software version like 1.19 and 1.20 on Gaudi3.
+To do benchmarking on a machine with 8 Gaudi3 cards, just run the below command inside the docker instance.
+```bash
+python3 Benchmark.py
+```
+### Gaudi2
+To do benchmarking on a machine with 8 Gaudi2 cards, just run the below command instead inside the docker instance.
+```bash
+GAUDI_VER=2 python3 Benchmark.py
+```
+
+### Skip Tests
+To skip tests for different models, pass related environment and assign its value to 1.
+For example, skip llama3.1 405B model test by following command.
+```bash
+skip_llama31_405b=1 python3 Benchmark.py
+```
+Here are all supported environment variables to pass different tests :
+skip_llama2_70b, skip_llama31_8b, skip_llama31_70b, skip_llama33_70b, skip_llama31_405b
+
+### HTML Report
+A html report will be generated under a folder with timestamp, and the html report will look like below the diagram.
+> NOTE: There is also a [PerfSpect](https://github.com/intel/PerfSpect) Report for detailed system and Gaudi information.
+
+
+
+
diff --git a/PyTorch/Deploying_OH/requirements_bm.txt b/PyTorch/Deploying_OH/requirements_bm.txt
new file mode 100644
index 0000000..5e806c0
--- /dev/null
+++ b/PyTorch/Deploying_OH/requirements_bm.txt
@@ -0,0 +1,6 @@
+openpyxl
+datasets
+pandas
+sentencepiece
+transformers
+matplotlib
diff --git a/PyTorch/Deploying_OH/run_text_generation_with_fp8.ipynb b/PyTorch/Deploying_OH/run_text_generation_with_fp8.ipynb
new file mode 100644
index 0000000..8ce438a
--- /dev/null
+++ b/PyTorch/Deploying_OH/run_text_generation_with_fp8.ipynb
@@ -0,0 +1,208 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "12fb7581-a09a-404f-a5ac-5a77996eafea",
+ "metadata": {},
+ "source": [
+ "Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.\n",
+ "SPDX-License-Identifier: Apache-2.0\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0157485d-f705-405f-9997-bdcadbc85218",
+ "metadata": {},
+ "source": [
+ "### Running Hugging Face with FP8 on IntelĀ® GaudiĀ® - Text Generation\n",
+ "\n",
+ "This example shows how to quantize a Hugging Face models from fp32 to fp8 with Intel Gaudi and the Optimum for Intel Gaudi (aka Optimum Habana) library.\n",
+ "\n",
+ "Llama2-70b, Llama2-7b, Llama3-70b, Llama3-8b, Mixtral-8x7B, Falcon-7B, Falcon-40B, Falcon-180B, phi-2 and Llama3-405B in FP8 are enabled using the [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. From synapse 1.17 / optimum-habana 1.13 release, INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.\n",
+ "\n",
+ "More information on enabling fp8 in SynapseAI is available here:\n",
+ "https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f35f62c6-a93f-4d11-bff5-12d38b0f4b8b",
+ "metadata": {},
+ "source": [
+ "#### Install the Hugging Face Optimum Habana Library"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "43d1f096-17ee-451e-a356-1e5b2162a83d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#%cd ~/Gaudi-tutorials/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8\n",
+ "%pip install optimum-habana==1.16.0"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a257d27b-c742-4290-a477-a0d48f5797bb",
+ "metadata": {},
+ "source": [
+ "#### Download the Hugging Face Optimum Habana"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5f8f0405-70ee-4225-acf4-91340678c03c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!git clone -b v1.16.0 https://github.com/huggingface/optimum-habana.git;cd optimum-habana/examples/text-generation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "60e181d2-fa9e-4193-90ca-1ca5d6668101",
+ "metadata": {},
+ "source": [
+ "#### Install Required packages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "27f25b4d-47a0-42d7-af30-4216f16d2ab2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install -r requirements.txt;pip install -r requirements_lm_eval.txt;pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.21.0"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6d5c7a14-3d38-4258-a8de-339bde866b7e",
+ "metadata": {},
+ "source": [
+ "#### Measure the tensor quantization statistics \n",
+ "Here is an example to measure the tensor quantization statistics on Llama3-8B with 1 card: \n",
+ "By changing model_name_or_path, a different llama model could be applied. \n",
+ "By changing world_size, multiple gaudi cards could be used for measurement. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "afb90147-891d-4ad0-b288-40e1c6673946",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n",
+ "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py:252: UserWarning: Device capability of hccl unspecified, assuming `cpu` and `cuda`. Please specify it via the `devices` argument of `register_backend`.\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
+ "source": [
+ "!HF_DATASETS_TRUST_REMOTE_CODE=true QUANT_CONFIG=./quantization_config/maxabs_measure.json python ../gaudi_spawn.py \\\n",
+ "--use_deepspeed --world_size 1 run_lm_eval.py \\\n",
+ "-o acc_llama3_8b_bs1_quant.txt \\\n",
+ "--model_name_or_path meta-llama/Llama-3.1-8B-Instruct \\\n",
+ "--warmup 0 \\\n",
+ "--use_hpu_graphs \\\n",
+ "--use_kv_cache \\\n",
+ "--trim_logits \\\n",
+ "--batch_size 1 \\\n",
+ "--bucket_size=128 \\\n",
+ "--bucket_internal \\\n",
+ "--trust_remote_code \\\n",
+ "--tasks hellaswag lambada_openai piqa winogrande \\\n",
+ "--bf16 \\\n",
+ "--attn_softmax_bf16 \\\n",
+ "--use_flash_attention \\\n",
+ "--flash_attention_recompute \\\n",
+ "--flash_attention_causal_mask"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0b55e1ae-c373-4bed-a1a8-79ffe6bb0339",
+ "metadata": {},
+ "source": [
+ "#### Quantize and run the fp8 model\n",
+ "Here is an example to quantize the model based on previous measurements for LLama3.1 8B model: \n",
+ "By changing model_name_or_path, a different llama model could be applied. \n",
+ "By changing world_size, multiple gaudi cards could be used for measurement. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "7edc56bc-73a9-4ccf-bfce-19a962f64efb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!HF_DATASETS_TRUST_REMOTE_CODE=true QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \\\n",
+ "--use_deepspeed --world_size 1 run_generation.py \\\n",
+ "--model_name_or_path meta-llama/Llama-3.1-8B-Instruct \\\n",
+ "--attn_softmax_bf16 \\\n",
+ "--use_hpu_graphs \\\n",
+ "--use_kv_cache \\\n",
+ "--limit_hpu_graphs \\\n",
+ "--max_input_tokens 128 \\\n",
+ "--max_new_tokens 128 \\\n",
+ "--batch_size 1536 \\\n",
+ "--bucket_size=128 \\\n",
+ "--bucket_internal \\\n",
+ "--attn_batch_split 2 \\\n",
+ "--bf16 \\\n",
+ "--reuse_cache \\\n",
+ "--trim_logits \\\n",
+ "--use_flash_attention \\\n",
+ "--flash_attention_recompute \\\n",
+ "--flash_attention_causal_mask"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "088a24bd-2365-46c3-99bc-2259e2a1ffd6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "exit()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}