togethercomputer · avnermay · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/bench/bench.py b/bench/bench.py
@@ -31,13 +31,14 @@ def parse_arguments():
     # Speculative decoding configuration
     parser.add_argument("--spec", action="store_true", help="Enable speculative decoding")
     parser.add_argument("--eagle", action="store_true", help="Enable eagle speculative decoding (implies --spec, uses default eagle draft for model)")
+    parser.add_argument("--phoenix", action="store_true", help="Enable eagle speculative decoding (implies --spec, uses default eagle draft for model)")
     parser.add_argument("--k", type=int, default=6, help="Speculative decoding k value")
     parser.add_argument("--async", action="store_true", help="Enable async speculative decoding")
     parser.add_argument("--f", type=int, default=3, help="Async fan out value")
     parser.add_argument("--fl", type=int, nargs='+', default=None, help="Fan out list (e.g., --fl 1 3 4 becomes [1, 3, 4])")
     parser.add_argument("--flh", type=int, nargs='+', default=None, help="Fan out list (e.g., --flh 1 3 4 becomes [1, 3, 4])")
     parser.add_argument("--flm", type=int, nargs='+', default=None, help="Fan out list miss (e.g., --flm 1 3 4 becomes [1, 3, 4])")
-    parser.add_argument("--backup", type=str, choices=["jit", "fast"], default="jit", help="Backup strategy (jit or fast)")
+    parser.add_argument("--backup", type=str, choices=["jit", "force-jit", "fast"], default="jit", help="Backup strategy (jit or fast)")
 
     # Memory and batching configuration
     parser.add_argument("--block_sz", type=int, default=256, help="KV cache block size (see config.py: kvcache_block_size)")
@@ -80,11 +81,13 @@ def parse_arguments():
     assert not (args.qwen and '--llama' in sys.argv), "--llama and --qwen are mutually exclusive"
     if args.qwen:
         args.llama = False
-    if args.eagle:
+    if args.eagle or args.phoenix:
+        args.spec = True
+        assert args.llama, "Eagle and Phoenix currently only support llama models"
+        assert args.temp == 0.0 and args.dtemp is None, "Eagle and Phoenix currently only support greedy decoding (temp=0)"
+        assert getattr(args, 'async', False), "Eagle and Phoenix currently only support async speculative decoding"
+    if getattr(args, 'async', False):
         args.spec = True
-        assert args.llama, "Eagle currently only supports llama models"
-        assert args.temp == 0.0 and args.dtemp is None, "Eagle currently only supports greedy decoding (temp=0)"
-        assert getattr(args, 'async', False), "Eagle currently only supports async speculative decoding"
     return args
 
 
@@ -129,7 +132,7 @@ def initialize_wandb(args, run_name):
             "gpus": args.gpus,
             "speculative_decoding": args.spec,
             "async_speculative": getattr(args, 'async', False),
-            "jit_speculative": args.backup == "jit",
+            "backup_strategy": args.backup,
             "k": args.k if args.spec else None,
             "f": args.f,
             "fan_out_list": args.flh,
@@ -143,6 +146,8 @@ def initialize_wandb(args, run_name):
             "b": args.b,
             "block_size": args.block_sz,
             "eager": args.eager,
+            "eagle": args.eagle,
+            "phoenix": args.phoenix,
             "example_mode": args.example,
             "humaneval_mode": args.humaneval,
             "alpaca_mode": args.alpaca,
@@ -172,8 +177,11 @@ def create_llm_kwargs(args, draft_path):
         max_num_seqs=args.b,
         max_model_len=args.max_model_len,
         sampler_x=args.x,
-        jit_speculate=(args.backup == "jit"),
+        jit_speculate=(args.backup == "jit" or args.backup == "force-jit"),
+        force_jit_speculate=(args.backup == "force-jit"),
         max_steps=args.max_steps,
+        communicate_cache_hits=True,
+        communicate_logits=False,
     )
 
     if args.flh is not None:
@@ -296,6 +304,8 @@ def main():
     llm_kwargs = create_llm_kwargs(args, draft_path)
     if args.eagle:
         llm_kwargs['use_eagle'] = True
+    if args.phoenix:
+        llm_kwargs['use_phoenix'] = True
     if args.debug:
         llm_kwargs['debug_mode'] = True
 

diff --git a/bench/bench_helpers.py b/bench/bench_helpers.py
@@ -6,9 +6,9 @@
 from typing import List, Optional, Tuple
 from transformers import AutoTokenizer
 try:
-    from ssd.paths import DATASET_PATHS, HF_CACHE_DIR, EAGLE3_SPECFORGE_70B, EAGLE3_YUHUILI_8B, EAGLE3_QWEN_32B
+    from ssd.paths import DATASET_PATHS, HF_CACHE_DIR, EAGLE3_SPECFORGE_70B, EAGLE3_YUHUILI_8B, EAGLE3_QWEN_32B, PHOENIX_70B
 except ImportError:
-    from bench_paths import DATASET_PATHS, HF_CACHE_DIR, EAGLE3_SPECFORGE_70B, EAGLE3_YUHUILI_8B, EAGLE3_QWEN_32B
+    from bench_paths import DATASET_PATHS, HF_CACHE_DIR, EAGLE3_SPECFORGE_70B, EAGLE3_YUHUILI_8B, EAGLE3_QWEN_32B, PHOENIX_70B
 
 
 def _get_snapshot_path(base_path: str) -> str:
@@ -62,6 +62,15 @@ def _get_draft_model_path(args, cache_dir: str) -> str:
             else:
                 raise ValueError(f"EAGLE draft not available for Qwen size {args.size}")
 
+    if getattr(args, "phoenix", False):
+        if args.llama:
+            if args.size == "70":
+                return PHOENIX_70B
+            else:
+                raise ValueError(f"Phoenix draft not available for Llama size {args.size}")
+        else:
+            raise ValueError(f"Phoenix draft not available for Qwen models")
+
     if args.llama:
         draft_size_to_model = {
             "1": "Llama-3.2-1B-Instruct",
@@ -157,6 +166,7 @@ def load_dataset_token_ids(
         return None
 
     dataset_file_path = DATASET_PATHS[dataset_name]
+    print(f"Loading dataset '{dataset_name}' from: {dataset_file_path}")
     if not os.path.exists(dataset_file_path):
         print(
             f"Warning: Dataset file not found at {dataset_file_path}, falling back to random tokens")
@@ -172,10 +182,11 @@ def load_dataset_token_ids(
                 data = json.loads(line.strip())
                 text: str = data["text"]
                 if use_chat_template and hasattr(tokenizer, 'apply_chat_template'):
-                    tokens = tokenizer.apply_chat_template(
+                    result = tokenizer.apply_chat_template(
                         [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": text}],
                         add_generation_prompt=True,
                     )
+                    tokens = result.input_ids if hasattr(result, 'input_ids') else result
                 else:
                     tokens = tokenizer.encode(text, add_special_tokens=False)
 

diff --git a/bench/bench_paths.py b/bench/bench_paths.py
@@ -43,15 +43,29 @@ def _required_env(var_name: str, note: str) -> str:
     f"{HF_CACHE_DIR}/models--RedHatAI--Qwen3-32B-speculator.eagle3",
 )
 
+PHOENIX_70B = f"{HF_CACHE_DIR}/models--togethercomputer--phoenix-Llama-3p2-1B-Instruct-tgt-Llama-3p3-70b-instruct-UNTRAINED"
+
 MODELS = {
     "llama_70b": os.environ.get(
         "BENCH_LLAMA_70B",
         f"{HF_CACHE_DIR}/models--meta-llama--Llama-3.3-70B-Instruct",
     ),
+    "llama_70b_3p1": os.environ.get(
+        "BENCH_LLAMA_70B_3P1",
+        f"{HF_CACHE_DIR}/models--meta-llama--Llama-3.1-70B-Instruct",
+    ),
+    "llama_8b": os.environ.get(
+        "BENCH_LLAMA_8B",
+        f"{HF_CACHE_DIR}/models--meta-llama--Llama-3.1-8B-Instruct",
+    ),
     "llama_1b": os.environ.get(
         "BENCH_LLAMA_1B",
         f"{HF_CACHE_DIR}/models--meta-llama--Llama-3.2-1B-Instruct",
     ),
+    "qwen_8b": os.environ.get(
+        "BENCH_QWEN_8B",
+        f"{HF_CACHE_DIR}/models--Qwen--Qwen3-8B",
+    ),
     "qwen_32b": os.environ.get(
         "BENCH_QWEN_32B",
         f"{HF_CACHE_DIR}/models--Qwen--Qwen3-32B",
@@ -62,12 +76,20 @@ def _required_env(var_name: str, note: str) -> str:
     ),
     "eagle3_llama_70b": os.environ.get(
         "BENCH_EAGLE3_LLAMA_70B",
-        "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-SpecForge",
+        f"{HF_CACHE_DIR}/models--lmsys--SGLang-EAGLE3-Llama-3.3-70B-Instruct-SpecForge",
+    ),
+    "eagle3_llama_8b": os.environ.get(
+        "BENCH_EAGLE3_LLAMA_8B",
+        f"{HF_CACHE_DIR}/models--yuhuili--EAGLE3-LLaMA3.1-Instruct-8B",
     ),
     "eagle3_qwen_32b": os.environ.get(
         "BENCH_EAGLE3_QWEN_32B",
         "Zhihu-ai/Zhi-Create-Qwen3-32B-Eagle3",
     ),
+    "phoenix2_qwen_8b": os.environ.get(
+        "BENCH_PHOENIX2_QWEN_8B",
+        "togethercomputer/phnx2-llama-decagon-4layer-v1.0",
+    ),
 }