diff --git a/bench/bench.py b/bench/bench.py index b80f21955..09c1c883f 100644 --- a/bench/bench.py +++ b/bench/bench.py @@ -31,13 +31,14 @@ def parse_arguments(): # Speculative decoding configuration parser.add_argument("--spec", action="store_true", help="Enable speculative decoding") parser.add_argument("--eagle", action="store_true", help="Enable eagle speculative decoding (implies --spec, uses default eagle draft for model)") + parser.add_argument("--phoenix", action="store_true", help="Enable eagle speculative decoding (implies --spec, uses default eagle draft for model)") parser.add_argument("--k", type=int, default=6, help="Speculative decoding k value") parser.add_argument("--async", action="store_true", help="Enable async speculative decoding") parser.add_argument("--f", type=int, default=3, help="Async fan out value") parser.add_argument("--fl", type=int, nargs='+', default=None, help="Fan out list (e.g., --fl 1 3 4 becomes [1, 3, 4])") parser.add_argument("--flh", type=int, nargs='+', default=None, help="Fan out list (e.g., --flh 1 3 4 becomes [1, 3, 4])") parser.add_argument("--flm", type=int, nargs='+', default=None, help="Fan out list miss (e.g., --flm 1 3 4 becomes [1, 3, 4])") - parser.add_argument("--backup", type=str, choices=["jit", "fast"], default="jit", help="Backup strategy (jit or fast)") + parser.add_argument("--backup", type=str, choices=["jit", "force-jit", "fast"], default="jit", help="Backup strategy (jit or fast)") # Memory and batching configuration parser.add_argument("--block_sz", type=int, default=256, help="KV cache block size (see config.py: kvcache_block_size)") @@ -80,11 +81,13 @@ def parse_arguments(): assert not (args.qwen and '--llama' in sys.argv), "--llama and --qwen are mutually exclusive" if args.qwen: args.llama = False - if args.eagle: + if args.eagle or args.phoenix: + args.spec = True + assert args.llama, "Eagle and Phoenix currently only support llama models" + assert args.temp == 0.0 and args.dtemp is None, "Eagle and Phoenix currently only support greedy decoding (temp=0)" + assert getattr(args, 'async', False), "Eagle and Phoenix currently only support async speculative decoding" + if getattr(args, 'async', False): args.spec = True - assert args.llama, "Eagle currently only supports llama models" - assert args.temp == 0.0 and args.dtemp is None, "Eagle currently only supports greedy decoding (temp=0)" - assert getattr(args, 'async', False), "Eagle currently only supports async speculative decoding" return args @@ -129,7 +132,7 @@ def initialize_wandb(args, run_name): "gpus": args.gpus, "speculative_decoding": args.spec, "async_speculative": getattr(args, 'async', False), - "jit_speculative": args.backup == "jit", + "backup_strategy": args.backup, "k": args.k if args.spec else None, "f": args.f, "fan_out_list": args.flh, @@ -143,6 +146,8 @@ def initialize_wandb(args, run_name): "b": args.b, "block_size": args.block_sz, "eager": args.eager, + "eagle": args.eagle, + "phoenix": args.phoenix, "example_mode": args.example, "humaneval_mode": args.humaneval, "alpaca_mode": args.alpaca, @@ -172,8 +177,11 @@ def create_llm_kwargs(args, draft_path): max_num_seqs=args.b, max_model_len=args.max_model_len, sampler_x=args.x, - jit_speculate=(args.backup == "jit"), + jit_speculate=(args.backup == "jit" or args.backup == "force-jit"), + force_jit_speculate=(args.backup == "force-jit"), max_steps=args.max_steps, + communicate_cache_hits=True, + communicate_logits=False, ) if args.flh is not None: @@ -296,6 +304,8 @@ def main(): llm_kwargs = create_llm_kwargs(args, draft_path) if args.eagle: llm_kwargs['use_eagle'] = True + if args.phoenix: + llm_kwargs['use_phoenix'] = True if args.debug: llm_kwargs['debug_mode'] = True diff --git a/bench/bench_helpers.py b/bench/bench_helpers.py index 4079cf3a6..048dd5281 100644 --- a/bench/bench_helpers.py +++ b/bench/bench_helpers.py @@ -6,9 +6,9 @@ from typing import List, Optional, Tuple from transformers import AutoTokenizer try: - from ssd.paths import DATASET_PATHS, HF_CACHE_DIR, EAGLE3_SPECFORGE_70B, EAGLE3_YUHUILI_8B, EAGLE3_QWEN_32B + from ssd.paths import DATASET_PATHS, HF_CACHE_DIR, EAGLE3_SPECFORGE_70B, EAGLE3_YUHUILI_8B, EAGLE3_QWEN_32B, PHOENIX_70B except ImportError: - from bench_paths import DATASET_PATHS, HF_CACHE_DIR, EAGLE3_SPECFORGE_70B, EAGLE3_YUHUILI_8B, EAGLE3_QWEN_32B + from bench_paths import DATASET_PATHS, HF_CACHE_DIR, EAGLE3_SPECFORGE_70B, EAGLE3_YUHUILI_8B, EAGLE3_QWEN_32B, PHOENIX_70B def _get_snapshot_path(base_path: str) -> str: @@ -62,6 +62,15 @@ def _get_draft_model_path(args, cache_dir: str) -> str: else: raise ValueError(f"EAGLE draft not available for Qwen size {args.size}") + if getattr(args, "phoenix", False): + if args.llama: + if args.size == "70": + return PHOENIX_70B + else: + raise ValueError(f"Phoenix draft not available for Llama size {args.size}") + else: + raise ValueError(f"Phoenix draft not available for Qwen models") + if args.llama: draft_size_to_model = { "1": "Llama-3.2-1B-Instruct", @@ -157,6 +166,7 @@ def load_dataset_token_ids( return None dataset_file_path = DATASET_PATHS[dataset_name] + print(f"Loading dataset '{dataset_name}' from: {dataset_file_path}") if not os.path.exists(dataset_file_path): print( f"Warning: Dataset file not found at {dataset_file_path}, falling back to random tokens") @@ -172,10 +182,11 @@ def load_dataset_token_ids( data = json.loads(line.strip()) text: str = data["text"] if use_chat_template and hasattr(tokenizer, 'apply_chat_template'): - tokens = tokenizer.apply_chat_template( + result = tokenizer.apply_chat_template( [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": text}], add_generation_prompt=True, ) + tokens = result.input_ids if hasattr(result, 'input_ids') else result else: tokens = tokenizer.encode(text, add_special_tokens=False) diff --git a/bench/bench_paths.py b/bench/bench_paths.py index 5e2e5ec6a..99c38f266 100644 --- a/bench/bench_paths.py +++ b/bench/bench_paths.py @@ -43,15 +43,29 @@ def _required_env(var_name: str, note: str) -> str: f"{HF_CACHE_DIR}/models--RedHatAI--Qwen3-32B-speculator.eagle3", ) +PHOENIX_70B = f"{HF_CACHE_DIR}/models--togethercomputer--phoenix-Llama-3p2-1B-Instruct-tgt-Llama-3p3-70b-instruct-UNTRAINED" + MODELS = { "llama_70b": os.environ.get( "BENCH_LLAMA_70B", f"{HF_CACHE_DIR}/models--meta-llama--Llama-3.3-70B-Instruct", ), + "llama_70b_3p1": os.environ.get( + "BENCH_LLAMA_70B_3P1", + f"{HF_CACHE_DIR}/models--meta-llama--Llama-3.1-70B-Instruct", + ), + "llama_8b": os.environ.get( + "BENCH_LLAMA_8B", + f"{HF_CACHE_DIR}/models--meta-llama--Llama-3.1-8B-Instruct", + ), "llama_1b": os.environ.get( "BENCH_LLAMA_1B", f"{HF_CACHE_DIR}/models--meta-llama--Llama-3.2-1B-Instruct", ), + "qwen_8b": os.environ.get( + "BENCH_QWEN_8B", + f"{HF_CACHE_DIR}/models--Qwen--Qwen3-8B", + ), "qwen_32b": os.environ.get( "BENCH_QWEN_32B", f"{HF_CACHE_DIR}/models--Qwen--Qwen3-32B", @@ -62,12 +76,20 @@ def _required_env(var_name: str, note: str) -> str: ), "eagle3_llama_70b": os.environ.get( "BENCH_EAGLE3_LLAMA_70B", - "lmsys/SGLang-EAGLE3-Llama-3.3-70B-Instruct-SpecForge", + f"{HF_CACHE_DIR}/models--lmsys--SGLang-EAGLE3-Llama-3.3-70B-Instruct-SpecForge", + ), + "eagle3_llama_8b": os.environ.get( + "BENCH_EAGLE3_LLAMA_8B", + f"{HF_CACHE_DIR}/models--yuhuili--EAGLE3-LLaMA3.1-Instruct-8B", ), "eagle3_qwen_32b": os.environ.get( "BENCH_EAGLE3_QWEN_32B", "Zhihu-ai/Zhi-Create-Qwen3-32B-Eagle3", ), + "phoenix2_qwen_8b": os.environ.get( + "BENCH_PHOENIX2_QWEN_8B", + "togethercomputer/phnx2-llama-decagon-4layer-v1.0", + ), } diff --git a/bench/run_sglang_bench.py b/bench/run_sglang_bench.py index 2949f8be7..53bbeeb8d 100644 --- a/bench/run_sglang_bench.py +++ b/bench/run_sglang_bench.py @@ -4,10 +4,11 @@ The benchmark client (sglang_eval_client.py) sends requests and logs metrics. Usage: - python run_sglang_bench.py --llama # SD, Llama 70B - python run_sglang_bench.py --qwen # SD, Qwen 32B - python run_sglang_bench.py --llama --mode ar # autoregressive baseline - python run_sglang_bench.py --llama --wandb --name myrun # log to wandb + python -O /work/avner/git/ssd/bench/run_sglang_bench.py --llama # SD, Llama 70B + python -O /work/avner/git/ssd/bench/run_sglang_bench.py --qwen # SD, Qwen 32B + python -O /work/avner/git/ssd/bench/run_sglang_bench.py --llama --mode AR # autoregressive baseline + python -O /work/avner/git/ssd/bench/run_sglang_bench.py --llama --wandb --name myrun # log to wandb + python -O /work/avner/git/ssd/bench/run_sglang_bench.py --llama --mode EAGLE3 --size 8 --dataset humaneval --numseqs 1 --profile --tp 1 Set model paths via env vars (BENCH_LLAMA_70B, etc.) or edit bench_paths.py. """ @@ -23,81 +24,47 @@ from bench_paths import MODELS, resolve_snapshot -def get_server_cmd(args): - if args.llama: - target = resolve_snapshot(MODELS["llama_70b"]) - draft = resolve_snapshot(MODELS["llama_1b"]) - else: - target = resolve_snapshot(MODELS["qwen_32b"]) - draft = resolve_snapshot(MODELS["qwen_0.6b"]) - - cmd = [ - sys.executable, "-m", "sglang.launch_server", - "--model-path", target, - "--tp", str(args.tp), - "--mem-fraction-static", str(args.mem_frac), - "--max-running-requests", "1", - "--disable-radix-cache", - "--log-level", "warning", - "--port", str(args.port), - ] - - if args.mode == "sd": - # Speculative decoding with standalone draft model. - # Default: k=5 (num_steps=4, num_draft_tokens=5). - cmd += [ - "--speculative-algorithm", "STANDALONE", - "--speculative-draft-model-path", draft, - "--speculative-num-steps", str(args.num_steps), - "--speculative-eagle-topk", "1", - "--speculative-num-draft-tokens", str(args.num_draft_tokens), - ] - # mode == "ar": no speculative flags, just serve the target model. - - return cmd, target - - -def wait_for_server(port, timeout=900, interval=5): - url = f"http://localhost:{port}/health" - deadline = time.time() + timeout - while time.time() < deadline: - try: - if requests.get(url, timeout=2).status_code == 200: - return True - except requests.ConnectionError: - pass - time.sleep(interval) - return False - - -def kill_server(proc): - if proc.poll() is None: - os.killpg(os.getpgid(proc.pid), signal.SIGKILL) - proc.wait() - - def main(): parser = argparse.ArgumentParser(description="Launch SGLang server and benchmark it") parser.add_argument("--llama", action="store_true", default=True) parser.add_argument("--qwen", action="store_true") - parser.add_argument("--mode", choices=["ar", "sd"], default="sd", + parser.add_argument("--size", type=int, default=0) + parser.add_argument("--mode", choices=["AR", "STANDALONE", "ASYNC_STANDALONE", "EAGLE3", "ASYNC_EAGLE3", "PHOENIX", "ASYNC_PHOENIX"], default="STANDALONE", help="ar = autoregressive, sd = speculative decoding (default)") + parser.add_argument("--backup", choices=["fast", "jit", "force-jit"], default="jit", + help="Backup strategy (fast, jit, force-jit)") parser.add_argument("--tp", type=int, default=4) parser.add_argument("--port", type=int, default=40010) - parser.add_argument("--mem_frac", type=float, default=0.70) - parser.add_argument("--num_steps", type=int, default=4, help="draft chain depth (k = num_steps + 1)") - parser.add_argument("--num_draft_tokens", type=int, default=5) + parser.add_argument("--mem-frac", type=float, default=0.70) + parser.add_argument("--num-steps", type=int, default=4, help="draft chain depth (k = num_steps + 1)") + parser.add_argument("--context-length", type=int, default=4096) # Pass-through to eval client parser.add_argument("--numseqs", type=int, default=128) - parser.add_argument("--output_len", type=int, default=512) + parser.add_argument("--output-len", type=int, default=512) parser.add_argument("--temp", type=float, default=0.0) + parser.add_argument("--dataset", type=str, choices=["all", "humaneval", "alpaca", "c4", "ultrafeedback", "random", "example"], default="all") parser.add_argument("--wandb", action="store_true") - parser.add_argument("--group", type=str, default=None) + parser.add_argument("--group", type=str, default="ssd") parser.add_argument("--name", type=str, default=None) + parser.add_argument("--chat-template", action="store_true") + + parser.add_argument("--f", type=int, default=4, help="Async fan out value") + parser.add_argument("--fl", type=int, nargs='+', default=None, help="Fan out list (e.g., --fl 1 3 4 becomes [1, 3, 4])") + parser.add_argument("--flh", type=int, nargs='+', default=None, help="Fan out list (e.g., --flh 1 3 4 becomes [1, 3, 4])") + parser.add_argument("--flm", type=int, nargs='+', default=None, help="Fan out list miss (e.g., --flm 1 3 4 becomes [1, 3, 4])") + parser.add_argument("--communicate-cache-hits", action="store_true") + parser.add_argument("--verbose", action="store_true") + parser.add_argument("--acceptance-rate-log", type=str, default=None, + help="Path to log acceptance rates (sets ACCEPTANCE_RATE_LOG env var for the server)") + parser.add_argument("--profile", action="store_true") + args = parser.parse_args() if args.qwen: args.llama = False + if args.size == 0: + args.size = 70 if args.llama else 32 + server_cmd, target = get_server_cmd(args) print(f"Mode: {args.mode}, Target: {target}") print(f"Server cmd: {' '.join(server_cmd)}") @@ -107,7 +74,17 @@ def main(): capture_output=True) time.sleep(2) - proc = subprocess.Popen(server_cmd, preexec_fn=os.setsid) + env = os.environ.copy() + if args.acceptance_rate_log: + env["ACCEPTANCE_RATE_LOG"] = args.acceptance_rate_log + print(f"ACCEPTANCE_RATE_LOG={args.acceptance_rate_log}") + if args.profile: + # env["SSD_PROFILE"] = "1" + # print("SSD_PROFILE=1") + env["SSD_PROFILE_EVENTS"] = "1" + print("SSD_PROFILE_EVENTS=1") + + proc = subprocess.Popen(server_cmd, preexec_fn=os.setsid, env=env) try: print("Waiting for server...") if not wait_for_server(args.port): @@ -118,19 +95,22 @@ def main(): bench_dir = os.path.dirname(__file__) eval_cmd = [ sys.executable, os.path.join(bench_dir, "sglang_eval_client.py"), - "--size", "70" if args.llama else "32", + "--size", str(args.size), "--numseqs", str(args.numseqs), "--output_len", str(args.output_len), "--temp", str(args.temp), - "--all", "--b", "1", + f"--{args.dataset}", + "--b", "1", "--port", str(args.port), ] + if args.chat_template: + eval_cmd.append("--chat-template") if args.llama: eval_cmd.append("--llama") else: eval_cmd.append("--qwen") - if args.mode == "sd": - eval_cmd += ["--draft", "1" if args.llama else "0.6"] + if is_eagle3(args.mode): + eval_cmd.append("--eagle") if args.wandb: eval_cmd += ["--wandb"] if args.group: @@ -145,5 +125,131 @@ def main(): print("Server stopped") +def is_spec(mode): + return mode in ["STANDALONE", "ASYNC_STANDALONE", "EAGLE3", "ASYNC_EAGLE3", "PHOENIX2", "ASYNC_PHOENIX2"] + + +def is_async(mode): + return mode in ["ASYNC_STANDALONE", "ASYNC_EAGLE3", "ASYNC_PHOENIX"] + + +def is_standalone(mode): + return mode in ["STANDALONE", "ASYNC_STANDALONE"] + +def is_eagle3(mode): + return mode in ["EAGLE3", "ASYNC_EAGLE3"] + + +def is_phoenix(mode): + return mode in ["PHOENIX2", "ASYNC_PHOENIX2"] + + +def get_server_cmd(args): + if args.llama: + draft_name = "llama_1b" + if args.size == 70: + if is_eagle3(args.mode): + target = resolve_snapshot(MODELS["llama_70b_3p1"]) + else: + target = resolve_snapshot(MODELS["llama_70b"]) + draft_name = "llama_1b" if is_standalone(args.mode) else "eagle3_llama_70b" + elif args.size == 8: + target = resolve_snapshot(MODELS["llama_8b"]) + draft_name = "llama_1b" if is_standalone(args.mode) else "eagle3_llama_8b" + else: + raise ValueError(f"Unsupported size for llama: {args.size}") + + draft = resolve_snapshot(MODELS[draft_name]) + else: + target = resolve_snapshot(MODELS["qwen_32b"]) + if is_standalone(args.mode): + draft = resolve_snapshot(MODELS["qwen_0.6b"]) + elif is_eagle3(args.mode): + draft = resolve_snapshot(MODELS["eagle3_qwen_32b"]) + elif is_phoenix(args.mode): + target = resolve_snapshot(MODELS["qwen_8b"]) + draft = resolve_snapshot(MODELS["phoenix2_qwen_8b"]) + else: + raise ValueError(f"Unsupported mode for qwen: {args.mode}") + + cmd = [ + "sglang", "serve", + "--model-path", target, + "--tp", str(args.tp), + "--mem-fraction-static", str(args.mem_frac), + "--max-running-requests", "1", + # "--disable-radix-cache", + "--log-level", "warning", + "--port", str(args.port), + "--context-length", str(args.context_length), + "--dtype", "bfloat16", + ] + + if is_spec(args.mode): + # Speculative decoding with standalone draft model. + # Default: k=5 (num_steps=4, num_draft_tokens=5). + cmd += [ + "--speculative-algorithm", args.mode, + "--speculative-draft-model-path", draft, + "--speculative-num-steps", str(args.num_steps), + "--speculative-eagle-topk", "1", + "--speculative-num-draft-tokens", str(args.num_steps + 1), + ] + if is_async(args.mode): + cmd += [ + "--speculative-async-fan-out", str(args.f), + ] + if args.fl: + cmd += [ + "--speculative-async-fan-out-list", ",".join(map(str, args.fl)), + ] + if args.flh: + cmd += [ + "--speculative-async-fan-out-list-hit", ",".join(map(str, args.flh)), + ] + if args.flm: + cmd += [ + "--speculative-async-fan-out-list-miss", ",".join(map(str, args.flm)), + ] + if args.backup in ["jit", "force-jit"]: + cmd += [ + "--speculative-async-jit-speculate", + ] + if args.backup == "force-jit": + cmd += [ + "--speculative-async-force-jit-speculate", + ] + if args.communicate_cache_hits: + cmd += [ + "--speculative-async-communicate-cache-hits", + ] + if args.verbose: + cmd += [ + "--speculative-async-verbose", + ] + + # mode == "ar": no speculative flags, just serve the target model. + return cmd, target + + +def wait_for_server(port, timeout=900, interval=5): + url = f"http://localhost:{port}/health" + deadline = time.time() + timeout + while time.time() < deadline: + try: + if requests.get(url, timeout=2).status_code == 200: + return True + except requests.ConnectionError: + pass + time.sleep(interval) + return False + + +def kill_server(proc): + if proc.poll() is None: + os.killpg(os.getpgid(proc.pid), signal.SIGKILL) + proc.wait() + + if __name__ == "__main__": main() diff --git a/bench/small_test.py b/bench/small_test.py new file mode 100644 index 000000000..4efb136ee --- /dev/null +++ b/bench/small_test.py @@ -0,0 +1,82 @@ +import argparse +import os + +from transformers import AutoTokenizer +from ssd import LLM, SamplingParams + +if __name__ == '__main__': + + llama_1b_path = '/scratch/avner/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6' + llama_70b_path = '/scratch/avner/huggingface/hub/models--meta-llama--Llama-3.3-70B-Instruct/snapshots/6f6073b423013f6a7d4d9f39144961bfbfbc386b' + eagle_path = '/scratch/avner/huggingface/hub/models--lmsys--SGLang-EAGLE3-Llama-3.3-70B-Instruct-SpecForge/snapshots/63ebaa6585f96b89685adad8fdfa0da53be6a8fd' + phoenix_path = '/scratch/avner/huggingface/hub/models--togethercomputer--phoenix-Llama-3p2-1B-Instruct-tgt-Llama-3p3-70b-instruct-UNTRAINED/snapshots/3af59d71514388e14d8685f2b684f74e3e311717' + # eagle_path = '/scratch/avner/huggingface/hub/models--yuhuili--EAGLE3-LLaMA3.3-Instruct-70B' + assert os.path.isdir(llama_1b_path) + assert os.path.isdir(llama_70b_path) + assert os.path.isdir(eagle_path) + + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, default=llama_1b_path) + parser.add_argument("--draft", type=str, default=llama_1b_path) + parser.add_argument("--eagle", action="store_true") + parser.add_argument("--phoenix", action="store_true") + parser.add_argument("--k", type=int, default=7) + parser.add_argument("--jit-speculate", action="store_true") + parser.add_argument("--num-gpus", type=int, default=2) + parser.add_argument("--ignore-eos", action="store_true") + parser.add_argument("--chat-template", action="store_true") + parser.add_argument("--communicate-logits", action="store_true") + parser.add_argument("--communicate-cache-hits", action="store_true") + parser.add_argument("--mary", action="store_true") + parser.add_argument("--verbose", action="store_true") + args = parser.parse_args() + + if args.eagle: + args.draft = eagle_path + args.model = llama_70b_path + args.num_gpus = 5 + args.jit_speculate = True + args.chat_template = True + + if args.phoenix: + args.draft = phoenix_path + args.model = llama_70b_path + args.num_gpus = 5 + args.jit_speculate = True + args.chat_template = True + + llm = LLM( + model=args.model, + draft=args.draft, + use_eagle=args.eagle, + use_phoenix=args.phoenix, + speculate_k=args.k, + speculate=True, + draft_async=True, + num_gpus=args.num_gpus, + jit_speculate=args.jit_speculate, + verbose=args.verbose, + communicate_logits=args.communicate_logits, + communicate_cache_hits=args.communicate_cache_hits, + ) + sampling_params = [SamplingParams(temperature=0.0, max_new_tokens=64, ignore_eos=args.ignore_eos)] + + if args.mary: + text = "Can you please tell me the lyrics to Mary had a little lamb, and can you repeat it 10 times?" + else: + text = "What is the capital city of France?" + if args.chat_template: + tokenizer = AutoTokenizer.from_pretrained(args.model) + tokens = tokenizer.apply_chat_template( + [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": text}], + add_generation_prompt=True, + ) + token_str = tokenizer.decode(tokens) + print(f"Generating response to prompt: '{token_str}'") + print(f"=============================================================") + outputs, _ = llm.generate([tokens], sampling_params) + + else: + outputs, _ = llm.generate([text], sampling_params) + + print(outputs[0]["text"]) diff --git a/pyproject.toml b/pyproject.toml index 41451ce37..19d77fd65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,25 +12,20 @@ readme = "README.md" description = "Async tree-based speculative decoding research engine" requires-python = ">=3.11,<3.13" dependencies = [ - "torch==2.8.0", - "triton==3.4.0", - "transformers==4.57.1", - "xxhash==3.5.0", - "numpy==2.3.3", - "safetensors==0.6.2", - "tqdm==4.67.1", - "flashinfer-python==0.5.2", - "sgl-kernel==0.3.17.post1", - "nvidia-cutlass-dsl==4.2.1", + "torch==2.9.1", + "triton", + "transformers>=5.3.0", + "xxhash", + "numpy", + "safetensors", + "tqdm", + "sglang-kernel==0.4.1", # Make sure this version is synchronized with TGL + "nvidia-cutlass-dsl>=4.3.4", "wandb==0.22.0", "hf_transfer", "tiktoken", -] - -[project.optional-dependencies] -scripts = [ - "datasets", - "huggingface_hub", + # Install from source for now, for latest support on Hopper + "flash-attn-4 @ git+https://github.com/Dao-AILab/flash-attention.git@5301a359f59ef8fa10f211618d9f7a69716a8898#subdirectory=flash_attn/cute", ] [project.urls] diff --git a/ssd/__init__.py b/ssd/__init__.py index a748fcbb6..e378d5bcf 100644 --- a/ssd/__init__.py +++ b/ssd/__init__.py @@ -20,5 +20,7 @@ prepare_decode_tensors_from_seqs, prepare_block_tables_from_seqs, prepare_prefill_tensors_from_seqs, - prepare_prefill_payload, + PrefillRequest, + SpeculationRequest, + SpeculationResponse, ) diff --git a/ssd/config.py b/ssd/config.py index 7c61564a0..558802943 100644 --- a/ssd/config.py +++ b/ssd/config.py @@ -4,18 +4,19 @@ import torch from ssd.paths import DEFAULT_TARGET, DEFAULT_DRAFT + @dataclass class Config: model: str = DEFAULT_TARGET max_num_batched_tokens: int = 16384 - max_num_seqs: int = 1 - max_model_len: int = 4096 + max_num_seqs: int = 1 + max_model_len: int = 4096 gpu_memory_utilization: float = 0.7 num_gpus: int = 1 enforce_eager: bool = False hf_config: AutoConfig | None = None eos: int = -1 - kvcache_block_size: int = 256 + kvcache_block_size: int = 1 num_kvcache_blocks: int = -1 device: torch.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") @@ -25,16 +26,22 @@ class Config: draft: str = DEFAULT_DRAFT speculate_k: int = 1 draft_async: bool = False - + # async spec only async_fan_out: int = 3 fan_out_list: list[int] | None = None fan_out_list_miss: list[int] | None = None sampler_x: float | None = None - jit_speculate: bool = False + jit_speculate: bool = False + force_jit_speculate: bool = False + async_nccl_port: int | None = None + async_nccl_host: str = "127.0.0.1" + communicate_logits: bool = False + communicate_cache_hits: bool = False - # eagle3 + # eagle3 / phoenix use_eagle: bool = False + use_phoenix: bool = False eagle_layers: list[int] | None = None d_model_target: int | None = None tokenizer_path: str | None = None @@ -48,29 +55,47 @@ class Config: def max_blocks(self): return (self.max_model_len + self.kvcache_block_size - 1) // self.kvcache_block_size + @property + def use_eagle_or_phoenix(self): + return self.use_eagle or self.use_phoenix + def __post_init__(self): - model = self.model + model = self.model assert os.path.isdir(model) assert 1 <= self.num_gpus <= 8 # this codebase only works on one node self.hf_config = AutoConfig.from_pretrained(model) - self.max_model_len = min( - self.max_model_len, self.hf_config.max_position_embeddings) - if self.speculate: + + if not self.speculate: + if self.max_model_len: + self.max_model_len = min( + self.max_model_len, self.hf_config.max_position_embeddings) + else: + self.max_model_len = self.hf_config.max_position_embeddings + else: draft = self.draft self.draft_hf_config = AutoConfig.from_pretrained(draft) - self.max_model_len = min( - self.max_model_len, self.draft_hf_config.max_position_embeddings) + if self.max_model_len: + self.max_model_len = min( + self.max_model_len, self.draft_hf_config.max_position_embeddings) + else: + self.max_model_len = self.draft_hf_config.max_position_embeddings + if self.draft_async: if self.fan_out_list is None: self.fan_out_list = [self.async_fan_out] * (self.speculate_k + 1) self.MQ_LEN = sum(self.fan_out_list) - if self.fan_out_list_miss is None: - self.fan_out_list_miss = self.fan_out_list + if not self.jit_speculate: + print(f'[Config] Setting fan_out_list_miss to [sum(fan_out_list)] + [0] * speculate_k because jit_speculate is False', flush=True) + self.fan_out_list_miss = [sum(self.fan_out_list)] + [0] * self.speculate_k + elif self.fan_out_list_miss is None: + # If you are jit speculating, always use the same fan_out_list for misses as for hits. + self.fan_out_list_miss = self.fan_out_list + assert sum(self.fan_out_list_miss) == sum(self.fan_out_list), "ERROR in Config: fan_out_list_miss must be the same as fan_out_list" - - if self.use_eagle: - if self.eagle_layers is None: + + if self.use_eagle_or_phoenix: + if self.use_eagle and self.eagle_layers is None: L = self.hf_config.num_hidden_layers # self.eagle_layers = [3, L//2, L-3] self.eagle_layers = [2, L//2, L-3] # [2, 16, 29] outputs, ie. [3, L//2+1, L-2] inputs @@ -90,5 +115,13 @@ def __post_init__(self): if target_max_pos != draft_max_pos: print(f'[Config] Overriding eagle draft max_position_embeddings: {draft_max_pos} -> {target_max_pos}', flush=True) self.draft_hf_config.max_position_embeddings = target_max_pos - - assert self.max_num_batched_tokens >= self.max_model_len + + if self.sampler_x is not None and not self.communicate_cache_hits: + self.communicate_cache_hits = True + print(f'[Config] Setting communicate_cache_hits to True because sampler_x is not None', flush=True) + + # assert self.max_num_batched_tokens >= self.max_model_len + if self.max_num_batched_tokens < self.max_model_len: + print(f'[Config] Warning: max_num_batched_tokens ({self.max_num_batched_tokens}) is less than max_model_len ({self.max_model_len})', flush=True) + print(f'[Config] Setting max_num_batched_tokens to max_model_len', flush=True) + self.max_num_batched_tokens = self.max_model_len diff --git a/ssd/engine/block_manager.py b/ssd/engine/block_manager.py index 1b28ca8a1..0f68028ab 100644 --- a/ssd/engine/block_manager.py +++ b/ssd/engine/block_manager.py @@ -90,6 +90,11 @@ def _deallocate_n_blocks(self, block_ids: list[int]): # we need to separate wher def _deallocate_block(self, block_id: int) -> Block: assert self.blocks[block_id].ref_count == 0 + + if self.blocks[block_id].hash != -1: # if block was finalized, remove from hash_to_block_id checkme + if self.hash_to_block_id.get(self.blocks[block_id].hash) == block_id: + del self.hash_to_block_id[self.blocks[block_id].hash] + self.used_block_ids.remove(block_id) self.free_block_ids.append(block_id) diff --git a/ssd/engine/draft_runner.py b/ssd/engine/draft_runner.py index bf1c6c977..d29cd25d3 100644 --- a/ssd/engine/draft_runner.py +++ b/ssd/engine/draft_runner.py @@ -1,5 +1,6 @@ import os import time +from datetime import datetime import torch import torch.distributed as dist import dataclasses @@ -7,15 +8,23 @@ from ssd.engine.model_runner import ModelRunner from ssd.config import Config from ssd.utils.context import set_context, reset_context +from ssd.utils.misc import compress_neg_ones_and_zeros from ssd.utils.async_helpers.async_spec_helpers import get_forked_recovery_tokens_from_logits, make_glue_decode_input_ids -from ssd.utils.async_helpers.nccl_pack import recv_int64 from ssd.engine.helpers.cudagraph_helpers import flush_draft_profile +from ssd.engine.helpers.runner_helpers import PrefillRequest, SpeculationRequest, SpeculationResponse, COMMAND PROFILE_DRAFT = os.environ.get("SSD_PROFILE_DRAFT", "0") == "1" +PROFILE_EVENTS = os.environ.get("SSD_PROFILE_EVENTS", "0") == "1" # CUDA event timing (no sync overhead) +NCCL_LOG = os.environ.get("SSD_NCCL_LOG", "0") == "1" +BRIEF_LOG = os.environ.get("SSD_BRIEF_LOG", "0") == "1" + +def _ts(): + return f'{datetime.now().strftime("%H:%M:%S.%f")[:-3]}' ttl = 0 ttl_hit = 0 + class DraftRunner(ModelRunner): @classmethod @@ -25,59 +34,72 @@ def create_draft_config(cls, cfg: Config) -> Config: cfg, model=cfg.draft, gpu_memory_utilization = (0.75 if not cfg.draft_async else 0.8), # REMAINING SPACE if not draft_async - tokenizer_path=cfg.model if cfg.use_eagle else None, - d_model_target=cfg.hf_config.hidden_size if cfg.use_eagle and cfg.hf_config else None, - enforce_eager=cfg.enforce_eager, + tokenizer_path=cfg.model if cfg.use_eagle_or_phoenix else None, + d_model_target=cfg.hf_config.hidden_size if cfg.use_eagle_or_phoenix and cfg.hf_config else None, ) return draft_cfg - def __init__(self, cfg: Config, rank: int = 0, init_q = None): - self.draft_cfg = self.create_draft_config(cfg) + def __init__(self, draft_cfg: Config, rank: int = 0, init_q = None): + print(f'[DraftRunner.__init__] draft_cfg={draft_cfg}', flush=True) + self.draft_cfg = draft_cfg self.is_draft = True # this is is_draft, use self.config.draft for the draft model path self.prev_num_tokens = None super().__init__(self.draft_cfg, rank=rank, event=None, is_draft=True, num_tp_gpus=1, init_q=init_q) - - if self.config.use_eagle: - assert self.config.jit_speculate, \ - "EAGLE requires jit_speculate=True (cache misses need draft activations)" + self._prefill_metadata = torch.empty(5, dtype=torch.int64, device=self.device) + self._decode_metadata = torch.empty(4, dtype=torch.int64, device=self.device) + self.target_rank = 0 + self.communicate_logits = self.config.communicate_logits + self.communicate_cache_hits = self.config.communicate_cache_hits if self.is_draft and self.draft_async: self._reset_tree_cache_tensors() self._init_prealloc_buffers() self._draft_step_times = [] - print(f'DraftRunner set up, starting draft_loop', flush=True) + self._acceptance_lengths = [] + self._cache_hits = [] + self._acceptance_rate_log_path = os.environ.get("ACCEPTANCE_RATE_LOG", None) + if self._acceptance_rate_log_path: + print(f'[{_ts()}] DraftRunner will log acceptance rate to: {self._acceptance_rate_log_path}', flush=True) + print(f'[{_ts()}] DraftRunner set up, starting draft_loop', flush=True) self.draft_loop() def draft_async_prefill(self): assert self.draft_async and self.is_draft - # 1) Receive metadata then individual tensors - # First recv metadata to learn sizes - metadata = torch.zeros(5, dtype=torch.int64, device=self.device) - dist.recv(metadata, src=0, group=self.async_pg) - total_new_tokens, batch_size, max_blocks, use_eagle, eagle_act_dim = metadata.tolist() - if use_eagle: - assert eagle_act_dim == 3 * self.config.d_model_target, ( - f"EAGLE activation dimension {eagle_act_dim} does not match expected dimension 3 * {self.config.d_model_target}" - ) + if self.config.verbose: + print(f'[{_ts()}] [draft_async_prefill] DRAFT ASYNC PREFILL STARTING', flush=True) + + prefill_request = PrefillRequest.receive(self.async_pg, self.target_rank, self.device, metadata_buffer=self._prefill_metadata) + total_new_tokens, batch_size, max_blocks, use_eagle_or_phoenix, eagle_phoenix_act_dim = prefill_request.metadata.tolist() + input_ids = prefill_request.input_ids + num_tokens = prefill_request.num_tokens + draft_block_table = prefill_request.draft_block_table + eagle_acts = prefill_request.eagle_acts + + if NCCL_LOG: + sep = '=' * 80 + print(f"[{_ts()}] \n{sep}", flush=True) + print(f"[{_ts()}] [NCCL_LOG DRAFT_RECV_PREFILL] input_ids shape={input_ids.shape}, values={input_ids.tolist()}", flush=True) + print(f"[{_ts()}] [NCCL_LOG DRAFT_RECV_PREFILL] input_ids decoded='{self.tokenizer.decode(input_ids.cpu().tolist())}'", flush=True) + print(f"[{_ts()}] [NCCL_LOG DRAFT_RECV_PREFILL] num_tokens={num_tokens.tolist()}", flush=True) + draft_block_table_values_str = compress_neg_ones_and_zeros(f"{draft_block_table.tolist()}") + print(f"[{_ts()}] [NCCL_LOG DRAFT_RECV_PREFILL] draft_block_table shape={draft_block_table.shape}, values={draft_block_table_values_str}", flush=True) + print(f"[{_ts()}] [NCCL_LOG DRAFT_RECV_PREFILL] eagle_acts={'None' if eagle_acts is None else f'shape={eagle_acts.shape}'}", flush=True) + print(f"[{_ts()}] {sep}\n", flush=True) + + prefill_ctxt = self.prepare_prefill_ctxt(num_tokens, draft_block_table) - # 2) receive fused int64 payload (input_ids + num_tokens + draft_block_table) - fused_total = total_new_tokens + batch_size + batch_size * max_blocks - fused = recv_int64(self.async_pg, src=0, total_length=fused_total, device=self.device) - off = 0 - input_ids = fused[off:off + total_new_tokens]; off += total_new_tokens - num_tokens = fused[off:off + batch_size]; off += batch_size - draft_block_table = fused[off:off + batch_size * max_blocks].view(batch_size, max_blocks).to(torch.int32); off += batch_size * max_blocks - assert off == fused_total - - eagle_acts = None - if use_eagle: - eagle_acts = torch.zeros( - total_new_tokens, eagle_act_dim, dtype=self.hf_config.torch_dtype, device=self.device, + if self.config.use_eagle: + assert eagle_phoenix_act_dim == 3 * self.config.d_model_target, ( + f"EAGLE activation dimension {eagle_phoenix_act_dim} does not match expected dimension 3 * {self.config.d_model_target}" + ) + elif self.config.use_phoenix: + assert eagle_phoenix_act_dim == self.config.d_model_target, ( + f"PHOENIX activation dimension {eagle_phoenix_act_dim} does not match expected dimension {self.config.d_model_target}" ) - dist.recv(eagle_acts, src=0, group=self.async_pg) + if self.config.verbose: + print(f'[{_ts()}] [draft_async_prefill] METADATA: total_new_tokens={total_new_tokens}, batch_size={batch_size}, max_blocks={max_blocks}, use_eagle_or_phoenix={use_eagle_or_phoenix}, eagle_phoenix_act_dim={eagle_phoenix_act_dim}', flush=True) - prefill_ctxt = self.prepare_prefill_ctxt(num_tokens, draft_block_table) # 5) set up context exactly like prepare_prefill() does: set_context( @@ -92,10 +114,16 @@ def draft_async_prefill(self): # 6) run the draft model in prefill mode positions = prefill_ctxt["positions"] - if self.config.use_eagle: - self.run_model(input_ids, positions, is_prefill=True, last_only=True, hidden_states=eagle_acts) - else: - self.run_model(input_ids, positions, is_prefill=True, last_only=True, hidden_states=eagle_acts) + self.run_model(input_ids, positions, is_prefill=True, last_only=True, hidden_states=eagle_acts) + + if self.config.verbose: + print(f'[{_ts()}] [draft_async_prefill] DRAFT ASYNC PREFILL DONE', flush=True) + # --- KV cache diagnostic --- + kv = self.kv_cache # [2, layers, blocks, block_size, heads, dim] + prefill_slots = prefill_ctxt["slot_map"].long() + k_norm = kv[0, 0, prefill_slots, 0, :, :].norm().item() + v_norm = kv[1, 0, prefill_slots, 0, :, :].norm().item() + print(f'[{_ts()}] [KV_CACHE] After prefill: K norm at slots {prefill_slots.tolist()} = {k_norm:.4f}, V norm = {v_norm:.4f}', flush=True) # 7) clean up reset_context() @@ -103,8 +131,7 @@ def draft_async_prefill(self): def _reset_tree_cache_tensors(self): """Reset tensor-backed tree cache to empty.""" # initialize as empty keys on correct device; tokens/logits set to None until first populate - self.tree_cache_keys = torch.zeros( - (0, 3), dtype=torch.int64, device=self.device) + self.tree_cache_keys = torch.empty(0, 3, dtype=torch.int64, device=self.device) self.tree_cache_tokens = None self.tree_cache_logits = None self.tree_cache_activations = None @@ -121,19 +148,19 @@ def _init_prealloc_buffers(self): self._arange_kp1 = torch.arange(K + 1, device=d, dtype=torch.int64) self._arange_2kp1 = torch.arange(2 * K + 1, device=d, dtype=torch.int64) - def jit_speculate(self, - request_keys: torch.Tensor, - num_tokens: torch.Tensor, - out_logits: torch.Tensor, - out_tokens: torch.Tensor, - temperatures: torch.Tensor, - draft_block_tables: torch.Tensor, - target_recovery_activations: torch.Tensor = None): - + def jit_speculate( + self, + request_keys: torch.Tensor, + num_tokens: torch.Tensor, + out_logits: torch.Tensor, + out_tokens: torch.Tensor, + temperatures: torch.Tensor, + draft_block_tables: torch.Tensor, + target_recovery_activations: torch.Tensor = None, + ): input_ids = request_keys[:, -1] - pos_offset = -1 if self.config.use_eagle else 0 - positions = num_tokens - 1 + pos_offset # want to write rec token at post N-1 since [0, ..., N-2] filled by prefill - context_lens = num_tokens + pos_offset # N+1 + positions = num_tokens - 1 + context_lens = num_tokens # Calculate slot mapping vectorized block_idx = positions // self.block_size pos_in_block = positions % self.block_size @@ -142,13 +169,16 @@ def jit_speculate(self, hidden_states = None spec_activations = None - - if self.config.use_eagle: + + if self.config.use_eagle_or_phoenix: assert target_recovery_activations is not None - hidden_states = self.model.fc(target_recovery_activations.to(self.model.fc.weight.dtype)) + if self.config.use_eagle: + hidden_states = self.model.fc(target_recovery_activations.to(self.model.fc.weight.dtype)) + else: + hidden_states = target_recovery_activations spec_activations = torch.empty( input_ids.shape[0], self.config.speculate_k, - self.hf_config.hidden_size, + self.hidden_states_dim, dtype=self.hf_config.torch_dtype, device=self.device) for i in range(self.config.speculate_k): # we're going to glue after this anyways, and by sending the spec request target has verified we have K more slots left in our last page @@ -160,14 +190,18 @@ def jit_speculate(self, is_jit=True, ) - if self.config.use_eagle: + if self.config.use_eagle_or_phoenix: logits, prenorm = self.run_model(input_ids, positions, is_prefill=False, last_only=True, hidden_states=hidden_states) - spec_activations[:, i] = prenorm - hidden_states = prenorm + if self.config.use_eagle: + spec_activations[:, i] = prenorm + hidden_states = prenorm + else: + spec_activations[:, i] = hidden_states else: logits = self.run_model(input_ids, positions, is_prefill=False, last_only=True) - - out_logits[:, i, :] = logits + + if self.config.communicate_logits: + out_logits[:, i, :] = logits reset_context() next_tokens = self.sampler(logits, temperatures, is_tree=True) out_tokens[:, i] = next_tokens @@ -183,198 +217,237 @@ def jit_speculate(self, return spec_activations - def hit_cache_and_respond(self, request_keys, B, K, num_tokens, temperatures, draft_block_tables, target_recovery_activations=None): + def hit_cache(self, request_keys, B, K, num_tokens, temperatures, draft_block_tables, target_recovery_activations=None): """Hits the cache (tensor-backed) and returns tensors to respond to the spec request.""" - global ttl, ttl_hit + global ttl # Draft model now returns full target vocab size logits (after d2t expansion) V = self.hf_config.vocab_size - # Init miss slots with valid random logits so token IDs are in-vocab (fixes B>1 crash) - out_logits = torch.empty((B, K, V), dtype=self.hf_config.torch_dtype, device=self.device).uniform_() - out_tokens = out_logits.argmax(dim=-1) - cache_hits = torch.zeros(B, dtype=torch.int64, device=self.device) + if self.config.communicate_logits: + out_logits = torch.full((B, K, V), float('-inf'), dtype=self.hf_config.torch_dtype, device=self.device) + out_logits[:, :, 0] = 0.0 + else: + out_logits = None + + out_tokens = torch.zeros(B, K, dtype=torch.int64, device=self.device) + cache_hits = torch.zeros(B, dtype=torch.bool, device=self.device) - assert request_keys.shape == (B, 3), f"ERROR in hit_cache_and_respond: request_keys should be (B, 3), got {request_keys.shape}" - - hidden_size = self.hf_config.hidden_size - out_activations = torch.zeros( - B, K, hidden_size, + assert request_keys.shape == (B, 3), f"ERROR in hit_cache: request_keys should be (B, 3), got {request_keys.shape}" + + out_activations = torch.empty( + B, K, self.hidden_states_dim, dtype=self.hf_config.torch_dtype, device=self.device - ) if self.config.use_eagle else None - + ) if self.config.use_eagle_or_phoenix else None + # Statistics ttl += int(B) - + if self.config.verbose: - print(f"[hit_cache_and_respond] Request keys: {request_keys}", flush=True) + print(f"[{_ts()}] [hit_cache] Request keys: {request_keys}", flush=True) for i in range(B): rec_token = request_keys[i, 2].item() rec_text = self.tokenizer.decode([rec_token]) - print(f" Req {i}: token={rec_token} ('{rec_text}')", flush=True) - + print(f"[{_ts()}] Req {i}: token={rec_token} ('{rec_text}')", flush=True) + if self.tree_cache_keys.numel() > 0: - # Vectorized membership against tensor cache + # Vectorized membership: broadcast eq on [B,T,3], fuse hit+idx via max() eq = (request_keys.unsqueeze(1) == self.tree_cache_keys.unsqueeze(0)) # [B,T,3] match = torch.all(eq, dim=2) # [B,T] - cache_hits = match.any(dim=1) # [B] - ttl_hit += int(cache_hits.sum().item()) - + cache_hits, idx = match.max(dim=1) # cache_hits: [B] bool, idx: [B] first-match index. + + there_was_a_cache_miss = not cache_hits.all() + if self.config.force_jit_speculate or (self.config.jit_speculate and there_was_a_cache_miss): if self.config.verbose: - print(f"[hit_cache_and_respond] Cache hits: {cache_hits.sum().item()}/{B}", flush=True) - print(f"[hit_cache_and_respond] Cache: {self.tree_cache_keys.shape[0]} entries", flush=True) - + if self.config.force_jit_speculate: + msg = "Force JIT speculate, running JIT speculate for all" + elif self.tree_cache_keys.numel() == 0: + msg = "Cache empty, running JIT speculate for all" + else: + assert there_was_a_cache_miss + msg = "There was a cache miss, running JIT speculate for all" + print(f"[{_ts()}] [hit_cache] {msg}", flush=True) + jit_acts = self.jit_speculate( + request_keys, + num_tokens, + out_logits, + out_tokens, + temperatures, + draft_block_tables, + target_recovery_activations + ) # write into out_logits, out_tokens + if self.config.use_eagle_or_phoenix: + out_activations = jit_acts + elif self.tree_cache_keys.numel() > 0: + if self.config.verbose: + print(f"[{_ts()}] [hit_cache] Cache hits: {cache_hits.sum().item()}/{B}", flush=True) + print(f"[{_ts()}] [hit_cache] Cache: {self.tree_cache_keys.shape[0]} entries", flush=True) + # Build set of hit cache indices for marking hit_indices = set() - if cache_hits.any(): - idx = match.float().argmax(dim=1).to(torch.int64) - for i in range(B): - if cache_hits[i]: - hit_indices.add(idx[i].item()) - + for i in range(B): + if cache_hits[i]: + hit_indices.add(idx[i].item()) + # Print cache entries with hit markers for i, key in enumerate(self.tree_cache_keys): seq_id, k_idx, rec_token = key.tolist() rec_text = self.tokenizer.decode([rec_token]) hit_marker = "[HIT]" if i in hit_indices else "" - print(f" [{i}]: key=({seq_id}, {k_idx}, {rec_token}) -> value=('{rec_text}') {hit_marker}", flush=True) - - # Fill hits - if (cache_hits.any() and not self.config.jit_speculate) or (cache_hits.all() and self.config.jit_speculate): - # print(f'[hit_cache_and_respond] got all cache hits, using cached logits and tokens', flush=True) - # [B], arbitrary if no match but masked out - idx = match.float().argmax(dim=1).to(torch.int64) - sel = cache_hits - # tokens [T,K] - out_tokens[sel] = self.tree_cache_tokens[idx[sel]] - # logits [T,K+1,V] - out_logits[sel] = self.tree_cache_logits[idx[sel]] - if self.config.use_eagle: - out_activations[sel] = self.tree_cache_activations[idx[sel]] - elif self.config.jit_speculate: - # print(f'[hit_cache_and_respond] found a cache miss, running jit speculate', flush=True) - if self.config.verbose: - print(f"[hit_cache_and_respond] Running JIT speculate for cache misses", flush=True) - jit_acts = self.jit_speculate( - request_keys, - num_tokens, - out_logits, - out_tokens, - temperatures, - draft_block_tables, - target_recovery_activations - ) # write into out_logits, out_tokens - if self.config.use_eagle: - out_activations = jit_acts - elif self.config.jit_speculate: - # Cache is empty (first iteration), must JIT all - if self.config.verbose: - print(f"[hit_cache_and_respond] Cache empty, running JIT speculate for all", flush=True) - jit_acts = self.jit_speculate( - request_keys, - num_tokens, - out_logits, - out_tokens, - temperatures, - draft_block_tables, - target_recovery_activations - ) - if self.config.use_eagle: - out_activations = jit_acts - + print(f"[{_ts()}] [{i}]: key=({seq_id}, {k_idx}, {rec_token}) -> value=('{rec_text}') {hit_marker}", flush=True) + + # Fill via direct indexing (miss slots get stale cache data, but that's ok since we can + # return any tokens/logits for cache misses, as long as they are consistent with one another). + out_tokens = self.tree_cache_tokens[idx] + if self.config.communicate_logits: + out_logits = self.tree_cache_logits[idx] + if self.config.use_eagle_or_phoenix: + out_activations = self.tree_cache_activations[idx] + rec_toks = request_keys[:, 2] - + + if self.config.verbose: + print(f"[{_ts()}] [CACHE RESPONSE]", flush=True) + for i in range(B): + hit_status = "HIT" if cache_hits[i].item() == 1 else "MISS" + print(f"[{_ts()}] Seq {request_keys[i, 0].item()}: {hit_status}", flush=True) + if cache_hits[i].item() == 1 or self.config.jit_speculate: + tokens_list = out_tokens[i, :K].tolist() + tokens_text = [self.tokenizer.decode([t]) for t in tokens_list] + print(f"[{_ts()}] Tokens: {tokens_list}", flush=True) + print(f"[{_ts()}] Detokenized: {tokens_text}", flush=True) + print(f"[{_ts()}] ", flush=True) + return out_tokens, out_logits, make_glue_decode_input_ids(out_tokens, rec_toks), cache_hits, out_activations def _service_spec_request(self): """Receives a speculation request, serves it from cache, and sends results back in a single response.""" - meta = self.recv_tensor((3,), torch.int64) - B, K, F = meta.tolist() - - # Receive all request payload in one fused int64 burst (includes temperatures encoded as int64) - max_blocks = self.config.max_blocks - fused_total = (3 * B) + B + (B * max_blocks) + B # +B for temps_as_int64 - fused_req = recv_int64(self.async_pg, src=0, - total_length=fused_total, device=self.device) - off = 0 - cache_keys = fused_req[off:off + (3 * B)].view(B, 3) - off += 3 * B - seq_ids = cache_keys[:, 0] - num_tokens = fused_req[off:off + B].to(torch.int64) - off += B - draft_block_tables = fused_req[off:off + B * - max_blocks].view(B, max_blocks).to(torch.int32) - off += B * max_blocks - temps_as_int64 = fused_req[off:off + B] - off += B - assert off == fused_total - temperatures = temps_as_int64.to(torch.int32).view(torch.float32) - - target_recovery_activations = torch.zeros( - B, 3 * self.config.d_model_target, dtype=self.hf_config.torch_dtype, device=self.device - ) if self.config.use_eagle else None - - extend_counts = None - extend_eagle_acts = None - extend_token_ids = None + _prof = os.environ.get("SSD_PROFILE", "0") == "1" + if _prof or PROFILE_DRAFT: + torch.cuda.synchronize() + _d0 = time.perf_counter() + if PROFILE_EVENTS: + _ev = [torch.cuda.Event(enable_timing=True) for _ in range(4)] + _ev[0].record() + + speculation_request = SpeculationRequest.receive( + async_pg=self.async_pg, + target_rank=self.target_rank, + device=self.device, + draft_dtype=self.hf_config.torch_dtype, + tokenizer=self.tokenizer, + verbose=self.config.verbose, + ) - if self.config.use_eagle: - dist.recv(target_recovery_activations, src=0, group=self.async_pg) + B, K, _, _, _ = speculation_request.metadata.tolist() + cache_keys, num_tokens, draft_block_tables, temperatures, target_recovery_activations = ( + speculation_request.cache_keys, + speculation_request.num_tokens, + speculation_request.block_tables, + speculation_request.temps, + speculation_request.recovery_activations, + ) - # Receive extend data for fused glue decode - act_dim = 3 * self.config.d_model_target - extend_counts = torch.zeros(B, dtype=torch.int64, device=self.device) - extend_eagle_acts = torch.zeros(B, K, act_dim, dtype=self.hf_config.torch_dtype, device=self.device) - extend_token_ids = torch.zeros(B, K, dtype=torch.int64, device=self.device) - dist.recv(extend_counts, src=0, group=self.async_pg) - dist.recv(extend_eagle_acts, src=0, group=self.async_pg) - dist.recv(extend_token_ids, src=0, group=self.async_pg) + if _prof or PROFILE_DRAFT: + torch.cuda.synchronize() + _d1 = time.perf_counter() + if PROFILE_EVENTS: + _ev[1].record() - if self.config.verbose: - recovery_tokens_target = cache_keys[:, 2].clone() - print(f"\n{'='*80}", flush=True) - print(f"[CACHE REQUEST] Batch size: {B}, Spec depth: {K}", flush=True) - for i in range(B): - seq_id = cache_keys[i, 0].item() - keep_idx = cache_keys[i, 1].item() - rec_token_target = recovery_tokens_target[i].item() - rec_token_text = self.tokenizer.decode([rec_token_target]) - n_ext = extend_counts[i].item() - print(f" Seq {seq_id}: keep_idx={keep_idx}, recovery_token={rec_token_target} ('{rec_token_text}'), n_ext={n_ext}", flush=True) - print(f"{'='*80}\n", flush=True) - - out_tokens, out_logits, glue_decode_input_ids, cache_hits, out_activations = self.hit_cache_and_respond( + out_tokens, out_logits, glue_decode_input_ids, cache_hits, out_activations = self.hit_cache( cache_keys, B, K, num_tokens, temperatures, draft_block_tables, target_recovery_activations) - if self.config.verbose: - print(f"[CACHE RESPONSE]", flush=True) + if _prof or PROFILE_DRAFT: + torch.cuda.synchronize() + _d2 = time.perf_counter() + if PROFILE_EVENTS: + _ev[2].record() + + if self._acceptance_rate_log_path: + # Collect per-step metrics for logging. + # cache_keys[:, 1] is last_spec_step_accepted_len - 1 from the target; + # first request has -1 (forced miss). + global ttl_hit + ttl_hit += int(cache_hits.sum().item()) for i in range(B): - hit_status = "HIT" if cache_hits[i].item() == 1 else "MISS" - print(f" Seq {cache_keys[i, 0].item()}: {hit_status}", flush=True) - if cache_hits[i].item() == 1 or self.config.jit_speculate: - tokens_list = out_tokens[i, :K].tolist() - tokens_text = [self.tokenizer.decode([t]) for t in tokens_list] - print(f" Tokens: {tokens_list}", flush=True) - print(f" Detokenized: {tokens_text}", flush=True) - print(f"", flush=True) + accept_len = cache_keys[i, 1].item() + 1 + self._acceptance_lengths.append(accept_len) + self._cache_hits.append(int(cache_hits[i].item())) + + speculation_response = SpeculationResponse( + speculations=out_tokens.reshape(-1).to(torch.int64), + cache_hits=cache_hits.reshape(-1).to(torch.int64) if self.communicate_cache_hits else None, + logits_q=out_logits[:, :K, :].contiguous() if self.communicate_logits else None, + ) + if BRIEF_LOG: + for i in range(B): + cache_hit = cache_hits[i].item() + # We pretend we are actually sending it, for clarify in debugging. + cache_hit_text = "HIT" if cache_hit == 1 else "MISS" + print(f"[{_ts()}] [SpeculationResponse.send] req[{i}]: CACHE {cache_hit_text}", flush=True) + + speculation_response.send(self.async_pg, self.target_rank, tokenizer=self.tokenizer) + + if _prof or PROFILE_DRAFT: + torch.cuda.synchronize() + _d3 = time.perf_counter() + print(f"[PROFILE draft._service_spec_request] receive={(_d1-_d0)*1000:.2f}ms, " + f"hit_cache={(_d2-_d1)*1000:.2f}ms, " + f"send={(_d3-_d2)*1000:.2f}ms, " + f"total={(_d3-_d0)*1000:.2f}ms", + flush=True, + ) + if PROFILE_EVENTS: + _ev[3].record() + _ev[3].synchronize() + print(f"[PROFILE_EVENTS draft._service_spec_request] receive={_ev[0].elapsed_time(_ev[1]):.2f}ms, " + f"hit_cache={_ev[1].elapsed_time(_ev[2]):.2f}ms, " + f"send={_ev[2].elapsed_time(_ev[3]):.2f}ms, " + f"total={_ev[0].elapsed_time(_ev[3]):.2f}ms", + flush=True, + ) - fused_response = torch.cat([cache_hits.reshape(-1), out_tokens.reshape(-1).to(torch.int64)]) - dist.send(fused_response, dst=0, group=self.async_pg) - dist.send(out_logits[:, :K, :].contiguous(), dst=0, group=self.async_pg) + if NCCL_LOG: + sep = '=' * 80 + print(f"[{_ts()}] \n{sep}", flush=True) + for i in range(B): + spec_ids = out_tokens[i, :K].tolist() + spec_text = [self.tokenizer.decode([t]) for t in spec_ids] + print(f"[{_ts()}] req[{i}]: speculations={spec_ids}", flush=True) + print(f"[{_ts()}] decoded={spec_text}", flush=True) + print(f"[{_ts()}] {sep}\n", flush=True) + + if _prof or PROFILE_DRAFT: + torch.cuda.synchronize() + _d3 = time.perf_counter() + print(f"[PROFILE draft._service_spec_request] receive={(_d1-_d0)*1000:.2f}ms, " + f"hit_cache={(_d2-_d1)*1000:.2f}ms, " + f"send={(_d3-_d2)*1000:.2f}ms, " + f"total={(_d3-_d0)*1000:.2f}ms", + flush=True, + ) + if PROFILE_EVENTS: + _ev[3].record() + _ev[3].synchronize() + print(f"[PROFILE_EVENTS draft._service_spec_request] receive={_ev[0].elapsed_time(_ev[1]):.2f}ms, " + f"hit_cache={_ev[1].elapsed_time(_ev[2]):.2f}ms, " + f"send={_ev[2].elapsed_time(_ev[3]):.2f}ms, " + f"total={_ev[0].elapsed_time(_ev[3]):.2f}ms", + flush=True, + ) partial_tree_decode_args = { "num_tokens": num_tokens, - "seq_ids": seq_ids, + "seq_ids": speculation_request.cache_keys[:, 0], "temperatures": temperatures, "dbt": draft_block_tables, "cache_hits": cache_hits, "returned_tokens": out_tokens, "target_recovery_activations": target_recovery_activations, "previous_activations": out_activations, - "extend_counts": extend_counts, - "extend_eagle_acts": extend_eagle_acts, - "extend_token_ids": extend_token_ids, + "extend_counts": speculation_request.extend_counts, + "extend_eagle_acts": speculation_request.extend_activations, + "extend_token_ids": speculation_request.extend_token_ids, } - return glue_decode_input_ids, partial_tree_decode_args def prepare_prefill_ctxt( @@ -415,8 +488,7 @@ def prepare_prefill_ctxt( def prepare_glue_decode_ctxt(self, num_tokens, input_ids, dbt, B): K = self.config.speculate_k - pos_offset = -1 if self.config.use_eagle else 0 - positions_start = (num_tokens - 1 + pos_offset).unsqueeze(-1) + positions_start = (num_tokens - 1).unsqueeze(-1) positions_grid = positions_start + self._arange_kp1 # Calculate block indices and offsets for ALL positions @@ -434,7 +506,7 @@ def prepare_glue_decode_ctxt(self, num_tokens, input_ids, dbt, B): positions_flat = positions_grid.reshape(-1).to(torch.int64) slot_map_flat = slot_map_grid.reshape(-1).to(torch.int32) - context_lens = (num_tokens + pos_offset + K).to(torch.int32) + context_lens = (num_tokens + K).to(torch.int32) seqlen_q = torch.full((B,), K + 1, dtype=torch.int32, device=self.device) cu_seqlens_q = torch.zeros(B + 1, dtype=torch.int32, device=self.device) cu_seqlens_q[1:] = torch.cumsum(seqlen_q, dim=0) @@ -507,9 +579,8 @@ def _construct_tree_decode_args(self, partial_tree_decode_args, rec_flat, dbt): seq_ids = partial_tree_decode_args["seq_ids"] seq_ids_expanded = seq_ids[b_flat] - pos_offset = -1 if self.config.use_eagle else 0 - positions = (partial_tree_decode_args["num_tokens"][b_flat] - 1 + pos_offset) + (K + 1) + fkp1_flat - rope_positions = (partial_tree_decode_args["num_tokens"][b_flat] - 1 + pos_offset) + j_idx_flat + 1 + positions = (partial_tree_decode_args["num_tokens"][b_flat] - 1) + (K + 1) + fkp1_flat + rope_positions = (partial_tree_decode_args["num_tokens"][b_flat] - 1) + j_idx_flat + 1 temperatures = partial_tree_decode_args["temperatures"][b_flat] tree_decode_args = { @@ -529,14 +600,21 @@ def _construct_tree_decode_args(self, partial_tree_decode_args, rec_flat, dbt): def _build_tree_batch(self, partial_tree_decode_args, glue_decode_input_ids): if self.config.verbose: - print(f'about to build tree batch') + print(f'[{_ts()}] about to build tree batch') K = self.config.speculate_k dbt = partial_tree_decode_args["dbt"] cache_hits = partial_tree_decode_args["cache_hits"] cache_hits_list = cache_hits.tolist() - pos_offset = -1 if self.config.use_eagle else 0 - if self.config.use_eagle: + _prof = os.environ.get("SSD_PROFILE", "0") == "1" + if _prof or PROFILE_DRAFT: + torch.cuda.synchronize() + _d0 = time.perf_counter() + if PROFILE_EVENTS: + _bev = [torch.cuda.Event(enable_timing=True) for _ in range(7)] + _bev[0].record() + + if self.config.use_eagle_or_phoenix: B = partial_tree_decode_args["num_tokens"].shape[0] extend_counts = partial_tree_decode_args.get("extend_counts") if extend_counts is None: @@ -545,65 +623,129 @@ def _build_tree_batch(self, partial_tree_decode_args, glue_decode_input_ids): extend_token_ids_batch = partial_tree_decode_args.get("extend_token_ids") target_acts = partial_tree_decode_args["target_recovery_activations"] prev_acts = partial_tree_decode_args["previous_activations"] - hidden_size = self.hf_config.hidden_size - fc_dtype = self.model.fc.weight.dtype + hidden_size = self.hidden_states_dim + fc_dtype = self.model.fc.weight.dtype if self.config.use_eagle else self.hf_config.torch_dtype gd_view = glue_decode_input_ids.view(B, K + 1) rec_tok_ids = gd_view[:, 0] spec_tok_ids = gd_view[:, 1:] - # Variable per-seq lengths: n_ext[b] + K + 1 - seqlens_q = (extend_counts + K + 1).to(torch.int32) - cu_seqlens_q = torch.zeros(B + 1, dtype=torch.int32, device=self.device) - cu_seqlens_q[1:] = torch.cumsum(seqlens_q, 0) - total_real = int(cu_seqlens_q[-1].item()) - - # Build packed fused_ids and fused_hs (no padding, no for loops) - fused_ids = torch.zeros(total_real, dtype=torch.int64, device=self.device) - fused_hs = torch.zeros(total_real, hidden_size, dtype=self.hf_config.torch_dtype, device=self.device) - - # Per-token batch index and local offset - batch_idx = torch.repeat_interleave(torch.arange(B, device=self.device), seqlens_q) - local_off = torch.arange(total_real, device=self.device) - cu_seqlens_q[:-1].long().repeat_interleave(seqlens_q) - n_ext = extend_counts.long() # [B] - n_ext_per_tok = n_ext[batch_idx] # [total_real] - - # Classify each token: extend (local < n_ext), rec (local == n_ext), spec (local > n_ext) - is_extend = local_off < n_ext_per_tok - is_rec = local_off == n_ext_per_tok - is_spec = local_off > n_ext_per_tok - - # Extend + rec tokens: batch fc into single call - is_target_conditioned = is_extend | is_rec - tc_b = batch_idx[is_target_conditioned] - tc_local = local_off[is_target_conditioned] - tc_n_ext = n_ext_per_tok[is_target_conditioned] - - # Gather target acts: extend uses extend_eagle_acts_batch[b,j], rec uses target_acts[b] - tc_is_ext = tc_local < tc_n_ext - tc_acts = torch.empty(tc_b.size(0), target_acts.size(1), dtype=fc_dtype, device=self.device) - if tc_is_ext.any() and extend_eagle_acts_batch is not None: - ext_b = tc_b[tc_is_ext] - ext_j = tc_local[tc_is_ext] - tc_acts[tc_is_ext] = extend_eagle_acts_batch[ext_b, ext_j].to(fc_dtype) - fused_ids[is_extend] = extend_token_ids_batch[ext_b, ext_j] - tc_acts[~tc_is_ext] = target_acts[tc_b[~tc_is_ext]].to(fc_dtype) - fused_ids[is_rec] = rec_tok_ids[batch_idx[is_rec]] - - # Single batched fc call - fused_hs[is_target_conditioned] = self.model.fc(tc_acts) - - # Spec tokens: ids from spec_tok_ids, hs from prev_acts (self-conditioned, no fc) - spec_j = local_off[is_spec] - n_ext_per_tok[is_spec] - 1 # 0..K-1 - fused_ids[is_spec] = spec_tok_ids[batch_idx[is_spec], spec_j] - fused_hs[is_spec] = prev_acts[batch_idx[is_spec], spec_j] - - glue_decode_ctxt = self.prepare_glue_decode_ctxt_eagle( - num_tokens=partial_tree_decode_args["num_tokens"], - fused_ids=fused_ids, fused_hs=fused_hs, - extend_counts=extend_counts, seqlens_q=seqlens_q, - cu_seqlens_q=cu_seqlens_q, dbt=dbt, B=B, - ) + # Check if all extend counts are the same (common case) for vectorized fast path + n_ext_0 = int(extend_counts[0].item()) + uniform_extends = (B == 1) or (extend_counts == n_ext_0).all().item() + + if uniform_extends: + # ── Fast path: regular layout (all seqs have same length) ── + # Layout per seq: [ext_0, ..., ext_{n-1}, rec, spec_0, ..., spec_{K-1}] + sl = n_ext_0 + K + 1 # uniform sequence length + total_real = B * sl + fused_ids = torch.empty(total_real, dtype=torch.int64, device=self.device) + fused_hs = torch.empty(total_real, hidden_size, dtype=self.hf_config.torch_dtype, device=self.device) + fid_v = fused_ids.view(B, sl) + fhs_v = fused_hs.view(B, sl, hidden_size) + + # Extend tokens: positions 0..n_ext-1 (need fc / target acts) + if n_ext_0 > 0 and extend_eagle_acts_batch is not None: + fid_v[:, :n_ext_0] = extend_token_ids_batch[:, :n_ext_0] + ext_fc_in = extend_eagle_acts_batch[:, :n_ext_0].reshape(B * n_ext_0, -1).to(fc_dtype) + else: + ext_fc_in = None + + # Recovery token: position n_ext_0 + fid_v[:, n_ext_0] = rec_tok_ids + rec_fc_in = target_acts.to(fc_dtype) + + # Single batched fc call for all extend + rec tokens + fc_in = torch.cat([ext_fc_in, rec_fc_in], dim=0) if ext_fc_in is not None else rec_fc_in + if self.config.use_eagle: + fc_out = self.model.fc(fc_in) + else: + fc_out = fc_in # Phoenix: no fc, use activations directly + if n_ext_0 > 0: + fhs_v[:, :n_ext_0, :] = fc_out[:B * n_ext_0].view(B, n_ext_0, hidden_size) + fhs_v[:, n_ext_0, :] = fc_out[B * n_ext_0:] + else: + fhs_v[:, 0, :] = fc_out + + # Spec tokens: positions n_ext_0+1..sl-1 (no fc needed) + fid_v[:, n_ext_0 + 1:] = spec_tok_ids + fhs_v[:, n_ext_0 + 1:, :] = prev_acts + + # cu_seqlens_q: regular spacing + cu_seqlens_q = (torch.arange(B + 1, device=self.device, dtype=torch.int32) * sl) + seqlens_q = torch.full((B,), sl, device=self.device, dtype=torch.int32) + + # Positions and slot mapping via arange arithmetic (no repeat_interleave) + tok_idx = torch.arange(total_real, device=self.device, dtype=torch.int64) + batch_idx_fast = tok_idx // sl + local_off_fast = tok_idx % sl + base_pos = (partial_tree_decode_args["num_tokens"] - 2 - n_ext_0).long() + positions = base_pos[batch_idx_fast] + local_off_fast + context_lens = (partial_tree_decode_args["num_tokens"] - 1 + K).to(torch.int32) + block_idx = (positions // self.block_size).clamp(0, dbt.shape[1] - 1).to(torch.int64) + block_off = (positions % self.block_size).to(torch.int32) + blk_ids = dbt[batch_idx_fast, block_idx] + slot_map = (blk_ids * self.block_size + block_off).to(torch.int32) + + glue_decode_ctxt = { + "input_ids": fused_ids, + "positions": positions, + "slot_map": slot_map, + "hidden_states": fused_hs, + "cu_seqlens_q": cu_seqlens_q, + "max_seqlen_q": sl, + "context_lens": context_lens, + "block_tables": dbt, + } + else: + # ── Fallback: variable-length layout (repeat_interleave + boolean masks) ── + seqlens_q = (extend_counts + K + 1).to(torch.int32) + cu_seqlens_q = torch.zeros(B + 1, dtype=torch.int32, device=self.device) + cu_seqlens_q[1:] = torch.cumsum(seqlens_q, 0) + total_real = int(cu_seqlens_q[-1].item()) + + fused_ids = torch.empty(total_real, dtype=torch.int64, device=self.device) + fused_hs = torch.empty(total_real, hidden_size, dtype=self.hf_config.torch_dtype, device=self.device) + + batch_idx = torch.repeat_interleave(torch.arange(B, device=self.device), seqlens_q) + local_off = torch.arange(total_real, device=self.device) - cu_seqlens_q[:-1].long().repeat_interleave(seqlens_q) + n_ext = extend_counts.long() + n_ext_per_tok = n_ext[batch_idx] + + is_extend = local_off < n_ext_per_tok + is_rec = local_off == n_ext_per_tok + is_spec = local_off > n_ext_per_tok + + is_target_conditioned = is_extend | is_rec + tc_b = batch_idx[is_target_conditioned] + tc_local = local_off[is_target_conditioned] + tc_n_ext = n_ext_per_tok[is_target_conditioned] + + tc_is_ext = tc_local < tc_n_ext + tc_acts = torch.empty(tc_b.size(0), target_acts.size(1), dtype=fc_dtype, device=self.device) + if tc_is_ext.any() and extend_eagle_acts_batch is not None: + ext_b = tc_b[tc_is_ext] + ext_j = tc_local[tc_is_ext] + tc_acts[tc_is_ext] = extend_eagle_acts_batch[ext_b, ext_j].to(fc_dtype) + fused_ids[is_extend] = extend_token_ids_batch[ext_b, ext_j] + tc_acts[~tc_is_ext] = target_acts[tc_b[~tc_is_ext]].to(fc_dtype) + fused_ids[is_rec] = rec_tok_ids[batch_idx[is_rec]] + + if self.config.use_eagle: + fused_hs[is_target_conditioned] = self.model.fc(tc_acts) + elif self.config.use_phoenix: + fused_hs[is_target_conditioned] = tc_acts + + spec_j = local_off[is_spec] - n_ext_per_tok[is_spec] - 1 + fused_ids[is_spec] = spec_tok_ids[batch_idx[is_spec], spec_j] + fused_hs[is_spec] = prev_acts[batch_idx[is_spec], spec_j] + + glue_decode_ctxt = self.prepare_glue_decode_ctxt_eagle( + num_tokens=partial_tree_decode_args["num_tokens"], + fused_ids=fused_ids, fused_hs=fused_hs, + extend_counts=extend_counts, seqlens_q=seqlens_q, + cu_seqlens_q=cu_seqlens_q, dbt=dbt, B=B, + ) else: # Non-EAGLE: K+1 per seq, uses verify CG path B = glue_decode_input_ids.shape[0] // (K + 1) @@ -614,6 +756,12 @@ def _build_tree_batch(self, partial_tree_decode_args, glue_decode_input_ids): dbt=dbt, B=B, ) + if _prof or PROFILE_DRAFT: + torch.cuda.synchronize() + _d1 = time.perf_counter() + if PROFILE_EVENTS: + _bev[1].record() + # Pre-compute tree decode args (overlap CPU with GPU) _pre_b_flat = torch.arange(B, device=self.device, dtype=torch.int64)[:, None].expand(B, self.config.MQ_LEN).flatten() _pre_fkp1_flat = self._arange_mq.repeat(B) @@ -621,8 +769,8 @@ def _build_tree_batch(self, partial_tree_decode_args, glue_decode_input_ids): N_pre = _pre_b_flat.shape[0] _pre_metadata_ints = (B, K, self.config.async_fan_out, N_pre) _pre_seq_ids_expanded = partial_tree_decode_args["seq_ids"][_pre_b_flat] - _pre_positions = (partial_tree_decode_args["num_tokens"][_pre_b_flat] - 1 + pos_offset) + (K + 1) + _pre_fkp1_flat - _pre_rope_positions = (partial_tree_decode_args["num_tokens"][_pre_b_flat] - 1 + pos_offset) + _pre_j_idx_flat + 1 + _pre_positions = (partial_tree_decode_args["num_tokens"][_pre_b_flat] - 1) + (K + 1) + _pre_fkp1_flat + _pre_rope_positions = (partial_tree_decode_args["num_tokens"][_pre_b_flat] - 1) + _pre_j_idx_flat + 1 _pre_temperatures = partial_tree_decode_args["temperatures"][_pre_b_flat] # --- Run glue decode forward --- @@ -635,8 +783,14 @@ def _build_tree_batch(self, partial_tree_decode_args, glue_decode_input_ids): block_tables=glue_decode_ctxt["block_tables"], ) + if _prof or PROFILE_DRAFT: + torch.cuda.synchronize() + _d2 = time.perf_counter() + if PROFILE_EVENTS: + _bev[2].record() + glue_prenorm = None - if self.config.use_eagle: + if self.config.use_eagle_or_phoenix: fused_hs_flat = glue_decode_ctxt["hidden_states"] glue_decode_logits_flat, glue_prenorm = self.run_model( glue_decode_ctxt["input_ids"], glue_decode_ctxt["positions"], @@ -646,10 +800,28 @@ def _build_tree_batch(self, partial_tree_decode_args, glue_decode_input_ids): glue_decode_ctxt["input_ids"], glue_decode_ctxt["positions"], is_prefill=False, last_only=False) + if _prof or PROFILE_DRAFT: + torch.cuda.synchronize() + _d3 = time.perf_counter() + if PROFILE_EVENTS: + _bev[3].record() + + if self.config.verbose: + print(f"[{_ts()}] [GLUE DECODE] logits shape={glue_decode_logits_flat.shape}, " + f"max={glue_decode_logits_flat.max().item():.4f}, " + f"min={glue_decode_logits_flat.min().item():.4f}, " + f"mean={glue_decode_logits_flat.mean().item():.6f}", flush=True) + reset_context() + if _prof or PROFILE_DRAFT: + torch.cuda.synchronize() + _d4 = time.perf_counter() + if PROFILE_EVENTS: + _bev[4].record() + # --- Extract K+1 logits/prenorms at rec+spec positions --- - if self.config.use_eagle: + if self.config.use_eagle_or_phoenix: # Packed layout: rec at cu_seqlens_q[b] + n_ext[b], spec follows cu_q = glue_decode_ctxt["cu_seqlens_q"] rec_offsets = cu_q[:-1].long() + extend_counts.long() # [B] @@ -666,6 +838,7 @@ def _build_tree_batch(self, partial_tree_decode_args, glue_decode_input_ids): # --- Build tree hidden states from K+1 prenorms --- tree_hidden_states = None if glue_prenorm is not None: + assert self.config.use_eagle_or_phoenix, "ERROR in _build_tree_batch: use_eagle_or_phoenix must be True when glue_prenorm is not None." # Vectorized: for each (b, depth), repeat prenorm by fan_out[depth] # fan_out_t[depth] for hits, fan_out_t_miss[depth] for misses fan_hit = self.config.fan_out_t # [K+1] @@ -677,16 +850,30 @@ def _build_tree_batch(self, partial_tree_decode_args, glue_decode_input_ids): fan_miss.unsqueeze(0).expand(B, K + 1), ) # [B, K+1] reps_flat = per_batch_fan.reshape(-1) # [B*(K+1)] - prenorms_flat = glue_prenorm_kp1.reshape(B * (K + 1), -1) # [B*(K+1), d] - tree_hidden_states = torch.repeat_interleave(prenorms_flat, reps_flat, dim=0) + + if self.config.use_eagle: + prenorms_flat = glue_prenorm_kp1.reshape(B * (K + 1), -1) # [B*(K+1), d] + tree_hidden_states = torch.repeat_interleave(prenorms_flat, reps_flat, dim=0) + else: + assert self.config.use_phoenix + # Phoenix conditions on target activations, not prenorms + target_acts_expanded = target_acts.unsqueeze(1).expand(B, K + 1, -1) # [B, K+1, target_dim] + acts_flat = target_acts_expanded.reshape(B * (K + 1), -1) # [B*(K+1), target_dim] + tree_hidden_states = torch.repeat_interleave(acts_flat, reps_flat, dim=0) # --- Fork tokens from K+1 logits --- # Need [B, K+1] input_ids for forking (rec + spec tokens) - if self.config.use_eagle: + if self.config.use_eagle_or_phoenix: gd_for_fork = gd_view # [B, K+1] already computed above else: gd_for_fork = glue_decode_input_ids.reshape(B, K + 1) + if _prof or PROFILE_DRAFT: + torch.cuda.synchronize() + _d5 = time.perf_counter() + if PROFILE_EVENTS: + _bev[5].record() + forked_rec_tokens = get_forked_recovery_tokens_from_logits( self.config, glue_decode_logits, @@ -695,6 +882,28 @@ def _build_tree_batch(self, partial_tree_decode_args, glue_decode_input_ids): tokenizer=self.tokenizer, ).view(-1) + if _prof or PROFILE_DRAFT: + torch.cuda.synchronize() + _d6 = time.perf_counter() + print(f"[PROFILE draft._build_tree_batch] prepare_glue_decode_ctxt={(_d1-_d0)*1000:.2f}ms " + f"set_context={(_d2-_d1)*1000:.2f}ms " + f"run_model={(_d3-_d2)*1000:.2f}ms " + f"reset_context={(_d4-_d3)*1000:.2f}ms " + f"prepare_get_forked_recovery_tokens={(_d5-_d4)*1000:.2f}ms " + f"get_forked_recovery_tokens={(_d6-_d5)*1000:.2f}ms, total={(_d6-_d0)*1000:.2f}ms", + flush=True, + ) + if PROFILE_EVENTS: + _bev[6].record() + _bev[6].synchronize() + print(f"[PROFILE_EVENTS draft._build_tree_batch] prepare_glue_decode_ctxt={_bev[0].elapsed_time(_bev[1]):.2f}ms " + f"set_context={_bev[1].elapsed_time(_bev[2]):.2f}ms " + f"run_model={_bev[2].elapsed_time(_bev[3]):.2f}ms " + f"reset_context={_bev[3].elapsed_time(_bev[4]):.2f}ms " + f"prepare_get_forked_recovery_tokens={_bev[4].elapsed_time(_bev[5]):.2f}ms " + f"get_forked_recovery_tokens={_bev[5].elapsed_time(_bev[6]):.2f}ms, total={_bev[0].elapsed_time(_bev[6]):.2f}ms", + flush=True, + ) tree_decode_args = { "metadata_ints": _pre_metadata_ints, "input_ids": forked_rec_tokens, @@ -706,6 +915,7 @@ def _build_tree_batch(self, partial_tree_decode_args, glue_decode_input_ids): "seq_ids_expanded": _pre_seq_ids_expanded, "cache_hits": cache_hits, "cache_hits_list": cache_hits_list, + "target_recovery_activations": partial_tree_decode_args["target_recovery_activations"], } tree_decode_args["hidden_states"] = tree_hidden_states return tree_decode_args @@ -730,7 +940,7 @@ def _compute_step_positions_and_slot_maps(self, initial_positions, initial_rope_ return step_positions, step_rope_positions, step_context_lens, step_slot_maps - def _decode_tree_step(self, depth, current_input_ids, step_rope_positions, step_slot_maps, step_context_lens, dbt, payload, spec_tokens, spec_logits, spec_activations): + def _decode_tree_step(self, depth, current_input_ids, step_rope_positions, step_slot_maps, step_context_lens, dbt, payload, spec_tokens, spec_logits, spec_activations, target_recovery_activations): """Execute a single tree decode step.""" # Use precomputed values for this step set_context( @@ -741,11 +951,15 @@ def _decode_tree_step(self, depth, current_input_ids, step_rope_positions, step_ ) hidden_states = payload.get("hidden_states") - if self.config.use_eagle: + if self.config.use_eagle_or_phoenix: logits, prenorm = self.run_model(current_input_ids, step_rope_positions[depth], is_prefill=False, last_only=False, tree_decode_step=depth, cache_hits=payload["cache_hits"], hidden_states=hidden_states) assert spec_activations is not None - spec_activations[:, depth] = prenorm - payload["hidden_states"] = prenorm + if self.config.use_eagle: + spec_activations[:, depth] = prenorm + payload["hidden_states"] = prenorm + else: + spec_activations[:, depth] = target_recovery_activations + payload["hidden_states"] = target_recovery_activations else: logits = self.run_model(current_input_ids, step_rope_positions[depth], is_prefill=False, last_only=False, tree_decode_step=depth, cache_hits=payload["cache_hits"]) @@ -767,14 +981,14 @@ def _decode_tree(self, payload): B, K, F, N = payload["metadata_ints"] V = self.hf_config.vocab_size # Draft returns full target vocab size after d2t expansion - spec_tokens = torch.zeros( - (N, K), dtype=torch.int64, device=self.device) - spec_logits = torch.zeros( - (N, K, V), dtype=self.hf_config.torch_dtype, device=self.device) - spec_activations = torch.zeros( - (N, K, self.hf_config.hidden_size), + spec_tokens = torch.empty( + N, K, dtype=torch.int64, device=self.device) + spec_logits = torch.empty( + N, K, V, dtype=self.hf_config.torch_dtype, device=self.device) + spec_activations = torch.empty( + N, K, self.hidden_states_dim, dtype=self.hf_config.torch_dtype, device=self.device - ) if self.config.use_eagle else None + ) if self.config.use_eagle_or_phoenix else None # Precompute all positions, context_lens, and slot_maps for all K steps # PERFORMANCE: no .clone() needed — these are not modified in-place @@ -782,6 +996,7 @@ def _decode_tree(self, payload): initial_rope_positions = payload["rope_positions"] # [N] current_input_ids = payload["input_ids"] # [N], the forked tokens dbt = payload["block_tables"] # [B, M] - constant across steps + target_recovery_activations = payload["target_recovery_activations"] # Use compiled function for batch-size independent computations _, step_rope_positions, step_context_lens, step_slot_maps = self._compute_step_positions_and_slot_maps( @@ -791,23 +1006,33 @@ def _decode_tree(self, payload): _prof = os.environ.get("SSD_PROFILE", "0") == "1" payload["_all_greedy"] = bool((payload["temps"] == 0).all()) _step_times = [] + if PROFILE_EVENTS: + _tev = [torch.cuda.Event(enable_timing=True) for _ in range(K + 1)] + _tev[0].record() for depth in range(K): if _prof or PROFILE_DRAFT: torch.cuda.synchronize() _st = time.perf_counter() current_input_ids = self._decode_tree_step( depth, current_input_ids, step_rope_positions, step_slot_maps, - step_context_lens, dbt, payload, spec_tokens, spec_logits, spec_activations + step_context_lens, dbt, payload, spec_tokens, spec_logits, spec_activations, target_recovery_activations, ) if _prof or PROFILE_DRAFT: torch.cuda.synchronize() _et = time.perf_counter() _step_times.append((_et - _st) * 1000) if _prof: - print(f"[PROFILE draft] tree_step[{depth}]={_step_times[-1]:.2f}ms", flush=True) + print(f"[{_ts()}] [PROFILE draft] tree_step[{depth}]={_step_times[-1]:.2f}ms", flush=True) + if PROFILE_EVENTS: + _tev[depth + 1].record() if PROFILE_DRAFT and _step_times: avg = sum(_step_times) / len(_step_times) - print(f"[PROFILE draft] tree_decode: K={K} steps={' '.join(f'{t:.2f}' for t in _step_times)} avg={avg:.2f}ms total={sum(_step_times):.2f}ms", flush=True) + print(f"[{_ts()}] [PROFILE draft] tree_decode: K={K} steps={' '.join(f'{t:.2f}' for t in _step_times)} avg={avg:.2f}ms total={sum(_step_times):.2f}ms", flush=True) + if PROFILE_EVENTS and K > 0: + _tev[K].synchronize() + _esteps = [f'{_tev[i].elapsed_time(_tev[i+1]):.2f}' for i in range(K)] + _etotal = _tev[0].elapsed_time(_tev[K]) + print(f"[PROFILE_EVENTS draft] tree_decode: K={K} steps={' '.join(_esteps)} total={_etotal:.2f}ms", flush=True) return spec_tokens, spec_logits, spec_activations @@ -832,8 +1057,8 @@ def _populate_tree_cache(self, payload, tokens, logits, cache_hits, activations= # Print cache population details if self.config.verbose: N = keys.shape[0] - print(f"\n{'='*80}", flush=True) - print(f"[CACHE POPULATED] {N} entries", flush=True) + print(f"[{_ts()}] \n{'='*80}", flush=True) + print(f"[{_ts()}] [CACHE POPULATED] {N} entries", flush=True) # Show sample entries per sequence for seq_id in keys[:, 0].unique()[:1]: # Just show first sequence @@ -841,7 +1066,7 @@ def _populate_tree_cache(self, payload, tokens, logits, cache_hits, activations= seq_entries = keys[seq_mask] seq_tokens = tokens[seq_mask] - print(f" Seq {seq_id.item()}: {seq_mask.sum().item()} entries", flush=True) + print(f"[{_ts()}] Seq {seq_id.item()}: {seq_mask.sum().item()} entries", flush=True) # Show first 2 unique recovery tokens for rec_token in seq_entries[:, 2].unique()[:2]: @@ -853,39 +1078,67 @@ def _populate_tree_cache(self, payload, tokens, logits, cache_hits, activations= rec_text = self.tokenizer.decode([rec_token.item()]) spec_tokens = seq_tokens[idx].tolist() spec_text = [self.tokenizer.decode([t]) for t in spec_tokens] - print(f" k={k_idx}, rec={rec_token.item()} ('{rec_text}') -> {spec_text}", flush=True) - print(f"{'='*80}\n", flush=True) + print(f"[{_ts()}] k={k_idx}, rec={rec_token.item()} ('{rec_text}') -> {spec_text}", flush=True) + print(f"[{_ts()}] {'='*80}\n", flush=True) + + def _start_interrupt_listener(self): + """Initiates a non-blocking receive for the next command to allow interruption.""" + cmd_tensor = torch.empty(1, dtype=torch.int64, device=self.device) + work_handle = dist.irecv(cmd_tensor, src=0, group=self.async_pg) + # return both the handle and its tensor buffer + return work_handle, cmd_tensor + # new one, with true asynchrony def draft_loop(self): """ Runs the asynchronous draft model loop. Handles three commands: - 1 = prefill, 0 = spec request, 2 = exit. + 1 = prefill, 0 = spec request, 2 = exit, 3 = branch prefetch (only after a spec request). """ assert self.draft_async, "draft_loop only runs in async-draft mode" + try: + self._draft_loop_inner() + except (torch.distributed.DistBackendError, RuntimeError) as e: + err = str(e) + if "closed" in err or "Connection" in err or "NCCL" in err: + print(f"[{_ts()}] [draft] Target disconnected, shutting down gracefully.", flush=True) + self.exit() + return + print(f"[{_ts()}] [draft] Error in draft_loop: {e}", flush=True) + raise e + except Exception as e: + print(f"[{_ts()}] [draft] Error in draft_loop: {e}", flush=True) + raise e + + def _draft_loop_inner(self): while True: # 1) Wait for the next command (may be PREFILL, SPEC_REQUEST, or EXIT) - cmd = self.recv_cmd() + cmd, _ = self._wait_for_cmd() # PREFILL: run the draft prefill and then loop back - if cmd == 1: + if cmd == COMMAND.PREFILL: self.draft_async_prefill() continue # SPECULATE request: serve out-of-cache or random speculations - elif cmd == 0: + elif cmd == COMMAND.SPECULATION: _ds0 = time.perf_counter() _prof = os.environ.get("SSD_PROFILE", "0") == "1" if _prof or PROFILE_DRAFT: torch.cuda.synchronize() _d0 = time.perf_counter() + if PROFILE_EVENTS: + _lev = [torch.cuda.Event(enable_timing=True) for _ in range(5)] + _lev[0].record() glue_decode_input_ids, partial_tree_decode_args = self._service_spec_request() if _prof or PROFILE_DRAFT: torch.cuda.synchronize() _d1 = time.perf_counter() + if PROFILE_EVENTS: + _lev[1].record() self._reset_tree_cache_tensors() @@ -894,6 +1147,8 @@ def draft_loop(self): if _prof or PROFILE_DRAFT: torch.cuda.synchronize() _d2 = time.perf_counter() + if PROFILE_EVENTS: + _lev[2].record() # Decode the branch tree tokens, logits, activations = self._decode_tree(tree_decode_args) @@ -901,6 +1156,8 @@ def draft_loop(self): if _prof or PROFILE_DRAFT: torch.cuda.synchronize() _d3 = time.perf_counter() + if PROFILE_EVENTS: + _lev[3].record() # Populate the local cache so future spec-requests can hit self._populate_tree_cache(tree_decode_args, tokens, logits, tree_decode_args["cache_hits"], activations) @@ -909,7 +1166,11 @@ def draft_loop(self): if _prof or PROFILE_DRAFT: torch.cuda.synchronize() _d4 = time.perf_counter() - print(f"[PROFILE draft] service={(_d1-_d0)*1000:.2f}ms build_tree={(_d2-_d1)*1000:.2f}ms decode_tree={(_d3-_d2)*1000:.2f}ms populate={(_d4-_d3)*1000:.2f}ms total={(_d4-_d0)*1000:.2f}ms", flush=True) + print(f"[{_ts()}] [PROFILE draft] service={(_d1-_d0)*1000:.2f}ms build_tree={(_d2-_d1)*1000:.2f}ms decode_tree={(_d3-_d2)*1000:.2f}ms populate={(_d4-_d3)*1000:.2f}ms total={(_d4-_d0)*1000:.2f}ms", flush=True) + if PROFILE_EVENTS: + _lev[4].record() + _lev[4].synchronize() + print(f"[PROFILE_EVENTS draft] service={_lev[0].elapsed_time(_lev[1]):.2f}ms build_tree={_lev[1].elapsed_time(_lev[2]):.2f}ms decode_tree={_lev[2].elapsed_time(_lev[3]):.2f}ms populate={_lev[3].elapsed_time(_lev[4]):.2f}ms total={_lev[0].elapsed_time(_lev[4]):.2f}ms", flush=True) if PROFILE_DRAFT: flush_draft_profile() @@ -917,10 +1178,24 @@ def draft_loop(self): continue # EXIT: clean up and break out of the loop - elif cmd == 2: + elif cmd == COMMAND.DRAFT_EXIT: if self._draft_step_times: avg_ms = sum(self._draft_step_times) * 1000 / len(self._draft_step_times) - print(f"[metrics] Avg draft step time (ms): {avg_ms:.2f}", flush=True) + print(f"[{_ts()}] [metrics] Avg draft step time (ms): {avg_ms:.2f}", flush=True) + if self._acceptance_rate_log_path and self._acceptance_lengths: + import json + avg_acc = sum(self._acceptance_lengths) / len(self._acceptance_lengths) + hit_rate = sum(self._cache_hits) / len(self._cache_hits) if self._cache_hits else 0 + print(f"[{_ts()}] [metrics] Avg acceptance length: {avg_acc:.2f} ({len(self._acceptance_lengths)} steps)", flush=True) + print(f"[{_ts()}] [metrics] Cache hit rate: {hit_rate:.2%} ({sum(self._cache_hits)}/{len(self._cache_hits)})", flush=True) + print(f"[{_ts()}] [metrics] All acceptance lengths: {self._acceptance_lengths}", flush=True) + print(f"[{_ts()}] [metrics] All cache hits: {self._cache_hits}", flush=True) + print(f"[{_ts()}] [metrics] Logging acceptance lengths and cache hits to: {self._acceptance_rate_log_path}", flush=True) + with open(self._acceptance_rate_log_path, "w") as f: + json.dump({ + "acceptance_lengths": self._acceptance_lengths, + "cache_hits": self._cache_hits, + }, f) self.exit() break diff --git a/ssd/engine/helpers/cudagraph_helpers.py b/ssd/engine/helpers/cudagraph_helpers.py index e347b3926..60d322491 100644 --- a/ssd/engine/helpers/cudagraph_helpers.py +++ b/ssd/engine/helpers/cudagraph_helpers.py @@ -1,9 +1,8 @@ import os +import math import torch -import numpy as np -from typing import List + from ssd.utils.context import set_context, get_context, reset_context -from ssd.engine.helpers.mask_helpers import get_custom_mask from time import perf_counter @@ -78,7 +77,7 @@ def run_verify_cudagraph(model_runner, input_ids, positions, last_only, graph_va torch.cuda.synchronize() _t2 = perf_counter() has_eagle = "eagle_acts" in graph_vars - print(f"[PROFILE verify_cg] replay={(_t1-_t0)*1000:.2f}ms logits={(_t2-_t1)*1000:.2f}ms eagle={has_eagle} bs={orig_bs} rank={model_runner.rank}", flush=True) + print(f"[cuda_graph_helpers.run_verify_cudagraph][PROFILE verify_cg] replay={(_t1-_t0)*1000:.2f}ms logits={(_t2-_t1)*1000:.2f}ms eagle={has_eagle} bs={orig_bs} rank={model_runner.rank}", flush=True) # For eagle target, also return eagle_acts if "eagle_acts" in graph_vars: @@ -122,9 +121,6 @@ def run_decode_cudagraph(model_runner, input_ids, positions, last_only, graph_va return logits -cache = {} - -_plan_event = None # Lazy-init CUDA event for plan() sync PROFILE = os.environ.get("SSD_PROFILE", "0") == "1" PROFILE_DRAFT = os.environ.get("SSD_PROFILE_DRAFT", "0") == "1" _draft_events = [] # [(step, label, start_event, end_event), ...] @@ -144,35 +140,28 @@ def flush_draft_profile(): detail = " ".join(f"{l}={t:.2f}" for l, t in by_step[step]) parts.append(f"s{step}={step_total:.2f}({detail})") total += step_total - print(f"[PROFILE draft_detail] K={len(by_step)} total={total:.2f}ms avg_step={total/len(by_step):.2f}ms | {' '.join(parts)}", flush=True) + print(f"[cuda_graph_helpers.flush_draft_profile][PROFILE draft_detail] K={len(by_step)} total={total:.2f}ms avg_step={total/len(by_step):.2f}ms | {' '.join(parts)}", flush=True) _draft_events.clear() @torch.inference_mode() def run_fi_tree_decode_cudagraph(model_runner, input_ids, positions, last_only, graph_vars, step, cache_hits, hidden_states=None): - # bs != len(input_ids, positions) now in multi-query seting, also need step-dependent mask context = get_context() - assert context.cu_seqlens_q is None, "ERROR in run_fi_tree_decode_cudagraph: cu_seqlens_q should be set to None so we don't take FA path" - K, F = model_runner.config.speculate_k, model_runner.config.async_fan_out - # MQ_LEN = F * (K+1) MQ_LEN = sum(model_runner.config.fan_out_list) orig_flat = input_ids.size(0) assert orig_flat % MQ_LEN == 0, f"ERROR in run_fi_tree_decode_cudagraph: flat_batch_size should be divisible by MQ_LEN, got {orig_flat} and {MQ_LEN}" orig_B = orig_flat // MQ_LEN - # Pick CUDA graph and wrapper bucket + # Pick CUDA graph bucket wrapper_bs = next( x for x in model_runner.graph_bs_list["fi_tree_decode"] if x >= orig_B) graph = model_runner.graphs["fi_tree_decode"][wrapper_bs] - wrapper = model_runner.prefill_wrappers[wrapper_bs] # Prepare padded inputs/context if needed if wrapper_bs > orig_B: - # print(f'PADDING--') pad_B = wrapper_bs - orig_B pad_flat = pad_B * MQ_LEN - # Pad queries (ids/rope positions) pad_ids = torch.zeros( pad_flat, dtype=input_ids.dtype, device=input_ids.device) pad_pos = torch.zeros( @@ -180,13 +169,11 @@ def run_fi_tree_decode_cudagraph(model_runner, input_ids, positions, last_only, input_ids = torch.cat([input_ids, pad_ids], dim=0) positions = torch.cat([positions, pad_pos], dim=0) - # Pad slot_mapping with -1 to skip KV writes for padded queries slot_map = torch.cat( [context.slot_mapping, torch.full((pad_flat,), -1, dtype=context.slot_mapping.dtype, device=context.slot_mapping.device)] ) - # Pad block_tables/context_lens by repeating the last real row bt = context.block_tables cl = context.context_lens pad_bt = bt[orig_B - 1:orig_B].expand(pad_B, -1).contiguous() @@ -194,205 +181,54 @@ def run_fi_tree_decode_cudagraph(model_runner, input_ids, positions, last_only, bt = torch.cat([bt, pad_bt], dim=0) cl = torch.cat([cl, pad_cl], dim=0) - # Set padded context for this replay set_context(is_prefill=False, slot_mapping=slot_map, - context_lens=cl, block_tables=bt) + context_lens=cl, block_tables=bt, + tree_cu_seqlens_q=graph_vars["tree_cu_seqlens_q"][wrapper_bs], + tree_mask_bias=graph_vars["tree_mask_bias"]) block_tables = bt context_lens = cl - flat_batch_size = input_ids.size(0) # == wrapper_bs * MQ_LEN + flat_batch_size = input_ids.size(0) B = wrapper_bs else: block_tables = context.block_tables context_lens = context.context_lens flat_batch_size = orig_flat B = orig_B - - if PROFILE: - torch.cuda.synchronize() - start_time = torch.cuda.Event(enable_timing=True) - end_time = torch.cuda.Event(enable_timing=True) - start_time.record() + # Set tree decode metadata on context for FA4 + context.tree_cu_seqlens_q = graph_vars["tree_cu_seqlens_q"][wrapper_bs] + context.tree_mask_bias = graph_vars["tree_mask_bias"] # in the case where we pad, we'll need cache_hits.shape[0] to match the padded batch size if cache_hits.shape[0] < B: cache_hits = torch.cat([cache_hits, torch.zeros(B - cache_hits.shape[0], device=cache_hits.device)]) - # PERFORMANCE: Step 0 -- precompute KV page metadata on CPU for all K steps. - # CPU tensors let plan() skip its internal .to("cpu") GPU->CPU syncs. - # For B<=8, CPU slicing also avoids GPU boolean indexing. - if step == 0: - cache["cu_seqlens_q_cpu"] = torch.arange(B + 1, dtype=torch.int32) * MQ_LEN - context_lens_list = context_lens.tolist() - cache["block_tables"] = block_tables - block_size = model_runner.block_size - cache["precomputed_kv"] = [] - cache["plan_cpu_args"] = [] - - if B <= 8: - # PERFORMANCE: CPU-only kv_indices via slicing (no GPU boolean indexing) - for s in range(K): - step_cls = [int(cl) + s * MQ_LEN for cl in context_lens_list] - step_counts = [(cl + block_size - 1) // block_size for cl in step_cls] - if B == 1: - kv_indices_s = block_tables[0, :step_counts[0]] - else: - kv_indices_s = torch.cat([block_tables[b, :step_counts[b]] for b in range(B)]) - cache["precomputed_kv"].append(kv_indices_s) - kv_indptr_cpu = torch.zeros(B + 1, dtype=torch.int32) - kv_indptr_cpu[1:] = torch.tensor(step_counts, dtype=torch.int32).cumsum(0) - kv_lpl_cpu = torch.tensor( - [cl % block_size if cl % block_size != 0 else block_size for cl in step_cls], - dtype=torch.int32) - cache["plan_cpu_args"].append((kv_indptr_cpu, kv_lpl_cpu)) - else: - # Large batch: GPU boolean indexing for kv_indices, CPU tensors for plan args - bt_upcast = torch.arange(block_tables.size(1), device=block_tables.device)[None, :] - step_offsets = torch.arange(K + 2, device=context_lens.device) * MQ_LEN - all_step_cls = context_lens.unsqueeze(1) + step_offsets.unsqueeze(0) - all_counts = (all_step_cls + block_size - 1) // block_size - all_masks = bt_upcast.unsqueeze(1) < all_counts.unsqueeze(2) - for s in range(K): - cache["precomputed_kv"].append(block_tables[all_masks[:, s, :]]) - step_cls = [int(cl) + s * MQ_LEN for cl in context_lens_list] - step_counts = [(cl + block_size - 1) // block_size for cl in step_cls] - kv_indptr_cpu = torch.zeros(B + 1, dtype=torch.int32) - kv_indptr_cpu[1:] = torch.tensor(step_counts, dtype=torch.int32).cumsum(0) - kv_lpl_cpu = torch.tensor( - [cl % block_size if cl % block_size != 0 else block_size for cl in step_cls], - dtype=torch.int32) - cache["plan_cpu_args"].append((kv_indptr_cpu, kv_lpl_cpu)) - - # CPU mask precompute: build all K packed masks using numpy at step 0. - # Eliminates per-step get_custom_mask (GPU) + segment_packbits + GPU->CPU syncs. - cache_hits_list = cache_hits[:B].tolist() - - if "glue_hit_np" not in cache: - _fol = model_runner.config.fan_out_list - _fol_miss = model_runner.config.fan_out_list_miss - _tril = np.tril(np.ones((K + 1, K + 1), dtype=np.uint8)) - cache["glue_hit_np"] = np.repeat(_tril, _fol, axis=0) - cache["glue_miss_np"] = np.repeat(_tril, _fol_miss, axis=0) - - _glue_hit = cache["glue_hit_np"] - _glue_miss = cache["glue_miss_np"] - _rows_np = np.arange(MQ_LEN) - - cache["cpu_packed_masks"] = [] - cache["cpu_packed_indptrs"] = [] - - for s in range(K): - ttl_added_s = (s + 1) * MQ_LEN + (K + 1) - packed_segs = [] - seg_packed_sizes = [] - - for b in range(B): - cols_b = int(context_lens_list[b]) + s * MQ_LEN - prefix_len_b = cols_b - ttl_added_s - - mask_b = np.zeros((MQ_LEN, cols_b), dtype=np.uint8) - mask_b[:, :prefix_len_b] = 1 - glue = _glue_hit if int(cache_hits_list[b]) == 1 else _glue_miss - mask_b[:, prefix_len_b:prefix_len_b + K + 1] = glue - diag_start = prefix_len_b + K + 1 - for blk in range(s + 1): - mask_b[_rows_np, diag_start + blk * MQ_LEN + _rows_np] = 1 - - packed = np.packbits(mask_b.ravel(), bitorder='little') - packed_segs.append(packed) - seg_packed_sizes.append(len(packed)) - - full_packed = np.concatenate(packed_segs) if B > 1 else packed_segs[0] - indptr = np.zeros(B + 1, dtype=np.int32) - indptr[1:] = np.cumsum(seg_packed_sizes) - - cache["cpu_packed_masks"].append( - torch.from_numpy(full_packed.copy()).to(model_runner.device, non_blocking=True)) - cache["cpu_packed_indptrs"].append( - torch.from_numpy(indptr.copy()).to(model_runner.device, non_blocking=True)) - - # Pre-transfer KV metadata to GPU (eliminates per-step pageable H2D transfers) - cache["qo_indptr_gpu"] = cache["cu_seqlens_q_cpu"].to(model_runner.device, non_blocking=True) - cache["kv_indptr_gpu"] = [] - cache["kv_lpl_gpu"] = [] - cache["kv_lens_gpu"] = [] - for s in range(K): - ki, kl = cache["plan_cpu_args"][s] - cache["kv_indptr_gpu"].append(ki.to(model_runner.device, non_blocking=True)) - cache["kv_lpl_gpu"].append(kl.to(model_runner.device, non_blocking=True)) - kv_lens = ((ki[1:] - ki[:-1] - 1) * model_runner.block_size + kl).to(torch.int32) - cache["kv_lens_gpu"].append(kv_lens.to(model_runner.device, non_blocking=True)) - if PROFILE: - end_time.record() torch.cuda.synchronize() - precompute_time = start_time.elapsed_time(end_time) + start_time = torch.cuda.Event(enable_timing=True) + end_time = torch.cuda.Event(enable_timing=True) start_time.record() - # Use precomputed CPU-packed masks (built at step 0) - if PROFILE_DRAFT: - _ev_mask0 = torch.cuda.Event(enable_timing=True); _ev_mask0.record() - - kv_indices = cache["precomputed_kv"][step] - kv_indptr_cpu, kv_lpl_cpu = cache["plan_cpu_args"][step] - qo_indptr_cpu = cache["cu_seqlens_q_cpu"] - - packed_mask = cache["cpu_packed_masks"][step] - packed_indptr = cache["cpu_packed_indptrs"][step] - wrapper._custom_mask_buf[:len(packed_mask)].copy_(packed_mask, non_blocking=True) - wrapper._mask_indptr_buf.copy_(packed_indptr, non_blocking=True) - - # GPU-to-GPU copies from pre-transferred tensors (no pageable H2D) - wrapper._qo_indptr_buf.copy_(cache["qo_indptr_gpu"], non_blocking=True) - wrapper._paged_kv_indptr_buf.copy_(cache["kv_indptr_gpu"][step], non_blocking=True) - wrapper._paged_kv_last_page_len_buf.copy_(cache["kv_lpl_gpu"][step], non_blocking=True) - wrapper._paged_kv_indices_buf[:len(kv_indices)].copy_(kv_indices, non_blocking=True) - - total_num_rows = int(qo_indptr_cpu[-1].item()) - wrapper._kv_lens_buffer[:len(kv_indptr_cpu) - 1].copy_(cache["kv_lens_gpu"][step], non_blocking=True) - - # Event-based sync: only wait for this stream's copies, not all CUDA streams. - global _plan_event - if _plan_event is None: - _plan_event = torch.cuda.Event() - _plan_event.record() - _plan_event.synchronize() - - if PROFILE_DRAFT: - _ev_plan0 = torch.cuda.Event(enable_timing=True); _ev_plan0.record() - - plan_args = [ - wrapper._float_workspace_buffer, wrapper._int_workspace_buffer, - wrapper._pin_memory_int_workspace_buffer, - qo_indptr_cpu, kv_indptr_cpu, cache["kv_lens_gpu"][step], - wrapper._max_total_num_rows or total_num_rows, - B, model_runner.hf_config.num_attention_heads, - model_runner.hf_config.num_key_value_heads, - model_runner.block_size, wrapper.is_cuda_graph_enabled, - model_runner.hf_config.head_dim, model_runner.hf_config.head_dim, - False, -1, - ] - if wrapper._backend == "fa2": - plan_args.extend([-1, False]) - wrapper._plan_info = wrapper._cached_module.plan(*plan_args) - - if PROFILE_DRAFT: - _ev_plan1 = torch.cuda.Event(enable_timing=True); _ev_plan1.record() - - if PROFILE: - end_time.record() - torch.cuda.synchronize() - plan_time = start_time.elapsed_time(end_time) - start_time.record() + # Build tree mask bias for this step and copy into pre-allocated buffer + from ssd.layers.tree_mask import build_tree_mask_bias + K = model_runner.config.speculate_k + mask_bias = build_tree_mask_bias( + context_lens, step=step, K=K, MQ_LEN=MQ_LEN, + fan_out_list=model_runner.config.fan_out_list, + fan_out_list_miss=model_runner.config.fan_out_list_miss, + cache_hits=cache_hits, + max_kv_stride=model_runner.config.max_model_len, + device=model_runner.device, + ) + graph_vars["tree_mask_bias"][:len(mask_bias)] = mask_bias - # Copy inputs/context into graph buffers for padded size + # Copy inputs/context into graph buffers graph_vars["input_ids"][:flat_batch_size] = input_ids graph_vars["positions"][:flat_batch_size] = positions graph_vars["slot_mapping"][:flat_batch_size] = get_context().slot_mapping graph_vars["context_lens"][:B] = context_lens if hidden_states is not None and "hidden_states" in graph_vars: if hidden_states.shape[0] < flat_batch_size: - # Pad hidden_states to match padded batch pad_n = flat_batch_size - hidden_states.shape[0] hidden_states = torch.cat([hidden_states, torch.zeros(pad_n, hidden_states.shape[1], dtype=hidden_states.dtype, device=hidden_states.device)]) graph_vars["hidden_states"][:flat_batch_size] = hidden_states @@ -412,8 +248,6 @@ def run_fi_tree_decode_cudagraph(model_runner, input_ids, positions, last_only, if PROFILE_DRAFT: _ev_replay1 = torch.cuda.Event(enable_timing=True); _ev_replay1.record() - _draft_events.append((step, "mask+buf", _ev_mask0, _ev_plan0)) - _draft_events.append((step, "plan", _ev_plan0, _ev_plan1)) _draft_events.append((step, "replay", _ev_replay0, _ev_replay1)) if PROFILE: @@ -421,14 +255,12 @@ def run_fi_tree_decode_cudagraph(model_runner, input_ids, positions, last_only, torch.cuda.synchronize() replay_time = start_time.elapsed_time(end_time) - # Extract logits from graph_vars instead of computing them separately logits_all = graph_vars["logits"][:flat_batch_size] if PROFILE: - print(f"[run_fi_tree_decode_cudagraph] step {step}: precompute={precompute_time:.3f}ms, plan={plan_time:.3f}ms, buffer={buffer_prep_time:.3f}ms, replay={replay_time:.3f}ms", flush=True) + print(f"[cuda_graph_helpers.run_fi_tree_decode_cudagraph] step {step}: buffer={buffer_prep_time:.3f}ms, replay={replay_time:.3f}ms", flush=True) logits_out = logits_all[:orig_flat] - # EAGLE draft: also return prenorm (outputs) for self-conditioning if "hidden_states" in graph_vars: prenorm = graph_vars["outputs"][:orig_flat] return logits_out, prenorm @@ -482,23 +314,29 @@ def capture_cudagraph(model_runner): is_jit = (model_runner.config.speculate and model_runner.config.draft_async and model_runner.is_draft) # Eagle models need special handling during CUDA graph capture - is_eagle_draft = config.use_eagle and model_runner.is_draft - is_eagle_target = config.use_eagle and not model_runner.is_draft + is_eagle_or_phoenix_draft = config.use_eagle_or_phoenix and model_runner.is_draft + is_eagle_or_phoenix_target = config.use_eagle_or_phoenix and not model_runner.is_draft hidden_states = None - if is_eagle_draft: - # Use hidden_size (d_model_draft) so CG captures the pass-through branch in Eagle3DraftForCausalLM.forward() - # All callers project target acts via fc() BEFORE passing to CG - hidden_states = torch.zeros(max_bs, hf_config.hidden_size, - dtype=hf_config.torch_dtype, device=input_ids.device) + if is_eagle_or_phoenix_draft: + # Note: For Eagle3, all callers project target acts via fc() BEFORE passing to CG + hidden_states = torch.zeros( + max_bs, + model_runner.hidden_states_dim, + dtype=hf_config.torch_dtype, + device=input_ids.device, + ) - for bs in reversed(graph_bs_list): + total_graphs = len(graph_bs_list) + print(f'[capture_cudagraph] Starting capture of {total_graphs} graphs, bs list: {graph_bs_list[:5]}...{graph_bs_list[-3:]} max_bs={max_bs}', flush=True) + for idx, bs in enumerate(reversed(graph_bs_list)): + print(f'[capture_cudagraph] Capturing graph {idx+1}/{total_graphs}, bs={bs}', flush=True) graph = torch.cuda.CUDAGraph() set_context( False, slot_mapping=slot_mapping[:bs], context_lens=context_lens[:bs], block_tables=block_tables[:bs], is_jit=is_jit) - if is_eagle_draft: + if is_eagle_or_phoenix_draft: outputs[:bs] = model_runner.model( input_ids[:bs], positions[:bs], hidden_states[:bs]) # warmup - elif is_eagle_target: + elif is_eagle_or_phoenix_target: out, _ = model_runner.model( input_ids[:bs], positions[:bs]) # warmup outputs[:bs] = out @@ -506,10 +344,10 @@ def capture_cudagraph(model_runner): outputs[:bs] = model_runner.model( input_ids[:bs], positions[:bs]) # warmup with torch.cuda.graph(graph, graph_pool): - if is_eagle_draft: + if is_eagle_or_phoenix_draft: outputs[:bs] = model_runner.model( input_ids[:bs], positions[:bs], hidden_states[:bs]) # capture - elif is_eagle_target: + elif is_eagle_or_phoenix_target: out, _ = model_runner.model( input_ids[:bs], positions[:bs]) # capture outputs[:bs] = out @@ -544,7 +382,7 @@ def capture_verify_cudagraph(model_runner): max_bs = min(model_runner.config.max_num_seqs, 512) k_plus_1 = model_runner.config.speculate_k + 1 - is_eagle_target = config.use_eagle and not model_runner.is_draft + is_eagle_or_phoenix_target = config.use_eagle_or_phoenix and not model_runner.is_draft # For verify, we need to handle k+1 tokens per sequence, and use cu_seqlens_q and max_seqlen_q input_ids = torch.zeros(max_bs * k_plus_1, dtype=torch.int64) @@ -556,12 +394,14 @@ def capture_verify_cudagraph(model_runner): outputs = torch.zeros(max_bs * k_plus_1, hf_config.hidden_size) cu_seqlens_q = torch.zeros(max_bs + 1, dtype=torch.int32) - # Eagle target: also capture eagle_acts from model forward + # Eagle/Phoenix target: also capture activations from model forward eagle_acts = None - if is_eagle_target: - # eagle_acts has shape [num_tokens, 3 * hidden_size] for 3 layers - eagle_acts = torch.zeros(max_bs * k_plus_1, 3 * hf_config.hidden_size, - dtype=hf_config.torch_dtype) + if is_eagle_or_phoenix_target: + eagle_acts = torch.zeros( + max_bs * k_plus_1, + model_runner.eagle_acts_dim, + dtype=hf_config.torch_dtype, + ) base = [1, 2, 4, 8] dynamic = list(range(16, max_bs+1, 16)) @@ -682,6 +522,7 @@ def run_glue_decode_cudagraph(model_runner, input_ids, positions, last_only, gra outputs = graph_vars["outputs"][:orig_flat] logits = model_runner.model.compute_logits(outputs, last_only) + assert logits.dim() == 2, "ERROR in run_glue_decode_cudagraph: logits must be 2D" if "eagle_hidden_states" in graph_vars: return logits, outputs return logits @@ -706,9 +547,14 @@ def capture_glue_decode_cudagraph(model_runner): outputs = torch.empty(max_flat, hf_config.hidden_size, device=model_runner.device) cu_seqlens_q = torch.zeros(max_bs + 1, dtype=torch.int32, device=model_runner.device) - eagle_hs = None - if config.use_eagle and model_runner.is_draft: - eagle_hs = torch.zeros(max_flat, hf_config.hidden_size, dtype=hf_config.torch_dtype, device=model_runner.device) + eagle_hidden_states = None + if config.use_eagle_or_phoenix and model_runner.is_draft: + eagle_hidden_states = torch.zeros( + max_flat, + model_runner.hidden_states_dim, + dtype=hf_config.torch_dtype, + device=model_runner.device, + ) graph_bs_list = [1] for bs in [2, 4, 8] + list(range(16, max_bs + 1, 16)): @@ -721,7 +567,7 @@ def capture_glue_decode_cudagraph(model_runner): graphs = {} graph_pool = None - print(f'[capture_glue_decode_cudagraph] Capturing for bs={graph_bs_list}', flush=True) + print(f'[cuda_graph_helpers.capture_glue_decode_cudagraph] Capturing for bs={graph_bs_list}', flush=True) for bs in reversed(graph_bs_list): graph = torch.cuda.CUDAGraph() @@ -742,14 +588,14 @@ def capture_glue_decode_cudagraph(model_runner): block_tables=block_tables[:bs], ) - if eagle_hs is not None: - outputs[:flat] = model_runner.model(input_ids[:flat], positions[:flat], eagle_hs[:flat]) + if eagle_hidden_states is not None: + outputs[:flat] = model_runner.model(input_ids[:flat], positions[:flat], eagle_hidden_states[:flat]) else: outputs[:flat] = model_runner.model(input_ids[:flat], positions[:flat]) with torch.cuda.graph(graph, graph_pool): - if eagle_hs is not None: - outputs[:flat] = model_runner.model(input_ids[:flat], positions[:flat], eagle_hs[:flat]) + if eagle_hidden_states is not None: + outputs[:flat] = model_runner.model(input_ids[:flat], positions[:flat], eagle_hidden_states[:flat]) else: outputs[:flat] = model_runner.model(input_ids[:flat], positions[:flat]) @@ -768,8 +614,8 @@ def capture_glue_decode_cudagraph(model_runner): cu_seqlens_q=cu_seqlens_q, outputs=outputs, ) - if eagle_hs is not None: - graph_vars["eagle_hidden_states"] = eagle_hs + if eagle_hidden_states is not None: + graph_vars["eagle_hidden_states"] = eagle_hidden_states return graph_vars, graph_pool, graphs, graph_bs_list @@ -779,8 +625,6 @@ def capture_fi_tree_decode_cudagraph(model_runner): config = model_runner.config hf_config = config.hf_config max_bs = min(model_runner.config.max_num_seqs, 512) - K, F = model_runner.config.speculate_k, model_runner.config.async_fan_out - # MQ_LEN = F * (K+1) MQ_LEN = sum(model_runner.config.fan_out_list) max_flat_batch_size = max_bs * MQ_LEN @@ -789,12 +633,11 @@ def capture_fi_tree_decode_cudagraph(model_runner): input_ids = torch.zeros(max_flat_batch_size, dtype=torch.int64, device=model_runner.device) positions = torch.zeros(max_flat_batch_size, dtype=torch.int64, device=model_runner.device) slot_mapping = torch.zeros(max_flat_batch_size, dtype=torch.int32, device=model_runner.device) - context_lens = torch.full((max_bs,), config.max_model_len, dtype=torch.int32, device=model_runner.device) # make sure these are consistent with our dummy example + context_lens = torch.full((max_bs,), config.max_model_len, dtype=torch.int32, device=model_runner.device) block_tables = torch.zeros(max_bs, max_num_blocks, dtype=torch.int32, device=model_runner.device) outputs = torch.empty(max_flat_batch_size, hf_config.hidden_size, device=model_runner.device) logits = torch.empty(max_flat_batch_size, hf_config.vocab_size, device=model_runner.device) - # Create graph_bs_list to match what will be used in cudagraph_helpers.py graph_bs_list = [1] for bs in [2, 4, 8] + list(range(16, max_bs + 1, 16)): if bs <= max_bs: @@ -806,60 +649,39 @@ def capture_fi_tree_decode_cudagraph(model_runner): graphs = {} graph_pool = None - # Eagle draft needs hidden_states for forward (d_model_draft, NOT 3*d_model_target) - # All callers project target acts via fc() BEFORE passing to CG - # MUST be outside the for-loop so all graphs share the same tensor fi_hidden_states = None - if config.use_eagle and model_runner.is_draft: - fi_hidden_states = torch.zeros(max_flat_batch_size, hf_config.hidden_size, - dtype=hf_config.torch_dtype, device=model_runner.device) + if config.use_eagle_or_phoenix and model_runner.is_draft: + fi_hidden_states = torch.zeros( + max_flat_batch_size, + model_runner.hidden_states_dim, + dtype=hf_config.torch_dtype, + device=model_runner.device, + ) - print(f'About to capture FI cudagraphs for bs={graph_bs_list}', flush=True) + # Pre-allocate tree_cu_seqlens_q per batch size bucket (constant values, used by FA4) + tree_cu_seqlens_q_dict = {} + for bs in graph_bs_list: + tree_cu_seqlens_q_dict[bs] = torch.arange( + bs + 1, dtype=torch.int32, device=model_runner.device) * MQ_LEN - for bs in reversed(graph_bs_list): - graph = torch.cuda.CUDAGraph() + # Pre-allocate tree mask bias at max size (shared across all batch sizes, updated before replay) + tree_mask_bias = torch.zeros( + max_flat_batch_size * config.max_model_len, + dtype=torch.float32, device=model_runner.device) - # Build a self-consistent fake plan for capture: - # - q_len = MQ_LEN for each request - # - k_len = max_model_len for each request (use maximum context length) + print(f'[cuda_graph_helpers.capture_fi_tree_decode_cudagraph] About to capture FA4 tree decode cudagraphs for bs={graph_bs_list}', flush=True) - cu_seqlens_q = torch.arange( - bs + 1, dtype=torch.int32, device=model_runner.device) * MQ_LEN - # Use max_num_blocks pages per request for maximum context length - kv_indptr = torch.arange( - bs + 1, dtype=torch.int32, device=model_runner.device) * max_num_blocks - kv_indices = torch.zeros(int( - kv_indptr[-1].item()), dtype=torch.int32, device=model_runner.device) # page ids (dummy) - # Last page length for max model len context - last_page_len = config.max_model_len % model_runner.block_size - if last_page_len == 0: - last_page_len = model_runner.block_size - kv_last_page_len = torch.full( - (bs,), last_page_len, dtype=torch.int32, device=model_runner.device) - custom_mask = torch.ones(bs * MQ_LEN * config.max_model_len, - dtype=torch.bool, device=model_runner.device) - - # Set the fi_tensors buffers with our fake data - model_runner.prefill_wrappers[bs].plan( - cu_seqlens_q, - kv_indptr, - kv_indices, - kv_last_page_len, - hf_config.num_attention_heads, - hf_config.num_key_value_heads, - hf_config.head_dim, - model_runner.block_size, - custom_mask=custom_mask, - q_data_type=torch.bfloat16, - kv_data_type=torch.bfloat16, - ) + for bs in reversed(graph_bs_list): + graph = torch.cuda.CUDAGraph() - # Set minimal context needed for run + # Set context with FA4 metadata set_context( is_prefill=False, slot_mapping=slot_mapping[:bs * MQ_LEN], context_lens=context_lens[:bs], - block_tables=block_tables[:bs] + block_tables=block_tables[:bs], + tree_cu_seqlens_q=tree_cu_seqlens_q_dict[bs], + tree_mask_bias=tree_mask_bias, ) # Warmup run @@ -895,6 +717,8 @@ def capture_fi_tree_decode_cudagraph(model_runner): context_lens=context_lens, outputs=outputs, logits=logits, + tree_cu_seqlens_q=tree_cu_seqlens_q_dict, + tree_mask_bias=tree_mask_bias, ) if fi_hidden_states is not None: graph_vars["hidden_states"] = fi_hidden_states diff --git a/ssd/engine/helpers/runner_helpers.py b/ssd/engine/helpers/runner_helpers.py index 8ad0804cc..2e0455e60 100644 --- a/ssd/engine/helpers/runner_helpers.py +++ b/ssd/engine/helpers/runner_helpers.py @@ -1,51 +1,609 @@ +from datetime import datetime +from dataclasses import dataclass +import os +import enum import torch import torch.distributed as dist +from transformers import AutoTokenizer from ssd.engine.sequence import Sequence +from ssd.utils.misc import compress_neg_ones_and_zeros -def prepare_prefill_payload( - input_id_list: list[list[int]], - eagle_acts: torch.Tensor, - device: torch.device, - max_blocks: int, - draft_block_tables: list[list[int]] | torch.Tensor, -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - input_ids_flat = [] - num_tokens = [] - for input_ids in input_id_list: - input_ids_flat.extend(input_ids) - num_tokens.append(len(input_ids)) - input_ids_flat = torch.tensor(input_ids_flat, dtype=torch.int64, device=device) - num_tokens = torch.tensor(num_tokens, dtype=torch.int64, device=device) - if isinstance(draft_block_tables, list): - draft_block_table = torch.tensor( - [dbt + [-1] * (max_blocks - len(dbt)) for dbt in draft_block_tables], - dtype=torch.int32, device=device, - ) +NCCL_LOG = os.environ.get("SSD_NCCL_LOG", "0") == "1" +BRIEF_LOG = os.environ.get("SSD_BRIEF_LOG", "0") == "1" +RUN_NAME = os.environ.get("SSD_RUN_NAME", "") + +def _ts(): + return datetime.now().strftime('%H:%M:%S.%f')[:-3] + +def _dump_ts(): + if RUN_NAME: + return RUN_NAME + else: + return datetime.now().strftime('%Y-%m-%d_%H-%M-%S.%f') # [:-4] + +def list_to_str(lst: list[float] | list[list[float]], num_decimals: int = 4) -> str: + assert len(lst) > 0 + if isinstance(lst[0], float): + return str([round(v, 4) for v in lst]) else: - assert draft_block_tables.shape == (len(input_id_list), max_blocks), ( - f"draft_block_tables shape mismatch: expected ({len(input_id_list), max_blocks}), got {draft_block_tables.shape}" + assert isinstance(lst[0], list) + return str([[round(v, 4) for v in row] for row in lst]) + + +@enum.unique +class COMMAND(enum.IntEnum): + PREFILL = 0 + SPECULATION = 1 + DRAFT_EXIT = 2 + + +@dataclass +class PrefillRequest: + cmd: torch.Tensor | None + metadata: torch.Tensor + input_ids: torch.Tensor + num_tokens: torch.Tensor + draft_block_table: torch.Tensor + eagle_acts: torch.Tensor + + @classmethod + def prepare( + cls, + input_ids: torch.Tensor, # flat tensor of input ids + num_tokens: torch.Tensor, # tensor of num tokens per sequence + draft_block_table: torch.Tensor, + eagle_acts: torch.Tensor, + max_blocks: int, + device: torch.device, + cmd_buffer: torch.Tensor = None, + metadata_buffer: torch.Tensor = None, + tokenizer: AutoTokenizer = None, + ): + if eagle_acts is not None: + assert eagle_acts.shape[0] == input_ids.shape[0], ( + f"Eagle activations length {eagle_acts.shape[0]} != input_ids_flat length {input_ids.shape[0]}" + ) + + metadata = [ + input_ids.shape[0], + num_tokens.shape[0], + max_blocks, + 1 if eagle_acts is not None else 0, + eagle_acts.shape[1] if eagle_acts is not None else 0, + ] + if metadata_buffer is None: + metadata_buffer = torch.tensor(metadata, dtype=torch.int64, device=device) + else: + metadata_buffer[:] = metadata + + if cmd_buffer is None: + cmd_buffer = torch.tensor([COMMAND.PREFILL], dtype=torch.int64, device=device) + else: + cmd_buffer[0] = COMMAND.PREFILL + + prefill_request = cls( + cmd=cmd_buffer, + metadata=metadata_buffer, + input_ids=input_ids, + num_tokens=num_tokens, + draft_block_table=draft_block_table, + eagle_acts=eagle_acts, + ) + prefill_request.tokenizer = tokenizer + return prefill_request + + def send(self, async_pg: dist.ProcessGroup, draft_rank: int): + if NCCL_LOG: + sep = '=' * 80 + print(f"[{_ts()}] \n{sep}", flush=True) + print(f"[{_ts()}] [NCCL_LOG SEND_PREFILL] cmd={self.cmd.tolist()}", flush=True) + print(f"[{_ts()}] [NCCL_LOG SEND_PREFILL] metadata={self.metadata.tolist()}", flush=True) + print(f"[{_ts()}] [NCCL_LOG SEND_PREFILL] input_ids shape={self.input_ids.shape}, values={self.input_ids.tolist()}", flush=True) + print(f"[{_ts()}] [NCCL_LOG SEND_PREFILL] input_ids decoded='{_decode_ids(self.input_ids, self.tokenizer)}'", flush=True) + print(f"[{_ts()}] [NCCL_LOG SEND_PREFILL] num_tokens={self.num_tokens.tolist()}", flush=True) + draft_block_table_values_str = compress_neg_ones_and_zeros(f"{self.draft_block_table.tolist()}") + print(f"[{_ts()}] [NCCL_LOG SEND_PREFILL] draft_block_table shape={self.draft_block_table.shape}, values={draft_block_table_values_str}", flush=True) + print(f"[{_ts()}] [NCCL_LOG SEND_PREFILL] eagle_acts={'None' if self.eagle_acts is None else f'shape={self.eagle_acts.shape}'}", flush=True) + print(f"[{_ts()}] {sep}\n", flush=True) + send_tensor(self.cmd, async_pg, draft_rank, name="cmd", prefix="TARGET:PrefillRequest.send") + send_tensor(self.metadata, async_pg, draft_rank, name="metadata", prefix="TARGET:PrefillRequest.send") + fused_payload = concat_tensors_as_int64(self.input_ids, self.num_tokens, self.draft_block_table) + send_tensor(fused_payload, async_pg, draft_rank, name="fused payload", prefix="TARGET:PrefillRequest.send") + if self.eagle_acts is not None: + send_tensor(self.eagle_acts, async_pg, draft_rank, name="eagle acts", prefix="TARGET:PrefillRequest.send") + + @classmethod + def receive( + cls, + async_pg: dist.ProcessGroup, + target_rank: int, + device: torch.device, + metadata_buffer: torch.Tensor=None, + eagle_act_dtype: torch.dtype=torch.bfloat16, + tokenizer: AutoTokenizer = None, + ): + # 1) Receive metadata then individual tensors + # First receive prefill metadata to learn sizes + if metadata_buffer is None: + metadata_buffer = torch.empty(5, dtype=torch.int64, device=device) + + metadata = receive_tensor(metadata_buffer, async_pg, target_rank, name="metadata", prefix="DRAFT:PrefillRequest.receive") + total_new_tokens, batch_size, max_blocks, use_eagle, eagle_act_dim = metadata.tolist() + + # 2) receive fused int64 payload (input_ids + num_tokens + draft_block_table) + fused_total = total_new_tokens + batch_size + batch_size * max_blocks + fused = torch.empty(fused_total, dtype=torch.int64, device=device) + fused = receive_tensor(fused, async_pg, target_rank, name="fused payload", prefix="DRAFT:PrefillRequest.receive") + off = 0 + input_ids = fused[off:off + total_new_tokens] + off += total_new_tokens + num_tokens = fused[off:off + batch_size] + off += batch_size + draft_block_table = fused[off:off + batch_size * max_blocks].view(batch_size, max_blocks).to(torch.int32) + off += batch_size * max_blocks + assert off == fused_total + + eagle_acts = None + if use_eagle: + eagle_acts = torch.empty( + total_new_tokens, eagle_act_dim, dtype=eagle_act_dtype, device=device, + ) + eagle_acts = receive_tensor(eagle_acts, async_pg, target_rank, name="eagle acts", prefix="DRAFT:PrefillRequest.receive") + + if BRIEF_LOG: + print(f"[{_ts()}] [PrefillRequest.receive] metadata={metadata.tolist()}", flush=True) + print(f"[{_ts()}] [PrefillRequest.receive] num_tokens={num_tokens.tolist()}", flush=True) + decoded_input_ids = _decode_ids(input_ids, tokenizer) + print(f"[{_ts()}] [PrefillRequest.receive] input_ids shape={input_ids.shape}, values={input_ids.tolist()}, decoded='{decoded_input_ids}'", flush=True) + if eagle_acts is not None: + print(f"[{_ts()}] [PrefillRequest.receive] eagle_acts shape={eagle_acts.shape}, eagle_acts[:3, :3]={list_to_str(eagle_acts[:3, :3].tolist())}", flush=True) + + received_request = cls( + cmd=None, + metadata=metadata, + input_ids=input_ids, + num_tokens=num_tokens, + draft_block_table=draft_block_table, + eagle_acts=eagle_acts, + ) + received_request.dump() + return received_request + + def dump(self): + dump_dir = os.environ.get("SSD_DUMP_TENSORS_DIR", "") + if dump_dir: + torch.save({ + 'metadata': self.metadata.cpu(), + 'input_ids': self.input_ids.cpu(), + 'num_tokens': self.num_tokens.cpu(), + 'draft_block_table': self.draft_block_table.cpu(), + 'eagle_acts': self.eagle_acts.cpu() if self.eagle_acts is not None else None, + }, f"{dump_dir}/prefill_request_{_dump_ts()}.pt") + + +@dataclass +class SpeculationRequest: + cmd: torch.Tensor | None + metadata: torch.Tensor + cache_keys: torch.Tensor + num_tokens: torch.Tensor + block_tables: torch.Tensor + temps: torch.Tensor # .view(torch.int32).to(torch.int64) + recovery_activations: torch.Tensor | None + extend_activations: torch.Tensor | None + extend_counts: torch.Tensor | None + extend_token_ids: torch.Tensor | None + + @classmethod + def prepare( + cls, + batch_size: int, + lookahead: int, + max_blocks: int, + vocab_size: int, + draft_dtype: torch.dtype, + device: torch.device, + eagle: bool = False, + eagle_act_dim: int = 0, + tokenizer: AutoTokenizer = None, + ): + speculation_request = cls(*([None] * 10)) + speculation_request.batch_size = batch_size + speculation_request.lookahead = lookahead + speculation_request.max_blocks = max_blocks + speculation_request.vocab_size = vocab_size + speculation_request.draft_dtype = draft_dtype + speculation_request.eagle = eagle + speculation_request.eagle_act_dim = eagle_act_dim + speculation_request.device = device + speculation_request.tokenizer = tokenizer + speculation_request._alloc_buffers() + return speculation_request + + def _alloc_buffers(self): + B, K = self.batch_size, self.lookahead + self.cmd = torch.tensor([COMMAND.SPECULATION], dtype=torch.int64, device=self.device) + self.metadata = torch.tensor([B, K, self.max_blocks, self.eagle_act_dim, self.vocab_size], dtype=torch.int64, device=self.device) + self.cache_keys = torch.empty(B, 3, dtype=torch.int64, device=self.device) + self.num_tokens = torch.empty(B, dtype=torch.int64, device=self.device) + self.temps = torch.zeros(B, dtype=torch.float32, device=self.device) + if self.max_blocks > 0: + self.block_tables = torch.full((B, self.max_blocks), -1, dtype=torch.int32, device=self.device) + else: + self.block_tables = None + if self.eagle: + self.recovery_activations = torch.empty(B, self.eagle_act_dim, dtype=self.draft_dtype, device=self.device) + self.extend_activations = torch.empty(B, K, self.eagle_act_dim, dtype=self.draft_dtype, device=self.device) + self.extend_counts = torch.zeros(B, dtype=torch.int64, device=self.device) + self.extend_token_ids = torch.empty(B, K, dtype=torch.int64, device=self.device) + else: + self.recovery_activations = None + self.extend_activations = None + self.extend_counts = None + self.extend_token_ids = None + + def maybe_update_buffers(self, batch_size: int, max_blocks: int = -1): + if batch_size != self.batch_size: + self.batch_size = batch_size + if max_blocks > 0: + self.max_blocks = max_blocks + self._alloc_buffers() + + def send(self, async_pg: dist.ProcessGroup, draft_rank: int): + send_tensor(self.cmd, async_pg, draft_rank, name="cmd", prefix="TARGET:SpeculationRequest.send") + send_tensor(self.metadata, async_pg, draft_rank, name="metadata", prefix="TARGET:SpeculationRequest.send") + # Fuse all payload fields (including EAGLE) into a single NCCL send + int64_parts = [ + self.cache_keys.reshape(-1), + self.num_tokens.reshape(-1), + self.block_tables.to(torch.int64).reshape(-1), + self.temps.view(torch.int32).to(torch.int64).reshape(-1), + ] + if self.eagle: + int64_parts.extend([ + self.recovery_activations.contiguous().reshape(-1).view(torch.int64), + self.extend_counts.reshape(-1), + self.extend_activations.contiguous().reshape(-1).view(torch.int64), + self.extend_token_ids.reshape(-1), + ]) + fused_payload = torch.cat(int64_parts) + send_tensor(fused_payload, async_pg, draft_rank, name="fused payload", prefix="TARGET:SpeculationRequest.send") + + @classmethod + def receive( + cls, + async_pg: dist.ProcessGroup, + target_rank: int, + device: torch.device, + draft_dtype: torch.dtype, + tokenizer: AutoTokenizer = None, + verbose: bool = False, + ): + meta = torch.empty(5, dtype=torch.int64, device=device) + meta = receive_tensor(meta, async_pg, target_rank, name="metadata", prefix="DRAFT:SpeculationRequest.receive") + B, K, max_blocks, eagle_act_dim, vocab_size = meta.tolist() + if NCCL_LOG: + print(f"[{_ts()}] [NCCL_LOG DRAFT_RECV_SPEC] SPECULATION REQUEST META RECEIVED, B={B}, K={K}, max_blocks={max_blocks}", flush=True) + + eagle = eagle_act_dim > 0 + speculation_request = cls.prepare( + batch_size=B, + lookahead=K, + max_blocks=max_blocks, + vocab_size=vocab_size, + draft_dtype=draft_dtype, + device=device, + eagle=eagle, + eagle_act_dim=eagle_act_dim, + tokenizer=tokenizer, + ) + + # Receive all payload (including EAGLE tensors) in one fused int64 burst + _dsz = torch.finfo(draft_dtype).bits // 8 if eagle else 0 # draft dtype element size + fused_total = (3 * B) + B + (B * max_blocks) + B # cache_keys + num_tokens + block_tables + temps + if eagle: + fused_total += B * eagle_act_dim * _dsz // 8 # recovery_activations as int64 + fused_total += B # extend_counts + fused_total += B * K * eagle_act_dim * _dsz // 8 # extend_activations as int64 + fused_total += B * K # extend_token_ids + fused_req = torch.empty(fused_total, dtype=torch.int64, device=device) + fused_req = receive_tensor(fused_req, async_pg, target_rank, name="fused payload", prefix="DRAFT:SpeculationRequest.receive") + off = 0 + speculation_request.cache_keys = fused_req[off:off + (3 * B)].view(B, 3) + off += 3 * B + speculation_request.num_tokens = fused_req[off:off + B].to(torch.int64) + off += B + speculation_request.block_tables = fused_req[off:off + B * max_blocks].view(B, max_blocks).to(torch.int32) + off += B * max_blocks + temps_as_int64 = fused_req[off:off + B] + off += B + speculation_request.temps = temps_as_int64.to(torch.int32).view(torch.float32) + if eagle: + n_rec = B * eagle_act_dim * _dsz // 8 + speculation_request.recovery_activations = fused_req[off:off + n_rec].view(draft_dtype).view(B, eagle_act_dim) + off += n_rec + speculation_request.extend_counts = fused_req[off:off + B] + off += B + n_ext = B * K * eagle_act_dim * _dsz // 8 + speculation_request.extend_activations = fused_req[off:off + n_ext].view(draft_dtype).view(B, K, eagle_act_dim) + off += n_ext + speculation_request.extend_token_ids = fused_req[off:off + B * K].view(B, K) + off += B * K + assert off == fused_total + + cache_keys, draft_block_tables, temperatures, num_tokens = ( + speculation_request.cache_keys, speculation_request.block_tables, speculation_request.temps, speculation_request.num_tokens + ) + if NCCL_LOG: + sep = '=' * 80 + print(f"[{_ts()}] \n{sep}", flush=True) + print(f"[{_ts()}] [NCCL_LOG DRAFT_RECV_SPEC] meta=[B={B}, K={K}]", flush=True) + print(f"[{_ts()}] [NCCL_LOG DRAFT_RECV_SPEC] cache_keys shape={cache_keys.shape}", flush=True) + for i in range(B): + seq_id, accept_len, verified_id = cache_keys[i].tolist() + if tokenizer is not None: + verified_text = f" (f'{tokenizer.decode([int(verified_id)])}')" + else: + verified_text = "" + print(f"[{_ts()}] req[{i}]: seq_id={seq_id}, accept_len={accept_len}, verified_id={int(verified_id)}{verified_text}", flush=True) + print(f"[{_ts()}] [NCCL_LOG DRAFT_RECV_SPEC] num_tokens={num_tokens.tolist()}", flush=True) + draft_block_table_values_str = compress_neg_ones_and_zeros(f"{draft_block_tables.tolist()}") + print(f"[{_ts()}] [NCCL_LOG DRAFT_RECV_SPEC] draft_block_tables shape={draft_block_tables.shape}, values={draft_block_table_values_str}", flush=True) + print(f"[{_ts()}] [NCCL_LOG DRAFT_RECV_SPEC] temperatures={temperatures.tolist()}", flush=True) + print(f"[{_ts()}] {sep}\n", flush=True) + + if eagle and verbose: + target_recovery_activations = speculation_request.recovery_activations + extend_counts = speculation_request.extend_counts + extend_eagle_acts = speculation_request.extend_activations + extend_token_ids = speculation_request.extend_token_ids + print(f"[{_ts()}] [CACHE REQUEST] target_recovery_activations.shape={target_recovery_activations.shape}", flush=True) + print(f"[{_ts()}] [CACHE REQUEST] extend_counts.shape={extend_counts.shape}, {extend_counts.tolist()}", flush=True) + print(f"[{_ts()}] [CACHE REQUEST] extend_eagle_acts.shape={extend_eagle_acts.shape}", flush=True) + print(f"[{_ts()}] [CACHE REQUEST] extend_token_ids.shape={extend_token_ids.shape}, {extend_token_ids.tolist()}", flush=True) + recovery_tokens_target = cache_keys[:, 2].clone() + print(f"[{_ts()}] \n{'='*80}", flush=True) + print(f"[{_ts()}] [CACHE REQUEST] Batch size: {B}, Spec depth: {K}", flush=True) + for i in range(B): + seq_id = cache_keys[i, 0].item() + keep_idx = cache_keys[i, 1].item() + rec_token_target = recovery_tokens_target[i].item() + if tokenizer is not None: + rec_token_text = f" (f'{tokenizer.decode([rec_token_target])}')" + else: + rec_token_text = "" + n_ext = extend_counts[i].item() + print(f"[{_ts()}] Seq {seq_id}: keep_idx={keep_idx}, recovery_token={rec_token_target}{rec_token_text}, n_ext={n_ext}", flush=True) + print(f"[{_ts()}] {'='*80}\n", flush=True) + + if BRIEF_LOG: + cache_keys = speculation_request.cache_keys + num_tokens = speculation_request.num_tokens + # block_tables = speculation_request.block_tables + # temps = speculation_request.temps + recovery_activations = speculation_request.recovery_activations + extend_activations = speculation_request.extend_activations + extend_counts = speculation_request.extend_counts + extend_token_ids = speculation_request.extend_token_ids + print(f"[{_ts()}] [SpeculationRequest.receive] {B=}, {K=}, {max_blocks=}, {eagle_act_dim=}", flush=True) + for i in range(B): + seq_id, accept_len, verified_id = cache_keys[i].tolist() + verified_text = _decode_ids(verified_id, tokenizer) + # print(f"[{_ts()}] req[{i}]: seq_id={seq_id}, accept_len={accept_len}, verified_id={int(verified_id)} ({verified_text})", flush=True) + print(f"[{_ts()}] req[{i}]: ACCEPT_LENGTH={accept_len}, VERIFIED_TEXT={verified_text}", flush=True) + if eagle: + print(f"[{_ts()}] req[{i}]: recovery_activations shape={recovery_activations.shape}, values[i, :3]={list_to_str(recovery_activations[i, :3].tolist())}", flush=True) + print(f"[{_ts()}] req[{i}]: extend_activations shape={extend_activations.shape}, values[i, :, :3]={list_to_str(extend_activations[i, :, :3].tolist())}", flush=True) + num_extend = extend_counts[i].item() + print(f"[{_ts()}] req[{i}]: extend_counts shape={extend_counts.shape}, values[i]={num_extend}", flush=True) + decoded_extend_token_ids = _decode_ids(extend_token_ids[i, :num_extend], tokenizer) + print(f"[{_ts()}] req[{i}]: extend_token_ids shape={extend_token_ids.shape}, values={extend_token_ids[i].tolist()}, decoded[:, :{num_extend}]='{decoded_extend_token_ids}'", flush=True) + + speculation_request.dump() + return speculation_request + + def dump(self): + dump_dir = os.environ.get("SSD_DUMP_TENSORS_DIR", "") + if dump_dir: + torch.save({ + 'metadata': self.metadata.cpu(), + 'cache_keys': self.cache_keys.cpu(), + 'num_tokens': self.num_tokens.cpu(), + 'block_tables': self.block_tables.cpu() if self.block_tables is not None else None, + 'temps': self.temps.cpu(), + 'recovery_activations': self.recovery_activations.cpu() if self.recovery_activations is not None else None, + 'extend_activations': self.extend_activations.cpu() if self.extend_activations is not None else None, + 'extend_counts': self.extend_counts.cpu() if self.extend_counts is not None else None, + 'extend_token_ids': self.extend_token_ids.cpu() if self.extend_token_ids is not None else None, + }, f"{dump_dir}/speculation_request_{_dump_ts()}.pt") + + +@dataclass +class SpeculationResponse: + speculations: torch.Tensor + logits_q: torch.Tensor | None + cache_hits: torch.Tensor | None + + @classmethod + def prepare( + cls, + lookahead: int, + device: torch.device, + draft_dtype: torch.dtype = torch.bfloat16, + batch_size: int = 1, + vocab_size: int = -1, + communicate_logits: bool = False, + communicate_cache_hits: bool = False, + tokenizer: AutoTokenizer = None, + ): + response = cls( + speculations=None, + logits_q=None, + cache_hits=None, ) - draft_block_table = draft_block_tables - - # 3) send cmd=1 - cmd = torch.tensor([1], dtype=torch.int64, device=device) - - # 4) send metadata for tensor reconstruction - metadata = torch.tensor([ - input_ids_flat.size(0), - len(input_id_list), # batch_size - max_blocks, - 1 if eagle_acts is not None else 0, - eagle_acts.shape[1] if eagle_acts is not None else 0, - ], dtype=torch.int64, device=device) - - if eagle_acts is not None: - assert eagle_acts.shape[0] == input_ids_flat.shape[0], ( - f"Eagle activations length {eagle_acts.shape[0]} != input_ids_flat length {input_ids_flat.shape[0]}" + response.batch_size = batch_size + response.lookahead = lookahead + response.draft_dtype = draft_dtype + response.device = device + response.vocab_size = vocab_size + response.communicate_logits = communicate_logits + response.communicate_cache_hits = communicate_cache_hits + response.tokenizer = tokenizer + if response.communicate_logits: + assert response.vocab_size > 0, "vocab_size must be set when communicate_logits is True" + response._alloc_buffers() + return response + + def _alloc_buffers(self): + self.speculations = torch.empty(self.batch_size, self.lookahead, dtype=torch.int64, device=self.device) + if getattr(self, 'communicate_logits', False): + self.logits_q = torch.empty(self.batch_size, self.lookahead, self.vocab_size, dtype=self.draft_dtype, device=self.device) + if getattr(self, 'communicate_cache_hits', False): + self.cache_hits = torch.zeros(self.batch_size, dtype=torch.int64, device=self.device) + + def maybe_update_buffers(self, batch_size: int = -1): + if batch_size > 0 and batch_size != self.batch_size: + self.batch_size = batch_size + self._alloc_buffers() + + def send(self, async_pg: dist.ProcessGroup, target_rank: int, tokenizer: AutoTokenizer = None): + send_tensor(self.speculations, async_pg, target_rank, name="speculations", prefix="DRAFT:SpeculationResponse.send") + + if BRIEF_LOG: + decoded_speculations = _decode_ids(self.speculations, tokenizer) + print(f"[{_ts()}] [SpeculationResponse.send] SPECULATION: '{decoded_speculations}'", flush=True) + print(f"[{_ts()}] {'='*80}\n", flush=True) + + if self.logits_q is not None: + assert getattr(self, 'communicate_logits', True), "logits_q is not None but communicate_logits is False" + send_tensor(self.logits_q, async_pg, target_rank, name="logits", prefix="DRAFT:SpeculationResponse.send") + if self.cache_hits is not None: + assert getattr(self, 'communicate_cache_hits', True), "cache_hits is not None but communicate_cache_hits is False" + send_tensor(self.cache_hits, async_pg, target_rank, name="cache hits", prefix="DRAFT:SpeculationResponse.send") + + self.dump() + + def dump(self): + dump_dir = os.environ.get("SSD_DUMP_TENSORS_DIR", "") + if dump_dir: + torch.save({ + 'speculations': self.speculations.cpu(), + 'logits': self.logits_q.cpu() if self.logits_q is not None else None, + 'cache_hits': self.cache_hits.cpu() if self.cache_hits is not None else None, + }, f"{dump_dir}/speculation_response_{_dump_ts()}.pt") + + @classmethod + def receive( + cls, + async_pg: dist.ProcessGroup, + draft_rank: int, + batch_size: int, + lookahead: int, + device: torch.device, + draft_dtype: torch.dtype = torch.bfloat16, + receive_logits: bool = False, + receive_cache_hits: bool = False, + vocab_size: int = -1, + tokenizer: AutoTokenizer = None, + ): + speculation_response = cls.prepare( + batch_size=batch_size, + lookahead=lookahead, + device=device, + draft_dtype=draft_dtype, + communicate_logits=receive_logits, + communicate_cache_hits=receive_cache_hits, + vocab_size=vocab_size, + tokenizer=tokenizer, ) + speculation_response.receive(async_pg, draft_rank, batch_size=batch_size) + return speculation_response + + def receive(self, async_pg: dist.ProcessGroup, draft_rank: int, batch_size: int=-1): + self.maybe_update_buffers(batch_size=batch_size) + self.speculations = receive_tensor(self.speculations, async_pg, draft_rank, name="speculations", prefix="TARGET:SpeculationResponse.receive") + if self.communicate_logits: + self.logits_q = receive_tensor(self.logits_q, async_pg, draft_rank, name="logits", prefix="TARGET:SpeculationResponse.receive") + if self.communicate_cache_hits: + self.cache_hits = receive_tensor(self.cache_hits, async_pg, draft_rank, name="cache hits", prefix="TARGET:SpeculationResponse.receive") + + +def _decode_ids(ids_tensor, tokenizer: AutoTokenizer = None): + if tokenizer is None: + return "" + if isinstance(ids_tensor, int): + ids = [ids_tensor] + else: + ids = ids_tensor.cpu().tolist() + if isinstance(ids, int): + ids = [ids] + return tokenizer.decode(ids) + + +def concat_tensors_as_int64(*tensors: torch.Tensor) -> torch.Tensor: + """Concatenate tensors into a single flat int64 payload.""" + parts = [] + for t in tensors: + if t is None: + continue + if t.dtype != torch.int64: + t = t.to(torch.int64) + parts.append(t.reshape(-1)) + if not parts: + return torch.empty(0, dtype=torch.int64) + return torch.cat(parts, dim=0) + + +def receive_tensor( + tensor: torch.Tensor, + async_pg: dist.ProcessGroup, + draft_runner_rank: int, + name: str = "", + prefix: str = "", + print_shape: bool = True, + print_values: bool = False, +) -> torch.Tensor: + prefix = f"[{prefix:>35}]" if prefix else "" + if NCCL_LOG: + tensor_str = f"{name:>30}" if name else "" + if print_shape: + tensor_str += (", " if tensor_str else "") + f"shape={tensor.shape}" + print(f"[{_ts()}][NCCL:START_RECEIVE_TENSOR]{prefix} {tensor_str}", flush=True) + + dist.recv(tensor, src=draft_runner_rank, group=async_pg) + + if NCCL_LOG: + if print_values: + tensor_str += (", " if tensor_str else "") + f"values={tensor.tolist()}" + print(f"[{_ts()}][NCCL: END_RECEIVE_TENSOR]{prefix} {tensor_str}", flush=True) + + return tensor + + +def send_tensor( + tensor: torch.Tensor, + async_pg: dist.ProcessGroup, + draft_runner_rank: int, + name: str = "", + prefix: str = "", + print_shape: bool = True, + print_values: bool = False, +) -> None: + prefix = f"[{prefix:>35}]" if prefix else "" + if NCCL_LOG: + tensor_str = f"{name:>30}" if name else "" + if print_shape: + tensor_str += (", " if tensor_str else "") + f"shape={tensor.shape}" + print(f"[{_ts()}][NCCL: START_SEND_TENSOR]{prefix} {tensor_str}", flush=True) + + dist.send(tensor, dst=draft_runner_rank, group=async_pg) + + if NCCL_LOG: + if print_values: + tensor_str += (", " if tensor_str else "") + f"values={tensor.tolist()}" + print(f"[{_ts()}][NCCL: END_SEND_TENSOR]{prefix} {tensor_str}", flush=True) - return cmd, metadata, input_ids_flat, num_tokens, draft_block_table, eagle_acts def prepare_decode_tensors_from_seqs( seqs: list[Sequence], @@ -96,6 +654,7 @@ def prepare_decode_tensors_from_seqs( slot_mapping.append( block_id * block_size + pos_in_block) + input_ids = torch.tensor( input_ids, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True) positions = torch.tensor( diff --git a/ssd/engine/llm_engine.py b/ssd/engine/llm_engine.py index a1015989b..7b40ed071 100644 --- a/ssd/engine/llm_engine.py +++ b/ssd/engine/llm_engine.py @@ -14,14 +14,15 @@ from ssd.engine.verifier import Verifier import atexit +import weakref from dataclasses import fields from time import perf_counter from tqdm.auto import tqdm from transformers import AutoTokenizer +import torch.distributed as dist import torch.multiprocessing as mp - METRICS = { "cache_hits": [], "accepted_suffix_lens_with_recovery": [], @@ -33,7 +34,11 @@ "decode_total_tokens": 0, "target_step_times": [], "target_verify_times": [], + # Per-step accept trace: enabled by tests when SSD_TRACE_ACCEPTS=1. + # See verifier.verify(); each step is a list of (seq_id, suffix, recovery). } +if os.environ.get("SSD_TRACE_ACCEPTS", "0") == "1": + METRICS["per_step_accepts"] = [] class LLMEngine: @@ -45,8 +50,6 @@ def __init__(self, model, **kwargs): self.config = config Sequence.block_size = config.kvcache_block_size - assert config.kvcache_block_size >= ( - 2 * config.speculate_k + 2), "ERROR: support for block size < 2*k+2 is not implemented" assert config.num_gpus > 1 or not config.draft_async, "ERROR: draft_async requires at least 2 gpus" # Check that target and draft are from the same family @@ -83,7 +86,12 @@ def __init__(self, model, **kwargs): init_q = ctx.Queue() draft_rank = config.num_gpus - 1 self.draft_ps = ctx.Process( - target=DraftRunner, args=(config, draft_rank, init_q)) + target=DraftRunner, args=( + DraftRunner.create_draft_config(config), + draft_rank, + init_q, + ), + ) self.draft_ps.start() print( f'Draft runner created on rank {draft_rank} (async)!', flush=True) @@ -94,11 +102,25 @@ def __init__(self, model, **kwargs): # do this after so we can launch model runner above so that the q is actually populated if config.speculate and config.draft_async: + _timeout_s = 1200 # 20 minutes + _banner = "=" * 80 + print( + f'\n{_banner}\n' + f'>>> TARGET: WAITING for draft runner to send kv_cache_size (timeout={_timeout_s}s) ...\n' + f'{_banner}\n', + flush=True, + ) try: - num_blocks = init_q.get(timeout=180) # seconds + num_blocks = init_q.get(timeout=_timeout_s) except Exception as e: raise RuntimeError( - "ERROR: Timed out waiting for draft kv cache size") from e + f"ERROR: Timed out after {_timeout_s}s waiting for draft kv cache size") from e + print( + f'\n{_banner}\n' + f'>>> TARGET: Received draft kv_cache_size={num_blocks}!\n' + f'{_banner}\n', + flush=True, + ) init_q.close() self.draft_cfg = DraftRunner.create_draft_config(config) @@ -109,7 +131,7 @@ def __init__(self, model, **kwargs): if config.speculate and not config.draft_async: # keep it colocated on rank 0, process/dist agnostic in this case - self.draft_runner = DraftRunner(config) + self.draft_runner = DraftRunner(DraftRunner.create_draft_config(config)) self.draft_cfg = self.draft_runner.draft_cfg print(f'Draft runner created on rank 0 (no async)', flush=True) @@ -121,7 +143,15 @@ def __init__(self, model, **kwargs): print(f"[LLMEngine] finished llm_engine init", flush=True) self._exiting = False - atexit.register(lambda: self.exit(hard=True)) + # Use a weakref so `del llm` can actually release the engine (and its + # GPU tensors on target rank 0) before process exit. A direct closure + # over `self` keeps the engine alive for the whole process lifetime. + _weak_self = weakref.ref(self) + def _atexit_cleanup(): + obj = _weak_self() + if obj is not None: + obj.exit(hard=True) + atexit.register(_atexit_cleanup) def exit(self, hard: bool = True): print(f"[LLMEngine] Exiting (hard={hard})", flush=True) @@ -135,10 +165,11 @@ def exit(self, hard: bool = True): self.model_runner.send_draft_exit_signal() except Exception: pass - # 2) Tell all target ranks (including rank 0 self) to exit (non-blocking cleanup, no os._exit inside) + # 2) Tell all target ranks (including rank 0 self) to exit (non-blocking cleanup, no os._exit inside). + # Forward `hard` so soft exits actually destroy process groups; otherwise the next test + # in the same process gets "trying to initialize the default process group twice". try: - self.model_runner.call("exit", - True if not self.config.draft_async else True) + self.model_runner.call("exit", hard) except Exception: pass # 3) Wait briefly for TP workers; terminate if still around @@ -190,11 +221,13 @@ def add_request(self, prompt: str | list[int], sampling_params: SamplingParams): self.scheduler.add(seq) - def step(self, step: InferenceStep): + def step(self, step: InferenceStep, step_num: int): t = perf_counter() seqs, is_prefill = self.scheduler.schedule() - ttl_tokens = step.prefill(seqs) if is_prefill else step.decode(seqs) - + ttl_tokens = ( + step.prefill(seqs, step_num=step_num) if is_prefill else + step.decode(seqs, step_num=step_num) + ) time_taken = perf_counter() - t if is_prefill: @@ -239,35 +272,48 @@ def log_metrics(self): print( f"[metrics] Avg target verify time (ms): {sum(METRICS['target_verify_times']) * 1000 / len(METRICS['target_verify_times']):.2f}", flush=True) if self.config.draft_async: - print( - f"[metrics] Avg Cache Hits: {sum(METRICS['cache_hits']) / len(METRICS['cache_hits']):.2f}", flush=True) - # Log separate metrics for cache hits - if METRICS['accepted_suffix_lens_on_hit']: - avg_suffix_len_on_hit = sum( - METRICS['accepted_suffix_lens_on_hit']) / len(METRICS['accepted_suffix_lens_on_hit']) - print( - f"[metrics] Avg Tokens per step on Cache Hit: {avg_suffix_len_on_hit:.2f}", flush=True) - - # Calculate empirical frequencies of accepted_suffix_lens_on_hit - 1 - adjusted_lens = [length - 1 for length in METRICS['accepted_suffix_lens_on_hit']] - total_count = len(adjusted_lens) - freq_counts = {} - for length in adjusted_lens: - freq_counts[length] = freq_counts.get(length, 0) + 1 - - # Print normalized empirical probabilities for range [0, K] - print(f"[metrics] Empirical frequencies of accepted_suffix_lens_on_hit - 1:", flush=True) - for k in range(self.config.speculate_k + 1): - prob = freq_counts.get(k, 0) / total_count - print(f" {k}: {prob:.3f}", flush=True) - if METRICS['accepted_suffix_lens_on_miss']: - avg_suffix_len_on_miss = sum( - METRICS['accepted_suffix_lens_on_miss']) / len(METRICS['accepted_suffix_lens_on_miss']) - print( - f"[metrics] Avg Tokens per step on Cache Miss: {avg_suffix_len_on_miss:.2f}", flush=True) + if METRICS['accepted_suffix_lens_with_recovery']: + print(f"[metrics] Avg Tokens per step (incl recovery): {sum(METRICS['accepted_suffix_lens_with_recovery']) / len(METRICS['accepted_suffix_lens_with_recovery']):.2f}", flush=True) + else: + print(f"[metrics] Avg Tokens per step (incl recovery): N/A (THIS MAY INDICATE A BUG)", flush=True) + + if not self.config.communicate_cache_hits: + # TODO: Compute these metrics on the draft side? + print(f"Skipping metrics based on cache hits vs misses because communicate_cache_hits is False", flush=True) else: print( - f"[metrics] Avg Tokens per step on Cache Hit: N/A (no cache hits)", flush=True) + f"[metrics] Avg Cache Hits: {sum(METRICS['cache_hits']) / len(METRICS['cache_hits']):.2f}", flush=True) + # Log separate metrics for cache hits + if METRICS['accepted_suffix_lens_on_hit']: + avg_suffix_len_on_hit = sum( + METRICS['accepted_suffix_lens_on_hit']) / len(METRICS['accepted_suffix_lens_on_hit']) + print( + f"[metrics] Avg Tokens per step on Cache Hit: {avg_suffix_len_on_hit:.2f}", flush=True) + + # Calculate empirical frequencies of accepted_suffix_lens_on_hit - 1 + adjusted_lens = [length - 1 for length in METRICS['accepted_suffix_lens_on_hit']] + total_count = len(adjusted_lens) + freq_counts = {} + for length in adjusted_lens: + freq_counts[length] = freq_counts.get(length, 0) + 1 + + # Print normalized empirical probabilities for range [0, K] + print(f"[metrics] Empirical frequencies of accepted_suffix_lens_on_hit - 1:", flush=True) + for k in range(self.config.speculate_k + 1): + prob = freq_counts.get(k, 0) / total_count + print(f" {k}: {prob:.3f}", flush=True) + else: + print( + f"[metrics] Avg Tokens per step on Cache Hit: N/A (no cache hits)", flush=True) + + if METRICS['accepted_suffix_lens_on_miss']: + avg_suffix_len_on_miss = sum( + METRICS['accepted_suffix_lens_on_miss']) / len(METRICS['accepted_suffix_lens_on_miss']) + print( + f"[metrics] Avg Tokens per step on Cache Miss: {avg_suffix_len_on_miss:.2f}", flush=True) + else: + print( + f"[metrics] Avg Tokens per step on Cache Miss: N/A (no cache misses)", flush=True) def create_inference_step(self, config: Config) -> InferenceStep: if config.speculate: @@ -281,6 +327,10 @@ def create_inference_step(self, config: Config) -> InferenceStep: draft_dtype=config.draft_hf_config.torch_dtype, kvcache_block_size=config.kvcache_block_size, max_model_len=config.max_model_len, + eagle=config.use_eagle_or_phoenix, + eagle_act_dim=self.model_runner.eagle_acts_dim if config.use_eagle_or_phoenix else 0, + communicate_logits=config.communicate_logits, + communicate_cache_hits=config.communicate_cache_hits, async_pg=self.model_runner.async_pg, draft_runner_rank=self.num_tp_gpus, tokenizer=self.tokenizer, @@ -307,7 +357,7 @@ def create_inference_step(self, config: Config) -> InferenceStep: scheduler=self.scheduler, speculator=speculator, verifier=verifier, - eagle=config.use_eagle, + eagle=config.use_eagle_or_phoenix, tokenizer=self.tokenizer, async_spec=config.draft_async, ) @@ -325,8 +375,6 @@ def generate( use_tqdm: bool = True, stream_callback=None, ) -> list[str]: - for k in METRICS: - METRICS[k] = [] if isinstance(METRICS[k], list) else 0 if use_tqdm: pbar = tqdm(total=len(prompts), @@ -349,7 +397,7 @@ def generate( ) i += 1 t = perf_counter() - output = self.step(inference_step) + output = self.step(inference_step, i - 1) time_taken = perf_counter() - t METRICS["target_step_times"].append(time_taken) diff --git a/ssd/engine/model_runner.py b/ssd/engine/model_runner.py index 1f268c8e5..f945ff4d1 100644 --- a/ssd/engine/model_runner.py +++ b/ssd/engine/model_runner.py @@ -1,25 +1,29 @@ import pickle import time +from datetime import datetime, timedelta import torch import torch.distributed as dist from multiprocessing.synchronize import Event from multiprocessing.shared_memory import SharedMemory from transformers import AutoTokenizer, AutoConfig import os -import flashinfer from ssd.config import Config from ssd.engine.sequence import Sequence from ssd.models.qwen3 import Qwen3ForCausalLM from ssd.models.llama3 import LlamaForCausalLM from ssd.models.eagle3_draft_llama3 import Eagle3DraftForCausalLM +from ssd.models.phoenix_draft_llama3 import PhoenixLlamaForCausalLM from ssd.layers.sampler import Sampler from ssd.utils.context import set_context, reset_context, get_context from ssd.utils.loader import load_model from ssd.engine.helpers.runner_helpers import ( + COMMAND, prepare_decode_tensors_from_seqs, prepare_block_tables_from_seqs, - prepare_prefill_tensors_from_seqs + prepare_prefill_tensors_from_seqs, + receive_tensor, + send_tensor, ) from ssd.engine.helpers.cudagraph_helpers import ( run_verify_cudagraph, @@ -30,9 +34,13 @@ capture_verify_cudagraph, capture_fi_tree_decode_cudagraph, capture_glue_decode_cudagraph, - get_custom_mask, ) - + +NCCL_LOG = os.environ.get("SSD_NCCL_LOG", "0") == "1" + +def _ts(): + return f'[[{datetime.now().strftime('%H:%M:%S.%f')[:-3]}]]' + class ModelRunner: @@ -48,18 +56,18 @@ def __init__(self, config: Config, rank: int, event: Event | list[Event], is_dra print(f"Warning: Draft dtype {config.draft_hf_config.torch_dtype} differs from target {config.hf_config.torch_dtype}. Casting draft to {config.hf_config.torch_dtype}.") config.draft_hf_config.torch_dtype = config.hf_config.torch_dtype assert (config.draft_hf_config.vocab_size == config.hf_config.vocab_size) or config.use_eagle, "ERROR in ModelRunner: draft_hf_config.vocab_size != hf_config.vocab_size" - + self.hf_config = config.hf_config if not is_draft else config.draft_hf_config self.block_size = config.kvcache_block_size self.enforce_eager = config.enforce_eager - self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_path if config.tokenizer_path else config.model, use_fast=True) + self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_path if config.tokenizer_path else config.model, use_fast=True, trust_remote_code=True) self.max_num_blocks = (config.max_model_len + self.block_size - 1) // self.block_size assert self.hf_config is not None, "ERROR in ModelRunner: hf_config is None" # this implies boundedness to the end # TODO: Get rid of this. if self.is_draft: - should_use_dist = self.config.draft_async + should_use_dist = self.config.draft_async and self.config.async_nccl_port is None else: should_use_dist = self.config.num_gpus > 1 @@ -67,6 +75,7 @@ def __init__(self, config: Config, rank: int, event: Event | list[Event], is_dra self.world_size = config.num_gpus if should_use_dist else 1 self.rank = rank self.use_eagle = config.use_eagle + self.use_phoenix = config.use_phoenix if config.draft_async: self.draft_rank = config.num_gpus - 1 @@ -86,12 +95,10 @@ def __init__(self, config: Config, rank: int, event: Event | list[Event], is_dra self._exiting = False torch.cuda.set_device(self.rank) - self.device = torch.device(f'cuda:{self.rank}') - - # cudagraph logic for FlashInfer kernels, need diff wrapper for each batch size we make a graph for - if is_draft and config.draft_async: - self._init_flashinfer_wrappers() - + self.device = torch.device(f'cuda:{self.rank}') + self._cmd = torch.empty(1, dtype=torch.int64, device=self.device) + + if self.verbose: print(f'INSIDE MODEL RUNNER INIT, DRAFT={is_draft}', flush=True) self.tp_pg = None @@ -114,7 +121,7 @@ def __init__(self, config: Config, rank: int, event: Event | list[Event], is_dra assert num_tp_gpus == 1, "ERROR in ModelRunner: draft should have tp_size=1" self.tp_pg = None # every rank is given an object from self.tp_pg, even tho draft doesnt participate it gets GROUP_NON_MEMBER object != None back, so we can't assert None here, we - print(f'[model_runner] about to setup and warmup model and cudagraphs, is use_eagle={self.use_eagle}', flush=True) + print(f'[model_runner] about to setup and warmup model and cudagraphs, is use_eagle={self.use_eagle}, is use_phoenix={self.use_phoenix}', flush=True) model_type = self.setup_and_warmup_model_and_cudagraphs(config, self.hf_config, init_q, is_draft) if self.verbose: print(f'-----CAPTURED {model_type}CUDAGRAPH----', flush=True) @@ -156,56 +163,6 @@ def __init__(self, config: Config, rank: int, event: Event | list[Event], is_dra if self.verbose: print(f'-----{model_type}MODEL RUNNER INITIALIZED----', flush=True) - def _init_flashinfer_wrappers(self): - """Initialize FlashInfer wrappers for draft async mode.""" - self.workspace_buffer = torch.zeros( - 512 * 1024 * 1024, dtype=torch.uint8, device=f"cuda:{self.rank}") - - if self.config.enforce_eager: - self.only_prefill_wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(self.workspace_buffer, "NHD") - else: - max_bs = min(self.config.max_num_seqs, 512) - max_num_blocks = (self.config.max_model_len + self.block_size - 1) // self.block_size - - # FlashInfer kernel tensors - # pages_for_max_len = (self.config.max_model_len + self.block_size - 1) // self.block_size - last_page_len_max_len = self.config.max_model_len % self.block_size - last_page_len_max_len = self.block_size if last_page_len_max_len == 0 else last_page_len_max_len - MQ_LEN = self.config.async_fan_out * (self.config.speculate_k + 1) - - cu_seqlens_q = torch.empty(max_bs + 1, dtype=torch.int32, device=self.device) - kv_indptr = torch.empty(max_bs + 1, dtype=torch.int32, device=self.device) - kv_indices = torch.empty(max_bs * max_num_blocks, dtype=torch.int32, device=self.device) - kv_last_page_len = torch.empty(max_bs, dtype=torch.int32, device=self.device) - custom_mask_buf = torch.empty(max_bs * MQ_LEN * self.config.max_model_len, dtype=torch.uint8, device=self.device) - mask_indptr_buf = torch.empty(max_bs + 1, dtype=torch.int32, device=self.device) - - # Create graph_bs_list to match what will be used in cudagraph_helpers.py - graph_bs_list = [1] - for bs in [2, 4, 8] + list(range(16, max_bs + 1, 16)): - if bs <= max_bs: - graph_bs_list.append(bs) - if max_bs not in graph_bs_list: - graph_bs_list.append(max_bs) - graph_bs_list.sort() - - # Create a dict of wrappers, one for each bs we will touch in cudagraph_helpers.py - self.prefill_wrappers = {} - print(f'[model_runner about to wrapper.init()] graph_bs_list={graph_bs_list}', flush=True) - for bs in graph_bs_list: - self.prefill_wrappers[bs] = flashinfer.BatchPrefillWithPagedKVCacheWrapper( - self.workspace_buffer, "NHD", - use_cuda_graph=True, - qo_indptr_buf=cu_seqlens_q[:bs + 1], - paged_kv_indptr_buf=kv_indptr[:bs + 1], - paged_kv_indices_buf=kv_indices[:bs * max_num_blocks], - paged_kv_last_page_len_buf=kv_last_page_len[:bs], - custom_mask_buf=custom_mask_buf[:bs * MQ_LEN * self.config.max_model_len], - mask_indptr_buf=mask_indptr_buf[:bs + 1], - ) - print(f'wrapper backend is {self.prefill_wrappers[bs]._backend}', flush=True) - - def setup_and_warmup_model_and_cudagraphs(self, config: Config, hf_config: AutoConfig, init_q=None, is_draft=False): # cudagraphs self.graph_vars = {} @@ -217,6 +174,9 @@ def setup_and_warmup_model_and_cudagraphs(self, config: Config, hf_config: AutoC if config.use_eagle and is_draft: print(f'[EAGLE3] Loading Eagle3DraftForCausalLM as model_class', flush=True) model_class = Eagle3DraftForCausalLM + elif config.use_phoenix and is_draft: + print(f'[PHOENIX] Loading PhoenixDraftForCausalLM as model_class', flush=True) + model_class = PhoenixLlamaForCausalLM elif hf_config.model_type == 'llama': model_class = LlamaForCausalLM elif hf_config.model_type == 'qwen3': @@ -236,11 +196,12 @@ def setup_and_warmup_model_and_cudagraphs(self, config: Config, hf_config: AutoC tp_size=self.num_tp_gpus, ) - if config.use_eagle: - kwargs['use_eagle'] = True + if config.use_eagle_or_phoenix: + kwargs['use_eagle'] = config.use_eagle + kwargs['use_phoenix'] = config.use_phoenix kwargs['eagle_layers'] = self.config.eagle_layers - - if model_class == Eagle3DraftForCausalLM: + + if model_class in [Eagle3DraftForCausalLM, PhoenixLlamaForCausalLM]: kwargs['d_model_target'] = config.d_model_target kwargs['debug_mode'] = config.debug_mode @@ -256,7 +217,37 @@ def setup_and_warmup_model_and_cudagraphs(self, config: Config, hf_config: AutoC load_model(self.model, config.model, target_path=target_path, target_hidden_size=target_hidden_size) if config.draft_async: # move this here so we don't get a timeout waiting for draft rank while load_model happens? - self.async_pg = dist.new_group(ranks=[0, self.draft_rank]) + if config.async_nccl_port is not None: + _nccl_timeout = timedelta(minutes=20) + _banner = "=" * 80 + print( + f'\n{_banner}\n' + f'>>> DRAFT: WAITING for target server at ' + f'{config.async_nccl_host}:{config.async_nccl_port} ' + f'to form NCCL process group (timeout={_nccl_timeout}) ...\n' + f'{_banner}\n', + flush=True, + ) + from torch.distributed import TCPStore + from ssd.utils.dist_utils import init_custom_process_group + store = TCPStore(config.async_nccl_host, port=config.async_nccl_port, + world_size=2, is_master=False, + timeout=_nccl_timeout) + with torch.cuda.device(self.device): + self.async_pg = init_custom_process_group( + backend="nccl", store=store, world_size=2, rank=1, + group_name="async_spec", timeout=_nccl_timeout) + print(f'\n{_banner}\n>>> DRAFT: NCCL process group formed! Now receiving kv_cache_size...\n{_banner}\n', flush=True) + # Cross-node: receive kv_cache_size from target so draft + # allocates the same number of KV cache blocks. + kv_buf = torch.empty(1, dtype=torch.int64, device=self.device) + kv_buf = receive_tensor(kv_buf, self.async_pg, 0, name="target kv_cache_size") + target_kv_cache_size = kv_buf.item() + print(f'[model_runner] Received target kv_cache_size={target_kv_cache_size} via NCCL', flush=True) + if target_kv_cache_size > 0: + config.num_kvcache_blocks = target_kv_cache_size + else: + self.async_pg = dist.new_group(ranks=[0, self.draft_rank]) if self.verbose: print(f'-----{model_type}MODEL LOADED----', flush=True) if config.sampler_x is not None: @@ -264,25 +255,20 @@ def setup_and_warmup_model_and_cudagraphs(self, config: Config, hf_config: AutoC assert sum(config.fan_out_list) == sum(config.fan_out_list_miss) == config.async_fan_out * (config.speculate_k + 1), "ERROR in ModelRunner: fancy sampling only supported for constant fan out for now." self.sampler = Sampler(sampler_x=config.sampler_x, async_fan_out=config.async_fan_out) - if self.verbose: - print(f'-----WARMING UP {model_type}MODEL----', flush=True) + print(f'[model_runner] Warming up {model_type}model...', flush=True) self.warmup_model() - if self.verbose: - print(f'-----ALLOCATING {model_type}KV CACHE----', flush=True) + print(f'[model_runner] Allocating {model_type}KV cache...', flush=True) self.allocate_kv_cache() - if init_q is not None: - # super().__init__() runs warmup and calculates num_kvcache_blocks, pass that up - init_q.put(self.config.num_kvcache_blocks) - init_q.close() if not self.enforce_eager: - # if not self.is_draft or (self.is_draft and self.config.draft_async and self.config.speculate): + print(f'[model_runner] Capturing CUDA graphs for {model_type}model...', flush=True) + # if not self.is_draft or (self.is_draft and self.config.draft_async and self.config.speculate): decode_graph_vars, decode_graph_pool, decode_graphs, decode_graph_bs_list = capture_cudagraph(self) # decode cudagraph, draft needs in spec and target in normal self.graph_vars["decode"] = decode_graph_vars self.graph_pools["decode"] = decode_graph_pool self.graphs["decode"] = decode_graphs self.graph_bs_list["decode"] = decode_graph_bs_list - if self.config.speculate and not (self.is_draft and self.config.use_eagle): # verify CG: target always, non-EAGLE draft for fan-out; EAGLE draft uses glue_decode CG instead + if self.config.speculate and not (self.is_draft and self.config.use_eagle_or_phoenix): # verify CG: target always, non-EAGLE draft for fan-out; EAGLE draft uses glue_decode CG instead verify_graph_vars, verify_graph_pool, verify_graphs, verify_graph_bs_list = capture_verify_cudagraph(self) self.graph_vars["verify"] = verify_graph_vars self.graph_pools["verify"] = verify_graph_pool @@ -294,13 +280,27 @@ def setup_and_warmup_model_and_cudagraphs(self, config: Config, hf_config: AutoC self.graph_pools["fi_tree_decode"] = fi_tree_decode_graph_pool self.graphs["fi_tree_decode"] = fi_tree_decode_graphs self.graph_bs_list["fi_tree_decode"] = fi_tree_decode_graph_bs_list - if self.config.speculate and self.is_draft and self.config.draft_async and self.config.use_eagle: + if self.config.speculate and self.is_draft and self.config.draft_async and self.config.use_eagle_or_phoenix: glue_gv, glue_pool, glue_graphs, glue_bs_list = capture_glue_decode_cudagraph(self) self.graph_vars["glue_decode"] = glue_gv self.graph_pools["glue_decode"] = glue_pool self.graphs["glue_decode"] = glue_graphs self.graph_bs_list["glue_decode"] = glue_bs_list + print(f'[model_runner] {model_type}model initialization complete.', flush=True) + if init_q is not None: + # Signal the scheduler that we're fully initialized (model loaded, + # KV cache allocated, CUDA graphs captured). Must happen after + # CUDA graph capture so the scheduler doesn't send NCCL requests + # before the draft runner enters its recv loop. + init_q.put(self.config.num_kvcache_blocks) + init_q.close() + elif self.is_draft and self.draft_async and hasattr(self, 'async_pg'): + # Cross-node mode: no mp.Queue available, signal readiness via NCCL. + ready_buf = torch.tensor([self.config.num_kvcache_blocks], dtype=torch.int64, device=self.device) + send_tensor(ready_buf, self.async_pg, 0, name="num_kvcache_blocks") + print(f'[model_runner] Cross-node init: sent num_kvcache_blocks={self.config.num_kvcache_blocks} via NCCL', flush=True) + return model_type def exit(self, hard: bool = True): @@ -315,20 +315,23 @@ def exit(self, hard: bool = True): self.send_draft_exit_signal() except Exception: pass - # 2) Best-effort local cleanup (no collectives; avoid group destroys in hard mode) + # 2) Best-effort local cleanup (no collectives; avoid group destroys in hard mode). + # Drop GPU tensors so main-process ranks (target rank 0) actually release + # model weights and KV cache — otherwise a subsequent engine or subprocess + # on the same GPU will OOM. try: - if not self.enforce_eager and hasattr(self, "graphs"): - del self.graphs - if hasattr(self, "graph_pool"): - del self.graph_pool - if hasattr(self, "verify_graphs"): - del self.verify_graphs - if hasattr(self, "verify_graph_pool"): - del self.verify_graph_pool - if hasattr(self, "glue_graphs"): - del self.glue_graphs - if hasattr(self, "glue_graph_pool"): - del self.glue_graph_pool + for attr in ( + "graphs", "graph_pools", "graph_vars", "graph_bs_list", + "prefill_wrappers", "only_prefill_wrapper", "workspace_buffer", + "verify_graphs", "verify_graph_pool", + "glue_graphs", "glue_graph_pool", + "model", "kv_cache", "sampler", + ): + if hasattr(self, attr): + setattr(self, attr, None) + import gc + gc.collect() + torch.cuda.empty_cache() except Exception: pass # Close SHM on all ranks that have it @@ -356,7 +359,7 @@ def exit(self, hard: bool = True): pass try: # Default group - if self.world_size > 1 or (self.draft_async and self.is_draft): + if (self.world_size > 1 or (self.draft_async and self.is_draft)) and self.config.async_nccl_port is None: dist.destroy_process_group() except Exception: pass @@ -378,16 +381,6 @@ def loop(self): self.call(method_name, *args) if method_name == "exit": break - - def recv_cmd(self): - t = torch.empty(1, dtype=torch.int64, device=self.device) - dist.recv(t, src=0, group=self.async_pg) - return int(t.item()) - - def recv_tensor(self, shape, dtype=torch.int64): - t = torch.empty(shape, dtype=dtype, device=self.device) - dist.recv(t, src=0, group=self.async_pg) - return t def send_draft_exit_signal(self): """ @@ -398,9 +391,30 @@ def send_draft_exit_signal(self): return try: cmd = torch.tensor([2], dtype=torch.int64, device=self.device) - dist.send(cmd, dst=self.draft_rank, group=self.async_pg) + send_tensor(cmd, self.async_pg, self.draft_rank, name="draft exit signal") except Exception: + if NCCL_LOG: + print(f"[{_ts()}] [NCCL_LOG SEND_DRAFT_EXIT_SIGNAL] ERROR SENDING DRAFT EXIT SIGNAL", flush=True) pass + + def _wait_for_cmd(self, handle_entry=None): + """Waits for a command, using the provided handle if available.""" + if handle_entry: + if NCCL_LOG: + print(f"[{_ts()}] [NCCL_LOG WAIT_FOR_CMD] WAITING FOR CMD", flush=True) + + work_handle, cmd_tensor = handle_entry + # block until the irecv completes and the buffer is filled + work_handle.wait() + else: + # no pending irecv, fall back to the normal recv path + cmd_tensor = receive_tensor(self._cmd, self.async_pg, 0, name="cmd", prefix="DRAFT:wait_for_cmd") + + command = COMMAND(cmd_tensor.item()) + if NCCL_LOG: + print(f"[{_ts()}] [NCCL_LOG WAIT_FOR_CMD] CMD RECEIVED: {command}", flush=True) + return command, None + def read_shm(self): assert self.world_size > 1 and self.rank self.event.wait() @@ -435,10 +449,15 @@ def warmup_model(self): seqs = [Sequence([0] * max_model_len) for _ in range(num_seqs)] hidden_states = None - if self.config.use_eagle and self.is_draft: + if self.config.use_eagle_or_phoenix and self.is_draft: num_tokens = num_seqs * max_model_len d_model_target = self.config.d_model_target or 4096 - hidden_states = torch.zeros(num_tokens, 3 * d_model_target, dtype=self.hf_config.torch_dtype, device=self.device) + if self.config.use_eagle: + hidden_states = torch.zeros(num_tokens, 3 * d_model_target, dtype=self.hf_config.torch_dtype, device=self.device) + elif self.config.use_phoenix: + hidden_states = torch.zeros(num_tokens, d_model_target, dtype=self.hf_config.torch_dtype, device=self.device) + else: + raise ValueError(f"Unsupported model type: {self.config.use_eagle_or_phoenix}") self.run(seqs, True, hidden_states=hidden_states) torch.cuda.empty_cache() @@ -472,7 +491,10 @@ def allocate_kv_cache(self): usable_bytes = max(usable_bytes - reserved_bytes, 0) assert usable_bytes > 0, "ERROR: Not enough memory for draft KV cache after accounting for tree_cache for logits storage" - config.num_kvcache_blocks = int(usable_bytes) // block_bytes + if config.num_kvcache_blocks is not None and config.num_kvcache_blocks > 0: + config.num_kvcache_blocks = min(config.num_kvcache_blocks, int(usable_bytes) // block_bytes) + else: + config.num_kvcache_blocks = int(usable_bytes) // block_bytes if self.verbose: print(f'KV CACHE ALLOCATION for {"TARGET" if not self.is_draft else "DRAFT"} model', flush=True) print(f' free={free/1e9:.2f}GB, util={config.gpu_memory_utilization:.2f}', flush=True) @@ -489,17 +511,23 @@ def allocate_kv_cache(self): num_kv_heads, hf_config.head_dim, ) - + print(f"allocate_kv_cache(): kv_cache shape = {self.kv_cache.shape}", flush=True) + + # Create tree_score_mod once (shared across all attention layers) + tree_score_mod = None + if self.is_draft and self.draft_async: + from ssd.layers.tree_mask import create_tree_score_mod + tree_score_mod = create_tree_score_mod(config.max_model_len) + layer_id = 0 for module in self.model.modules(): if hasattr(module, "k_cache") and hasattr(module, "v_cache"): module.k_cache = self.kv_cache[0, layer_id] module.v_cache = self.kv_cache[1, layer_id] - if self.is_draft and self.draft_async and not self.enforce_eager: - module.prefill_wrappers = self.prefill_wrappers - elif self.is_draft and self.draft_async and self.enforce_eager: - module.only_prefill_wrapper = self.only_prefill_wrapper # this will make it not None so it can be used on fwd + if self.is_draft and self.draft_async: + module.max_seqlen_k = config.max_model_len + module.tree_score_mod = tree_score_mod layer_id += 1 @@ -550,47 +578,38 @@ def prepare_sample(self, seqs: list[Sequence]): return temperatures def eager_tree_decode_plan(self, input_ids, positions, step, cache_hits): - """Plan FlashInfer for tree decode in eager mode""" + """Set up context metadata for FA4 tree decode in eager mode.""" assert self.is_draft and self.config.draft_async, "ERROR in eager_tree_decode_plan: not a draft async model" + from ssd.layers.tree_mask import build_tree_mask_bias context = get_context() - - K, F = self.config.speculate_k, self.config.async_fan_out - # MQ_LEN = F * (K+1) + K = self.config.speculate_k MQ_LEN = self.config.MQ_LEN - flat_batch_size = input_ids.size(0) - B = flat_batch_size // MQ_LEN # [N] tokens = B * sum(fan_out_list) - - # Convert block_tables to FlashInfer format - block_tables = context.block_tables # [B, M] - context_lens = context.context_lens # [B] - - counts = (context_lens + self.block_size - 1) // self.block_size # [B] - kv_indptr = torch.cat([torch.tensor([0], device=block_tables.device), - counts.cumsum(dim=0)]).to(torch.int32) - mask = torch.arange(block_tables.size(1), device=block_tables.device)[None, :] < counts[:, None] - kv_indices = block_tables[mask] # flattened page ids - - # Last-page actual token count per request - kv_last_page_len = (context_lens % self.block_size) - kv_last_page_len[kv_last_page_len == 0] = self.block_size - kv_last_page_len = kv_last_page_len.to(torch.int32) - cu_seqlens_q = torch.arange(B + 1, device=self.device, dtype=torch.int32) * MQ_LEN # assumes same MQ_LEN across batch dimension - custom_mask = get_custom_mask(self.config, context_lens, step, K, F, B, device=self.device, cache_hits=cache_hits) - - self.only_prefill_wrapper.plan( - cu_seqlens_q, - kv_indptr, - kv_indices, - kv_last_page_len, - self.hf_config.num_attention_heads, - self.hf_config.num_key_value_heads, - self.hf_config.head_dim, - self.block_size, - custom_mask=custom_mask, - q_data_type=self.hf_config.torch_dtype, - kv_data_type=self.hf_config.torch_dtype, + B = input_ids.size(0) // MQ_LEN + context.tree_cu_seqlens_q = torch.arange(B + 1, device=self.device, dtype=torch.int32) * MQ_LEN + context.tree_mask_bias = build_tree_mask_bias( + context.context_lens, step=step, K=K, MQ_LEN=MQ_LEN, + fan_out_list=self.config.fan_out_list, + fan_out_list_miss=self.config.fan_out_list_miss, + cache_hits=cache_hits, + max_kv_stride=self.config.max_model_len, + device=self.device, ) + @property + def hidden_states_dim(self): + # The dimension of the hidden states that are concatenated with the draft tokens embeddings + # as the input to the Eagle/Phoenix draft model. + assert self.config.use_eagle_or_phoenix and self.is_draft + return self.config.hf_config.hidden_size if self.config.use_eagle else self.config.d_model_target + + @property + def eagle_acts_dim(self): + assert self.config.use_eagle_or_phoenix and not self.is_draft + if self.config.eagle_layers: + return len(self.config.eagle_layers) * self.config.hf_config.hidden_size + else: + return self.config.hf_config.hidden_size + @torch.inference_mode() def run_model(self, input_ids: torch.Tensor, positions: torch.Tensor, is_prefill: bool, last_only: bool = True, tree_decode_step: int = -1, cache_hits: torch.Tensor | None = None, hidden_states: torch.Tensor | None = None): is_tree_decode = self.is_draft and self.config.draft_async and tree_decode_step >= 0 @@ -603,10 +622,10 @@ def run_model(self, input_ids: torch.Tensor, positions: torch.Tensor, is_prefill if is_tree_decode: self.eager_tree_decode_plan(input_ids, positions, tree_decode_step, cache_hits) - if self.config.use_eagle: + if self.config.use_eagle_or_phoenix: if self.is_draft: assert hidden_states is not None, "hidden_states required for EAGLE draft" - assert isinstance(self.model, Eagle3DraftForCausalLM) + assert isinstance(self.model, Eagle3DraftForCausalLM) or isinstance(self.model, PhoenixLlamaForCausalLM) prenorm = self.model(input_ids, positions, hidden_states) logits = self.model.compute_logits(prenorm, last_only) return logits, prenorm # return prenorm as conditioning vector for next iteration @@ -656,7 +675,7 @@ def run( # Handle EAGLE returning (logits, conditioning_vector for next iter) conditioning = None - if self.config.use_eagle: + if self.config.use_eagle_or_phoenix: logits, conditioning = self.run_model( input_ids, positions, is_prefill, last_only, hidden_states=hidden_states) else: @@ -665,7 +684,7 @@ def run( if _pt: torch.cuda.synchronize() _r2 = time.perf_counter() - print(f"[PROFILE target_run] prepare_decode={(_r1-_r0)*1000:.2f}ms run_model={(_r2-_r1)*1000:.2f}ms eagle={self.config.use_eagle} n_ids={input_ids.shape[0]}", flush=True) + print(f"[PROFILE target_run] prepare_decode={(_r1-_r0)*1000:.2f}ms run_model={(_r2-_r1)*1000:.2f}ms eagle={self.config.use_eagle}, phoenix={self.config.use_phoenix}, n_ids={input_ids.shape[0]}", flush=True) if last_only: token_ids = self.sampler(logits, temperatures).tolist() if self.rank == 0 else None @@ -678,5 +697,3 @@ def run( if conditioning is not None: return logits, conditioning return logits - - diff --git a/ssd/engine/scheduler.py b/ssd/engine/scheduler.py index b8c667aab..2907f7647 100644 --- a/ssd/engine/scheduler.py +++ b/ssd/engine/scheduler.py @@ -304,6 +304,7 @@ def postprocess_speculate( if eagle_acts is not None: accepted_len = len(new_suffix) idx = min(accepted_len - 1, eagle_acts.shape[1] - 1) + # TODO: Get rid of last_target_hidden_state field, just use extend_eagle_acts instead. seq.last_target_hidden_state = eagle_acts[i, idx] # Store extend data for next glue decode diff --git a/ssd/engine/speculator_async.py b/ssd/engine/speculator_async.py index 2334fd93a..f61d1212d 100644 --- a/ssd/engine/speculator_async.py +++ b/ssd/engine/speculator_async.py @@ -3,10 +3,9 @@ from transformers import AutoTokenizer from ssd.engine.helpers.speculate_types import SpeculateResult, VerifyResult, SpeculatorBase -from ssd.engine.helpers.runner_helpers import prepare_prefill_payload +from ssd.engine.helpers.runner_helpers import PrefillRequest, SpeculationRequest, SpeculationResponse from ssd.engine.sequence import Sequence from ssd.utils.misc import decode_tokens -from ssd.utils.async_helpers.nccl_pack import send_int64 class SpeculatorAsync(SpeculatorBase): @@ -21,6 +20,10 @@ def __init__( draft_dtype: torch.dtype, kvcache_block_size: int, max_model_len: int, + eagle: bool, + eagle_act_dim: int, + communicate_logits: bool, + communicate_cache_hits: bool, async_pg: dist.ProcessGroup, draft_runner_rank: int, tokenizer: AutoTokenizer, @@ -33,60 +36,90 @@ def __init__( self.draft_dtype = draft_dtype self.kvcache_block_size = kvcache_block_size self.max_model_len = max_model_len + self.eagle = eagle + self.eagle_act_dim = eagle_act_dim + self.communicate_logits = communicate_logits + self.communicate_cache_hits = communicate_cache_hits self.async_pg = async_pg self.draft_runner_rank = draft_runner_rank + self.target_rank = 0 self.tokenizer = tokenizer self.verbose = verbose self.K = lookahead # Pre-allocate handshake send/recv buffers (reused every step) - self._alloc_handshake_bufs(1) - - # Pre-allocate speculate() output buffers (avoid torch.tensor(device=cuda) sync) - self._recovery_buf = torch.empty(1, dtype=torch.int64, device=device) - self._speculations_buf = torch.empty(1, lookahead + 1, dtype=torch.int64, device=device) - - def _alloc_handshake_bufs(self, B): - self._hs_B = B - d = self.device - self._cmd = torch.zeros(1, dtype=torch.int64, device=d) - self._meta = torch.tensor([B, self.K, self.async_fan_out], dtype=torch.int64, device=d) - self._cache_keys = torch.empty(B, 3, dtype=torch.int64, device=d) - self._num_tokens_buf = torch.empty(B, dtype=torch.int64, device=d) - self._temps_buf = torch.empty(B, dtype=torch.float32, device=d) - self._block_tables_buf = torch.full((B, self.max_blocks), -1, dtype=torch.int32, device=d) - self._fused_response = torch.empty(B + B * self.K, dtype=torch.int64, device=d) - self._logits_q = torch.empty(B, self.K, self.vocab_size, dtype=self.draft_dtype, device=d) - self._extend_counts = torch.zeros(B, dtype=torch.int64, device=d) + B = 1 + self._speculation_request = SpeculationRequest.prepare( + batch_size=B, + lookahead=lookahead, + max_blocks=max_blocks, + vocab_size=vocab_size, + draft_dtype=draft_dtype, + device=device, + eagle=eagle, + eagle_act_dim=eagle_act_dim, + ) + self._speculation_response = SpeculationResponse.prepare( + batch_size=B, + lookahead=lookahead, + device=device, + draft_dtype=draft_dtype, + communicate_logits=communicate_logits, + communicate_cache_hits=communicate_cache_hits, + vocab_size=vocab_size, + ) + self._recovery_buf = torch.empty(B, dtype=torch.int64, device=self.device) + self._speculations_buf = torch.empty(B, self.K + 1, dtype=torch.int64, device=self.device) - def prefill(self, seqs: list[Sequence], verify_result: VerifyResult) -> SpeculateResult: + def _prepare_prefill_request(self, seqs: list[Sequence], verify_result: VerifyResult) -> PrefillRequest: eagle_acts = verify_result.eagle_acts input_id_list = [seq.token_ids for seq in seqs] - # EAGLE token-conditioning shift: token at position j gets conditioning - # from target act at position j-1. Skip first token per seq and drop - # last eagle_act per seq so they align correctly. + # EAGLE/Phoenix token-conditioning shift: we duplicate the first target activation for each sequence. + # [t0, h0], [t1, h0], [t2, h1], [t3, h2], ... if eagle_acts is not None: sliced = [] offset = 0 for ids in input_id_list: seq_len = len(ids) + sliced.append(eagle_acts[offset:offset + 1]) sliced.append(eagle_acts[offset:offset + seq_len - 1]) offset += seq_len eagle_acts = torch.cat(sliced, dim=0) - input_id_list = [ids[1:] for ids in input_id_list] max_blocks = (self.max_model_len + self.kvcache_block_size - 1) // self.kvcache_block_size - cmd, metadata, input_ids, num_tokens, draft_block_table, eagle_acts = prepare_prefill_payload( - input_id_list, eagle_acts, self.device, max_blocks, - [seq.draft_block_table for seq in seqs], + input_ids_flat = [] + num_tokens = [] + for input_ids in input_id_list: + input_ids_flat.extend(input_ids) + num_tokens.append(len(input_ids)) + + draft_block_tables = [seq.draft_block_table for seq in seqs] + input_ids_flat = torch.tensor(input_ids_flat, dtype=torch.int64, device=self.device) + num_tokens = torch.tensor(num_tokens, dtype=torch.int64, device=self.device) + if isinstance(draft_block_tables, list): + draft_block_table = torch.tensor( + [dbt + [-1] * (max_blocks - len(dbt)) for dbt in draft_block_tables], + dtype=torch.int32, device=self.device, + ) + else: + assert draft_block_tables.shape == (len(input_id_list), max_blocks), ( + f"draft_block_tables shape mismatch: expected ({len(input_id_list), max_blocks}), got {draft_block_tables.shape}" + ) + draft_block_table = draft_block_tables + + return PrefillRequest.prepare( + input_ids_flat, + num_tokens, + draft_block_table, + eagle_acts, + max_blocks, + self.device, ) - dist.send(cmd, dst=self.draft_runner_rank, group=self.async_pg) - dist.send(metadata, dst=self.draft_runner_rank, group=self.async_pg) - send_int64(self.async_pg, self.draft_runner_rank, - input_ids, num_tokens, draft_block_table.to(torch.int64)) - if eagle_acts is not None: - dist.send(eagle_acts, dst=self.draft_runner_rank, group=self.async_pg) + + def prefill(self, seqs: list[Sequence], verify_result: VerifyResult) -> SpeculateResult: + prefill_request = self._prepare_prefill_request(seqs, verify_result) + prefill_request.send(self.async_pg, self.draft_runner_rank) return SpeculateResult([], []) def speculate(self, seqs: list[Sequence], verify_result: VerifyResult) -> SpeculateResult: @@ -106,9 +139,24 @@ def speculate(self, seqs: list[Sequence], verify_result: VerifyResult) -> Specul print(f"{sep}\n", flush=True) eagle = verify_result.eagle_acts is not None - speculations_tokens, logits_q, cache_hits = self._speculation_request(seqs, eagle) + assert self.eagle == eagle, "Eagle status mismatch" + speculation_response = self._make_speculation_request(seqs, eagle) + speculation_tokens = speculation_response.speculations + logits_q = speculation_response.logits_q + cache_hits = speculation_response.cache_hits # Build speculations using pre-allocated buffers (avoids torch.tensor(device=cuda) sync) + speculations = self._prepend_recovery_tokens(seqs, speculation_tokens) + + for i, seq in enumerate(seqs): + seq.token_ids.extend(speculation_tokens[i].tolist()) + seq.num_tokens = len(seq.token_ids) + seq.last_token = seq.token_ids[-1] + seq.num_draft_cached_tokens += len(speculation_tokens[i]) + 1 + + return SpeculateResult(speculations, logits_q, cache_hits) + + def _prepend_recovery_tokens(self, seqs: list[Sequence], speculation_tokens: torch.Tensor) -> torch.Tensor: B = len(seqs) if B != self._recovery_buf.shape[0]: self._recovery_buf = torch.empty(B, dtype=torch.int64, device=self.device) @@ -116,72 +164,42 @@ def speculate(self, seqs: list[Sequence], verify_result: VerifyResult) -> Specul _rec_cpu = torch.tensor([seq.recovery_token_id for seq in seqs], dtype=torch.int64) self._recovery_buf.copy_(_rec_cpu, non_blocking=True) self._speculations_buf[:, 0] = self._recovery_buf - self._speculations_buf[:, 1:] = speculations_tokens - speculations = self._speculations_buf + self._speculations_buf[:, 1:] = speculation_tokens + return self._speculations_buf - for i, seq in enumerate(seqs): - seq.token_ids.extend(speculations_tokens[i].tolist()) - seq.num_tokens = len(seq.token_ids) - seq.last_token = seq.token_ids[-1] - seq.num_draft_cached_tokens += len(speculations_tokens[i]) + 1 - - return SpeculateResult(speculations, logits_q, cache_hits) - - def _speculation_request(self, seqs: list[Sequence], eagle: bool): + def _prepare_speculation_request(self, seqs: list[Sequence], eagle: bool) -> SpeculationRequest: B = len(seqs) - if B != self._hs_B: - self._alloc_handshake_bufs(B) + self._speculation_request.maybe_update_buffers(B) # Fill send buffers in-place (avoids torch.tensor from Python lists) for i, seq in enumerate(seqs): - self._cache_keys[i, 0] = seq.seq_id - self._cache_keys[i, 1] = seq.last_spec_step_accepted_len - 1 - self._cache_keys[i, 2] = seq.recovery_token_id - self._num_tokens_buf[i] = seq.num_tokens - self._temps_buf[i] = seq.draft_temperature if seq.draft_temperature is not None else seq.temperature + self._speculation_request.cache_keys[i, 0] = seq.seq_id + self._speculation_request.cache_keys[i, 1] = seq.last_spec_step_accepted_len - 1 + self._speculation_request.cache_keys[i, 2] = seq.recovery_token_id + self._speculation_request.num_tokens[i] = seq.num_tokens + self._speculation_request.temps[i] = seq.draft_temperature if seq.draft_temperature is not None else seq.temperature bt = seq.draft_block_table bt_len = len(bt) if bt_len > 0: - self._block_tables_buf[i, :bt_len] = torch.tensor(bt, dtype=torch.int32, device=self.device) - self._block_tables_buf[i, bt_len:] = -1 - - # Send cmd + meta + fused payload (temps fused into int64 burst) - dist.send(self._cmd, dst=self.draft_runner_rank, group=self.async_pg) - dist.send(self._meta, dst=self.draft_runner_rank, group=self.async_pg) - temps_as_int64 = self._temps_buf.view(torch.int32).to(torch.int64) - send_int64( - self.async_pg, self.draft_runner_rank, - self._cache_keys, self._num_tokens_buf, - self._block_tables_buf.to(torch.int64), temps_as_int64, - ) + self._speculation_request.block_tables[i, :bt_len] = torch.tensor(bt, dtype=torch.int32, device=self.device) + self._speculation_request.block_tables[i, bt_len:] = -1 if eagle: - recovery_activations = torch.stack( - [seq.last_target_hidden_state for seq in seqs], dim=0, - ).to(self.device) - dist.send(recovery_activations.to(self.draft_dtype), - dst=self.draft_runner_rank, group=self.async_pg) - - # Send extend data for glue decode with fused extend - K = self.K - act_dim = recovery_activations.shape[-1] - for i, seq in enumerate(seqs): - self._extend_counts[i] = seq.extend_count - extend_eagle_acts = torch.zeros(B, K, act_dim, dtype=self.draft_dtype, device=self.device) - extend_token_ids = torch.zeros(B, K, dtype=torch.int64, device=self.device) - for i, seq in enumerate(seqs): + self._prepare_eagle_payload(seqs) + + return self._speculation_request + + def _prepare_eagle_payload(self, seqs: list[Sequence]): + for i, seq in enumerate(seqs): + self._speculation_request.recovery_activations[i, :] = seq.last_target_hidden_state + self._speculation_request.extend_counts[i] = seq.extend_count + if seq.extend_count > 0 and seq.extend_eagle_acts is not None: n = seq.extend_count - if n > 0 and seq.extend_eagle_acts is not None: - extend_eagle_acts[i, :n] = seq.extend_eagle_acts[:n].to(self.draft_dtype) - extend_token_ids[i, :n] = seq.extend_token_ids[:n] - dist.send(self._extend_counts, dst=self.draft_runner_rank, group=self.async_pg) - dist.send(extend_eagle_acts, dst=self.draft_runner_rank, group=self.async_pg) - dist.send(extend_token_ids, dst=self.draft_runner_rank, group=self.async_pg) - - # Recv into pre-allocated buffers - dist.recv(self._fused_response, src=self.draft_runner_rank, group=self.async_pg) - cache_hits = self._fused_response[:B] - speculations = self._fused_response[B:].view(B, self.K) - dist.recv(self._logits_q, src=self.draft_runner_rank, group=self.async_pg) - - return speculations, self._logits_q, cache_hits + self._speculation_request.extend_activations[i, :n] = seq.extend_eagle_acts[:n].to(self.draft_dtype) + self._speculation_request.extend_token_ids[i, :n] = seq.extend_token_ids[:n] + + def _make_speculation_request(self, seqs: list[Sequence], eagle: bool): + speculation_request = self._prepare_speculation_request(seqs, eagle) + speculation_request.send(self.async_pg, self.draft_runner_rank) + self._speculation_response.receive(self.async_pg, self.draft_runner_rank, batch_size=len(seqs)) + return self._speculation_response diff --git a/ssd/engine/step.py b/ssd/engine/step.py index f60939c31..68c461089 100644 --- a/ssd/engine/step.py +++ b/ssd/engine/step.py @@ -18,39 +18,40 @@ def __init__(self, scheduler: Scheduler): self.scheduler = scheduler @abstractmethod - def decode(self, seqs: list[Sequence]) -> int: + def prefill(self, seqs: list[Sequence], step_num: int = 0) -> int: pass @abstractmethod - def prefill(self, seqs: list[Sequence]) -> int: + def decode(self, seqs: list[Sequence], step_num: int = 0) -> int: pass class AutoRegressiveStep(InferenceStep): - def __init__(self, scheduler: Scheduler, model_runner: ModelRunner, tokenizer: AutoTokenizer): + def __init__(self, scheduler: Scheduler, model_runner: ModelRunner, tokenizer: AutoTokenizer, verbose: bool = False): super().__init__(scheduler) self.model_runner = model_runner self.tokenizer = tokenizer + self.verbose = verbose - def step(self, seqs: list[Sequence], is_prefill: bool) -> int: - if __debug__: + def step(self, seqs: list[Sequence], is_prefill: bool, step_num: int = 0) -> int: + if self.verbose: print(f'[auto_regressive_step] is_prefill={is_prefill}', flush=True) token_ids = self.model_runner.call("run", seqs, is_prefill) - if __debug__: + if self.verbose: decoded_tokens = decode_tokens(token_ids, self.tokenizer) print(f"[auto_regressive_step] generated tokens: {decoded_tokens}", flush=True) self.scheduler.postprocess(seqs, token_ids, is_prefill) return len(seqs) if not is_prefill else sum(len(seq) for seq in seqs) - def prefill(self, seqs: list[Sequence]) -> int: - return self.step(seqs, is_prefill=True) + def prefill(self, seqs: list[Sequence], step_num: int = 0) -> int: + return self.step(seqs, is_prefill=True, step_num=step_num) - def decode(self, seqs: list[Sequence]) -> int: - return self.step(seqs, is_prefill=False) + def decode(self, seqs: list[Sequence], step_num: int = 0) -> int: + return self.step(seqs, is_prefill=False, step_num=step_num) class SpecDecodeStep(InferenceStep): @@ -63,6 +64,7 @@ def __init__( eagle: bool, tokenizer: AutoTokenizer, async_spec: bool, + verbose: bool = False, ): super().__init__(scheduler) self.speculator = speculator @@ -70,16 +72,26 @@ def __init__( self.eagle = eagle self.tokenizer = tokenizer self.async_spec = async_spec + self.verbose = verbose - def prefill(self, seqs: list[Sequence]) -> int: + def prefill(self, seqs: list[Sequence], step_num: int = 0) -> int: # When doing async speculation and not Eagle, we can do draft and target prefills in parallel. - if not self.eagle and self.async_spec: - empty_verify_result = VerifyResult([], [], None) - self.speculator.prefill(seqs, empty_verify_result) - verify_result = self.verifier.prefill(seqs, eagle=False) - else: - verify_result = self.verifier.prefill(seqs, eagle=self.eagle) - self.speculator.prefill(seqs, verify_result) + # TEMPORARY: Disable prefill optimization of running draft and target prefills in parallel. + # if not self.eagle and self.async_spec: + # empty_verify_result = VerifyResult([], [], None) + # self.speculator.prefill(seqs, empty_verify_result) + # verify_result = self.verifier.prefill(seqs, eagle=False) + # else: + if self.verbose: + print(f"[SpecDecodeStep] Verifier prefill {step_num}", flush=True) + verify_result = self.verifier.prefill(seqs, eagle=self.eagle) + + if self.verbose: + print(f"[SpecDecodeStep] Speculator prefill {step_num}", flush=True) + self.speculator.prefill(seqs, verify_result) + + if self.verbose: + print(f"[SpecDecodeStep] Prefill {step_num} complete", flush=True) for seq in seqs: assert seq.recovery_token_id is not None @@ -88,11 +100,15 @@ def prefill(self, seqs: list[Sequence]) -> int: return sum(len(seq) for seq in seqs) - def decode(self, seqs: list[Sequence]) -> int: + def decode(self, seqs: list[Sequence], step_num: int = 0) -> int: _prof = os.environ.get("SSD_PROFILE", "0") == "1" + _prof_ev = os.environ.get("SSD_PROFILE_EVENTS", "0") == "1" if _prof: torch.cuda.synchronize() _t0 = perf_counter() + if _prof_ev: + _ev = [torch.cuda.Event(enable_timing=True) for _ in range(4)] + _ev[0].record() # Save lightweight state instead of expensive clone_spec deep copy. # speculate() modifies: token_ids (append+extend), num_tokens, last_token, num_draft_cached_tokens @@ -112,15 +128,17 @@ def decode(self, seqs: list[Sequence]) -> int: if _prof: torch.cuda.synchronize() _t1 = perf_counter() + if _prof_ev: + _ev[1].record() - if __debug__: + if self.verbose: speculations = speculate_result.speculations - print(f"[SpecDecodeStep] speculations: {speculations}", flush=True) + print(f"[SpecDecodeStep] speculations {step_num}: {speculations}", flush=True) speculations_list = speculations.tolist() for i, speculation in enumerate(speculations_list): decoded_tokens = decode_tokens(speculation, self.tokenizer) - print(f"[SpecDecodeStep] speculation {i}: {decoded_tokens}", flush=True) + print(f"[SpecDecodeStep] speculation {step_num},{i}: {decoded_tokens}", flush=True) #### STEP 2: VERIFY #### out_verify_result = self.verifier.verify(seqs, speculate_result, eagle=self.eagle) @@ -128,13 +146,15 @@ def decode(self, seqs: list[Sequence]) -> int: if _prof: torch.cuda.synchronize() _t2 = perf_counter() + if _prof_ev: + _ev[2].record() - if __debug__: + if self.verbose: recovery_tokens = out_verify_result.recovery_tokens new_suffixes = out_verify_result.new_suffixes for i, new_suffix in enumerate(new_suffixes): decoded_tokens = decode_tokens(new_suffix + [recovery_tokens[i]], self.tokenizer) - print(f"[SpecDecodeStep] verification {i}: {decoded_tokens}", flush=True) + print(f"[SpecDecodeStep] verification {step_num},{i}: {decoded_tokens}", flush=True) # Restore original seq state before postprocess (undo speculate + verify modifications) for seq, (orig_len, orig_nt, orig_lt, orig_ndc, orig_nct) in zip(seqs, saved): @@ -159,5 +179,12 @@ def decode(self, seqs: list[Sequence]) -> int: hits_str = f"hits={cache_hits.sum().item()}/{len(cache_hits)}" if cache_hits is not None else "" toks = sum(len(s) for s in out_verify_result.new_suffixes) print(f"[PROFILE target] handshake={(_t1-_t0)*1000:.2f}ms verify={(_t2-_t1)*1000:.2f}ms postprocess={(_t3-_t2)*1000:.2f}ms total={(_t3-_t0)*1000:.2f}ms {hits_str} toks={toks}", flush=True) + if _prof_ev: + _ev[3].record() + _ev[3].synchronize() + cache_hits = speculate_result.cache_hits + hits_str = f"hits={cache_hits.sum().item()}/{len(cache_hits)}" if cache_hits is not None else "" + toks = sum(len(s) for s in out_verify_result.new_suffixes) + print(f"[PROFILE_EVENTS target] handshake={_ev[0].elapsed_time(_ev[1]):.2f}ms verify={_ev[1].elapsed_time(_ev[2]):.2f}ms postprocess={_ev[2].elapsed_time(_ev[3]):.2f}ms total={_ev[0].elapsed_time(_ev[3]):.2f}ms {hits_str} toks={toks}", flush=True) return sum(len(s) for s in out_verify_result.new_suffixes) diff --git a/ssd/engine/verifier.py b/ssd/engine/verifier.py index c5412b6a9..d423e7710 100644 --- a/ssd/engine/verifier.py +++ b/ssd/engine/verifier.py @@ -20,6 +20,7 @@ def __init__( jit_speculate: bool = False, tokenizer: AutoTokenizer = None, metrics: dict = None, + verbose: bool = False, ): super().__init__(lookahead, device) self.target_model_runner = target_model_runner @@ -28,6 +29,7 @@ def __init__( self.jit_speculate = jit_speculate self.tokenizer = tokenizer self.metrics = metrics + self.verbose = verbose def prefill(self, seqs: list[Sequence], eagle: bool = False) -> VerifyResult: result = self.target_model_runner.call("run", seqs, True) @@ -114,7 +116,7 @@ def verify(self, seqs: list[Sequence], speculate_result: SpeculateResult, eagle: # # Debug: print recovery tokens detokenized - if __debug__ and recovery_tokens is not None and len(recovery_tokens) > 0: + if self.verbose and recovery_tokens is not None and len(recovery_tokens) > 0: recovery_texts = [] for token in recovery_tokens: try: @@ -127,6 +129,15 @@ def verify(self, seqs: list[Sequence], speculate_result: SpeculateResult, eagle: self.metrics["accepted_suffix_lens_with_recovery"].extend( [len(s) for s in new_suffixes]) + # Full per-step accept trace for correctness tests (tier 1). + # Each entry is a list of (seq_id, accepted_suffix, new_recovery_token) + # covering every sequence in that verify step's batch. + if "per_step_accepts" in self.metrics: + self.metrics["per_step_accepts"].append([ + (seq.seq_id, list(suffix), int(rec)) + for seq, suffix, rec in zip(seqs, new_suffixes, recovery_tokens) + ]) + # For async mode, also track accepted suffix lengths only for cache hits if speculate_result.cache_hits is not None: _ch_cpu = speculate_result.cache_hits.cpu() @@ -138,7 +149,7 @@ def verify(self, seqs: list[Sequence], speculate_result: SpeculateResult, eagle: self.metrics["accepted_suffix_lens_on_miss"].append(suffix_len) # Print mean length of new suffixes for monitoring - if __debug__ and new_suffixes: + if self.verbose and new_suffixes: mean_suffix_len = sum([len(suffix) for suffix in new_suffixes]) / len(new_suffixes) print(f"[verify] mean new suffix length: {mean_suffix_len:.2f}", flush=True) diff --git a/ssd/layers/attention.py b/ssd/layers/attention.py index ed5ec7b3a..6b1f61c7c 100644 --- a/ssd/layers/attention.py +++ b/ssd/layers/attention.py @@ -3,7 +3,8 @@ import triton import triton.language as tl -from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache +from flash_attn.cute.interface import flash_attn_varlen_func as fa4_varlen_func +from ssd.layers.tree_mask import create_tree_score_mod from ssd.utils.context import get_context @@ -65,10 +66,10 @@ def __init__( self.speculate = speculate self.draft_async = draft_async self.use_eagle = use_eagle - self.prefill_wrappers = {} self.F = F # async_fan_out self.K = K # speculate_k - self.only_prefill_wrapper = None + self.max_seqlen_k = 0 # set during KV cache allocation to config.max_model_len + self.tree_score_mod = None # set during KV cache allocation def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor): o: torch.Tensor @@ -87,7 +88,7 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor): k, v = k_cache, v_cache k, v = k.view(-1, self.num_kv_heads, self.head_dim), v.view(-1, self.num_kv_heads, self.head_dim) - o = flash_attn_varlen_func(q, k, v, + o, _ = fa4_varlen_func(q, k, v, max_seqlen_q=context.max_seqlen_q, cu_seqlens_q=context.cu_seqlens_q, max_seqlen_k=context.max_seqlen_k, cu_seqlens_k=context.cu_seqlens_k, softmax_scale=self.scale, causal=True) @@ -104,29 +105,45 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor): if verify_or_glue: assert context.context_lens is not None - o = flash_attn_with_kvcache(q, k_cache, v_cache, - cache_seqlens=context.context_lens, page_table=context.block_tables, + o, _ = fa4_varlen_func(q, k_cache, v_cache, + cu_seqlens_q=context.cu_seqlens_q, + cu_seqlens_k=None, + max_seqlen_q=context.max_seqlen_q, + max_seqlen_k=self.max_seqlen_k, + seqused_k=context.context_lens, + page_table=context.block_tables, softmax_scale=self.scale, causal=True, - cu_seqlens_q=context.cu_seqlens_q, max_seqlen_q=context.max_seqlen_q, ) elif tree_decode: - if self.only_prefill_wrapper is not None: - prefill_wrapper = self.only_prefill_wrapper - else: - mq_len = self.F * (self.K+1) - bs = q.shape[0] // mq_len - wrapper_bs = None - for available_bs in sorted(self.prefill_wrappers.keys()): - if available_bs >= bs: - wrapper_bs = available_bs - break - prefill_wrapper = self.prefill_wrappers[wrapper_bs] - o = prefill_wrapper.run(q, (self.k_cache, self.v_cache)) + score_mod_kwargs = {} + if self.tree_score_mod is not None and context.tree_mask_bias is not None: + score_mod_kwargs["score_mod"] = self.tree_score_mod + score_mod_kwargs["aux_tensors"] = [context.tree_mask_bias] + o, _ = fa4_varlen_func( + q, + self.k_cache, + self.v_cache, + cu_seqlens_q=context.tree_cu_seqlens_q, + cu_seqlens_k=None, + max_seqlen_q=self.F * (self.K + 1), + max_seqlen_k=self.max_seqlen_k, + seqused_k=context.context_lens, + page_table=context.block_tables, + softmax_scale=self.scale, + causal=False, + **score_mod_kwargs, + ) else: # single query decode - q = q.unsqueeze(1) - o = flash_attn_with_kvcache(q, k_cache, v_cache, - cache_seqlens=context.context_lens, page_table=context.block_tables, + batch_size = context.context_lens.shape[0] + cu_seqlens_q = torch.arange(0, batch_size + 1, dtype=torch.int32, device=q.device) + o, _ = fa4_varlen_func(q, k_cache, v_cache, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=None, + max_seqlen_q=1, + max_seqlen_k=self.max_seqlen_k, + seqused_k=context.context_lens, + page_table=context.block_tables, softmax_scale=self.scale, causal=True, ) diff --git a/ssd/layers/embed_head.py b/ssd/layers/embed_head.py index c50174d2e..51f841579 100644 --- a/ssd/layers/embed_head.py +++ b/ssd/layers/embed_head.py @@ -43,7 +43,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): shard_size = param_data.size(0) start_idx = self.tp_rank * shard_size loaded_weight = loaded_weight.narrow(0, start_idx, shard_size) - assert param_data.size() == loaded_weight.size() + assert param_data.size() == loaded_weight.size(), f"param_data.size()={param_data.size()}, loaded_weight.size()={loaded_weight.size()}" param_data.copy_(loaded_weight) def forward(self, x: torch.Tensor): diff --git a/ssd/layers/linear.py b/ssd/layers/linear.py index b25824172..d605caaa5 100755 --- a/ssd/layers/linear.py +++ b/ssd/layers/linear.py @@ -89,6 +89,9 @@ def __init__( def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): param_data = param.data + if param_data.dim() == 1: # bias — no sharding needed + param_data.copy_(loaded_weight) + return shard_size = param_data.size(self.tp_dim) start_idx = self.tp_rank * shard_size loaded_weight = loaded_weight.narrow(self.tp_dim, start_idx, shard_size) @@ -115,6 +118,9 @@ def __init__( def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, loaded_shard_id: int): param_data = param.data + if param_data.dim() == 1: # bias — no sharding needed + param_data.copy_(loaded_weight) + return shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size shard_size = self.output_sizes[loaded_shard_id] // self.tp_size param_data = param_data.narrow(self.tp_dim, shard_offset, shard_size) @@ -147,6 +153,9 @@ def __init__( def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, loaded_shard_id: str): param_data = param.data + if param_data.dim() == 1: # bias — no sharding needed + param_data.copy_(loaded_weight) + return assert loaded_shard_id in ["q", "k", "v"] if loaded_shard_id == "q": shard_size = self.num_heads * self.head_size @@ -187,6 +196,9 @@ def __init__( def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): param_data = param.data + if param_data.dim() == 1: # bias — no sharding needed + param_data.copy_(loaded_weight) + return shard_size = param_data.size(self.tp_dim) start_idx = self.tp_rank * shard_size loaded_weight = loaded_weight.narrow(self.tp_dim, start_idx, shard_size) diff --git a/ssd/layers/tree_mask.py b/ssd/layers/tree_mask.py new file mode 100644 index 000000000..d44a7ec14 --- /dev/null +++ b/ssd/layers/tree_mask.py @@ -0,0 +1,100 @@ +"""Tree decode mask for FA4 via score_mod + aux_tensors. + +The tree mask is stored as a dense float32 bias tensor of shape +(max_total_q, max_kv_stride), flattened to 1D. Unmasked positions have +value 0.0; masked positions have a large negative value (-1e6). + +score_mod adds the bias to each attention score, effectively masking out +positions where the bias is -1e6. +""" + +import torch +import numpy as np +import cutlass +import cutlass.cute as cute + +# Large negative value used to mask attention scores. +_MASK_VAL = -1.0e6 + + +def create_tree_score_mod(max_kv_stride: int): + """Return a @cute.jit score_mod that reads a mask bias from aux_tensors[0]. + + The aux_tensor is a 1D float32 tensor indexed by: + (offset_q + q_idx) * max_kv_stride + kv_idx + + where offset_q comes from seqlen_info for varlen sequences. + """ + + @cute.jit + def tree_score_mod(tSrS_ssa, b_idx, h_idx, q_idx, kv_idx, seqlen_info, aux_tensors): + mask_bias = aux_tensors[0] + dtype = mask_bias.element_type + global_q = seqlen_info.offset_q + q_idx + flat_idx = global_q * max_kv_stride + kv_idx + idx_frag = cute.make_rmem_tensor(1, cutlass.Int32) + idx_frag.store(flat_idx) + val_frag = cute.make_rmem_tensor(1, dtype) + val_frag[0] = mask_bias[idx_frag[0]] + bias = (val_frag.load()).to(cutlass.Float32) + return tSrS_ssa + bias + + return tree_score_mod + + +def build_tree_mask_bias( + context_lens: torch.Tensor, + step: int, + K: int, + MQ_LEN: int, + fan_out_list: list[int], + fan_out_list_miss: list[int], + cache_hits: torch.Tensor, + max_kv_stride: int, + device: torch.device, +) -> torch.Tensor: + """Build the dense mask bias tensor for one tree decode step. + + Returns a 1D float32 tensor of shape (B * MQ_LEN * max_kv_stride,) + with 0.0 for attend and _MASK_VAL for masked positions. + """ + B = context_lens.shape[0] + context_lens_list = context_lens.tolist() + cache_hits_list = cache_hits[:B].tolist() + + # Pre-compute glue patterns + tril = np.tril(np.ones((K + 1, K + 1), dtype=np.float32)) + fol = np.array(fan_out_list) + fol_miss = np.array(fan_out_list_miss) + glue_hit = np.repeat(tril, fol, axis=0) # (MQ_LEN, K+1) + glue_miss = np.repeat(tril, fol_miss, axis=0) + + ttl_added = (step + 1) * MQ_LEN + (K + 1) + rows = np.arange(MQ_LEN) + + # Build mask as numpy, then convert + bias = np.full((B * MQ_LEN, max_kv_stride), _MASK_VAL, dtype=np.float32) + + for b in range(B): + cols_b = int(context_lens_list[b]) + prefix_len_b = cols_b - ttl_added + row_offset = b * MQ_LEN + + # Prefix: attend to all + if prefix_len_b > 0: + bias[row_offset:row_offset + MQ_LEN, :prefix_len_b] = 0.0 + + # Glue pattern + glue = glue_hit if int(cache_hits_list[b]) == 1 else glue_miss + glue_start = prefix_len_b + glue_bias = np.where(glue > 0, 0.0, _MASK_VAL).astype(np.float32) + bias[row_offset:row_offset + MQ_LEN, glue_start:glue_start + K + 1] = glue_bias + + # Diagonal blocks + diag_start = prefix_len_b + K + 1 + for blk in range(step + 1): + col_indices = diag_start + blk * MQ_LEN + rows + valid = col_indices < max_kv_stride + bias[row_offset + rows[valid], col_indices[valid]] = 0.0 + + return torch.from_numpy(bias.reshape(-1)).to(device, non_blocking=True) diff --git a/ssd/models/eagle3_draft_llama3.py b/ssd/models/eagle3_draft_llama3.py index 4f5ec7da0..71c19a1b9 100644 --- a/ssd/models/eagle3_draft_llama3.py +++ b/ssd/models/eagle3_draft_llama3.py @@ -219,6 +219,7 @@ def __init__( draft: bool = False, speculate: bool = False, use_eagle: bool = False, + use_phoenix: bool = False, eagle_layers: list[int] | None = None, d_model_target: int = 4096, spec_k: int = 1, @@ -233,6 +234,7 @@ def __init__( assert draft, "ERROR in Eagle3DraftForLlama3: draft must be True" assert use_eagle, "ERROR in Eagle3DraftForLlama3: config.use_eagle must be True" assert eagle_layers is not None, "ERROR in Eagle3DraftForLlama3: eagle_layers must be set" + assert not use_phoenix, "ERROR in Eagle3DraftForLlama3: config.use_phoenix must be False" # this will be the draft that does tree decode, just needs a modified fwd pass that takes in hidden states and uses fc and dicts to sample, etc self.config = config @@ -242,7 +244,7 @@ def __init__( self.tp_group = tp_group self.tp_size = tp_size self.use_eagle = use_eagle - self.eagle_layers = eagle_layers if eagle_layers is not None else [] + self.eagle_layers = eagle_layers self.d_model_target = d_model_target self.d2t = {} # loaded by loader.py, converted to tensor after load_model self.t2d = {} # loaded by loader.py, converted to tensor after load_model diff --git a/ssd/models/llama3.py b/ssd/models/llama3.py index a9934ad5d..091df664e 100755 --- a/ssd/models/llama3.py +++ b/ssd/models/llama3.py @@ -210,6 +210,7 @@ def __init__( async_fan_out: int = 1, draft_async: bool = False, use_eagle: bool = False, + use_phoenix: bool = False, eagle_layers: list[int] | None = None, tp_group: dist.ProcessGroup | None = None, tp_size: int = 1, @@ -221,8 +222,9 @@ def __init__( self.async_fan_out = async_fan_out self.draft_async = draft_async self.use_eagle = use_eagle + self.use_phoenix = use_phoenix self.eagle_layers = eagle_layers - print(f'[LlamaModel] use_eagle={use_eagle}, eagle_layers={eagle_layers}', flush=True) + print(f'[LlamaModel] use_eagle={use_eagle}, use_phoenix={use_phoenix}, eagle_layers={eagle_layers}', flush=True) self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, @@ -249,24 +251,33 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, + hidden_states: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - hidden_states = self.embed_tokens(input_ids) # torch.Size([4096, 2560]) always through residual stream + if hidden_states is None: + hidden_states = self.embed_tokens(input_ids) residual = None # Collect activations if use_eagle - collected_acts = [] if self.use_eagle else None + collected_acts = [] if not self.draft and (self.use_eagle or self.use_phoenix) else None for layer_idx, layer in enumerate(self.layers): - if collected_acts is not None and layer_idx in self.eagle_layers: + if collected_acts is not None and self.eagle_layers is not None and layer_idx in self.eagle_layers: current_act = hidden_states if residual is None else hidden_states + residual collected_acts.append(current_act) hidden_states, residual = layer(positions, hidden_states, residual) - hidden_states, _ = self.norm(hidden_states, residual) - - if collected_acts: - eagle_acts = torch.cat(collected_acts, dim=-1) + + if not self.draft and self.use_phoenix: + assert self.eagle_layers is None, "ERROR in LlamaModel: use_phoenix and eagle_layers are not compatible" + collected_acts.append(hidden_states) + + if collected_acts is not None: + if len(collected_acts) > 1: + eagle_acts = torch.cat(collected_acts, dim=-1) + else: + assert len(collected_acts) == 1 + eagle_acts = collected_acts[0] print(f'[LlamaModel] eagle_acts shape={eagle_acts.shape}', flush=True) return hidden_states, eagle_acts else: @@ -284,9 +295,11 @@ class LlamaForCausalLM(nn.Module): def __init__( self, - config: LlamaConfig, draft: bool = False, + config: LlamaConfig, + draft: bool = False, speculate: bool = False, use_eagle: bool = False, + use_phoenix: bool = False, eagle_layers: list[int] | None = None, spec_k: int = 1, async_fan_out: int = 1, @@ -301,6 +314,7 @@ def __init__( self.async_fan_out = async_fan_out self.draft_async = draft_async self.use_eagle = use_eagle + self.use_phoenix = use_phoenix self.eagle_layers = eagle_layers self.tp_group = tp_group self.tp_size = tp_size @@ -310,7 +324,19 @@ def __init__( print(f'Starting LlamaForCausalLM init, draft={draft}, speculate={speculate}, spec_k={spec_k}') print(f'[LlamaForCausalLM] use_eagle={use_eagle}, eagle_layers={eagle_layers}', flush=True) - self.model = LlamaModel(config, draft, speculate, spec_k, async_fan_out, draft_async, use_eagle=use_eagle, eagle_layers=eagle_layers, tp_group=tp_group, tp_size=self.tp_size) + self.model = LlamaModel( + config, + draft, + speculate, + spec_k, + async_fan_out, + draft_async, + use_eagle=use_eagle, + use_phoenix=use_phoenix, + eagle_layers=eagle_layers, + tp_group=tp_group, + tp_size=self.tp_size, + ) self.lm_head = ParallelLMHead( config.vocab_size, config.hidden_size, diff --git a/ssd/models/phoenix_draft_llama3.py b/ssd/models/phoenix_draft_llama3.py new file mode 100644 index 000000000..2b25401cc --- /dev/null +++ b/ssd/models/phoenix_draft_llama3.py @@ -0,0 +1,74 @@ +import torch +import torch.distributed as dist +from transformers import LlamaConfig + +from ssd.layers.linear import RowParallelLinear +from ssd.models.llama3 import LlamaForCausalLM + + +class PhoenixLlamaForCausalLM(LlamaForCausalLM): + def __init__( + self, + config: LlamaConfig, + draft: bool = True, + speculate: bool = True, + use_eagle: bool = False, + use_phoenix: bool = True, + eagle_layers: list[int] | None = None, + d_model_target: int = 4096, + spec_k: int = 1, + async_fan_out: int = 1, + draft_async: bool = False, + tp_group: dist.ProcessGroup | None = None, + tp_size: int = 1, + debug_mode: bool = False, + ) -> None: + assert draft, "ERROR in PhoenixLlamaForCausalLM: draft must be True" + assert use_phoenix, "ERROR in PhoenixLlamaForCausalLM: config.use_phoenix must be True" + assert not use_eagle, "ERROR in PhoenixLlamaForCausalLM: config.use_eagle must be False" + super().__init__( + config, + draft=True, + speculate=True, + use_eagle=False, + use_phoenix=True, + eagle_layers=None, + spec_k=spec_k, + async_fan_out=async_fan_out, + draft_async=draft_async, + tp_group=tp_group, + tp_size=tp_size, + ) + self.d_model_target = d_model_target + self.debug_mode = debug_mode + self.eh_proj = RowParallelLinear( + self.d_model_target + config.hidden_size, + config.hidden_size, + bias=True, + tp_group=tp_group, + tp_size=tp_size, + ) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + input_embeds = self.model.embed_tokens(input_ids) + hidden_states = torch.cat((input_embeds, hidden_states), dim=-1) + hidden_states = self.eh_proj(hidden_states.to(self.eh_proj.weight.dtype)) + out = self.model(input_ids, positions, hidden_states) + return out + + def compute_logits( + self, + hidden_states: torch.Tensor, + last_only: bool = True, + ) -> torch.Tensor: + logits = self.lm_head(hidden_states, last_only=last_only) + + if logits.dim() == 3: + logits = logits.view(-1, logits.shape[-1]) + + return logits diff --git a/ssd/paths.py b/ssd/paths.py index 98fbb851d..c4b6a3a7e 100644 --- a/ssd/paths.py +++ b/ssd/paths.py @@ -6,19 +6,18 @@ os.environ.setdefault("TORCH_CUDA_ARCH_LIST", CUDA_ARCH) -def _required_env(var_name: str, note: str) -> str: - value = os.environ.get(var_name) - if value: - return value - raise RuntimeError(f"Missing required env var {var_name}. {note}") - - # root directory where huggingface model snapshots are stored. each model # lives under this as models--org--name/snapshots//. if you downloaded # models with `huggingface-cli download`, this is your HF_HOME/hub directory. -HF_CACHE_DIR = _required_env( +HF_CACHE_DIR = os.environ.get( "SSD_HF_CACHE", - "Set it to your HuggingFace cache hub directory (for example: /path/to/huggingface/hub).", + os.environ.get( + "HF_HUB_CACHE", + os.environ.get( + "HF_HOME", + os.path.expanduser("~/.cache/huggingface"), + ) + ) ) # default target and draft model snapshot paths. these are full paths to the @@ -50,9 +49,15 @@ def _required_env(var_name: str, note: str) -> str: # directory containing preprocessed benchmark datasets (jsonl files). # each dataset is a subdirectory with a file like humaneval_data_10000.jsonl. # you can generate these with scripts/get_data_from_hf.py. -DATASET_DIR = _required_env( +DATASET_DIR = os.environ.get( "SSD_DATASET_DIR", - "Set it to your processed dataset directory (for example: /path/to/processed_datasets).", + os.environ.get( + "HF_DATASETS_CACHE", + os.environ.get( + "HF_HOME", + os.path.expanduser("~/.cache/huggingface"), + ) + ) ) DATASET_PATHS = { "humaneval": f"{DATASET_DIR}/humaneval/humaneval_data_10000.jsonl", diff --git a/ssd/utils/async_helpers/async_spec_helpers.py b/ssd/utils/async_helpers/async_spec_helpers.py index c1793ae46..8c64b1356 100644 --- a/ssd/utils/async_helpers/async_spec_helpers.py +++ b/ssd/utils/async_helpers/async_spec_helpers.py @@ -40,16 +40,17 @@ def get_forked_recovery_tokens_from_logits(config: Config, logits: torch.Tensor, assert logits.shape[0] == B and logits.shape[1] == K+1, f"logits must have shape (B, K+1, V), got {logits.shape}" assert len(fan_out_list) == K + 1, f"fan_out_list must have length K+1={K+1}, got {len(fan_out_list)}" assert returned_tokens.shape == (B, K+1), f"returned_tokens must have shape (B, K+1), got {returned_tokens.shape}" - - # Use scatter_ to set returned tokens to -inf so we don't include those in forked tokens + + # Use scatter_ to set returned tokens to -inf so we don't include those in forked tokens # Don't touch the last sequence position, only scatter the first K positions + # Clone required: logits is an inference-mode tensor (from model forward under torch.inference_mode) logits = logits.clone() - logits[:, :-1, :] = logits[:, :-1, :].scatter( + logits[:, :-1, :].scatter_( dim=2, index=returned_tokens[:, 1:].unsqueeze(2), value=float('-inf'), ) - + # Compute top-k once at max fanout, then mask per row/position k_max = max(max(fan_out_list), max(fan_out_list_miss)) _, topk_idx = torch.topk(logits, k_max, dim=-1) # [B, K+1, k_max] diff --git a/ssd/utils/async_helpers/nccl_pack.py b/ssd/utils/async_helpers/nccl_pack.py deleted file mode 100644 index 3e592e847..000000000 --- a/ssd/utils/async_helpers/nccl_pack.py +++ /dev/null @@ -1,34 +0,0 @@ -import torch -import torch.distributed as dist - - -def concat_int64(*tensors: torch.Tensor) -> torch.Tensor: - """Concatenate tensors into a single flat int64 payload.""" - parts = [] - for t in tensors: - if t is None: - continue - if t.dtype != torch.int64: - t = t.to(torch.int64) - parts.append(t.reshape(-1)) - if not parts: - return torch.empty(0, dtype=torch.int64) - return torch.cat(parts, dim=0) - - -def send_int64(pg, dst: int, *tensors: torch.Tensor): - """Send many int64-compatible tensors as one fused payload in a fixed order.""" - payload = concat_int64(*tensors) - if payload.numel() == 0: - return - dist.send(payload, dst=dst, group=pg) - - -def recv_int64(pg, src: int, total_length: int, device: torch.device) -> torch.Tensor: - """Receive a fused int64 payload of known total length.""" - t = torch.empty((total_length,), dtype=torch.int64, device=device) - if total_length > 0: - dist.recv(t, src=src, group=pg) - return t - - diff --git a/ssd/utils/context.py b/ssd/utils/context.py index 91c744a27..cccb3459c 100644 --- a/ssd/utils/context.py +++ b/ssd/utils/context.py @@ -13,15 +13,17 @@ class Context: slot_mapping: torch.Tensor | None = None context_lens: torch.Tensor | None = None block_tables: torch.Tensor | None = None + tree_cu_seqlens_q: torch.Tensor | None = None + tree_mask_bias: torch.Tensor | None = None _CONTEXT = Context() def get_context(): return _CONTEXT -def set_context(is_prefill, cu_seqlens_q=None, cu_seqlens_k=None, max_seqlen_q=0, max_seqlen_k=0, slot_mapping=None, context_lens=None, block_tables=None, is_jit=False): +def set_context(is_prefill, cu_seqlens_q=None, cu_seqlens_k=None, max_seqlen_q=0, max_seqlen_k=0, slot_mapping=None, context_lens=None, block_tables=None, is_jit=False, tree_cu_seqlens_q=None, tree_mask_bias=None): global _CONTEXT - _CONTEXT = Context(is_prefill, is_jit, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, context_lens, block_tables) + _CONTEXT = Context(is_prefill, is_jit, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, context_lens, block_tables, tree_cu_seqlens_q, tree_mask_bias) def reset_context(): global _CONTEXT diff --git a/ssd/utils/dist_utils.py b/ssd/utils/dist_utils.py new file mode 100644 index 000000000..859896cf5 --- /dev/null +++ b/ssd/utils/dist_utils.py @@ -0,0 +1,76 @@ +"""Custom process group helper, copied from sglang to avoid circular dependency.""" + +import torch +from packaging import version as pkg_version + +torch_release = pkg_version.parse(torch.__version__).release + + +def init_custom_process_group( + backend=None, + init_method=None, + timeout=None, + world_size=-1, + rank=-1, + store=None, + group_name=None, + pg_options=None, + device_id=None, +): + from torch.distributed.distributed_c10d import ( + Backend, + PrefixStore, + _new_process_group_helper, + _world, + default_pg_timeout, + rendezvous, + ) + + assert (store is None) or ( + init_method is None + ), "Cannot specify both init_method and store." + + if store is not None: + assert world_size > 0, "world_size must be positive if using store" + assert rank >= 0, "rank must be non-negative if using store" + elif init_method is None: + init_method = "env://" + + if backend: + backend = Backend(backend) + else: + backend = Backend("undefined") + + if timeout is None: + timeout = default_pg_timeout + + # backward compatible API + if store is None: + rendezvous_iterator = rendezvous(init_method, rank, world_size, timeout=timeout) + store, rank, world_size = next(rendezvous_iterator) + store.set_timeout(timeout) + + # Use a PrefixStore to avoid accidental overrides of keys used by + # different systems (e.g. RPC) in case the store is multi-tenant. + store = PrefixStore(group_name, store) + + # NOTE: The pg_options parameter was renamed into backend_options in PyTorch 2.6.0 + # https://github.com/pytorch/pytorch/commit/a0c7029a75628cd5fa8df83c0de0ea98ee7fd844 + pg_options_param_name = ( + "backend_options" if torch_release >= (2, 6) else "pg_options" + ) + pg, _ = _new_process_group_helper( + world_size, + rank, + [], + backend, + store, + group_name=group_name, + **{pg_options_param_name: pg_options}, + timeout=timeout, + device_id=device_id, + ) + + _world.pg_group_ranks[pg] = {i: i for i in range(world_size)} + + return pg diff --git a/ssd/utils/loader.py b/ssd/utils/loader.py index f56ec807f..7169e3198 100644 --- a/ssd/utils/loader.py +++ b/ssd/utils/loader.py @@ -186,6 +186,8 @@ def load_eagle_model(model: nn.Module, path: str, packed_modules_mapping: dict, def load_safetensors_model(model: nn.Module, path: str, packed_modules_mapping: dict): """Load model weights from safetensors files""" safetensor_files = glob(os.path.join(path, "*.safetensors")) + assert safetensor_files, f"No safetensors files found at {path}" + print(f"[load_safetensors_model] Found {len(safetensor_files)} safetensors files at {path}") for file in tqdm(safetensor_files, desc="Loading model files"): with safe_open(file, "pt", "cpu") as f: for weight_name in f.keys(): diff --git a/ssd/utils/misc.py b/ssd/utils/misc.py index 1123718dc..df4f1c649 100644 --- a/ssd/utils/misc.py +++ b/ssd/utils/misc.py @@ -1,3 +1,4 @@ +import re from transformers import AutoTokenizer @@ -22,3 +23,9 @@ def decode_tokens(token_ids: list[int], tokenizer: AutoTokenizer) -> list[str]: except Exception: decoded.append(f"") return decoded + + +def compress_neg_ones_and_zeros(long_str: str) -> str: + sub1 = re.sub(r'-1(?:, -1){2,}', '-1, ..., -1', long_str) + sub2 = re.sub(r'0(?:, 0){2,}', '0, ..., 0', sub1) + return sub2 diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 000000000..81ed3241c --- /dev/null +++ b/tests/README.md @@ -0,0 +1,68 @@ +# SSD testbed + +See `ssd_test_plan_cc.md` for the full plan, invariant list, and tier definitions. +This README is the how-to-run quick reference. + +## Running + +```bash +# Activate the SSD env. +source /work/avner/git/ssd-phnx/.venv/bin/activate + +# Fast subset (tier 0 + smoke): ~1-2 min on H100. Intended for per-commit CI. +./tests/run_fast.sh + +# Full tier 0+1: ~8-10 min on H100. +./tests/run_tier1.sh + +# Ad-hoc: +pytest tests/unit -m tier0 # CPU unit tests only +pytest tests/e2e -m tier1 # all tier 1 +pytest tests -m "tier0 or smoke" # fast subset +pytest tests/unit/test_verify.py -v # one file +``` + +## Current coverage (Tiers 0–1) + +| Tier | Invariant | Test file | +|------|-----------|-----------| +| 0 / I8 | `verify()` correctness across branches | `tests/unit/test_verify.py` | +| 0 / I9 | mask helpers: cached ≡ vectorized + structure | `tests/unit/test_mask_helpers.py` | +| 0 / I10 | BlockManager allocate / deallocate / refcount | `tests/unit/test_block_manager.py` | +| 0 / I7 | tree-cache lookup semantics | `tests/unit/test_tree_cache_semantics.py` | +| 0 / I11 | handshake pack/unpack round-trip | `tests/unit/test_handshake_roundtrip.py` | +| 1 / I1 | async+force-jit ≡ no-spec (greedy, 8B) | `tests/e2e/test_sync_vs_force_jit.py` | +| 1 / I2 | force-jit ≡ jit ≡ fast (greedy, 8B) | `tests/e2e/test_greedy_strategy_equivalence.py` | +| 1 / I3 | cudagraph ≡ eager (greedy, 8B) | `tests/e2e/test_cudagraph_vs_eager.py` | +| 1 / I4 | batch position independence | `tests/e2e/test_batch_independence.py` | +| 1 / I5 | duplicate-prompt prefix-cache correctness | `tests/e2e/test_prefix_cache.py` | +| 1 / I6 | preemption round-trip | `tests/e2e/test_preemption.py` | + +Tiers 2–5 (HF reference, SSD↔TGL fixtures, 70B TP=4, perf regression) are +scoped out of this pass; see plan for details. + +## Environment + +- SSD uses `/work/avner/git/ssd-phnx/.venv` (managed by uv). +- Tier 1 tests assume model snapshots under `/scratch/avner/huggingface/hub/` + — specifically: + - target: `models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249...` + - draft: `models--meta-llama--Llama-3.2-1B-Instruct/snapshots/921317...` + - (Tests auto-skip if a required snapshot is missing.) + +## Implementation notes + +- Tier 1 tests run each LLM config in a fresh subprocess via `tests/e2e/_runner.py`. + This is necessary because `LLMEngine.exit` calls `os._exit(0)` during teardown; + running two LLM instances inside one pytest process would kill the test runner. +- Tier 0 tests run in-process and do not allocate any CUDA memory. + +## Known issue / next steps + +- **Sync-spec (`draft_async=False`) crashes at draft-model load** on the + `cc/sglang-fa4` branch: `AttributeError: ModuleList has no attribute '20'` + — the draft model loader appears to use target-layer indices to traverse the + draft model. I1 was therefore pivoted to compare `async+force-jit` against + `no-spec` (greedy output must match), which is an equally strong correctness + property. When sync-spec is fixed, a direct sync-vs-async test can be added + to `test_sync_vs_force_jit.py`. diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..2bdcbd6c9 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,42 @@ +"""Shared pytest config for the SSD testbed. + +Markers: +- tier0: no GPU / no model weights. Always runnable. +- tier1: single GPU, real 8B weights. Requires CUDA and the 8B model snapshot. +- smoke: a tiny subset of tier1 suitable for per-commit CI. +- tier2..5: reserved for future tiers (HF ref, cross-repo, 70B, perf). + +Run examples (see tests/README.md for more): + pytest tests/unit -m tier0 + pytest tests/e2e -m tier1 + pytest tests -m "tier0 or smoke" +""" +from __future__ import annotations + +import pytest + + +def pytest_configure(config): + for marker in ("tier0", "tier1", "tier2", "tier3", "tier4", "tier5", "smoke"): + config.addinivalue_line("markers", f"{marker}: see tests/ssd_test_plan_cc.md") + + +def _cuda_count() -> int: + try: + import torch + return torch.cuda.device_count() if torch.cuda.is_available() else 0 + except Exception: + return 0 + + +def pytest_collection_modifyitems(config, items): + """Auto-skip GPU-dependent tiers when insufficient GPUs are available.""" + n = _cuda_count() + skip_no_gpu = pytest.mark.skip(reason="requires >=1 CUDA device") + skip_lt4_gpu = pytest.mark.skip(reason="requires >=4 CUDA devices") + for item in items: + if "tier1" in item.keywords or "tier2" in item.keywords or "tier3" in item.keywords: + if n < 1: + item.add_marker(skip_no_gpu) + if "tier4" in item.keywords and n < 4: + item.add_marker(skip_lt4_gpu) diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/e2e/_helpers.py b/tests/e2e/_helpers.py new file mode 100644 index 000000000..a32e309fe --- /dev/null +++ b/tests/e2e/_helpers.py @@ -0,0 +1,95 @@ +"""Helpers used by Tier 1 E2E tests. + +Runs the `_runner.py` subprocess with a given config and returns the parsed +JSON result. Each test invokes this multiple times with different configs and +asserts that the (greedy) token outputs match. +""" +from __future__ import annotations + +import json +import os +import subprocess +import sys +from pathlib import Path + + +# Canonical local model snapshots (8B target + 1B standalone draft). +LLAMA_3_1_8B_SNAPSHOT = "/scratch/avner/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659" +LLAMA_3_2_1B_SNAPSHOT = "/scratch/avner/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6" +EAGLE3_8B_SNAPSHOT = "/scratch/avner/huggingface/hub/models--yuhuili--EAGLE3-LLaMA3.1-Instruct-8B/snapshots/61aa096484ad9752292507b0cc9973bb423abb35" + + +def require_8b_target() -> str: + if not Path(LLAMA_3_1_8B_SNAPSHOT).is_dir(): + import pytest + pytest.skip(f"Llama-3.1-8B snapshot not found at {LLAMA_3_1_8B_SNAPSHOT}") + return LLAMA_3_1_8B_SNAPSHOT + + +def require_1b_draft() -> str: + if not Path(LLAMA_3_2_1B_SNAPSHOT).is_dir(): + import pytest + pytest.skip(f"Llama-3.2-1B snapshot not found at {LLAMA_3_2_1B_SNAPSHOT}") + return LLAMA_3_2_1B_SNAPSHOT + + +def run_llm_subprocess(config: dict, timeout: int = 600, trace_accepts: bool = False) -> dict: + """Run the LLM runner in a fresh subprocess with the given config dict. + + Returns the parsed runner result (see `_runner.py`). + + When `trace_accepts=True`, sets SSD_TRACE_ACCEPTS=1 so the engine records + the per-step accept trace (list of (seq_id, suffix, recovery) per verify + step), which the runner includes in the result under "per_step_accepts". + """ + runner = Path(__file__).parent / "_runner.py" + env = dict(os.environ) + # Ensure no lingering stale NCCL/shm state leaks into this child process. + env.setdefault("SSD_BRIEF_LOG", "0") + env.setdefault("SSD_NCCL_LOG", "0") + if trace_accepts: + env["SSD_TRACE_ACCEPTS"] = "1" + + proc = subprocess.run( + [sys.executable, str(runner), "--config-json", json.dumps(config)], + capture_output=True, + text=True, + env=env, + timeout=timeout, + ) + if proc.returncode != 0: + raise RuntimeError( + f"runner exited with code {proc.returncode}\n" + f"--- stdout ---\n{proc.stdout}\n" + f"--- stderr ---\n{proc.stderr}\n" + ) + # Find the RUNNER_RESULT line + for line in proc.stdout.splitlines(): + if line.startswith("RUNNER_RESULT: "): + return json.loads(line[len("RUNNER_RESULT: "):]) + raise RuntimeError( + f"runner did not emit RUNNER_RESULT\n" + f"--- stdout ---\n{proc.stdout}\n" + f"--- stderr ---\n{proc.stderr}\n" + ) + + +def base_config(prompts: list[str], *, max_new_tokens: int = 32, target: str | None = None) -> dict: + """A default base config that tests customize by adding/overriding fields.""" + return { + "model": target or require_8b_target(), + "prompts": prompts, + "temperature": 0.0, + "max_new_tokens": max_new_tokens, + "ignore_eos": True, + "max_model_len": 2048, + "max_num_seqs": 4, + "enforce_eager": False, + "num_gpus": 1, + } + + +CANONICAL_PROMPTS = [ + "The capital city of France is", + "The largest ocean on Earth is", +] diff --git a/tests/e2e/_runner.py b/tests/e2e/_runner.py new file mode 100644 index 000000000..f40b6ff35 --- /dev/null +++ b/tests/e2e/_runner.py @@ -0,0 +1,71 @@ +"""Subprocess runner used by Tier 1 tests. + +Runs a single LLM configuration and prints a JSON line `RUNNER_RESULT: {...}` +containing output token ids and metrics. This lives behind a subprocess boundary +because `LLMEngine.exit()` calls os._exit(0) on teardown, which would kill pytest. + +Invoked as: + python tests/e2e/_runner.py --config-json '{"model": ..., "speculate": true, ...}' + +The config JSON supports a superset of LLMEngine kwargs plus: +- prompts: list[str] (required) +- max_new_tokens: int (default 32) +- temperature: float (default 0.0) +- seed: int | None (default None — no explicit seed) +""" +from __future__ import annotations + +import argparse +import json +import os +import sys + + +def _load_config() -> dict: + p = argparse.ArgumentParser() + p.add_argument("--config-json", required=True) + args = p.parse_args() + return json.loads(args.config_json) + + +def main(): + cfg = _load_config() + prompts: list[str] = cfg.pop("prompts") + max_new_tokens: int = cfg.pop("max_new_tokens", 32) + temperature: float = cfg.pop("temperature", 0.0) + ignore_eos: bool = cfg.pop("ignore_eos", True) + seed = cfg.pop("seed", None) + + if seed is not None: + os.environ.setdefault("PYTHONHASHSEED", str(seed)) + import random + random.seed(seed) + import torch + torch.manual_seed(seed) + + # Import AFTER seed setup so any CUDA init happens with a stable seed. + from ssd import LLM, SamplingParams # noqa: E402 + + llm = LLM(**cfg) + sp = [SamplingParams(temperature=temperature, max_new_tokens=max_new_tokens, ignore_eos=ignore_eos)] * len(prompts) + outputs, metrics = llm.generate(prompts, sp, use_tqdm=False) + + # Keep only token ids from outputs — text decoding is the tokenizer's job, tested separately. + result = { + "token_ids": [o["token_ids"] for o in outputs], + "n_seqs": len(outputs), + # A few scalar metrics (aggregate) that are safe to compare across runs. + "prefill_total_tokens": metrics.get("prefill_total_tokens", 0), + "decode_total_tokens": metrics.get("decode_total_tokens", 0), + "num_cache_hits": int(sum(metrics.get("cache_hits", []))), + "num_verify_steps": len(metrics.get("accepted_suffix_lens_with_recovery", [])), + } + # Opt-in: include the full per-step accept trace (enabled by SSD_TRACE_ACCEPTS=1 + # — the engine populates this key only when the env var is set). + if "per_step_accepts" in metrics: + result["per_step_accepts"] = metrics["per_step_accepts"] + print("RUNNER_RESULT: " + json.dumps(result), flush=True) + + +if __name__ == "__main__": + main() diff --git a/tests/e2e/_trace_analysis.py b/tests/e2e/_trace_analysis.py new file mode 100644 index 000000000..0c241e41b --- /dev/null +++ b/tests/e2e/_trace_analysis.py @@ -0,0 +1,91 @@ +"""Ad-hoc script: quantify how far sync-spec and async+force-jit traces diverge.""" +from __future__ import annotations + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +from tests.e2e._helpers import ( # noqa: E402 + CANONICAL_PROMPTS, base_config, require_1b_draft, require_8b_target, run_llm_subprocess, +) + + +def _per_seq(trace): + id_map: dict[int, int] = {} + out: dict[int, list] = {} + for step in trace: + for sid, suf, rec in step: + if sid not in id_map: + id_map[sid] = len(id_map) + out[id_map[sid]] = [] + out[id_map[sid]].append((list(suf), int(rec))) + return out + + +def main(): + target, draft = require_8b_target(), require_1b_draft() + prompts = CANONICAL_PROMPTS + common = dict(speculate=True, speculate_k=2, enforce_eager=True, max_new_tokens=16) + + sync_cfg = {**base_config(prompts), "model": target, "draft": draft, + "draft_async": False, "num_gpus": 1, **common} + async_cfg = {**base_config(prompts), "model": target, "draft": draft, + "draft_async": True, "force_jit_speculate": True, "jit_speculate": True, + "async_fan_out": 2, "num_gpus": 2, **common} + + sync = run_llm_subprocess(sync_cfg, trace_accepts=True) + asn = run_llm_subprocess(async_cfg, trace_accepts=True) + + a = _per_seq(sync["per_step_accepts"]) + b = _per_seq(asn["per_step_accepts"]) + + print(f"final token streams equal: {sync['token_ids'] == asn['token_ids']}") + print() + + for seq_idx in sorted(a.keys()): + ta, tb = a[seq_idx], b[seq_idx] + print(f"=== seq #{seq_idx} ===") + print(f" sync steps: {len(ta)}, async steps: {len(tb)}") + + def stats(trace): + drafts_per_step = [len(suf) - 1 for suf, _ in trace] + total_drafts = sum(drafts_per_step) + completions = total_drafts + len(trace) # each step adds drafts + 1 recovery + proposals = len(trace) * 2 # speculate_k=2 draft proposals per step + return drafts_per_step, total_drafts, completions, proposals + + sda, tda, coma, pra = stats(ta) + sdb, tdb, comb, prb = stats(tb) + + print(f" sync drafts accepted per step: {sda} (total {tda}/{pra} = {tda/pra:.1%})") + print(f" async drafts accepted per step: {sdb} (total {tdb}/{prb} = {tdb/prb:.1%})") + print(f" sync completion tokens (drafts+recoveries): {coma}") + print(f" async completion tokens: {comb}") + + # How many of the sync-trace (suffix, recovery) pairs also appear in async trace? + common = set(map(lambda x: (tuple(x[0]), x[1]), ta)) & set(map(lambda x: (tuple(x[0]), x[1]), tb)) + print(f" shared (suffix, recovery) pairs: {len(common)} " + f"(sync unique={len(ta) - len(common)}, async unique={len(tb) - len(common)})") + + # Recovery tokens alone — match the actual per-recovery token trace. + sync_recs = [r for _, r in ta] + asn_recs = [r for _, r in tb] + print(f" recovery tokens equal (as sequence)? {sync_recs == asn_recs}") + + # If recovery sequences are subsequences of each other (async = sync with extras) + if len(sync_recs) <= len(asn_recs): + shorter, longer = sync_recs, asn_recs + label = "sync subseq of async" + else: + shorter, longer = asn_recs, sync_recs + label = "async subseq of sync" + def is_subseq(s, l): + it = iter(l) + return all(any(x == y for y in it) for x in s) + print(f" {label}: {is_subseq(shorter, longer)}") + print() + + +if __name__ == "__main__": + main() diff --git a/tests/e2e/test_batch_independence.py b/tests/e2e/test_batch_independence.py new file mode 100644 index 000000000..5254911c5 --- /dev/null +++ b/tests/e2e/test_batch_independence.py @@ -0,0 +1,36 @@ +"""Tier 1 / I4: greedy output of a prompt is independent of batch position. + +Running a prompt alone (batch=1) must produce the same greedy tokens as +running the same prompt at any position in a batch of prompts, since greedy +decoding has no cross-sequence dependencies. +""" +from __future__ import annotations + +import pytest + +from ._helpers import ( + CANONICAL_PROMPTS, + base_config, + require_8b_target, + run_llm_subprocess, +) + + +@pytest.mark.tier1 +def test_prompt_output_independent_of_batch_position(): + target = require_8b_target() + p = CANONICAL_PROMPTS[0] + other = CANONICAL_PROMPTS[1] + + solo_cfg = {**base_config([p]), "model": target, "max_new_tokens": 12, "num_gpus": 1, "enforce_eager": True, "max_num_seqs": 1} + batched_cfg = {**base_config([p, other]), "model": target, "max_new_tokens": 12, "num_gpus": 1, "enforce_eager": True, "max_num_seqs": 2} + + solo = run_llm_subprocess(solo_cfg) + batched = run_llm_subprocess(batched_cfg) + + # Output order matches input order (see llm_engine.generate). + assert solo["token_ids"][0] == batched["token_ids"][0], ( + f"prompt output changed with batch position:\n" + f" solo[0] = {solo['token_ids'][0]}\n" + f" batched[0] = {batched['token_ids'][0]}" + ) diff --git a/tests/e2e/test_cudagraph_vs_eager.py b/tests/e2e/test_cudagraph_vs_eager.py new file mode 100644 index 000000000..1ff3ce0cf --- /dev/null +++ b/tests/e2e/test_cudagraph_vs_eager.py @@ -0,0 +1,36 @@ +"""Tier 1 / I3: CUDA-graph decode ≡ eager decode (greedy). + +Target-only decode with enforce_eager=True must produce the same tokens as +with CUDA graphs enabled. Tests catch bugs introduced during graph capture +(e.g. missed variable updates, padding errors). +""" +from __future__ import annotations + +import pytest + +from ._helpers import ( + CANONICAL_PROMPTS, + base_config, + require_8b_target, + run_llm_subprocess, +) + + +@pytest.mark.tier1 +def test_cudagraph_vs_eager_target_only(): + target = require_8b_target() + prompts = CANONICAL_PROMPTS + + common = {**base_config(prompts), "model": target, "max_new_tokens": 16, "num_gpus": 1} + + eager_cfg = {**common, "enforce_eager": True} + graph_cfg = {**common, "enforce_eager": False} + + eager = run_llm_subprocess(eager_cfg) + graph = run_llm_subprocess(graph_cfg) + + assert eager["token_ids"] == graph["token_ids"], ( + f"cudagraph vs eager mismatch (target-only greedy):\n" + f" eager = {eager['token_ids']}\n" + f" graph = {graph['token_ids']}\n" + ) diff --git a/tests/e2e/test_greedy_strategy_equivalence.py b/tests/e2e/test_greedy_strategy_equivalence.py new file mode 100644 index 000000000..c7c16c4da --- /dev/null +++ b/tests/e2e/test_greedy_strategy_equivalence.py @@ -0,0 +1,66 @@ +"""Tier 1 / I2: in greedy mode, force-jit ≡ jit ≡ fast. + +In greedy sampling the target's argmax solely determines the output; what the +draft proposes only changes *speed* and *acceptance rate*. So all three async +backup strategies must produce the same final token stream for the same prompts +with temperature=0. + +Note: `fast` mode returns all-zero speculations on cache misses, which means +the target will reject every speculated token on a miss and sample the recovery +directly. That still yields the same greedy tokens, just one at a time. +""" +from __future__ import annotations + +import pytest + +from ._helpers import ( + CANONICAL_PROMPTS, + base_config, + require_1b_draft, + require_8b_target, + run_llm_subprocess, +) + + +def _async_cfg(prompts, *, target, draft, backup: str): + """Build an async-spec config with the given backup strategy.""" + cfg = { + **base_config(prompts), + "model": target, "draft": draft, + "speculate": True, "draft_async": True, + "speculate_k": 2, "async_fan_out": 2, + "enforce_eager": True, + "num_gpus": 2, + "max_new_tokens": 12, + } + if backup == "force-jit": + cfg["force_jit_speculate"] = True + cfg["jit_speculate"] = True + elif backup == "jit": + cfg["force_jit_speculate"] = False + cfg["jit_speculate"] = True + elif backup == "fast": + cfg["force_jit_speculate"] = False + cfg["jit_speculate"] = False + else: + raise ValueError(backup) + return cfg + + +@pytest.mark.tier1 +def test_force_jit_jit_fast_match_greedy(): + target = require_8b_target() + draft = require_1b_draft() + prompts = [CANONICAL_PROMPTS[0]] + + results = { + b: run_llm_subprocess(_async_cfg(prompts, target=target, draft=draft, backup=b)) + for b in ("force-jit", "jit", "fast") + } + + fj = results["force-jit"]["token_ids"] + jt = results["jit"]["token_ids"] + ft = results["fast"]["token_ids"] + + assert fj == jt, f"force-jit ≠ jit\n force-jit={fj}\n jit={jt}" + assert fj == ft, f"force-jit ≠ fast\n force-jit={fj}\n fast={ft}" diff --git a/tests/e2e/test_preemption.py b/tests/e2e/test_preemption.py new file mode 100644 index 000000000..8adef9335 --- /dev/null +++ b/tests/e2e/test_preemption.py @@ -0,0 +1,48 @@ +"""Tier 1 / I6: preemption round-trip preserves greedy output. + +When KV-cache blocks are scarce, the scheduler preempts running sequences +(deallocates their blocks, moves them back to waiting, then re-prefills). The +final generated tokens must equal those of an un-preempted run. + +We force preemption by configuring `num_kvcache_blocks` to a tight value with +`max_num_seqs > 1`, so the second sequence cannot fit without preempting the +first. Compare to a run with plenty of blocks (no preemption). +""" +from __future__ import annotations + +import pytest + +from ._helpers import ( + CANONICAL_PROMPTS, + base_config, + require_8b_target, + run_llm_subprocess, +) + + +@pytest.mark.tier1 +def test_preemption_matches_unpreempted_output(): + target = require_8b_target() + prompts = CANONICAL_PROMPTS + + # Both runs use the same prompts and sampling; only num_kvcache_blocks differs. + common = { + **base_config(prompts), + "model": target, + "max_new_tokens": 16, + "max_num_seqs": 2, + "num_gpus": 1, + "enforce_eager": True, + "kvcache_block_size": 256, + } + unpreempted = run_llm_subprocess({**common, "num_kvcache_blocks": 512}) + # With block_size=256 and max_model_len=2048, each seq can need up to 8 blocks. + # Setting num_kvcache_blocks=10 with two sequences and prompts of ~16 tokens forces + # preemption when a second sequence's blocks can't be appended. + preempted = run_llm_subprocess({**common, "num_kvcache_blocks": 10}) + + assert unpreempted["token_ids"] == preempted["token_ids"], ( + f"preempted run diverged from unpreempted (same greedy prompts):\n" + f" unpreempted = {unpreempted['token_ids']}\n" + f" preempted = {preempted['token_ids']}" + ) diff --git a/tests/e2e/test_prefix_cache.py b/tests/e2e/test_prefix_cache.py new file mode 100644 index 000000000..3647bb847 --- /dev/null +++ b/tests/e2e/test_prefix_cache.py @@ -0,0 +1,42 @@ +"""Tier 1 / I5: shared-prefix prefix caching. + +When two prompts share a prefix, the block manager must reuse blocks for the +shared region. Operationally: running two identical prompts in one batch must +produce the same output for both, and prefill should account for the shared +blocks (e.g. fewer newly allocated blocks than for a non-sharing batch). + +We check the output-equivalence condition as the primary signal, since +prefix-caching bugs typically manifest as one sequence getting the other's +cached logits and diverging in output. +""" +from __future__ import annotations + +import pytest + +from ._helpers import ( + CANONICAL_PROMPTS, + base_config, + require_8b_target, + run_llm_subprocess, +) + + +@pytest.mark.tier1 +def test_duplicate_prompt_yields_identical_outputs(): + target = require_8b_target() + # A long-ish prompt to ensure at least one full block is shared. + p = "The following is a detailed explanation of the theory of relativity, which was proposed by Albert Einstein in the early twentieth century. It states that" + cfg = { + **base_config([p, p]), + "model": target, + "max_new_tokens": 12, + "max_num_seqs": 2, + "num_gpus": 1, + "enforce_eager": True, + } + out = run_llm_subprocess(cfg) + assert out["token_ids"][0] == out["token_ids"][1], ( + f"duplicate prompts produced different outputs (prefix-cache bug?):\n" + f" [0] = {out['token_ids'][0]}\n" + f" [1] = {out['token_ids'][1]}" + ) diff --git a/tests/e2e/test_sync_vs_force_jit.py b/tests/e2e/test_sync_vs_force_jit.py new file mode 100644 index 000000000..30c1a6a20 --- /dev/null +++ b/tests/e2e/test_sync_vs_force_jit.py @@ -0,0 +1,197 @@ +"""Tier 1 / I1: synchronous speculative decoding ≡ async+force-jit (greedy). + +`force-jit` in async mode always runs the draft synchronously — so the only +difference between it and sync spec (`draft_async=False`) is process topology +(separate target/draft processes vs. colocated on rank 0). In greedy mode the +two must agree on: +1. final generated token stream (bitwise identical), and +2. per-step acceptance trace — for every verify step, the accepted suffix + (previous recovery + accepted draft tokens) and the new recovery token + must match across both configurations for the same seq_id. + +The per-step comparison (2) is the stronger check: it verifies the spec +algorithm's decision trace is identical, not merely the aggregate output. +""" +from __future__ import annotations + +import pytest + +from ._helpers import ( + CANONICAL_PROMPTS, + base_config, + require_1b_draft, + require_8b_target, + run_llm_subprocess, +) + + +def _sync_cfg(prompts, target, draft, max_new_tokens, k=2): + return { + **base_config(prompts), "model": target, "draft": draft, + "speculate": True, "draft_async": False, + "speculate_k": k, + "max_new_tokens": max_new_tokens, "enforce_eager": True, "num_gpus": 1, + } + + +def _async_forcejit_cfg(prompts, target, draft, max_new_tokens, k=2): + return { + **base_config(prompts), "model": target, "draft": draft, + "speculate": True, "draft_async": True, + "force_jit_speculate": True, "jit_speculate": True, + "speculate_k": k, "async_fan_out": 2, + "max_new_tokens": max_new_tokens, "enforce_eager": True, "num_gpus": 2, + } + + +def _per_seq_trace(trace): + """Group a per-step trace into a per-sequence trace. + + Returns dict[canonical_seq_idx, list[(suffix, recovery)]] where + canonical_seq_idx is 0..N-1 assigned in first-appearance order (the raw + seq_ids come from a process-global counter and differ across configs). + + Comparing per-sequence traces is the right level of strictness for + sync-vs-async+force-jit equivalence: different sequences can complete in + different numbers of steps (e.g. one sequence keeps accepting multi-token + suffixes while another accepts single tokens), so the aggregate step count + and per-step batch composition legitimately differ between modes. What must + agree is each individual sequence's trace. + """ + id_map: dict[int, int] = {} + per_seq: dict[int, list[tuple[list[int], int]]] = {} + for step in trace: + for seq_id, suffix, rec in step: + if seq_id not in id_map: + id_map[seq_id] = len(id_map) + per_seq[id_map[seq_id]] = [] + per_seq[id_map[seq_id]].append((list(suffix), int(rec))) + return per_seq + + +def _assert_traces_equal(sync_trace, async_trace, *, context): + a = _per_seq_trace(sync_trace) + b = _per_seq_trace(async_trace) + assert a.keys() == b.keys(), ( + f"{context}: different set of sequences — sync={sorted(a)}, async={sorted(b)}" + ) + for seq_idx in sorted(a.keys()): + assert a[seq_idx] == b[seq_idx], ( + f"{context}: per-sequence trace diverges for seq #{seq_idx}\n" + f" sync ({len(a[seq_idx])} steps) = {a[seq_idx]}\n" + f" async ({len(b[seq_idx])} steps) = {b[seq_idx]}" + ) + + +@pytest.mark.tier1 +@pytest.mark.smoke +def test_single_prompt_greedy_matches_tokens_and_trace(): + """I1 smoke: one prompt, force-jit must match sync-spec on both token stream and per-step trace.""" + target = require_8b_target() + draft = require_1b_draft() + prompts = [CANONICAL_PROMPTS[0]] + + sync_out = run_llm_subprocess( + _sync_cfg(prompts, target, draft, max_new_tokens=12), trace_accepts=True, + ) + async_out = run_llm_subprocess( + _async_forcejit_cfg(prompts, target, draft, max_new_tokens=12), trace_accepts=True, + ) + + # (1) Final token streams agree + assert sync_out["token_ids"] == async_out["token_ids"], ( + f"token_ids mismatch:\n sync = {sync_out['token_ids']}\n async = {async_out['token_ids']}" + ) + # (2) Per-step accept traces agree + assert "per_step_accepts" in sync_out and "per_step_accepts" in async_out, ( + "per_step_accepts missing — trace_accepts=True did not propagate" + ) + _assert_traces_equal( + sync_out["per_step_accepts"], async_out["per_step_accepts"], + context="sync vs async+force-jit (single prompt)", + ) + + +@pytest.mark.tier1 +def test_multi_prompt_greedy_matches_tokens(): + """I1: multiple prompts, final token streams match between sync-spec and async+force-jit.""" + target = require_8b_target() + draft = require_1b_draft() + prompts = CANONICAL_PROMPTS + + sync_out = run_llm_subprocess(_sync_cfg(prompts, target, draft, max_new_tokens=16)) + async_out = run_llm_subprocess(_async_forcejit_cfg(prompts, target, draft, max_new_tokens=16)) + assert sync_out["token_ids"] == async_out["token_ids"] + + +@pytest.mark.tier1 +def test_multi_prompt_first_seq_trace_matches_at_longer_length(): + """I1: in a 2-prompt batch, seq #0 (the first prompt in canonical order) has + an identical per-step accept trace under sync-spec and async+force-jit for a + generation length well beyond max_new_tokens=16. + + Seq #0 equality held at length=16 (see `test_multi_prompt_greedy_matches_tokens` + and the accompanying `_trace_analysis.py`). This test verifies that equality + *continues* to hold as the generation runs longer — ruling out the possibility + that seq #0 was only passing by coincidence for short outputs. + + Seq #1 is known to diverge on per-step traces (same final tokens, different + acceptance schedule); see `test_multi_prompt_greedy_matches_trace` for the + full-batch check that records that divergence. + """ + target = require_8b_target() + draft = require_1b_draft() + prompts = CANONICAL_PROMPTS + long_n = 64 # 4× the default — enough to catch drift that accumulates over time + + sync_out = run_llm_subprocess( + _sync_cfg(prompts, target, draft, max_new_tokens=long_n), trace_accepts=True, + ) + async_out = run_llm_subprocess( + _async_forcejit_cfg(prompts, target, draft, max_new_tokens=long_n), trace_accepts=True, + ) + + a = _per_seq_trace(sync_out["per_step_accepts"]) + b = _per_seq_trace(async_out["per_step_accepts"]) + assert 0 in a and 0 in b, "seq #0 missing from one of the traces" + assert a[0] == b[0], ( + f"seq #0 per-step accept trace diverges at max_new_tokens={long_n}\n" + f" sync ({len(a[0])} steps) = {a[0]}\n" + f" async ({len(b[0])} steps) = {b[0]}" + ) + + +@pytest.mark.tier1 +@pytest.mark.xfail( + reason=( + "Known divergence on multi-prompt batches: async+force-jit and sync-spec " + "produce the same final tokens but diverging per-step acceptance traces " + "for seq #1 (second prompt in the batch). Seq #0 matches exactly — see " + "test_multi_prompt_first_seq_trace_matches_at_longer_length. Hypothesis: " + "tree-attention vs linear-decode produces subtly different draft logits " + "at non-zero batch positions, or KV rollback after partial accepts drifts " + "state for the second sequence." + ), + strict=True, +) +def test_multi_prompt_greedy_matches_trace(): + """I1 (xfail): tighter version of the multi-prompt check — per-step accept trace equality. + + This test is marked xfail (strict) to record the finding; if a future change + to the async path makes this pass, the xfail assertion will flip to a real + failure, flagging the behavioral change for review. + """ + target = require_8b_target() + draft = require_1b_draft() + prompts = CANONICAL_PROMPTS + + sync_out = run_llm_subprocess( + _sync_cfg(prompts, target, draft, max_new_tokens=16), trace_accepts=True, + ) + async_out = run_llm_subprocess( + _async_forcejit_cfg(prompts, target, draft, max_new_tokens=16), trace_accepts=True, + ) + _assert_traces_equal( + sync_out["per_step_accepts"], async_out["per_step_accepts"], + context="sync vs async+force-jit (multi prompt)", + ) diff --git a/tests/hf/__init__.py b/tests/hf/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/hf/eagle3_hf.py b/tests/hf/eagle3_hf.py new file mode 100644 index 000000000..0d6065c84 --- /dev/null +++ b/tests/hf/eagle3_hf.py @@ -0,0 +1,164 @@ +from __future__ import annotations + +import argparse +import glob +import os + +import torch +import torch.nn.functional as F +from torch import nn +from safetensors.torch import load_file +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaConfig +from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm + + +EAGLE_LAYERS_LLAMA_8B = [2, 16, 29] # set in ssd/config.py for L=32 +D_MODEL_TARGET_LLAMA_8B = 4096 + + +# --------------------------------------------------------------------------- +# Minimal from-scratch Eagle3 model. SpecForge keys land here cleanly. +# --------------------------------------------------------------------------- +class Eagle3Attention(nn.Module): + def __init__(self, cfg): + super().__init__() + self.nh = cfg.num_attention_heads + self.nkh = cfg.num_key_value_heads + self.hd = getattr(cfg, "head_dim", None) or (cfg.hidden_size // self.nh) + self.scale = self.hd ** -0.5 + # qkv input dim is 2*hidden (concat of embeds and target_hidden, post-norm). + in_dim = 2 * cfg.hidden_size + self.q_proj = nn.Linear(in_dim, self.nh * self.hd, bias=False) + self.k_proj = nn.Linear(in_dim, self.nkh * self.hd, bias=False) + self.v_proj = nn.Linear(in_dim, self.nkh * self.hd, bias=False) + self.o_proj = nn.Linear(self.nh * self.hd, cfg.hidden_size, bias=False) + self.rope_theta = getattr(cfg, "rope_theta", 10000.0) + inv_freq = 1.0 / ( + self.rope_theta ** (torch.arange(0, self.hd, 2, dtype=torch.float32) / self.hd) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def _rope(self, positions, x): + # x: [T, H, D]; positions: [T]. Matches HF Llama's interleaved-pair RoPE. + pos_f = positions.float() + freqs = torch.outer(pos_f, self.inv_freq.to(pos_f.device)) # [T, D/2] + cos = freqs.cos().unsqueeze(1) # [T, 1, D/2] + sin = freqs.sin().unsqueeze(1) + # HF Llama's default RoPE: split the last dim into HALVES (not even/odd). + d = x.shape[-1] + half = d // 2 + x1 = x[..., :half] + x2 = x[..., half:] + rotated = torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1) + return rotated.to(x.dtype) + + def forward(self, positions, h): + # h: [T, 2*hidden] (after concat+norms); positions: [T]. + q = self.q_proj(h).view(-1, self.nh, self.hd) + k = self.k_proj(h).view(-1, self.nkh, self.hd) + v = self.v_proj(h).view(-1, self.nkh, self.hd) + q = self._rope(positions, q) + k = self._rope(positions, k) + # Stash post-rotary K and V for per-step dumps (diagnostic only). + self.last_k = k.detach().contiguous() + self.last_v = v.detach().contiguous() + # SDPA: [B=1, H, T, D] + o = F.scaled_dot_product_attention( + q.transpose(0, 1).unsqueeze(0), + k.transpose(0, 1).unsqueeze(0), + v.transpose(0, 1).unsqueeze(0), + is_causal=True, scale=self.scale, enable_gqa=True, + ) + o = o.squeeze(0).transpose(0, 1).contiguous().view(-1, self.nh * self.hd) + return self.o_proj(o) + + +class Eagle3DecoderLayer(nn.Module): + def __init__(self, cfg): + super().__init__() + self.self_attn = Eagle3Attention(cfg) + self.mlp = LlamaMLP(cfg) + self.input_layernorm = LlamaRMSNorm(cfg.hidden_size, eps=cfg.rms_norm_eps) + self.hidden_norm = LlamaRMSNorm(cfg.hidden_size, eps=cfg.rms_norm_eps) + self.post_attention_layernorm = LlamaRMSNorm(cfg.hidden_size, eps=cfg.rms_norm_eps) + + def forward(self, positions, embeds, target_h_proj): + # Matches upstream sglang/llama_eagle3.py exactly. + residual = target_h_proj + embeds_n = self.input_layernorm(embeds) + hidden_n = self.hidden_norm(target_h_proj) + combined = torch.cat([embeds_n, hidden_n], dim=-1) + attn_out = self.self_attn(positions, combined) + # Fused add+norm equivalent: return (mlp(norm(attn+res)), attn+res). + new_res = attn_out + residual + normed = self.post_attention_layernorm(new_res) + mlp_out = self.mlp(normed) + return mlp_out + new_res # the "prenorm" sum used for final_norm + + +class Eagle3Model(nn.Module): + def __init__(self, cfg, d_model_target, device: str = "cuda"): + super().__init__() + self.config = cfg + self.device = device + self.embed_tokens = nn.Embedding(cfg.vocab_size, cfg.hidden_size) + self.fc = nn.Linear(3 * d_model_target, cfg.hidden_size, bias=False) + self.midlayer = Eagle3DecoderLayer(cfg) + self.norm = LlamaRMSNorm(cfg.hidden_size, eps=cfg.rms_norm_eps) + self.lm_head = nn.Linear(cfg.hidden_size, cfg.draft_vocab_size, bias=False) + self.register_buffer( + "d2t", torch.zeros(cfg.draft_vocab_size, dtype=torch.long), persistent=False, + ) + + def forward(self, input_ids, target_hidden): + # input_ids: [T]; target_hidden: [T, 3*D_target]. + embeds = self.embed_tokens(input_ids) + target_h_proj = self.fc(target_hidden.to(self.fc.weight.dtype)) + positions = torch.arange(input_ids.shape[0], device=input_ids.device) + prenorm = self.midlayer(positions, embeds, target_h_proj) + final = self.norm(prenorm) + return F.linear(final, self.lm_head.weight) # [T, draft_vocab] + + def forward_with_cond(self, input_ids, positions, cond): + """Like forward() but takes a pre-projected conditioning stream + (shape [T, hidden_size]) so callers can mix target-hidden and + draft-hidden conditioning per-position. Returns prenorm (pre- + final_norm hidden states).""" + embeds = self.embed_tokens(input_ids) + return self.midlayer(positions, embeds, cond) + + def draft_tok_to_target(self, draft_idx: int) -> int: + return int(draft_idx) + int(self.d2t[draft_idx].item()) + + +def load_eagle3_specforge( + path: str, target_embed: torch.Tensor, d_model_target: int, device: str = "cuda", dtype=torch.bfloat16, +) -> Eagle3Model: + if not os.path.exists(os.path.join(path, "config.json")): + hits = glob.glob(os.path.join(path, "snapshots", "*", "config.json")) + assert hits, f"no config.json under {path}" + path = os.path.dirname(hits[0]) + + cfg = LlamaConfig.from_pretrained(path) + model = Eagle3Model(cfg, d_model_target, device=device).to(dtype) + + sd = load_file(glob.glob(os.path.join(path, "*.safetensors"))[0]) + with torch.no_grad(): + model.d2t.copy_(sd["d2t"].long()) + model.fc.weight.copy_(sd["fc.weight"]) + model.norm.weight.copy_(sd["norm.weight"]) + model.lm_head.weight.copy_(sd["lm_head.weight"]) + ml = model.midlayer + ml.self_attn.q_proj.weight.copy_(sd["midlayer.self_attn.q_proj.weight"]) + ml.self_attn.k_proj.weight.copy_(sd["midlayer.self_attn.k_proj.weight"]) + ml.self_attn.v_proj.weight.copy_(sd["midlayer.self_attn.v_proj.weight"]) + ml.self_attn.o_proj.weight.copy_(sd["midlayer.self_attn.o_proj.weight"]) + ml.mlp.gate_proj.weight.copy_(sd["midlayer.mlp.gate_proj.weight"]) + ml.mlp.up_proj.weight.copy_(sd["midlayer.mlp.up_proj.weight"]) + ml.mlp.down_proj.weight.copy_(sd["midlayer.mlp.down_proj.weight"]) + ml.input_layernorm.weight.copy_(sd["midlayer.input_layernorm.weight"]) + ml.hidden_norm.weight.copy_(sd["midlayer.hidden_norm.weight"]) + ml.post_attention_layernorm.weight.copy_(sd["midlayer.post_attention_layernorm.weight"]) + # embed_tokens is shared with the target. + model.embed_tokens.weight.copy_(target_embed.to(dtype)) + return model.to(device, dtype=dtype) diff --git a/tests/hf/helpers.py b/tests/hf/helpers.py new file mode 100644 index 000000000..c1d96e667 --- /dev/null +++ b/tests/hf/helpers.py @@ -0,0 +1,185 @@ +"""Helpers used by Tier 1 E2E tests. + +Runs the `_runner.py` subprocess with a given config and returns the parsed +JSON result. Each test invokes this multiple times with different configs and +asserts that the (greedy) token outputs match. +""" +from __future__ import annotations + +import json +import os +from pathlib import Path +import psutil +import requests +import subprocess +import sys +import signal +import time + + +TGL_BASE_DIR = "/work/avner/git/tgl" + +# Canonical local model snapshots (8B target + 1B standalone draft). +LLAMA_3_1_8B_SNAPSHOT = "/data/shared/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659" +LLAMA_3_2_1B_SNAPSHOT = "/data/shared/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6" +EAGLE3_8B_SNAPSHOT = "/data/shared/huggingface/hub/models--yuhuili--EAGLE3-LLaMA3.1-Instruct-8B/snapshots/61aa096484ad9752292507b0cc9973bb423abb35" + +QWEN3_8B_SNAPSHOT = "/data/shared/huggingface/hub/models--Qwen--Qwen3-8B/snapshots/b968826d9c46dd6066d109eabc6255188de91218" +QWEN3_0_6B_SNAPSHOT = "/data/shared/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/c1899de289a04d12100db370d81485cdf75e47ca" + +# EAGLE3 draft models (for use with `use_eagle=True`). +EAGLE3_LLAMA_8B_SNAPSHOT = "/data/shared/huggingface/hub/models--lmsys--SGLang-EAGLE3-Llama-3.1-8B-Instruct-SpecForge/snapshots/4a8e38f7dbee5d6dc82369f59a58540855fe09af" +EAGLE3_QWEN3_8B_SNAPSHOT = "/data/shared/huggingface/hub/models--AngelSlim--Qwen3-8B_eagle3/snapshots/9629dfce7a4a10564dd48d3e5485c3976095653c" + + +def require_8b_target() -> str: + assert Path(LLAMA_3_1_8B_SNAPSHOT).is_dir(), f"Llama-3.1-8B snapshot not found at {LLAMA_3_1_8B_SNAPSHOT}" + return LLAMA_3_1_8B_SNAPSHOT + + +def require_1b_draft() -> str: + assert Path(LLAMA_3_2_1B_SNAPSHOT).is_dir(), f"Llama-3.2-1B snapshot not found at {LLAMA_3_2_1B_SNAPSHOT}" + return LLAMA_3_2_1B_SNAPSHOT + + +def require_qwen3_8b_target() -> str: + assert Path(QWEN3_8B_SNAPSHOT).is_dir(), f"Qwen3-8B snapshot not found at {QWEN3_8B_SNAPSHOT}" + return QWEN3_8B_SNAPSHOT + + +def require_qwen3_0p6b_draft() -> str: + assert Path(QWEN3_0_6B_SNAPSHOT).is_dir(), f"Qwen3-0.6B snapshot not found at {QWEN3_0_6B_SNAPSHOT}" + return QWEN3_0_6B_SNAPSHOT + + +def require_eagle_llama_8b_draft() -> str: + assert Path(EAGLE3_LLAMA_8B_SNAPSHOT).is_dir(), f"EAGLE3-LLaMA3.1 snapshot not found at {EAGLE3_LLAMA_8B_SNAPSHOT}" + return EAGLE3_LLAMA_8B_SNAPSHOT + + +def require_eagle_qwen3_8b_draft() -> str: + assert Path(EAGLE3_QWEN3_8B_SNAPSHOT).is_dir(), f"EAGLE3 Qwen3 snapshot not found at {EAGLE3_QWEN3_8B_SNAPSHOT}" + return EAGLE3_QWEN3_8B_SNAPSHOT + + +def _get_speculative_algorithm(speculator_type: str) -> str: + if speculator_type == "standalone": + return "ASYNC_STANDALONE" + elif speculator_type == "sync_standalone": + return "STANDALONE" + elif speculator_type == "eagle": + return "ASYNC_EAGLE3" + elif speculator_type == "sync_eagle": + return "EAGLE3" + else: + raise ValueError(f"unknown speculator type: {speculator_type}") + + +def launch_tgl_server( + speculator_type: str, + backup: str, + target: str, + draft: str, + lookahead: int, + fanout: int, + port: int, + cross_node: bool = False, +): + env = os.environ.copy() + env["NCCL_CUMEM_ENABLE"] = "0" # match sglang; avoids P2P/IPC vs P2P/CUMEM mismatch on same-node + cmd = [ + # sys.executable, "-m", "sglang.launch_server", + "sglang", "serve", + "--model-path", target, + "--speculative-algorithm", _get_speculative_algorithm(speculator_type), + "--speculative-draft-model-path", draft, + "--tp", "1", "--mem-fraction-static", "0.7", + "--max-running-requests", "1", + "--log-level", "warning", + "--port", str(port), + "--context-length", "2048", + "--dtype", "bfloat16", + "--skip-server-warmup", + ### THESE ARE FOR DYNAMIC LOOKAHEAD TEST + # "--speculative-num-steps", str(8), + # "--speculative-num-draft-tokens", str(8 + 1), + # "--speculative-num-steps-list", "[3,3,4,5,6,7,8]", + ### ABOVE ARE FOR DYNAMIC LOOKAHEAD TEST + "--speculative-num-steps", str(lookahead), + "--speculative-num-draft-tokens", str(lookahead + 1), + "--speculative-eagle-topk", "1", + "--page-size", "64", + "--speculative-async-communicate-cache-hits", + "--speculative-async-communicate-logits", + # "--disable-cuda-graph", + ] + + if speculator_type in ["standalone", "eagle"]: + if backup == "force-jit": + cmd.append("--speculative-async-jit-speculate") + cmd.append("--speculative-async-force-jit-speculate") + elif backup == "jit": + cmd.append("--speculative-async-jit-speculate") + + if cross_node: + cmd.append("--speculative-async-remote-draft") + + print(f"[tgl] Launching server: {' '.join(cmd)}", flush=True) + server_process = subprocess.Popen(cmd, start_new_session=True, env=env) + draft_process = None + + if cross_node: + draft_cmd = [ + "python", f"{TGL_BASE_DIR}/scripts/launch_remote_draft.py", + "--draft-model-path", draft, + "--target-host", "localhost", + "--gpu-id", "1", + "--speculate-k", str(lookahead), + "--max-model-len", "4096", + "--fan-out", str(fanout), + ] + if backup == "jit" or backup == "force-jit": + draft_cmd.append("--jit-speculate") + if backup == "force-jit": + draft_cmd.append("--force-jit-speculate") + + print(f"[tgl] Launching draft: {' '.join(draft_cmd)}", flush=True) + draft_process = subprocess.Popen(draft_cmd, start_new_session=True, env=env) + return server_process, draft_process + + +def wait_for_server(port: int, timeout: int = 300) -> bool: + deadline = time.time() + timeout + print(f"[tgl] waiting for server", flush=True) + while time.time() < deadline: + try: + if requests.get( + f"http://localhost:{port}/health", timeout=2, + ).status_code == 200: + print(f"[tgl] server health check passed", flush=True) + return True + except Exception: + pass + time.sleep(3) + print(f"[tgl] server health check timed out", flush=True) + return False + + + +def kill_server(proc: subprocess.Popen) -> None: + try: + os.killpg(os.getpgid(proc.pid), signal.SIGKILL) + print(f"[tgl] killed server", flush=True) + except (ProcessLookupError, PermissionError): + print(f"[tgl] failed to kill server", flush=True) + pass + # Close pipes so wait() doesn't block on buffer drainage + for fd in (proc.stdout, proc.stderr, proc.stdin): + if fd: + print(f"[tgl] closing pipe {fd}", flush=True) + try: + fd.close() + print(f"[tgl] closed pipe {fd}", flush=True) + except Exception: + print(f"[tgl] failed to close pipe {fd}", flush=True) + pass diff --git a/tests/hf/test_ssd_vs_hf_reference.py b/tests/hf/test_ssd_vs_hf_reference.py new file mode 100644 index 000000000..e00ae9278 --- /dev/null +++ b/tests/hf/test_ssd_vs_hf_reference.py @@ -0,0 +1,806 @@ +import os +from pathlib import Path + +import pytest +import requests +import torch +import numpy as np + +from transformers import AutoModelForCausalLM, AutoTokenizer + +from ssd import LLM, SamplingParams +from .eagle3_hf import Eagle3Model, load_eagle3_specforge +from .helpers import require_8b_target, require_eagle_llama_8b_draft, require_1b_draft, launch_tgl_server, wait_for_server, kill_server + + +PORT = 40023 +LOGIT_GAP_THRESHOLD = 0.3 +EAGLE_LAYERS = [2, 16, 29] +D_MODEL = 4096 + +ASYNC_BACKUPS = ["force-jit", "jit", "fast"] +SPECULATOR_TYPES = ["standalone", "eagle"] +CROSS_NODE = [True, False] + +# @pytest.mark.parametrize("speculator_type", ["standalone"]) +# @pytest.mark.parametrize("cross_node", [False]) +# @pytest.mark.parametrize("backup", ["force-jit"]) +@pytest.mark.parametrize("backup", ["force-jit","jit"]) # [None]) +@pytest.mark.parametrize("speculator_type", ["eagle"]) +@pytest.mark.parametrize("cross_node", [False]) +@pytest.mark.parametrize("engine", ["ssd"]) +@pytest.mark.parametrize("max_new_tokens", [128]) +def test_ssd_vs_hf_reference(backup, speculator_type, cross_node, engine, max_new_tokens, tmp_path): + lookahead = 4 + fanout = 3 + eagle = speculator_type in ["eagle", "sync_eagle"] + sync_speculator = speculator_type in ["sync_standalone", "sync_eagle"] + dtype = torch.bfloat16 + target_path = require_8b_target() + draft_path = require_eagle_llama_8b_draft() if eagle else require_1b_draft() + trace_dir = tmp_path / "trace" + trace_dir.mkdir(exist_ok=True) + os.environ["SSD_DUMP_TENSORS_DIR"] = str(trace_dir) + print(f"================================================================================") + print(f"[{engine}] Launching {engine} engine with speculator type {speculator_type} and backup {backup}, trace directory {trace_dir}, max new tokens {max_new_tokens}, cross node {cross_node}", flush=True) + print(f"================================================================================") + + tokenizer = AutoTokenizer.from_pretrained(target_path) + prompt_tokens = tokenizer.apply_chat_template( + [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Please tell me about the capital city of France."}], + add_generation_prompt=True, + ) + if isinstance(prompt_tokens, list): + print(f"[{engine}] BANANA: {prompt_tokens=}", flush=True) + else: + prompt_tokens = prompt_tokens["input_ids"] + + # For each engine, we initialize the engine, send a request to it, and then tear down the engine. + if engine == "tgl": + try: + tgl_server, draft_process = launch_tgl_server( + speculator_type, backup, target_path, draft_path, lookahead, fanout, PORT, cross_node=cross_node, + ) + + assert wait_for_server(PORT), "tgl server failed to start" + print(f"[{engine}] server up; sending request", flush=True) + + resp = requests.post( + f"http://localhost:{PORT}/generate", + json={ + "input_ids": prompt_tokens, + "sampling_params": { + "temperature": 0.0, + "max_new_tokens": max_new_tokens, + "ignore_eos": True, + }, + }, + ) + # Fields in the response json: + # 'completion_tokens': 128, 'e2e_latency': 1.4077615810092539, + # 'spec_accept_rate': 0.8166666666666667, 'spec_accept_length': 4.266666666666667, 'spec_accept_histogram': [4, 0, 2, 2, 22], + # 'spec_accept_token_num': 98, 'spec_draft_token_num': 120, 'spec_verify_ct': 30, + + assert resp.status_code == 200, "tgl server failed to generate" + print(f"[{engine}] response received", flush=True) + resp_json = resp.json() + print(f"[{engine}] response json: {resp_json}", flush=True) + # completion_text = resp_json["text"] + completion_tokens = resp_json["output_ids"] + print(f"[{engine}] prompt tokens: {prompt_tokens}", flush=True) + print(f"[{engine}] response tokens: {completion_tokens}", flush=True) + + except Exception as e: + print(f"[{engine}] error: {e}", flush=True) + pytest.fail(f"[{engine}] error: {e}") + + finally: + # TODO: We currently speedup the test by not killing the server; uncomment this when done debugging. + print(f"[{engine}] killing server", flush=True) + kill_server(tgl_server) + assert not wait_for_server(PORT, timeout=3.0), "tgl server failed to stop" + print(f"[{engine}] server stopped", flush=True) + + if cross_node: + print(f"[{engine}] killing draft process", flush=True) + kill_server(draft_process) + print(f"[{engine}] draft process stopped", flush=True) + + elif engine == "ssd": + ssd_kwargs = dict( + enforce_eager=False, + num_gpus=2, + speculate=True, + speculate_k=lookahead, + draft_async=True, + async_fan_out=fanout, + verbose=True, + draft=draft_path, + kvcache_block_size=64, + max_num_seqs=1, + max_model_len=4096, + jit_speculate=(backup == "jit" or backup == "force-jit"), + force_jit_speculate=(backup == "force-jit"), + communicate_cache_hits=True, + communicate_logits=True, + use_eagle=eagle, + eagle_layers=EAGLE_LAYERS if eagle else None, + ) + llm = None + try: + llm = LLM(target_path, **ssd_kwargs) + print(f"[{engine}] generating completion", flush=True) + output, metrics = llm.generate( + [prompt_tokens], + SamplingParams(max_new_tokens=max_new_tokens, temperature=0.0, ignore_eos=True), + use_tqdm=False, + ) + except Exception as e: + print(f"[{engine}] error: {e}", flush=True) + pytest.fail(f"[{engine}] error: {e}") + finally: + # Clean up the engine. + if llm is not None: + llm.exit(hard=False) + del llm + # Defensive: if LLM init raised partway, llm is None and exit() never ran, + # so the default process group set up inside ModelRunner.__init__ is still + # alive in this process. Without this, the next parametrize case fails with + # "trying to initialize the default process group twice". + try: + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + except Exception: + pass + import gc; gc.collect() + torch.cuda.empty_cache() + + completion_text = output[0]["text"] + print(f"[{engine}] completion text: {completion_text}", flush=True) + completion_tokens = output[0]["token_ids"] + print(f"[{engine}] completion tokens: {completion_tokens}", flush=True) + print(f"[{engine}] generation metrics: {metrics}", flush=True) + else: + raise ValueError(f"Unknown engine: {engine}") + + # COMPARE TGL RESPONSE TO HF REFERENCE. Ensure that + target_device = "cuda:4" + draft_device = "cuda:5" + + # Load target + print(f"[{engine}] begin load target model", flush=True) + target_model = AutoModelForCausalLM.from_pretrained(target_path, torch_dtype=dtype) + print(f"[{engine}] target model loaded", flush=True) + target_model.eval() + target_model.to(target_device) + + # COMPARE TGL RESPONSE TO HF REFERENCE. + print(f"====================================================") + print(f"[{engine}] Beginning comparison of completion to hf reference ({speculator_type}, {backup})") + print(f"=====================================================") + gaps, full_target_logits = compare_completion_to_hf_reference( + target_model, + prompt_tokens, + completion_tokens, + 0, + tokenizer, + engine=engine, + ) + assert max(gaps) < LOGIT_GAP_THRESHOLD, f"COMPARE COMPLETION TO HF REFERENCE: max gap {max(gaps)} exceeds threshold {LOGIT_GAP_THRESHOLD}, {gaps=}" + + if sync_speculator: + return + + # Load draft + if eagle: + draft_model = load_eagle3_specforge( + draft_path, target_model.model.embed_tokens.weight, target_model.config.hidden_size, draft_device, + dtype=dtype, + ) + draft_model.eval() + else: + assert speculator_type == "standalone" + draft_model = AutoModelForCausalLM.from_pretrained(draft_path, torch_dtype=dtype).to(draft_device) + draft_model.eval() + + + print(f"====================================================") + print(f"[{engine}] Beginning SSD simulation ({speculator_type}, {backup})") + print(f"=====================================================") + full_ssd_simulation( + target_model, + draft_model, + prompt_tokens, + completion_tokens, + backup=backup, + eagle=eagle, + lookahead=lookahead, + tokenizer=tokenizer, + ) + + # COMPARE SPECULATIONS TO HF REFERENCE + print(f"====================================================") + print(f"[{engine}] Beginning comparison of speculations to hf reference ({speculator_type}, {backup})") + print(f"=====================================================") + compare_speculations_to_hf_reference( + trace_dir, + target_model, + draft_model, + prompt_tokens, + completion_tokens, + eagle=eagle, + backup=backup, + tokenizer=tokenizer, + engine=engine, + full_target_logits=full_target_logits, + ) + + +def compare_completion_to_hf_reference( + model, + prefix: list[int], + completion: list[int], + request_index: int, + tokenizer: AutoTokenizer, + engine: str = "tgl", + full_target_logits: torch.Tensor = None, + verbose: bool = False, +): + completion_length = len(completion) + all_tokens = prefix + completion + hf_logits_for_completion = get_hf_logits_for_completion(model, all_tokens, completion_length) + gaps = [] + for i in range(completion_length): + completion_token = completion[i] + hf_logit = hf_logits_for_completion[i, completion_token] + hf_max_logit = hf_logits_for_completion[i].max() + gaps.append(torch.abs(hf_logit - hf_max_logit).item()) + + max_gap = max(gaps) + + greedy_preds = hf_logits_for_completion.argmax(dim=-1) + matching = tokenizer.decode(greedy_preds) == tokenizer.decode(completion) + match_str = "YES" if matching else " NO" + if verbose: + print("=============") + print(f"[{engine}][{request_index}][{match_str}] completion (hf reference): {tokenizer.decode(greedy_preds)}") + print(f"[{engine}][{request_index}][{match_str}] completion (engine - tgl): {tokenizer.decode(completion)}") + print(f"[{engine}][{request_index}][{match_str}] max gap: {max_gap}, gaps: {gaps}") + + if full_target_logits is not None: + full_target_logits = full_target_logits.to(hf_logits_for_completion.device) + norm_gaps = [] + for i in range(completion_length): + idx = len(prefix) + i + curr_logits = hf_logits_for_completion[i] + if idx > full_target_logits.shape[0] - 1: + break + target_logits = full_target_logits[idx] + target_probs = torch.softmax(target_logits, dim=-1) + curr_probs = torch.softmax(curr_logits, dim=-1) + norm_gaps.append(torch.linalg.norm(curr_probs - target_probs, ord=1).item()) + max_norm_gap = max(norm_gaps) if norm_gaps else 0.0 + print(f"[{engine}][{request_index}] max norm gap: {max_norm_gap}, norm gaps: {norm_gaps}") + + # pytest.set_trace() + return gaps, hf_logits_for_completion + + +def full_ssd_simulation( + target_model: AutoModelForCausalLM, + draft_model: AutoModelForCausalLM | Eagle3Model, + prompt_tokens: list[int], + completion_tokens: list[int], + backup: str = "force-jit", + eagle: bool = False, + lookahead: int = 4, + full_target_logits: torch.Tensor = None, + full_target_activations: torch.Tensor = None, # Note: These should already be projected into the draft space. + duplicate_first_token: bool = True, + tokenizer: AutoTokenizer = None, + fan_out: int = 5, + verbose: bool = False, +): + all_tokens = prompt_tokens + completion_tokens + all_tokens_tensor = torch.tensor([all_tokens], device=draft_model.device, dtype=torch.long) + draft_device = draft_model.device + dtype = draft_model.lm_head.weight.dtype + if full_target_activations is None and eagle: + full_target_activations = get_hf_target_activations_for_eagle(target_model, all_tokens).to(draft_model.device) + if duplicate_first_token: + full_target_activations = torch.cat([ + full_target_activations[:1], + full_target_activations + ]) + full_target_activations = draft_model.fc(full_target_activations.to(dtype=dtype)) + # print(f"[SIMULATION] full_target_activations.shape: {full_target_activations.shape}") + else: + raise ValueError("Unsupported at the moment") + + if full_target_logits is None: + full_target_logits = get_hf_logits(target_model, all_tokens).to(draft_model.device) + + target_preds = full_target_logits.argmax(dim=-1) + + acceptance_lengths = [] + cache_hits = [] + probability_gaps = [] + + cache_hit = False + generated = 1 # bonus token from prefill is already generated + while True: + ## SPECULATE ## + tokens_remaining = all_tokens_tensor.shape[1] - (len(prompt_tokens) + generated) + if tokens_remaining < lookahead: + break + + if eagle: + if backup == "force-jit" or (not cache_hit and backup == "jit") or cache_hit: + + # For cache hits, we don't have the target activations from the previous round. + if cache_hit and backup != "force-jit": + num_generated_last_round = acceptance_lengths[-1] + 1 + base_len = len(prompt_tokens) + generated - num_generated_last_round + # We do one extra draft pass (+1) to get the logits after the last speculated token, + # which are needed to check for cache hits when all tokens are accepted. + num_draft_passes = num_generated_last_round + lookahead + 1 + else: + base_len = len(prompt_tokens) + generated + # We do one extra draft pass (+1) to get the logits after the last speculated token, + # which are needed to check for cache hits when all tokens are accepted. + num_draft_passes = lookahead + 1 + current_activations = full_target_activations[:base_len] + for i in range(num_draft_passes): + curr_len = base_len + i + current_prefix = all_tokens_tensor[0, :curr_len] + # print(f"[SIMULATION] current_activations.shape: {current_activations.shape}") + if i > 0: + # print(f"[SIMULATION] draft_activations.shape: {draft_activations.shape}") + current_activations = torch.cat([current_activations, draft_activations[-1:]]) + draft_activations = draft_model.forward_with_cond(current_prefix, torch.arange(curr_len, device=draft_device), current_activations) + speculation_activations = draft_model.norm(draft_activations[-(lookahead + 1):]) + speculation_logits = draft_model.lm_head(speculation_activations) + speculation_logits = convert_to_full_vocab_logits(draft_model, speculation_logits) + speculation_preds = speculation_logits.argmax(dim=-1) + else: + # TODO: THIS IS NOT CORRECT. + # fast speculation + speculation_logits = torch.full((lookahead + 1, draft_model.config.vocab_size), float("-inf"), device=draft_device, dtype=dtype) + speculation_logits[:, 0] = 0.0 + speculation_preds = torch.zeros(lookahead + 1, device=draft_device, dtype=torch.long) + # # GLUE DECODE: After cache miss, we do a glue decode to get + # assert num_accepted == 0 + # curr_len = len(prompt_tokens) + generated + # current_prefix = all_tokens_tensor[0, :curr_len] + # current_activations = full_target_activations[:curr_len] + # draft_activations = draft_model.forward_with_cond(current_prefix, torch.arange(curr_len, device=draft_device), current_activations) + # speculation_activations = draft_model.norm(draft_activations[-1:]) + # speculation_logits = draft_model.lm_head(speculation_activations) + # speculation_logits = convert_to_full_vocab_logits(draft_model, speculation_logits) + else: + curr_len = len(prompt_tokens) + generated + lookahead + current_prefix = all_tokens_tensor[:, :curr_len] + if backup == "fast" and not cache_hit: + # fast speculation + # TODO: THIS IS NOT CORRECT. + speculation_logits = torch.full((lookahead + 1, draft_model.config.vocab_size), float("-inf"), device=draft_device, dtype=dtype) + speculation_logits[:, 0] = 0.0 + speculation_preds = torch.zeros(lookahead + 1, device=draft_device, dtype=torch.long) + else: + speculation_logits = draft_model.forward(current_prefix).logits[0] + speculation_logits = speculation_logits[-(lookahead + 1):] + # Note: speculation preds has an extra token at the end. + speculation_preds = speculation_logits.argmax(dim=-1) + ### END SPECULATE ### + + ### CHECK HOW MANY TOKENS ARE ACCEPTED ### + num_accepted = lookahead + for i in range(lookahead): + curr_idx = len(prompt_tokens) + generated + i + next_token = all_tokens[curr_idx] + if verbose and target_preds[curr_idx - 1].item() != next_token: + if tokenizer is not None: + target_pred_str = tokenizer.decode(target_preds[curr_idx - 1]) + next_token_str = tokenizer.decode(next_token) + print(f"[SIMULATION] Target prediction `{target_pred_str}` != next token `{next_token_str}` at index {curr_idx}") + else: + print(f"[SIMULATION] Target prediction {target_preds[curr_idx].item()} != next token {next_token} at index {curr_idx}") + + speculated_token = speculation_preds[i].item() + if speculated_token != next_token: + num_accepted = i + break + + acceptance_lengths.append(num_accepted) + ### END CHECK HOW MANY TOKENS ARE ACCEPTED ### + + ### DETERMINE IF THERE IS A CACHE HIT IN THE NEXT ROUND ### + next_token = all_tokens[len(prompt_tokens) + generated + num_accepted] + speculated_token = speculation_preds[num_accepted].item() + draft_logits = speculation_logits[num_accepted].clone() + if num_accepted != lookahead: + draft_logits[speculated_token] = float("-inf") + cache_hit = int(next_token in draft_logits.topk(k=fan_out).indices) + cache_hits.append(cache_hit) + ### END DETERMINE IF THERE IS A CACHE HIT IN THE NEXT ROUND ### + + ### MEASURE PROBABILITY DISTRIBUTION GAPS (DRAFT VS TARGET) ### + curr_probability_gaps = [] + for i in range(lookahead): + curr_idx = len(prompt_tokens) + generated + i + draft_logits = speculation_logits[i] + target_logits = full_target_logits[curr_idx - 1] + draft_probs = torch.softmax(draft_logits, dim=-1) + target_probs = torch.softmax(target_logits, dim=-1) + gap = torch.linalg.norm(draft_probs - target_probs, ord=1).item() + if verbose and gap > 0.5: + prefix = all_tokens_tensor[0, :curr_idx] + decoded_prefix = tokenizer.decode(prefix) + print(f"[SIMULATION][{curr_idx}] Prefix: {decoded_prefix}") + draft_pred = draft_logits.argmax(dim=-1) + target_pred = target_logits.argmax(dim=-1) + draft_pred_str = tokenizer.decode(draft_pred) + target_pred_str = tokenizer.decode(target_pred) + print(f"[SIMULATION][{curr_idx}] |draft_probs - target_probs| = {gap:.4f}, Draft prediction `{draft_pred_str}`. Target prediction `{target_pred_str}`.") + curr_probability_gaps.append(gap) + + probability_gaps.append(curr_probability_gaps) + ### END MEASURE PROBABILITY DISTRIBUTION GAPS (DRAFT VS TARGET) ### + + generated += num_accepted + 1 + + acc_lengths_array = np.array(acceptance_lengths) + 1 + print(f"[SIMULATION] Acceptance lengths: {acc_lengths_array.tolist()}") + print(f"[SIMULATION] Average acceptance length: {acc_lengths_array.mean():.4f}") + print(f"[SIMULATION] Probability gaps: {probability_gaps}") + print(f"[SIMULATION] Average probability gap: {np.array(probability_gaps).mean():.4f}") + if backup != "force-jit": + print(f"[SIMULATION] Cache hits: {cache_hits}") + print(f"[SIMULATION] Average cache hit: {np.array(cache_hits).mean():.4f}") + return acceptance_lengths, probability_gaps + + +def convert_to_full_vocab_logits(draft_model: Eagle3Model, draft_logits: torch.Tensor) -> torch.Tensor: + full_vocab_indices = torch.arange(draft_model.d2t.shape[0], device=draft_logits.device) + draft_model.d2t + full_vocab_logits = draft_logits.new_full((draft_logits.shape[0], draft_model.config.vocab_size), float("-inf")) + full_vocab_logits.index_copy_(-1, full_vocab_indices, draft_logits) + return full_vocab_logits + + +def compare_completion_to_hf_reference_eagle( + draft_model: Eagle3Model, + prefix: list[int], + speculation: list[int], + eagle_acts: torch.Tensor, + eagle_activation_index: int, # where to start forward passes from. + request_index: int, + extend_token_ids: list[torch.Tensor], + extend_counts: list[int], + extend_activations: list[torch.Tensor], + recovery_activations: list[torch.Tensor], + prompt_eagle_acts: torch.Tensor, + jit: bool, + engine_acts: torch.Tensor, + tokenizer: AutoTokenizer, + engine: str = "tgl", + funky: bool = False, + prefixes: list[list[int]] = None, + full_target_logits: torch.Tensor = None, + verbose: bool = False, +): + if funky and jit: + if request_index == 0: + eagle_activation_index = len(prefixes[0]) + else: + eagle_activation_index = len(prefixes[request_index - 1]) + + device = draft_model.device + dtype = draft_model.lm_head.weight.dtype + all_tokens = torch.tensor(prefix + speculation, device=device, dtype=torch.long) + eagle_acts = eagle_acts.to(device=device, dtype=dtype) + # eagle_acts = engine_acts.to(device=device, dtype=dtype) # WE ARE TESTING OUT ENGINE ACTS INSTEAD OF HF ACTS + all_eagle_acts_proj = draft_model.fc(eagle_acts) + + speculation_length = len(speculation) + target_eagle_acts = eagle_acts[:eagle_activation_index] + target_eagle_acts = draft_model.fc(target_eagle_acts) + + draft_eagle_acts = torch.zeros(all_tokens.shape[0] - eagle_activation_index, target_eagle_acts.shape[1], device=device, dtype=dtype) + joint_eagle_acts = torch.cat([target_eagle_acts, draft_eagle_acts], dim=0) + joint_eagle_acts[:eagle_activation_index] = target_eagle_acts + # First we do len(prefix) - eagle_activation_index steps of forward passes to catch up to the current speculation. + for i in range(len(prefix) - eagle_activation_index): + idx = eagle_activation_index + i + with torch.no_grad(): + if funky and idx == len(prefix) - 1: + joint_eagle_acts[idx] = all_eagle_acts_proj[idx] + else: + # teacher-force with the actual speculation tokens. + prenorm = draft_model.forward_with_cond(all_tokens[:idx], torch.arange(idx, device=device), joint_eagle_acts[:idx]) + joint_eagle_acts[idx] = prenorm[-1] + + # Now we do the remaining steps of forward passes to get the logits for the speculation. + for i in range(speculation_length): + idx = len(prefix) + i + with torch.no_grad(): + prenorm = draft_model.forward_with_cond(all_tokens[:idx], torch.arange(idx, device=device), joint_eagle_acts[:idx]) + joint_eagle_acts[idx] = prenorm[-1] + + post_norm_final_draft_acts = draft_model.norm(joint_eagle_acts[-speculation_length:]) + draft_logits = draft_model.lm_head(post_norm_final_draft_acts) + + # Scatter draft-vocab draft_logits into target-vocab space via d2t so argmax / + # indexing by the engine's target-vocab ids is well-defined. Non-draft + # positions stay -inf (the draft cannot produce those tokens). + draft_logits = convert_to_full_vocab_logits(draft_model, draft_logits) + + greedy_preds = draft_logits.argmax(dim=-1) + + # print(f"[{engine}] model moved to cuda", flush=True) + # hf_logits_for_speculation = get_hf_logits_for_speculation(model, all_tokens, speculation_length) + # print(f"[{engine}] hf draft_logits for speculation loaded", flush=True) + gaps = [] + for i in range(speculation_length): + speculation_token = speculation[i] + hf_logit = draft_logits[i, speculation_token] + hf_max_logit = draft_logits[i].max() + # print(f"[{engine}] hf logit {hf_logit}, hf max logit {hf_max_logit}, logit_norm {torch.norm(hf_logits_for_speculation[i])}") + gaps.append(torch.abs(hf_logit - hf_max_logit).item()) + + if verbose: + max_gap = max(gaps) + print("=============") + matching = tokenizer.decode(greedy_preds) == tokenizer.decode(speculation) + match_str = "YES" if matching else " NO" + prefix_str = tokenizer.decode(prefix) + print(f"[{engine}][{request_index}] prefix[-40:]: {prefix_str[-40:]}") + print(f"[{engine}][{request_index}][{match_str}] speculation (hf reference): {tokenizer.decode(greedy_preds)}") + print(f"[{engine}][{request_index}][{match_str}] speculation (engine - tgl): {tokenizer.decode(speculation)}") + print(f"[{engine}][{request_index}][{match_str}] max gap: {max_gap}, gaps: {gaps}") + # if max_gap > 0.0: + # pytest.set_trace() + return gaps + + + +def validate_request_and_response(request, response, request_num, eagle: bool = False): + assert request["cache_keys"].shape[0] == 1 + assert request["num_tokens"].shape[0] == 1 + cache_keys = request["cache_keys"][0] + num_accepted = cache_keys[1].item() + if request_num == 0: + assert num_accepted <= 0 + else: + assert num_accepted >= 0 + + if eagle: + assert request["extend_token_ids"].shape[0] == 1 + assert request["extend_counts"].shape[0] == 1 + assert request["extend_activations"].shape[0] == 1 + assert request["recovery_activations"].shape[0] == 1 + + assert response["cache_hits"].shape[0] == 1 + assert response["logits"].shape[0] == 1 + + +def compare_speculations_to_hf_reference( + trace_dir: Path, + target_model, + draft_model, + prompt_tokens: list[int], + completion_tokens: list[int], + eagle: bool = False, + backup: str = "force-jit", + tokenizer: AutoTokenizer = None, + engine: str = "tgl", + full_target_logits: torch.Tensor = None, + verbose: bool = False, +): + all_tokens = prompt_tokens + completion_tokens + prefill_request_files = list(trace_dir.glob("prefill_request_*.pt")) + speculation_request_files = list(sorted(trace_dir.glob("speculation_request_*.pt"))) + speculation_response_files = list(sorted(trace_dir.glob("speculation_response_*.pt"))) + assert len(prefill_request_files) == 1 + assert len(speculation_request_files) == len(speculation_response_files) + + prefill_request = torch.load(prefill_request_files[0]) + speculation_requests = [torch.load(f) for f in speculation_request_files] + speculation_responses = [torch.load(f) for f in speculation_response_files] + + if not eagle: + prompt_tokens_from_prefill_request = prefill_request["input_ids"].tolist() + assert prompt_tokens_from_prefill_request == prompt_tokens, f"{prompt_tokens_from_prefill_request=} != {prompt_tokens=}" + else: + hf_full_eagle_acts = get_hf_target_activations_for_eagle(target_model, all_tokens).to(draft_model.device) + hf_full_eagle_acts = torch.cat([ + hf_full_eagle_acts[:1], + hf_full_eagle_acts + ]) + prompt_eagle_acts = prefill_request["eagle_acts"].to(draft_model.device) + prompt_len = prompt_eagle_acts.shape[0] + if verbose: + print(f"[{engine}] hf prompt acts vs dumped eagle_acts: {torch.norm(prompt_eagle_acts - hf_full_eagle_acts[:prompt_len])}") + print(f"[{engine}] prompt acts: {prompt_eagle_acts[:5, :5]}") + print(f"[{engine}] full acts: {hf_full_eagle_acts[:5, :5]}") + # print(f"[{engine}] prompt eagle acts.shape: {prompt_eagle_acts.shape}") + # print(f"[{engine}] full eagle acts.shape: {full_eagle_acts.shape}") + + prefixes = [] + speculations = [] + num_accepted = [] + num_tokens = [] + cache_hits = [] + logits = [] + if eagle: + extend_token_ids = [] + extend_counts = [] + extend_activations = [] + extend_activations_accepted = [] + recovery_activations = [] + # TODO: Do this per request, by having a dictionary indexed by sequence ID. + for i in range(len(speculation_requests)): + request = speculation_requests[i] + response = speculation_responses[i] + validate_request_and_response(request, response, i, eagle) + + cache_keys = request["cache_keys"][0] + num_tokens.append(request["num_tokens"][0].item()) + num_accepted.append(cache_keys[1].item()) + rec_token = cache_keys[2].item() + if i == 0: + prefixes.append(prompt_tokens + [rec_token]) + else: + # Does the speculation contain the recovery token? I think it does? + prefixes.append(prefixes[-1] + speculations[-1][:num_accepted[-1]] + [rec_token]) + + if eagle: + extend_token_ids.append(request["extend_token_ids"][0]) + extend_counts.append(request["extend_counts"][0].item()) + extend_activations.append(request["extend_activations"][0]) + recovery_activations.append(request["recovery_activations"][0]) + if verbose: + print(f"[{engine}] extend_activations.shape: {extend_activations[-1].shape}") + + # TODO: It seems speculations is shape [lookahead] instead of [batch_size, lookahead]. Fix this? + speculations.append(response["speculations"].tolist()) + cache_hits.append(response["cache_hits"][0].item()) + logits.append(response["logits"][0].tolist()) + if verbose: + if tokenizer is not None: + prefix_text = tokenizer.decode(prefixes[-1]) + speculations_text = tokenizer.decode(speculations[-1]) + print(f"[{engine}] prefix text: {prefix_text}") + print(f"[{engine}] speculations text: {speculations_text}") + print(f"[{engine}] num accepted: {num_accepted[-1]}") + # print(f"[{engine}] num tokens: {num_tokens[-1]}") + print(f"[{engine}] rec token: {tokenizer.decode([rec_token])}") + else: + print(f"[{engine}] prefix: {prefixes[-1]}, speculation: {speculations[-1]}, num_accepted: {num_accepted[-1]}, num_tokens: {num_tokens[-1]}, rec_token: {rec_token}") + + prompt_len = len(prompt_tokens) + if eagle: + engine_acts = torch.zeros((len(all_tokens), 4096*3), dtype=draft_model.lm_head.weight.dtype, device="cpu") + engine_acts[:prompt_len] = prompt_eagle_acts.cpu() + t = prompt_len + for i in range(len(speculation_requests)): + num_accept = extend_counts[i] + if num_accept > 0: + engine_acts[t: t + num_accept] = extend_activations[i][:num_accept].cpu() + engine_acts[t + num_accept] = recovery_activations[i].cpu() + t += 1 + num_accept + if verbose: + print(f"FINAL OFFSET: {t}") + diffs = [ + (torch.norm(hf_full_eagle_acts[i].cpu() - engine_acts[i]) / torch.norm(hf_full_eagle_acts[i].cpu())).item() + for i in range(t) + ] + for i, diff in enumerate(diffs): + print(f"DIFF {i}: {diff:.4f}") + + print(f"[{engine}] eagle extend counts: {extend_counts}") + + # pytest.set_trace() + all_gaps = [] + if verbose: + print(f"[{engine}] prefix lengths: [{[len(p) for p in prefixes]}") + + for i in range(len(speculation_requests)): + # print(f"BANANA: CHECKING SPECULATION {i}, {cache_hits[i]=}, {backup=}") + prefix = prefixes[i] + speculation = speculations[i] + # num_accepted_i = num_accepted[i] + + + # jit: force_jit or (cache_miss and jit) + # random: fast and cache_miss + # delayed: cache_hit and not force_jit + if backup == "fast" and not cache_hits[i]: + continue + + if not eagle: + gaps, _ = compare_completion_to_hf_reference( + draft_model, + prefix, + speculation, + i, + tokenizer, + engine=engine, + full_target_logits=full_target_logits, + verbose=verbose, + ) + all_gaps.append(gaps) + else: + cache_hit = bool(cache_hits[i]) + jit = backup == "force-jit" or (not cache_hit and backup == "jit") + if jit: + eagle_activation_index = len(prefix) + else: + assert cache_hit and i > 0 + eagle_activation_index = len(prefixes[i-1]) + # if i > 0: + # assert len(prefixes[i-1]) + extend_counts[i-1] == len(prefix) + + gaps = compare_completion_to_hf_reference_eagle( + draft_model, + prefix, + speculation, + hf_full_eagle_acts, + eagle_activation_index, + i, + extend_token_ids, + extend_counts, + extend_activations, + recovery_activations, + prompt_eagle_acts, + jit, + engine_acts, + tokenizer, + engine=engine, + funky=False, + prefixes=prefixes, + full_target_logits=full_target_logits, + verbose=verbose, + ) + all_gaps.append(gaps) + + method_str = "eagle" if eagle else "standalone" + print(f" ****** SUMMARY OF ALL RESULTS (engine={engine}, method={method_str}, backup={backup}) ******") + + if eagle: + # extend counts don't include the recovery token, so we add 1 to the average. + print(f"[{engine},{method_str},{backup}][FINAL_METRIC] Average acceptance lengths: {1 + (sum(extend_counts) / (len(extend_counts) - 1)):.4f}") + print(f"[{engine},{method_str},{backup}] Full list of acceptance lengths: {extend_counts}") + else: + prefix_lengths = np.array([len(p) for p in prefixes]) + acceptance_lengths = prefix_lengths[1:] - prefix_lengths[:-1] + print(f"[{engine},{method_str},{backup}][FINAL_METRIC] Average acceptance lengths: {sum(acceptance_lengths) / len(acceptance_lengths):.4f}") + print(f"[{engine},{method_str},{backup}] Full list of acceptance lengths: {acceptance_lengths}") + + print(f"[{engine},{method_str},{backup}][FINAL_METRIC] Average cache hit rate: {sum(cache_hits) / len(cache_hits)}") + print(f"[{engine},{method_str},{backup}] Full list of cache hits: {cache_hits}") + + print(f"[{engine},{method_str},{backup}][FINAL_METRIC] Average gap: {np.array(all_gaps).mean():.4f}") + print(f"[{engine},{method_str},{backup}] Full list of gaps: {all_gaps}") + + max_gap = max(max(gaps) for gaps in all_gaps) + # assert max_gap < LOGIT_GAP_THRESHOLD, f"COMPARE SPECULATIONS TO HF REFERENCE: max gap {max_gap} exceeds threshold {LOGIT_GAP_THRESHOLD}, {all_gaps=}" + + +def get_hf_target_activations_for_eagle(target_model, all_tokens: list[int]) -> torch.Tensor: + with torch.no_grad(): + ids = torch.tensor([all_tokens], device=target_model.device, dtype=torch.long) + out = target_model(ids, output_hidden_states=True, use_cache=False) + acts = [out.hidden_states[li].squeeze(0).float() for li in EAGLE_LAYERS] + return torch.cat(acts, dim=-1).detach() # [N, 3*D] + + +def get_hf_logits(model, all_tokens: list[int]) -> torch.Tensor: + with torch.no_grad(): + output = model.forward(torch.tensor([all_tokens], device=model.device), use_cache=False) + return output.logits[0] + + +def get_hf_logits_for_completion(model, all_tokens: list[int], completion_length: int) -> torch.Tensor: + with torch.no_grad(): + output = model.forward(torch.tensor([all_tokens], device=model.device), use_cache=False) + return output.logits[0, -completion_length-1:-1] diff --git a/tests/pytest.ini b/tests/pytest.ini new file mode 100644 index 000000000..4bf7f167b --- /dev/null +++ b/tests/pytest.ini @@ -0,0 +1,14 @@ +[pytest] +addopts = --import-mode=importlib +markers = + tier0: CPU-only unit tests (no GPU, no model weights) + tier1: single-GPU E2E tests (8B target) + tier2: reserved for HF greedy reference (future) + tier3: reserved for SSD ↔ TGL fixture equivalence (future) + tier4: reserved for 70B TP=4 (future) + tier5: reserved for perf regression (future) + smoke: tiny subset of tier1 suitable for per-commit CI + +# Suppress HF deprecation noise in test output. +filterwarnings = + ignore:.*HF_HUB_ENABLE_HF_TRANSFER.*:DeprecationWarning diff --git a/tests/run_fast.sh b/tests/run_fast.sh new file mode 100755 index 000000000..ff7bd92a3 --- /dev/null +++ b/tests/run_fast.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# Fast subset: Tier 0 + Tier 1 smoke. Designed to run in under ~2 minutes on +# a single H100. Intended for per-commit CI. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +cd "$REPO_ROOT" +source .venv/bin/activate + +pytest tests/unit tests/e2e -m "tier0 or smoke" -v "$@" diff --git a/tests/run_tier1.sh b/tests/run_tier1.sh new file mode 100755 index 000000000..6244b836d --- /dev/null +++ b/tests/run_tier1.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Full Tier 1 suite: all single-GPU E2E tests. Takes ~8-10 minutes on H100. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +cd "$REPO_ROOT" +source .venv/bin/activate + +pytest tests/unit tests/e2e -m "tier0 or tier1" -v "$@" diff --git a/tests/ssd_test_plan.md b/tests/ssd_test_plan.md new file mode 100644 index 000000000..5bc413d22 --- /dev/null +++ b/tests/ssd_test_plan.md @@ -0,0 +1,32 @@ +# Test plan for SSD (for both SSD and TGL repos) + +## System overview. +- We have implemented an LLM inference algorithm called SSD (speculative speculative decoding, described in this paper: https://arxiv.org/pdf/2603.03251), in two repositories: + - SSD (/work/avner/git/ssd): This is a self-contained implementation of the algorithm. + - TGL (/work/avner/git/tgl): This is an integration of the SSD algorithm into a private branch of the open-source inference engine SGLang. For the draft process, as well as communication between the draft and target processes, it imports code from the SSD repo. +- The high-level design of the algorithm is as follows: + - Instead of doing speculative decoding by alternating sequentially between the draft model speculating K tokens, and the target model verifying those tokens, this algorithm does speculation and verification asynchronously, on separate GPUs. + - It does so by letting the draft model predict what it believes to be the most likely outcomes of the ongoing verification (e.g., accept k tokens, reject the k+1 token, and sample token t instead), and then speculating in advance in parallel for each of these outcomes, while the verification is still ongoing. If the actual verification outcome is one that it had prepared for, it can immediately send the speculation for that outcome, which it had precomputed. + - It has two strategies for handling cases where the actual verification outcome is not in the set of outcomes the draft model had prepared for: (1) "JIT": Speculate "just in time" using the draft model (the target model will wait while the draft model is running, like in regular speculative decoding), (2) "Fast": Immediately return all zeros as the speculation. (We additionally implement "force-jit", which ALWAYS runs the draft model synchronously, to aid with debugging and sanity checking). +- We would like to create a thorough testbed for this algorithm (for now, can ). + +## Test plan design criteria +- The primary repos/branches we want to test are: + - The `avner/sglang-fa4` branch of the SSD repo (/work/avner/git/ssd) + - The `avner/ssd-port` branch of the TGL repo (/work/avner/git/tgl) + +The following are properties the SSD async speculation system should have: +- `--force-jit` performance (acceptance rates, which tokens accepted, etc) should be identical to synchronous speculative decoding performance, in both SSD repo (self-contained async spec implementation) and TGL repo (for both Eagle and standalone speculators). +- SSD behavior for a given setting (acceptance rates, which tokens accepted, cache hits vs misses, inputs/outputs, etc) should always match TGL behavior for the same setting (eagle vs standalone, and force-jit vs jit vs fast backup strategies). +- The behavior of the system (inputs/outputs, accept vs reject decisions, cache hits vs misses) should match that of a naive inefficient implementation of the algorithm (e.g., using huggingface). +- All of the above should hold true for Llama 8B with TP=1, and Llama 70B with TP=4, with both Eagle and Standalone speculators. +- The SSD performance (including speed in tokens per second) at branch `avner/sglang-fa4` should be similar to or better than the `avner/main2` branch. +- The SSD speed in the SSD repo should be similar to the SSD speed in the TGL repo. +- These tests should be as simple and efficient as possible, testing individual components whenever possible, and doing end-to-end testing whenever necessary. Perhaps there should be a fast subset of tests we can run frequently, and a slower but more thorough set of tests. +- There should be a test that simply benchmarks the algorithm, and stores the speeds of each important component in a structured format that it uses for visualization (creating plots to visualize the key results, similar to /work/avner/git/ssd/bench/extract_metrics.py), and ideally fails when there has been a regression in performance. +- The results of these tests should ideally be stored in a sub-folder of the ssd repo, and perhaps uploaded automatically to git for visualization/review. Perhaps git actions are a useful tool here, perhaps to run these tests automatically on every commit? + +## Other important details: +- Current benchmarking scripts for both the SSD and TGL repositories are at /work/avner/git/ssd/bench/bench.py and /work/avner/git/ssd/bench/run_sglang_bench.py. +- The python environments for the SSD and TGL repos are uv python environments at /work/avner/git/ssd/.venv and /work/avner/git/tgl/.venv. +- I have access to research-secure-29.cloud.together.ai and research-secure-30.cloud.together.ai for testing, and my username is 'avner'. \ No newline at end of file diff --git a/tests/ssd_test_plan_cc.md b/tests/ssd_test_plan_cc.md new file mode 100644 index 000000000..cf3bb678b --- /dev/null +++ b/tests/ssd_test_plan_cc.md @@ -0,0 +1,173 @@ +# SSD test plan (refined) + +This is a refinement of `ssd_test_plan.md`. The original plan correctly identifies the properties the SSD async-speculation system should have. This refinement makes those properties **operational** (i.e., testable with precise pass/fail criteria), organizes the tests into **tiers** with clear scope and runtime expectations, and identifies the fixture capture points needed for cross-repo (SSD ↔ TGL) equivalence testing. + +## Primary targets under test + +- SSD repo: `/work/avner/git/ssd`, branch `avner/sglang-fa4`. +- TGL repo: `/work/avner/git/tgl`, branch `avner/ssd-port`. + +All work on these targets is done via the sibling worktree `/work/avner/git/ssd-phnx` (branch `cc/sglang-fa4`) so that in-flight experiments on `avner/sglang-fa4` are not disturbed. + +## Key refinements over the original plan + +1. **"Identical" is split into two regimes.** + - *Greedy (temperature == 0)*: bitwise-identical token streams. This is the strict oracle. + - *Sampled (temperature > 0)*: distributional match — acceptance rate and cache-hit rate within a tolerance over N prompts, RNG-seed controlled. + Every equivalence claim below specifies which regime applies. + +2. **SSD-vs-TGL equivalence is framed at the component level, not end-to-end.** The two systems have different schedulers, different prefill ordering, and different tokenization edges; an end-to-end equivalence requirement would force scheduler changes that are out of scope. Instead, we capture fixtures from one repo and replay them in the other, checking that the algorithmic components (draft-tree contents given fixed inputs, accept-longest-prefix logic given fixed logits) agree exactly. + +3. **The HF "naive reference" is scoped narrowly.** HF does not natively do async-speculation, so we do **not** re-implement the async algorithm in HF. Instead: + - HF is used only as a **ground-truth greedy token oracle** for the target model. Target-greedy output of SSD/TGL must equal HF greedy output token-for-token on short prompts. + - The spec-algorithm invariants (accept-longest-prefix, ratio-accept with cache-hit gating, tree-mask shapes, etc.) are tested against a **small pure-python oracle** we write inline in the tests — no HF, no weights. + +4. **Tests are organized into tiers** based on hardware cost and runtime: + + | Tier | Hardware | Model | Typical runtime | What it covers | + |------|--------------------|---------|-----------------|---------------------------------------------------------------------------------| + | 0 | CPU-only | none | seconds | Pure logic: verify(), block manager, mask helpers, oracles | + | 1 | 1× H100 (or A100) | 8B | 1–5 min | E2E correctness w/ real weights, greedy equivalence between modes | + | 2 | 1× H100 | 8B | 5–15 min | HF greedy reference match on short prompts | + | 3 | 1× H100 | 8B | 1–5 min | Fixture-based SSD ↔ TGL component equivalence | + | 4 | 4× H100 | 70B | 15–60 min | Same invariants as tiers 1–3 at TP=4 | + | 5 | 1× or 4× H100 | 8B/70B | 10–30 min | Performance regression — JSON metrics, baseline comparison, plot generation | + + **Fast subset** (for per-commit CI) = Tier 0 + one smoke test from Tier 1. + **This PR implements Tiers 0 and 1.** Tiers 2–5 are tracked but not in scope. + +5. **"Identical across draft strategies" (force-jit / jit / fast)** is greedy-only. + In greedy mode the final token stream is independent of which tokens the draft proposed — the target's argmax always decides. So in greedy mode all three backup strategies must produce the same token stream; only *speed* and *acceptance rate* differ. In sampled mode they will not match token-for-token, and we do not require it. + +## Invariants (operationalized) + +Each invariant below specifies the precise equality used and the oracle it is checked against. + +### I1. `force-jit` ≡ synchronous speculative decoding (greedy) +- **What**: For temperature=0 and fixed prompt, running SSD with `--async --backup force-jit` produces the same token stream as running SSD with `--async=False` (sync spec) using the same speculator. +- **Why it should hold**: `force-jit` always runs the draft synchronously, so the only difference between it and sync spec is the process topology (separate process vs colocated), which must not affect outputs. +- **Tolerance**: Bitwise token match, over a set of canonical prompts. +- **Tier**: 1 (SSD side). TGL side is Tier 4 eventually. + +### I2. Greedy token stream independent of backup strategy +- **What**: For temperature=0, `force-jit`, `jit`, and `fast` produce the same output token stream for the same prompts. +- **Tolerance**: Bitwise token match. +- **Tier**: 1. + +### I3. CUDA-graph ≡ eager +- **What**: Greedy output with `enforce_eager=True` equals output with CUDA graphs enabled. +- **Tolerance**: Bitwise token match. +- **Tier**: 1. + +### I4. Batch independence +- **What**: Greedy output for a prompt is the same whether the prompt is run alone (batch=1) or in a batch at arbitrary position alongside other prompts. +- **Tolerance**: Bitwise token match for the prompt of interest. +- **Tier**: 1. + +### I5. Prefix-caching correctness +- **What**: Running a prompt with a shared prefix twice consecutively produces the same output, and the second run reports `num_cached_tokens > 0` for the shared prefix. +- **Tier**: 1. + +### I6. Preemption round-trip +- **What**: A sequence that gets preempted (blocks freed, moved back to waiting, re-prefilled) produces the same final output as a sequence that was never preempted. Forced by setting `max_num_seqs` and `num_kvcache_blocks` to a value that guarantees preemption. +- **Tier**: 1. + +### I7. Tree-cache invalidation +- **What** (unit): After a sequence's state rolls back (accepted a short suffix, recovery token set), the draft-side tree cache for that `(seq_id, keep_idx, recovery_token)` key must be reused if the same key appears; a different key must miss. +- **Tier**: 0 (tested with a pure-Python model of the tree cache). + +### I8. `verify()` correctness against a pure-Python oracle +- **What**: `ssd.utils.verify.verify` produces the expected `(accepted_suffixes, recovery_tokens)` on synthetic logits_p, logits_q, and speculations, for all branches: + - all-greedy (temp_p=0, temp_q=0) + - target-sampled, draft-greedy (temp_p>0, temp_q=0) + - both-sampled, cache hit (ratio acceptance) + - both-sampled, cache miss (fall back to greedy when `jit_speculate=False`) + - `jit_speculate=True` uses ratio acceptance regardless of cache hit +- **Tolerance**: Exact for greedy branches; probabilistic match on seed-controlled distribution for ratio branches. +- **Tier**: 0. + +### I9. Mask-helper equivalence and structure +- **What**: `get_custom_mask_cached` (B≤8 path) and `get_custom_mask_vectorized` (B>8 path) produce the **same flattened mask** for any given (context_lens, step, K, F, B, fan_out_list, fan_out_list_miss, cache_hits). Separately, the mask shape/semantics match a small reference implementation (`get_mask_iter_i`-style). +- **Tier**: 0. + +### I10. Block-manager semantics +- **What**: `BlockManager` allocate/deallocate/may_append correctly: + - refcount goes to zero → block returns to free pool. + - shared prefix → `hash_to_block_id` reuse; `num_cached_tokens` reflects reuse. + - incomplete last block has `hash == -1` and is never put into `hash_to_block_id`. + - `can_allocate` / `can_append` return false when the pool is empty. + - draft and target managers are independent. +- **Tier**: 0. + +### I11. Handshake pack/unpack round-trip +- **What**: `TargetDraftHandshake.send_request` / `receive_response` tensor shapes and semantics are invertible. We pack a known set of inputs, simulate "wire transfer" by copying to CPU and back, and check that the receiver observes the same values. +- **Tier**: 0 (simulated; no NCCL). + +### I12. SSD ↔ TGL fixture-based equivalence +- **What**: Captured inputs `(cache keys, seqs metadata, block tables, target hidden states)` fed into the SSD draft-tree builder produce the same tree as when fed into TGL's draft-tree builder. Captured `(logits_p, logits_q, speculations)` fed into SSD's `verify()` produce the same accept-count and recovery-token decision as TGL's equivalent. +- **Tier**: 3 (out of scope for this PR; we add the fixture-capture hook so the fixture set can be collected when we get to it). + +### I13. HF target greedy match +- **What**: `LLM(target_only=True).generate(prompt, temperature=0)` output tokens equal `AutoModelForCausalLM.generate(..., do_sample=False)` output tokens on a small set of short prompts. +- **Tier**: 2 (out of scope for this PR). + +### I14. Performance regression +- **What**: For a canonical benchmark config (dataset, batch size, input/output lengths), measured `tokens_per_sec` and per-component `ms` metrics do not regress by more than a threshold (default 5%) vs. a checked-in baseline JSON. +- **Tier**: 5 (out of scope for this PR). + +## This PR (Tiers 0 + 1) — concrete test list + +### Tier 0 (CPU-only, no model weights) + +Files under `tests/unit/`: + +- `test_verify.py` — invariant I8. Constructs synthetic logits and speculations, exercises each branch of `ssd.utils.verify.verify`, asserts accepted suffixes and recovery tokens against a pure-Python oracle. Uses fixed `torch.manual_seed` where sampling is involved. +- `test_block_manager.py` — invariant I10. Exercises allocate/deallocate/shared-prefix/may_append/refcount. Tests both `is_draft=False` and `is_draft=True`. +- `test_mask_helpers.py` — invariant I9. For a matrix of `(K, F, B, context_lens, step, fan_out_list, fan_out_list_miss, cache_hits)`, builds the mask via the cached path and the vectorized path and asserts they agree; also checks shape and causal structure against a reference built from `get_mask_iter_i` primitives. Uses CUDA if available; otherwise CPU. Tier 0 runs with CPU. +- `test_tree_cache_semantics.py` — invariant I7. Pure-Python model of the draft's `prev_fork_keys` / cache hit logic. Verifies key matching, rollback invalidation, collision behavior (same seq_id, different recovery_token → miss). +- `test_handshake_roundtrip.py` — invariant I11. Uses `TargetDraftHandshake`-shaped tensor buffers but substitutes NCCL send/recv with in-memory copies to exercise pack/unpack logic and shape contracts. + +### Tier 1 (1× H100, 8B, real weights, greedy) + +Files under `tests/e2e/`: + +- `test_sync_vs_force_jit.py` — I1. Two LLMs with same config, one sync-spec, one async+force-jit; same prompts, temp=0, assert equal token streams. +- `test_greedy_strategy_equivalence.py` — I2. `force-jit`, `jit`, `fast` all produce the same greedy output. Runs three configs in sequence (one LLM at a time to avoid OOM). +- `test_cudagraph_vs_eager.py` — I3. Same config with `enforce_eager=True` vs `False`, assert equal greedy output. +- `test_batch_independence.py` — I4. Prompt P run solo vs run at each position in a batch of N prompts; greedy output of P must match. +- `test_prefix_cache.py` — I5. Run a prompt with a long shared prefix twice; verify second run hits cache (`num_cached_tokens > 0` reported via METRICS) and produces identical output. +- `test_preemption.py` — I6. Configure KV pool such that preemption is guaranteed; verify final outputs equal those of an unpreempted run. + +All Tier 1 tests default to a short prompt set (≤5 prompts, ≤128 output tokens) so the whole tier finishes in a few minutes on a single H100. + +### Fast subset (per-commit) + +All of Tier 0 plus a single smoke test from Tier 1 (`test_sync_vs_force_jit.py::test_two_prompts_greedy`). + +Invocation (documented in `tests/README.md`): +``` +# fast +pytest tests/unit tests/e2e/test_sync_vs_force_jit.py::test_two_prompts_greedy -m "tier0 or smoke" +# full tier 0+1 +pytest tests/unit tests/e2e -m "tier0 or tier1" +``` + +## Out of scope for this PR (tracked) + +- Tier 2 (HF greedy reference). +- Tier 3 (SSD ↔ TGL fixture equivalence). The fixture format and capture hooks will be designed when we get here; they will live in `tests/fixtures/` and be produced by an opt-in flag in each repo's engine. +- Tier 4 (70B TP=4). Requires a 4-GPU host; same invariants as Tiers 1–3. +- Tier 5 (perf regression). Will reuse the output of `bench/extract_metrics.py` and add baseline JSON checked into `tests/perf_baselines/`. +- EAGLE-3 hidden-state specific tests (captured as a Tier 1 follow-up). +- VLM / non-Llama models / TP mismatch between draft and target — explicit non-goals. + +## Infrastructure + +- **Environments**: SSD tests use `/work/avner/git/ssd-phnx/.venv` (uv-managed). TGL tests (Tier 3+) will use `/work/avner/git/tgl/.venv`. +- **GPU selection**: pytest marker `tier1`/`tier4` auto-skips when `torch.cuda.device_count()` is insufficient. Tier 0 never uses CUDA. +- **Results storage**: Tier 5 metrics JSON lands under `tests/perf_results/.json` and plots under `tests/perf_results/plots/` (gitignored except for baselines). +- **CI** (proposal for future): GitHub Actions self-hosted runner with 1 H100 runs fast subset + Tier 1 per commit; nightly workflow runs Tiers 2, 3, 5; manual dispatch for Tier 4. + +## Open questions for the user + +- (none; aligned on scope: Tier 0 + Tier 1 this pass, fixture-based for SSD↔TGL later.) diff --git a/tests/test_attention_paths.py b/tests/test_attention_paths.py new file mode 100644 index 000000000..8bedf948e --- /dev/null +++ b/tests/test_attention_paths.py @@ -0,0 +1,388 @@ +"""Tests for all Attention code paths after migration from sgl_kernel to FA4. + +Covers: + 1. Prefill (contiguous Q/K/V with cu_seqlens) + 2. Verify/glue decode (paged KV cache with cu_seqlens_q) + 3. Single query decode (paged KV cache, 1 query per sequence) + 4. Tree decode is already covered in test_fa4_tree_decode.py +""" + +import pytest +import torch +from ssd.layers.attention import Attention +from ssd.utils.context import set_context, reset_context + + +DEVICE = "cuda" +DTYPE = torch.bfloat16 + + +@pytest.fixture(autouse=True) +def cleanup_context(): + yield + reset_context() + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def make_attention( + num_heads=8, num_kv_heads=2, head_dim=128, + draft=False, speculate=False, draft_async=False, + F=1, K=1, +): + scale = head_dim ** -0.5 + return Attention( + num_heads=num_heads, head_dim=head_dim, scale=scale, + num_kv_heads=num_kv_heads, draft=draft, speculate=speculate, + draft_async=draft_async, use_eagle=False, F=F, K=K, + ) + + +def make_paged_kv_cache(num_pages, page_size, num_kv_heads, head_dim): + k_cache = torch.randn(num_pages, page_size, num_kv_heads, head_dim, dtype=DTYPE, device=DEVICE) + v_cache = torch.randn(num_pages, page_size, num_kv_heads, head_dim, dtype=DTYPE, device=DEVICE) + return k_cache, v_cache + + +def make_block_tables(batch_size, context_lens_list, page_size, max_pages_per_seq, page_offset=0): + block_tables = torch.zeros(batch_size, max_pages_per_seq, dtype=torch.int32, device=DEVICE) + for b in range(batch_size): + n_pages = (context_lens_list[b] + page_size - 1) // page_size + block_tables[b, :n_pages] = torch.arange(n_pages, dtype=torch.int32, device=DEVICE) + b * page_offset + return block_tables + + +# =========================================================================== +# 1. Prefill path +# =========================================================================== + +class TestPrefill: + """context.is_prefill=True, no paged KV cache (contiguous Q/K/V).""" + + @pytest.fixture(autouse=True) + def setup(self): + torch.manual_seed(0) + self.num_heads = 8 + self.num_kv_heads = 2 + self.head_dim = 128 + self.hidden = self.num_heads * self.head_dim + self.kv_hidden = self.num_kv_heads * self.head_dim + + def _run(self, seq_lens): + attn = make_attention( + num_heads=self.num_heads, num_kv_heads=self.num_kv_heads, + head_dim=self.head_dim, + ) + # No KV cache for prefill without paging + total_tokens = sum(seq_lens) + q = torch.randn(total_tokens, self.hidden, dtype=DTYPE, device=DEVICE) + k = torch.randn(total_tokens, self.kv_hidden, dtype=DTYPE, device=DEVICE) + v = torch.randn(total_tokens, self.kv_hidden, dtype=DTYPE, device=DEVICE) + + cu_seqlens = torch.zeros(len(seq_lens) + 1, dtype=torch.int32, device=DEVICE) + for i, sl in enumerate(seq_lens): + cu_seqlens[i + 1] = cu_seqlens[i] + sl + max_seqlen = max(seq_lens) + slot_mapping = torch.arange(total_tokens, dtype=torch.int32, device=DEVICE) + + set_context( + is_prefill=True, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen, + max_seqlen_k=max_seqlen, + slot_mapping=slot_mapping, + ) + + with torch.inference_mode(): + out = attn(q, k, v) + return out + + def test_output_shape(self): + out = self._run([10, 15]) + assert out.shape == (25, self.hidden) + + def test_no_nan_inf(self): + out = self._run([10, 15]) + assert not torch.isnan(out).any(), "Output contains NaN" + assert not torch.isinf(out).any(), "Output contains Inf" + + def test_single_sequence(self): + out = self._run([20]) + assert out.shape == (20, self.hidden) + assert not torch.isnan(out).any() + + def test_different_seq_lens(self): + out = self._run([5, 30]) + out_seq0 = out[:5] + out_seq1 = out[5:] + assert not torch.allclose(out_seq0.mean(), out_seq1.mean()) + + def test_deterministic(self): + torch.manual_seed(0) + out1 = self._run([10, 15]) + torch.manual_seed(0) + out2 = self._run([10, 15]) + assert torch.allclose(out1, out2) + + +# =========================================================================== +# 2. Prefill with paged KV cache +# =========================================================================== + +class TestPrefillPaged: + """context.is_prefill=True with block_tables set (paged KV).""" + + @pytest.fixture(autouse=True) + def setup(self): + torch.manual_seed(1) + self.num_heads = 8 + self.num_kv_heads = 2 + self.head_dim = 128 + self.hidden = self.num_heads * self.head_dim + self.kv_hidden = self.num_kv_heads * self.head_dim + self.page_size = 1 + self.num_pages = 200 + self.max_pages_per_seq = 50 + + def _run(self, seq_lens): + attn = make_attention( + num_heads=self.num_heads, num_kv_heads=self.num_kv_heads, + head_dim=self.head_dim, + ) + k_cache, v_cache = make_paged_kv_cache( + self.num_pages, self.page_size, self.num_kv_heads, self.head_dim, + ) + attn.k_cache = k_cache + attn.v_cache = v_cache + + total_tokens = sum(seq_lens) + q = torch.randn(total_tokens, self.hidden, dtype=DTYPE, device=DEVICE) + k = torch.randn(total_tokens, self.kv_hidden, dtype=DTYPE, device=DEVICE) + v = torch.randn(total_tokens, self.kv_hidden, dtype=DTYPE, device=DEVICE) + + cu_seqlens = torch.zeros(len(seq_lens) + 1, dtype=torch.int32, device=DEVICE) + for i, sl in enumerate(seq_lens): + cu_seqlens[i + 1] = cu_seqlens[i] + sl + max_seqlen = max(seq_lens) + + slot_mapping = torch.arange(total_tokens, dtype=torch.int32, device=DEVICE) + block_tables = make_block_tables( + len(seq_lens), seq_lens, self.page_size, self.max_pages_per_seq, page_offset=50, + ) + + set_context( + is_prefill=True, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen, + max_seqlen_k=max_seqlen, + slot_mapping=slot_mapping, + block_tables=block_tables, + ) + + with torch.inference_mode(): + out = attn(q, k, v) + return out + + def test_output_shape(self): + out = self._run([10, 15]) + assert out.shape == (25, self.hidden) + + def test_no_nan_inf(self): + out = self._run([10, 15]) + assert not torch.isnan(out).any() + assert not torch.isinf(out).any() + + +# =========================================================================== +# 3. Verify/glue decode path +# =========================================================================== + +class TestVerifyGlueDecode: + """speculate=True, cu_seqlens_q is not None → verify_or_glue path.""" + + @pytest.fixture(autouse=True) + def setup(self): + torch.manual_seed(2) + self.num_heads = 8 + self.num_kv_heads = 2 + self.head_dim = 128 + self.hidden = self.num_heads * self.head_dim + self.kv_hidden = self.num_kv_heads * self.head_dim + self.page_size = 1 + self.num_pages = 200 + self.max_pages_per_seq = 50 + self.max_model_len = 100 + + def _make_attn(self): + attn = make_attention( + num_heads=self.num_heads, num_kv_heads=self.num_kv_heads, + head_dim=self.head_dim, speculate=True, + ) + k_cache, v_cache = make_paged_kv_cache( + self.num_pages, self.page_size, self.num_kv_heads, self.head_dim, + ) + attn.k_cache = k_cache + attn.v_cache = v_cache + attn.max_seqlen_k = self.max_model_len + return attn + + def _run(self, query_lens, context_lens_list): + """ + query_lens: list of query tokens per sequence (e.g. [K+1, K+1] for verify) + context_lens_list: list of KV context lengths per sequence + """ + attn = self._make_attn() + B = len(query_lens) + total_q = sum(query_lens) + q = torch.randn(total_q, self.hidden, dtype=DTYPE, device=DEVICE) + k = torch.randn(total_q, self.kv_hidden, dtype=DTYPE, device=DEVICE) + v = torch.randn(total_q, self.kv_hidden, dtype=DTYPE, device=DEVICE) + + cu_seqlens_q = torch.zeros(B + 1, dtype=torch.int32, device=DEVICE) + for i, ql in enumerate(query_lens): + cu_seqlens_q[i + 1] = cu_seqlens_q[i] + ql + max_seqlen_q = max(query_lens) + + context_lens = torch.tensor(context_lens_list, dtype=torch.int32, device=DEVICE) + slot_mapping = torch.arange(total_q, dtype=torch.int32, device=DEVICE) + block_tables = make_block_tables( + B, context_lens_list, self.page_size, self.max_pages_per_seq, page_offset=50, + ) + + set_context( + is_prefill=False, + cu_seqlens_q=cu_seqlens_q, + max_seqlen_q=max_seqlen_q, + slot_mapping=slot_mapping, + context_lens=context_lens, + block_tables=block_tables, + ) + + with torch.inference_mode(): + out = attn(q, k, v) + return out + + def test_output_shape(self): + # 2 sequences, each with K+1=4 query tokens, context 20 and 15 + out = self._run([4, 4], [20, 15]) + assert out.shape == (8, self.hidden) + + def test_no_nan_inf(self): + out = self._run([4, 4], [20, 15]) + assert not torch.isnan(out).any(), "Output contains NaN" + assert not torch.isinf(out).any(), "Output contains Inf" + + def test_single_sequence(self): + out = self._run([8], [30]) + assert out.shape == (8, self.hidden) + assert not torch.isnan(out).any() + + def test_variable_query_lens(self): + out = self._run([3, 6], [25, 10]) + assert out.shape == (9, self.hidden) + assert not torch.isnan(out).any() + + def test_deterministic(self): + torch.manual_seed(2) + out1 = self._run([4, 4], [20, 15]) + torch.manual_seed(2) + out2 = self._run([4, 4], [20, 15]) + assert torch.allclose(out1, out2) + + +# =========================================================================== +# 4. Single query decode path +# =========================================================================== + +class TestSingleQueryDecode: + """decode=True, not verify_or_glue, not tree_decode → single query decode.""" + + @pytest.fixture(autouse=True) + def setup(self): + torch.manual_seed(3) + self.num_heads = 8 + self.num_kv_heads = 2 + self.head_dim = 128 + self.hidden = self.num_heads * self.head_dim + self.kv_hidden = self.num_kv_heads * self.head_dim + self.page_size = 1 + self.num_pages = 200 + self.max_pages_per_seq = 50 + self.max_model_len = 100 + + def _make_attn(self): + # speculate=False (or draft=False, draft_async=False) so we don't enter + # verify_or_glue or tree_decode + attn = make_attention( + num_heads=self.num_heads, num_kv_heads=self.num_kv_heads, + head_dim=self.head_dim, speculate=False, + ) + k_cache, v_cache = make_paged_kv_cache( + self.num_pages, self.page_size, self.num_kv_heads, self.head_dim, + ) + attn.k_cache = k_cache + attn.v_cache = v_cache + attn.max_seqlen_k = self.max_model_len + return attn + + def _run(self, batch_size, context_lens_list): + attn = self._make_attn() + # Single query decode: 1 query token per sequence + total_q = batch_size + q = torch.randn(total_q, self.hidden, dtype=DTYPE, device=DEVICE) + k = torch.randn(total_q, self.kv_hidden, dtype=DTYPE, device=DEVICE) + v = torch.randn(total_q, self.kv_hidden, dtype=DTYPE, device=DEVICE) + + context_lens = torch.tensor(context_lens_list, dtype=torch.int32, device=DEVICE) + slot_mapping = torch.arange(total_q, dtype=torch.int32, device=DEVICE) + block_tables = make_block_tables( + batch_size, context_lens_list, self.page_size, self.max_pages_per_seq, page_offset=50, + ) + + set_context( + is_prefill=False, + cu_seqlens_q=None, # None → not verify_or_glue + slot_mapping=slot_mapping, + context_lens=context_lens, + block_tables=block_tables, + ) + + with torch.inference_mode(): + out = attn(q, k, v) + return out + + def test_output_shape(self): + out = self._run(2, [20, 15]) + assert out.shape == (2, self.hidden) + + def test_no_nan_inf(self): + out = self._run(2, [20, 15]) + assert not torch.isnan(out).any(), "Output contains NaN" + assert not torch.isinf(out).any(), "Output contains Inf" + + def test_single_sequence(self): + out = self._run(1, [30]) + assert out.shape == (1, self.hidden) + assert not torch.isnan(out).any() + + def test_large_batch(self): + B = 16 + ctx_lens = [5 + i * 2 for i in range(B)] # max = 5 + 15*2 = 35 < max_pages_per_seq + out = self._run(B, ctx_lens) + assert out.shape == (B, self.hidden) + assert not torch.isnan(out).any() + + def test_different_context_lens_produce_different_outputs(self): + out = self._run(2, [50, 5]) + assert not torch.allclose(out[0], out[1]) + + def test_deterministic(self): + torch.manual_seed(3) + out1 = self._run(2, [20, 15]) + torch.manual_seed(3) + out2 = self._run(2, [20, 15]) + assert torch.allclose(out1, out2) diff --git a/tests/test_fa4_tree_decode.py b/tests/test_fa4_tree_decode.py new file mode 100644 index 000000000..19102ad75 --- /dev/null +++ b/tests/test_fa4_tree_decode.py @@ -0,0 +1,201 @@ +"""Tests for FA4 flash_attn_varlen_func with paged KV cache (tree decode replacement).""" + +import pytest +import torch +from flash_attn.cute.interface import flash_attn_varlen_func as fa4_varlen_func +from ssd.layers.attention import Attention +from ssd.utils.context import set_context, reset_context + + +DEVICE = "cuda" +DTYPE = torch.bfloat16 + + +# --------------------------------------------------------------------------- +# FA4 varlen + page_table: basic correctness +# --------------------------------------------------------------------------- + +class TestFA4VarlenPageTable: + """Test flash_attn_varlen_func with page_table at various page sizes.""" + + @pytest.fixture(autouse=True) + def setup(self): + torch.manual_seed(42) + self.B = 2 + self.MQ_LEN = 6 + self.num_heads = 4 + self.num_kv_heads = 2 + self.head_dim = 128 + self.num_pages = 200 + self.max_pages_per_seq = 20 + + def _run(self, page_size, kv_lens): + total_q = self.B * self.MQ_LEN + q = torch.randn(total_q, self.num_heads, self.head_dim, dtype=DTYPE, device=DEVICE) + k_cache = torch.randn(self.num_pages, page_size, self.num_kv_heads, self.head_dim, dtype=DTYPE, device=DEVICE) + v_cache = torch.randn(self.num_pages, page_size, self.num_kv_heads, self.head_dim, dtype=DTYPE, device=DEVICE) + cu_seqlens_q = torch.arange(self.B + 1, dtype=torch.int32, device=DEVICE) * self.MQ_LEN + + page_table = torch.zeros(self.B, self.max_pages_per_seq, dtype=torch.int32, device=DEVICE) + for b in range(self.B): + n_pages = (kv_lens[b] + page_size - 1) // page_size + page_table[b, :n_pages] = torch.arange(n_pages, dtype=torch.int32, device=DEVICE) + b * 50 + + seqused_k = torch.tensor(kv_lens, dtype=torch.int32, device=DEVICE) + + out, lse = fa4_varlen_func( + q, k_cache, v_cache, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=None, + max_seqlen_q=self.MQ_LEN, + max_seqlen_k=max(kv_lens), + seqused_k=seqused_k, + page_table=page_table, + softmax_scale=self.head_dim ** -0.5, + causal=False, + ) + return out, lse + + @pytest.mark.parametrize("page_size", [1, 16, 128]) + def test_output_shape(self, page_size): + out, _ = self._run(page_size, kv_lens=[10, 5]) + assert out.shape == (self.B * self.MQ_LEN, self.num_heads, self.head_dim) + + @pytest.mark.parametrize("page_size", [1, 16, 128]) + def test_no_nan_inf(self, page_size): + out, _ = self._run(page_size, kv_lens=[10, 5]) + assert not torch.isnan(out).any(), "Output contains NaN" + assert not torch.isinf(out).any(), "Output contains Inf" + + @pytest.mark.parametrize("page_size", [1, 16, 128]) + def test_lse_returned_none_by_default(self, page_size): + _, lse = self._run(page_size, kv_lens=[10, 5]) + assert lse is None, "LSE should be None when return_lse=False (default)" + + def test_variable_kv_lengths(self): + """Sequences with very different KV lengths should both produce valid output.""" + self.max_pages_per_seq = 60 # accommodate kv_len=50 + out, _ = self._run(page_size=1, kv_lens=[50, 3]) + assert not torch.isnan(out).any() + # Check that the two sequences produce different outputs (they have different KV) + out_seq0 = out[:self.MQ_LEN] + out_seq1 = out[self.MQ_LEN:] + assert not torch.allclose(out_seq0, out_seq1), "Different KV should produce different outputs" + + def test_deterministic(self): + """Same inputs should produce same outputs.""" + out1, _ = self._run(page_size=1, kv_lens=[10, 5]) + torch.manual_seed(42) # reset seed to get same random inputs + out2, _ = self._run(page_size=1, kv_lens=[10, 5]) + assert torch.allclose(out1, out2), "Same inputs should produce identical outputs" + + def test_batch_size_1(self): + """Single-sequence batch should work.""" + self.B = 1 + out, _ = self._run(page_size=1, kv_lens=[10]) + assert out.shape == (self.MQ_LEN, self.num_heads, self.head_dim) + assert not torch.isnan(out).any() + + +# --------------------------------------------------------------------------- +# Attention layer integration: tree decode path +# --------------------------------------------------------------------------- + +class TestAttentionTreeDecode: + """Test the Attention module's tree_decode path end-to-end with FA4.""" + + @pytest.fixture(autouse=True) + def setup(self): + torch.manual_seed(42) + self.num_heads = 8 + self.num_kv_heads = 2 + self.head_dim = 128 + self.scale = self.head_dim ** -0.5 + self.F_fan = 2 + self.K_spec = 2 + self.MQ_LEN = self.F_fan * (self.K_spec + 1) + self.page_size = 1 + self.num_pages = 200 + self.max_pages_per_seq = 50 + self.max_model_len = 50 + yield + reset_context() + + def _make_attn(self): + attn = Attention( + num_heads=self.num_heads, head_dim=self.head_dim, scale=self.scale, + num_kv_heads=self.num_kv_heads, draft=True, speculate=True, + draft_async=True, use_eagle=False, F=self.F_fan, K=self.K_spec, + ) + attn.k_cache = torch.randn( + self.num_pages, self.page_size, self.num_kv_heads, self.head_dim, + dtype=DTYPE, device=DEVICE) + attn.v_cache = torch.randn( + self.num_pages, self.page_size, self.num_kv_heads, self.head_dim, + dtype=DTYPE, device=DEVICE) + attn.max_seqlen_k = self.max_model_len + return attn + + def _run(self, attn, B, context_lens_list): + total_tokens = B * self.MQ_LEN + q = torch.randn(total_tokens, self.num_heads * self.head_dim, dtype=DTYPE, device=DEVICE) + k = torch.randn(total_tokens, self.num_kv_heads * self.head_dim, dtype=DTYPE, device=DEVICE) + v = torch.randn(total_tokens, self.num_kv_heads * self.head_dim, dtype=DTYPE, device=DEVICE) + + context_lens = torch.tensor(context_lens_list, dtype=torch.int32, device=DEVICE) + slot_mapping = torch.arange(total_tokens, dtype=torch.int32, device=DEVICE) + + block_tables = torch.zeros(B, self.max_pages_per_seq, dtype=torch.int32, device=DEVICE) + for b in range(B): + n_pages = context_lens_list[b] # page_size=1, so pages == tokens + block_tables[b, :n_pages] = torch.arange(n_pages, dtype=torch.int32, device=DEVICE) + b * 50 + + cu_seqlens_q = torch.arange(B + 1, dtype=torch.int32, device=DEVICE) * self.MQ_LEN + + set_context( + is_prefill=False, + slot_mapping=slot_mapping, + context_lens=context_lens, + block_tables=block_tables, + tree_cu_seqlens_q=cu_seqlens_q, + ) + + with torch.inference_mode(): + out = attn(q, k, v) + return out + + def test_output_shape(self): + attn = self._make_attn() + out = self._run(attn, B=2, context_lens_list=[20, 15]) + expected = (2 * self.MQ_LEN, self.num_heads * self.head_dim) + assert out.shape == expected, f"Expected {expected}, got {out.shape}" + + def test_no_nan_inf(self): + attn = self._make_attn() + out = self._run(attn, B=2, context_lens_list=[20, 15]) + assert not torch.isnan(out).any(), "Output contains NaN" + assert not torch.isinf(out).any(), "Output contains Inf" + + def test_single_sequence(self): + attn = self._make_attn() + out = self._run(attn, B=1, context_lens_list=[30]) + expected = (self.MQ_LEN, self.num_heads * self.head_dim) + assert out.shape == expected + + def test_different_context_lens(self): + """Sequences with different context lengths should produce different outputs.""" + attn = self._make_attn() + out = self._run(attn, B=2, context_lens_list=[40, 10]) + out_seq0 = out[:self.MQ_LEN] + out_seq1 = out[self.MQ_LEN:] + assert not torch.allclose(out_seq0, out_seq1) + + def test_non_tree_decode_paths_unaffected(self): + """Verify that non-tree-decode paths still use the original kernels.""" + attn = Attention( + num_heads=self.num_heads, head_dim=self.head_dim, scale=self.scale, + num_kv_heads=self.num_kv_heads, draft=False, speculate=False, + draft_async=False, use_eagle=False, + ) + # This attention module should NOT take the tree_decode path + assert not (attn.speculate and attn.draft and attn.draft_async) diff --git a/tests/test_score_mod_basic.py b/tests/test_score_mod_basic.py new file mode 100644 index 000000000..e7ea7cdfe --- /dev/null +++ b/tests/test_score_mod_basic.py @@ -0,0 +1,155 @@ +"""Test that score_mod with aux_tensors works with FA4 varlen + page_table.""" + +import torch +import pytest +from flash_attn.cute.interface import flash_attn_varlen_func +from ssd.layers.tree_mask import create_tree_score_mod, build_tree_mask_bias + +DEVICE = "cuda" +DTYPE = torch.bfloat16 + + +class TestScoreModBasic: + """Verify score_mod compiles and runs with FA4 varlen + page_table.""" + + @pytest.fixture(autouse=True) + def setup(self): + torch.manual_seed(42) + self.B = 2 + self.MQ_LEN = 6 + self.num_heads = 4 + self.num_kv_heads = 2 + self.head_dim = 128 + self.num_pages = 200 + self.max_pages_per_seq = 50 + self.page_size = 1 + + def _make_inputs(self, kv_lens): + total_q = self.B * self.MQ_LEN + q = torch.randn(total_q, self.num_heads, self.head_dim, dtype=DTYPE, device=DEVICE) + k_cache = torch.randn(self.num_pages, self.page_size, self.num_kv_heads, self.head_dim, dtype=DTYPE, device=DEVICE) + v_cache = torch.randn(self.num_pages, self.page_size, self.num_kv_heads, self.head_dim, dtype=DTYPE, device=DEVICE) + cu_seqlens_q = torch.arange(self.B + 1, dtype=torch.int32, device=DEVICE) * self.MQ_LEN + page_table = torch.zeros(self.B, self.max_pages_per_seq, dtype=torch.int32, device=DEVICE) + for b in range(self.B): + n = kv_lens[b] + page_table[b, :n] = torch.arange(n, dtype=torch.int32, device=DEVICE) + b * 50 + seqused_k = torch.tensor(kv_lens, dtype=torch.int32, device=DEVICE) + return q, k_cache, v_cache, cu_seqlens_q, page_table, seqused_k + + def test_zero_bias_matches_no_scoremod(self): + """A score_mod that adds zero should produce identical output.""" + kv_lens = [10, 5] + max_kv_stride = 50 + q, k, v, cu, pt, sk = self._make_inputs(kv_lens) + + out_base, _ = flash_attn_varlen_func( + q, k, v, cu_seqlens_q=cu, cu_seqlens_k=None, + max_seqlen_q=self.MQ_LEN, max_seqlen_k=max(kv_lens), + seqused_k=sk, page_table=pt, + softmax_scale=self.head_dim ** -0.5, causal=False, + ) + + score_mod = create_tree_score_mod(max_kv_stride) + # All-zero bias = no masking + bias = torch.zeros(self.B * self.MQ_LEN * max_kv_stride, dtype=torch.float32, device=DEVICE) + + out_mod, _ = flash_attn_varlen_func( + q, k, v, cu_seqlens_q=cu, cu_seqlens_k=None, + max_seqlen_q=self.MQ_LEN, max_seqlen_k=max(kv_lens), + seqused_k=sk, page_table=pt, + softmax_scale=self.head_dim ** -0.5, causal=False, + score_mod=score_mod, aux_tensors=[bias], + ) + + assert torch.allclose(out_base, out_mod, atol=1e-2), \ + f"Zero bias should match base, max diff: {(out_base - out_mod).abs().max().item()}" + + def test_full_mask_produces_uniform_attention(self): + """Masking all but one KV position should concentrate attention there.""" + kv_lens = [10, 5] + max_kv_stride = 50 + q, k, v, cu, pt, sk = self._make_inputs(kv_lens) + + score_mod = create_tree_score_mod(max_kv_stride) + # Mask everything except KV position 0 for all queries + bias = torch.full((self.B * self.MQ_LEN * max_kv_stride,), -1e6, dtype=torch.float32, device=DEVICE) + for b in range(self.B): + for qi in range(self.MQ_LEN): + flat_idx = (b * self.MQ_LEN + qi) * max_kv_stride + 0 # only attend to kv_idx=0 + bias[flat_idx] = 0.0 + + out, _ = flash_attn_varlen_func( + q, k, v, cu_seqlens_q=cu, cu_seqlens_k=None, + max_seqlen_q=self.MQ_LEN, max_seqlen_k=max(kv_lens), + seqused_k=sk, page_table=pt, + softmax_scale=self.head_dim ** -0.5, causal=False, + score_mod=score_mod, aux_tensors=[bias], + ) + + assert not torch.isnan(out).any(), "Masked output has NaN" + assert not torch.isinf(out).any(), "Masked output has Inf" + + +class TestTreeMaskBuild: + """Test build_tree_mask_bias produces correct mask structure.""" + + def test_prefix_unmasked(self): + """All prefix positions should have bias=0 (attend).""" + B, K, MQ_LEN = 1, 2, 6 + fol = [2, 2, 2] + context_lens = torch.tensor([20], dtype=torch.int32) # prefix = 20 - (1*6 + 3) = 11 + cache_hits = torch.tensor([1]) + max_kv_stride = 50 + + bias = build_tree_mask_bias( + context_lens, step=0, K=K, MQ_LEN=MQ_LEN, + fan_out_list=fol, fan_out_list_miss=fol, + cache_hits=cache_hits, max_kv_stride=max_kv_stride, + device="cpu", + ) + bias_2d = bias.reshape(MQ_LEN, max_kv_stride) + prefix_len = 20 - (1 * MQ_LEN + K + 1) + # All prefix columns should be 0.0 (unmasked) + assert (bias_2d[:, :prefix_len] == 0.0).all(), "Prefix should be unmasked" + + def test_masked_positions_negative(self): + """Positions beyond the valid KV should be masked (large negative).""" + B, K, MQ_LEN = 1, 2, 6 + fol = [2, 2, 2] + context_lens = torch.tensor([20], dtype=torch.int32) + cache_hits = torch.tensor([1]) + max_kv_stride = 50 + + bias = build_tree_mask_bias( + context_lens, step=0, K=K, MQ_LEN=MQ_LEN, + fan_out_list=fol, fan_out_list_miss=fol, + cache_hits=cache_hits, max_kv_stride=max_kv_stride, + device="cpu", + ) + bias_2d = bias.reshape(MQ_LEN, max_kv_stride) + # Beyond context_lens should be masked + assert (bias_2d[:, 20:] < -1e5).all(), "Beyond context_lens should be masked" + + def test_diagonal_pattern(self): + """At step 0, each query should attend to its own diagonal position.""" + B, K, MQ_LEN = 1, 2, 6 + fol = [2, 2, 2] + # context_lens at step 0 needs to be at least ttl_added = 1*MQ_LEN + K+1 = 9 + context_lens = torch.tensor([15], dtype=torch.int32) + cache_hits = torch.tensor([1]) + max_kv_stride = 50 + + bias = build_tree_mask_bias( + context_lens, step=0, K=K, MQ_LEN=MQ_LEN, + fan_out_list=fol, fan_out_list_miss=fol, + cache_hits=cache_hits, max_kv_stride=max_kv_stride, + device="cpu", + ) + bias_2d = bias.reshape(MQ_LEN, max_kv_stride) + prefix_len = 15 - (1 * MQ_LEN + K + 1) # = 6 + diag_start = prefix_len + K + 1 # = 9 + # At step 0, block 0: bias_2d[q, diag_start + q] should be 0.0 + for q in range(MQ_LEN): + col = diag_start + q + assert bias_2d[q, col].item() == 0.0, f"Diagonal at q={q}, col={col} should be unmasked" diff --git a/tests/test_tree_mask_correctness.py b/tests/test_tree_mask_correctness.py new file mode 100644 index 000000000..0f8750c50 --- /dev/null +++ b/tests/test_tree_mask_correctness.py @@ -0,0 +1,164 @@ +"""Correctness tests: verify FA4 tree mask matches the original flashinfer mask logic.""" + +import torch +import numpy as np +import pytest +from flash_attn.cute.interface import flash_attn_varlen_func +from ssd.layers.tree_mask import create_tree_score_mod, build_tree_mask_bias +from ssd.engine.helpers.mask_helpers import get_custom_mask + +DEVICE = "cuda" +DTYPE = torch.bfloat16 + + +class FakeConfig: + """Minimal config for get_custom_mask.""" + def __init__(self, K, fan_out_list, fan_out_list_miss, max_model_len): + self.speculate_k = K + self.fan_out_list = fan_out_list + self.fan_out_list_miss = fan_out_list_miss + self.max_model_len = max_model_len + + +class TestTreeMaskMatchesOriginal: + """Verify that build_tree_mask_bias produces masks equivalent to get_custom_mask.""" + + @pytest.fixture(autouse=True) + def setup(self): + self.K = 2 + self.F = 2 + self.fan_out_list = [2, 2, 2] # F=2, K+1=3 groups + self.fan_out_list_miss = [2, 2, 2] + self.MQ_LEN = sum(self.fan_out_list) # = 6 + + def _compare_masks(self, B, context_lens_list, step, cache_hits_list): + """Compare old (get_custom_mask) vs new (build_tree_mask_bias) for one step.""" + context_lens = torch.tensor(context_lens_list, dtype=torch.int32, device=DEVICE) + cache_hits = torch.tensor(cache_hits_list, dtype=torch.float32, device=DEVICE) + max_model_len = 100 + + config = FakeConfig(self.K, self.fan_out_list, self.fan_out_list_miss, max_model_len) + + # Old mask: 1D bool tensor, concatenation of per-seq (MQ_LEN x kv_len) masks + old_mask = get_custom_mask( + config, context_lens, step, self.K, self.F, B, + device=DEVICE, cache_hits=cache_hits, + ) + + # New mask bias: (B * MQ_LEN * max_model_len,) float32 + new_bias = build_tree_mask_bias( + context_lens, step=step, K=self.K, MQ_LEN=self.MQ_LEN, + fan_out_list=self.fan_out_list, + fan_out_list_miss=self.fan_out_list_miss, + cache_hits=cache_hits, + max_kv_stride=max_model_len, + device=DEVICE, + ) + new_bias_2d = new_bias.reshape(B * self.MQ_LEN, max_model_len) + + # Extract per-batch masks from old format and compare + old_offset = 0 + for b in range(B): + kv_len = context_lens_list[b] + old_mask_b = old_mask[old_offset:old_offset + self.MQ_LEN * kv_len].reshape(self.MQ_LEN, kv_len) + new_mask_b = new_bias_2d[b * self.MQ_LEN:(b + 1) * self.MQ_LEN, :kv_len] + + # Old: True = attend, False = mask + # New: 0.0 = attend, -1e6 = mask + new_attend = (new_mask_b == 0.0) + old_attend = old_mask_b.bool() + + mismatches = (new_attend != old_attend).sum().item() + assert mismatches == 0, ( + f"Mask mismatch at batch={b}, step={step}: {mismatches} positions differ\n" + f" old attend count: {old_attend.sum().item()}, new attend count: {new_attend.sum().item()}\n" + f" context_len={kv_len}, cache_hit={cache_hits_list[b]}" + ) + old_offset += self.MQ_LEN * kv_len + + @pytest.mark.parametrize("step", [0, 1]) + def test_single_seq_cache_hit(self, step): + # context_lens must be >= ttl_added = (step+1)*MQ_LEN + K+1 + cl = 30 + step * self.MQ_LEN + self._compare_masks(B=1, context_lens_list=[cl], step=step, cache_hits_list=[1]) + + @pytest.mark.parametrize("step", [0, 1]) + def test_single_seq_cache_miss(self, step): + cl = 30 + step * self.MQ_LEN + self._compare_masks(B=1, context_lens_list=[cl], step=step, cache_hits_list=[0]) + + @pytest.mark.parametrize("step", [0, 1]) + def test_multi_seq_mixed_hits(self, step): + base = 25 + step * self.MQ_LEN + self._compare_masks( + B=3, + context_lens_list=[base, base + 10, base + 5], + step=step, + cache_hits_list=[1, 0, 1], + ) + + def test_step_2(self): + cl = 40 + 2 * self.MQ_LEN + self._compare_masks(B=2, context_lens_list=[cl, cl - 5], step=2, cache_hits_list=[1, 0]) + + +class TestFA4WithTreeMask: + """End-to-end: verify FA4 attention with tree mask produces valid, masked output.""" + + @pytest.fixture(autouse=True) + def setup(self): + torch.manual_seed(42) + self.B = 2 + self.K = 2 + self.MQ_LEN = 6 + self.num_heads = 4 + self.num_kv_heads = 2 + self.head_dim = 128 + self.num_pages = 200 + self.page_size = 1 + self.max_pages_per_seq = 50 + self.max_kv_stride = 50 + self.fan_out_list = [2, 2, 2] + self.fan_out_list_miss = [2, 2, 2] + + def test_masked_vs_unmasked_differ(self): + """Masked attention should produce different output than unmasked.""" + kv_lens = [20, 15] + total_q = self.B * self.MQ_LEN + q = torch.randn(total_q, self.num_heads, self.head_dim, dtype=DTYPE, device=DEVICE) + k = torch.randn(self.num_pages, self.page_size, self.num_kv_heads, self.head_dim, dtype=DTYPE, device=DEVICE) + v = torch.randn(self.num_pages, self.page_size, self.num_kv_heads, self.head_dim, dtype=DTYPE, device=DEVICE) + cu = torch.arange(self.B + 1, dtype=torch.int32, device=DEVICE) * self.MQ_LEN + pt = torch.zeros(self.B, self.max_pages_per_seq, dtype=torch.int32, device=DEVICE) + for b in range(self.B): + pt[b, :kv_lens[b]] = torch.arange(kv_lens[b], dtype=torch.int32, device=DEVICE) + b * 50 + sk = torch.tensor(kv_lens, dtype=torch.int32, device=DEVICE) + + # Unmasked (causal=False, no score_mod) + out_unmasked, _ = flash_attn_varlen_func( + q, k, v, cu_seqlens_q=cu, cu_seqlens_k=None, + max_seqlen_q=self.MQ_LEN, max_seqlen_k=max(kv_lens), + seqused_k=sk, page_table=pt, + softmax_scale=self.head_dim ** -0.5, causal=False, + ) + + # Masked + score_mod = create_tree_score_mod(self.max_kv_stride) + context_lens = torch.tensor(kv_lens, dtype=torch.int32) + cache_hits = torch.tensor([1, 1]) + mask_bias = build_tree_mask_bias( + context_lens, step=0, K=self.K, MQ_LEN=self.MQ_LEN, + fan_out_list=self.fan_out_list, fan_out_list_miss=self.fan_out_list_miss, + cache_hits=cache_hits, max_kv_stride=self.max_kv_stride, device=DEVICE, + ) + out_masked, _ = flash_attn_varlen_func( + q, k, v, cu_seqlens_q=cu, cu_seqlens_k=None, + max_seqlen_q=self.MQ_LEN, max_seqlen_k=max(kv_lens), + seqused_k=sk, page_table=pt, + softmax_scale=self.head_dim ** -0.5, causal=False, + score_mod=score_mod, aux_tensors=[mask_bias], + ) + + assert not torch.isnan(out_masked).any(), "Masked output has NaN" + assert not torch.allclose(out_masked, out_unmasked, atol=1e-2), \ + "Masked and unmasked should produce different outputs" diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/test_block_manager.py b/tests/unit/test_block_manager.py new file mode 100644 index 000000000..42f039cf4 --- /dev/null +++ b/tests/unit/test_block_manager.py @@ -0,0 +1,197 @@ +"""Tier 0 / I10: BlockManager semantics. + +Exercises allocate / deallocate / prefix caching / refcount / may_append / +draft-vs-target independence. +""" +from __future__ import annotations + +import pytest + +from ssd.engine.block_manager import Block, BlockManager +from ssd.engine.sequence import Sequence +from ssd.sampling_params import SamplingParams + +pytestmark = pytest.mark.tier0 + + +# Block_size is a class-var on Sequence set by the engine; we set it for tests. +BLOCK_SIZE = 4 + + +def _seq(token_ids: list[int]) -> Sequence: + Sequence.block_size = BLOCK_SIZE + return Sequence(token_ids, SamplingParams()) + + +def _fresh_bm(num_blocks: int = 16, is_draft: bool = False, max_model_len: int = 4096) -> BlockManager: + return BlockManager( + num_blocks=num_blocks, + block_size=BLOCK_SIZE, + is_draft=is_draft, + max_model_len=max_model_len, + ) + + +# --------------------------------------------------------------------------- +# Allocation invariants +# --------------------------------------------------------------------------- +class TestAllocate: + def test_allocate_fills_block_table(self): + bm = _fresh_bm() + s = _seq([1, 2, 3, 4, 5, 6, 7]) # 7 tokens → 2 blocks (one full, one partial) + assert s.num_blocks == 2 + bm.allocate(s) + assert len(s.block_table) == 2 + # Complete block finalized (hash set), incomplete block not finalized + b0 = bm.blocks[s.block_table[0]] + b1 = bm.blocks[s.block_table[1]] + assert b0.hash != -1 + assert b1.hash == -1 + assert b0.ref_count == 1 + assert b1.ref_count == 1 + assert b0.block_id not in bm.free_block_ids + assert b1.block_id not in bm.free_block_ids + + def test_shared_prefix_hits_cache(self): + """Second sequence with same first-block prefix reuses the same block.""" + bm = _fresh_bm() + s1 = _seq([10, 11, 12, 13, 14, 15, 16, 17]) # 2 full blocks + s2 = _seq([10, 11, 12, 13, 99, 98, 97]) # first block matches s1; second differs + bm.allocate(s1) + bm.allocate(s2) + + assert s1.block_table[0] == s2.block_table[0], "shared first block not reused" + assert s1.block_table[1] != s2.block_table[1], "different second block collided" + # cached_tokens reflects the reuse on s2 + assert s2.num_cached_tokens == BLOCK_SIZE + # Shared block has ref_count == 2 + assert bm.blocks[s1.block_table[0]].ref_count == 2 + + def test_incomplete_last_block_is_not_hashed(self): + bm = _fresh_bm() + s = _seq([1, 2, 3]) # less than a block + bm.allocate(s) + assert len(s.block_table) == 1 + assert bm.blocks[s.block_table[0]].hash == -1 + assert not any(h == bm.blocks[s.block_table[0]].hash for h in bm.hash_to_block_id) + + def test_can_allocate_respects_free_pool(self): + bm = _fresh_bm(num_blocks=2) + s_small = _seq([1, 2, 3]) # 1 block + s_big = _seq([1] * (BLOCK_SIZE * 3)) # 3 blocks + assert bm.can_allocate(s_small) is True + assert bm.can_allocate(s_big) is False + + +# --------------------------------------------------------------------------- +# Deallocation / refcount +# --------------------------------------------------------------------------- +class TestDeallocate: + def test_deallocate_returns_block_to_free_pool(self): + bm = _fresh_bm() + s = _seq([1, 2, 3, 4, 5]) + bm.allocate(s) + freed_ids = list(s.block_table) + free_before = len(bm.free_block_ids) + bm.deallocate(s) + assert s.block_table == [] + assert len(bm.free_block_ids) == free_before + len(freed_ids) + assert s.num_cached_tokens == 0 + for bid in freed_ids: + assert bm.blocks[bid].ref_count == 0 + + def test_shared_block_stays_until_refcount_zero(self): + bm = _fresh_bm() + s1 = _seq([1, 2, 3, 4, 5]) # 2 blocks, shares first with s2 + s2 = _seq([1, 2, 3, 4, 9]) + bm.allocate(s1) + bm.allocate(s2) + shared = s1.block_table[0] + assert bm.blocks[shared].ref_count == 2 + + bm.deallocate(s1) + assert bm.blocks[shared].ref_count == 1 + assert shared not in bm.free_block_ids # still held by s2 + + bm.deallocate(s2) + assert bm.blocks[shared].ref_count == 0 + assert shared in bm.free_block_ids + + def test_deallocate_removes_hash_mapping(self): + bm = _fresh_bm() + s = _seq([1, 2, 3, 4, 5, 6, 7, 8]) # 2 full blocks, both hashed + bm.allocate(s) + hashes = [bm.blocks[b].hash for b in s.block_table] + assert all(h in bm.hash_to_block_id for h in hashes) + bm.deallocate(s) + assert not any(h in bm.hash_to_block_id for h in hashes) + + +# --------------------------------------------------------------------------- +# may_append / lookahead +# --------------------------------------------------------------------------- +class TestMayAppend: + def test_may_append_allocates_more_blocks(self): + bm = _fresh_bm() + s = _seq([1, 2, 3]) # 1 block + bm.allocate(s) + # Simulate appending tokens so num_tokens grows + s.append_token(4) + s.append_token(5) # now 5 tokens → needs 2 blocks + assert s.num_blocks == 2 + bm.may_append(s, lookahead_num_tokens=0) + assert len(s.block_table) == 2 + + def test_can_append_respects_max_model_len(self): + bm = _fresh_bm(max_model_len=10) + s = _seq([1] * 9) + bm.allocate(s) + # lookahead that would push past max_model_len + assert bm.can_append(s, lookahead_num_tokens=2) is False + assert bm.can_append(s, lookahead_num_tokens=1) is True + + +# --------------------------------------------------------------------------- +# Draft-vs-target independence +# --------------------------------------------------------------------------- +class TestDraftTargetIndependence: + def test_draft_bm_uses_draft_block_table(self): + t_bm = _fresh_bm(is_draft=False) + d_bm = _fresh_bm(is_draft=True) + s = _seq([1, 2, 3, 4, 5]) + t_bm.allocate(s) + d_bm.allocate(s) + # Separate tables; can share ids because each bm has its own pool + assert s.block_table and s.draft_block_table + # Deallocating one does not affect the other bm's state + t_bm.deallocate(s) + assert s.block_table == [] + assert s.draft_block_table # untouched + d_bm.deallocate(s) + assert s.draft_block_table == [] + + +# --------------------------------------------------------------------------- +# Hash function sanity +# --------------------------------------------------------------------------- +def test_compute_hash_includes_prefix(): + h_no_prefix = BlockManager.compute_hash([1, 2, 3, 4]) + h_with_prefix = BlockManager.compute_hash([1, 2, 3, 4], prefix=999) + assert h_no_prefix != h_with_prefix + + +def test_compute_hash_is_deterministic(): + a = BlockManager.compute_hash([1, 2, 3, 4], prefix=5) + b = BlockManager.compute_hash([1, 2, 3, 4], prefix=5) + assert a == b + + +def test_block_reset_clears_state(): + b = Block(block_id=7) + b.ref_count = 3 + b.hash = 42 + b.token_ids = [1, 2, 3] + b.reset() + assert b.ref_count == 1 + assert b.hash == -1 + assert b.token_ids == [] diff --git a/tests/unit/test_handshake_roundtrip.py b/tests/unit/test_handshake_roundtrip.py new file mode 100644 index 000000000..d2e754269 --- /dev/null +++ b/tests/unit/test_handshake_roundtrip.py @@ -0,0 +1,210 @@ +"""Tier 0 / I11: handshake pack/unpack round-trip. + +The real handshake in SpeculationRequest.send / .receive uses `dist.send` / +`dist.recv` over NCCL. The packing logic (fuse payload into one int64 tensor) +and parsing logic (slice/view out of the fused tensor) are exercised here +without NCCL by copying the bytes between a "sender" tensor and a "receiver" +tensor in memory. + +If the pack/parse layouts ever diverge (e.g. dtype mismatch, offset drift, +forgetting to include a tensor), this test will fail immediately without +needing a multi-GPU setup. + +What the real send/receive does (paraphrased from helpers/runner_helpers.py): +- pack: torch.cat of [cache_keys, num_tokens, block_tables.to(int64), + temps.view(int32).to(int64), ...eagle bits] +- parse: slice by offsets based on metadata=[B, K, max_blocks, eagle_act_dim, vocab_size] +""" +from __future__ import annotations + +import pytest +import torch + +from ssd.engine.helpers.runner_helpers import concat_tensors_as_int64 + +pytestmark = pytest.mark.tier0 + + +# --------------------------------------------------------------------------- +# PrefillRequest: input_ids + num_tokens + draft_block_table all int64 +# --------------------------------------------------------------------------- +def test_prefill_request_roundtrip_no_eagle(): + B = 3 + max_blocks = 8 + num_tokens_list = [5, 7, 4] + total_new = sum(num_tokens_list) + + input_ids = torch.arange(total_new, dtype=torch.int64) + 1000 + num_tokens = torch.tensor(num_tokens_list, dtype=torch.int64) + draft_block_table = torch.arange(B * max_blocks, dtype=torch.int32).view(B, max_blocks) - 5 # some negatives = padding + + # pack (same order as PrefillRequest.send) + fused = concat_tensors_as_int64(input_ids, num_tokens, draft_block_table) + + # parse (same as PrefillRequest.receive) + metadata = torch.tensor([total_new, B, max_blocks, 0, 0], dtype=torch.int64) + total_new_r, B_r, max_blocks_r, use_eagle_r, eagle_act_dim_r = metadata.tolist() + assert (total_new_r, B_r, max_blocks_r, use_eagle_r, eagle_act_dim_r) == (total_new, B, max_blocks, 0, 0) + + fused_total = total_new_r + B_r + B_r * max_blocks_r + assert fused.numel() == fused_total + + off = 0 + got_input_ids = fused[off:off + total_new_r] + off += total_new_r + got_num_tokens = fused[off:off + B_r] + off += B_r + got_draft_bt = fused[off:off + B_r * max_blocks_r].view(B_r, max_blocks_r).to(torch.int32) + off += B_r * max_blocks_r + assert off == fused_total + + assert torch.equal(got_input_ids, input_ids) + assert torch.equal(got_num_tokens, num_tokens) + assert torch.equal(got_draft_bt, draft_block_table) + + +# --------------------------------------------------------------------------- +# SpeculationRequest: most complex packing (temps reinterpreted via int32 view) +# --------------------------------------------------------------------------- +def _pack_spec_request(cache_keys, num_tokens, block_tables, temps, eagle_bits=None): + """Replicates SpeculationRequest.send's pack step (without dist.send).""" + int64_parts = [ + cache_keys.reshape(-1), + num_tokens.reshape(-1), + block_tables.to(torch.int64).reshape(-1), + temps.view(torch.int32).to(torch.int64).reshape(-1), + ] + if eagle_bits is not None: + recovery_activations, extend_counts, extend_activations, extend_token_ids = eagle_bits + int64_parts.extend([ + recovery_activations.contiguous().reshape(-1).view(torch.int64), + extend_counts.reshape(-1), + extend_activations.contiguous().reshape(-1).view(torch.int64), + extend_token_ids.reshape(-1), + ]) + return torch.cat(int64_parts) + + +def _parse_spec_request(fused, B, K, max_blocks, eagle_act_dim, draft_dtype): + """Replicates SpeculationRequest.receive's parse step (without dist.recv).""" + eagle = eagle_act_dim > 0 + _dsz = torch.finfo(draft_dtype).bits // 8 if eagle else 0 + off = 0 + cache_keys = fused[off:off + 3 * B].view(B, 3) + off += 3 * B + num_tokens = fused[off:off + B].to(torch.int64) + off += B + block_tables = fused[off:off + B * max_blocks].view(B, max_blocks).to(torch.int32) + off += B * max_blocks + temps = fused[off:off + B].to(torch.int32).view(torch.float32) + off += B + if eagle: + n_rec = B * eagle_act_dim * _dsz // 8 + recovery_activations = fused[off:off + n_rec].view(draft_dtype).view(B, eagle_act_dim) + off += n_rec + extend_counts = fused[off:off + B] + off += B + n_ext = B * K * eagle_act_dim * _dsz // 8 + extend_activations = fused[off:off + n_ext].view(draft_dtype).view(B, K, eagle_act_dim) + off += n_ext + extend_token_ids = fused[off:off + B * K].view(B, K) + off += B * K + else: + recovery_activations = extend_counts = extend_activations = extend_token_ids = None + return { + "cache_keys": cache_keys, + "num_tokens": num_tokens, + "block_tables": block_tables, + "temps": temps, + "recovery_activations": recovery_activations, + "extend_counts": extend_counts, + "extend_activations": extend_activations, + "extend_token_ids": extend_token_ids, + "consumed": off, + } + + +def test_speculation_request_roundtrip_no_eagle(): + B, K, max_blocks = 4, 3, 8 + torch.manual_seed(0) + cache_keys = torch.tensor( + [[i, i * 2, 100 + i] for i in range(B)], dtype=torch.int64, + ) + num_tokens = torch.tensor([37, 42, 51, 29], dtype=torch.int64) + block_tables = (torch.arange(B * max_blocks, dtype=torch.int32).view(B, max_blocks) - 3) + temps = torch.tensor([0.0, 0.7, 1.0, 0.5], dtype=torch.float32) + + fused = _pack_spec_request(cache_keys, num_tokens, block_tables, temps) + got = _parse_spec_request(fused, B, K, max_blocks, eagle_act_dim=0, draft_dtype=torch.bfloat16) + + assert got["consumed"] == fused.numel() + assert torch.equal(got["cache_keys"], cache_keys) + assert torch.equal(got["num_tokens"], num_tokens) + assert torch.equal(got["block_tables"], block_tables) + # temps is reinterpreted through int32; value must be preserved + assert torch.equal(got["temps"], temps), f"{got['temps']} vs {temps}" + + +def test_speculation_request_roundtrip_with_eagle(): + """Eagle payload includes recovery_activations/extend_activations (bfloat16, bit-cast to int64).""" + B, K, max_blocks = 2, 2, 4 + eagle_act_dim = 16 + draft_dtype = torch.bfloat16 + torch.manual_seed(1) + + cache_keys = torch.tensor([[0, 0, 77], [1, 1, 88]], dtype=torch.int64) + num_tokens = torch.tensor([10, 20], dtype=torch.int64) + block_tables = torch.tensor([[0, 1, 2, -1], [3, 4, -1, -1]], dtype=torch.int32) + temps = torch.tensor([0.25, 0.75], dtype=torch.float32) + + recovery_activations = torch.randn(B, eagle_act_dim, dtype=torch.float32).to(draft_dtype) + extend_counts = torch.tensor([1, 2], dtype=torch.int64) + extend_activations = torch.randn(B, K, eagle_act_dim, dtype=torch.float32).to(draft_dtype) + extend_token_ids = torch.tensor([[42, 43], [44, 45]], dtype=torch.int64) + + fused = _pack_spec_request( + cache_keys, num_tokens, block_tables, temps, + eagle_bits=(recovery_activations, extend_counts, extend_activations, extend_token_ids), + ) + got = _parse_spec_request(fused, B, K, max_blocks, eagle_act_dim, draft_dtype) + + assert got["consumed"] == fused.numel() + assert torch.equal(got["cache_keys"], cache_keys) + assert torch.equal(got["num_tokens"], num_tokens) + assert torch.equal(got["block_tables"], block_tables) + assert torch.equal(got["temps"], temps) + assert torch.equal(got["recovery_activations"], recovery_activations) + assert torch.equal(got["extend_counts"], extend_counts) + assert torch.equal(got["extend_activations"], extend_activations) + assert torch.equal(got["extend_token_ids"], extend_token_ids) + + +def test_fused_payload_total_size_matches_formula(): + """Independent check: the fused-payload size formula used on the receive side + must equal the pack-side total for eagle=True. + """ + B, K, max_blocks, eagle_act_dim = 3, 4, 6, 32 + draft_dtype = torch.bfloat16 + _dsz = torch.finfo(draft_dtype).bits // 8 # = 2 for bf16 + + cache_keys = torch.zeros(B, 3, dtype=torch.int64) + num_tokens = torch.zeros(B, dtype=torch.int64) + block_tables = torch.zeros(B, max_blocks, dtype=torch.int32) + temps = torch.zeros(B, dtype=torch.float32) + recovery_activations = torch.zeros(B, eagle_act_dim, dtype=draft_dtype) + extend_counts = torch.zeros(B, dtype=torch.int64) + extend_activations = torch.zeros(B, K, eagle_act_dim, dtype=draft_dtype) + extend_token_ids = torch.zeros(B, K, dtype=torch.int64) + + fused = _pack_spec_request( + cache_keys, num_tokens, block_tables, temps, + eagle_bits=(recovery_activations, extend_counts, extend_activations, extend_token_ids), + ) + expected = ( + (3 * B) + B + (B * max_blocks) + B + + (B * eagle_act_dim * _dsz // 8) + + B + + (B * K * eagle_act_dim * _dsz // 8) + + (B * K) + ) + assert fused.numel() == expected, f"fused {fused.numel()} != expected {expected}" diff --git a/tests/unit/test_mask_helpers.py b/tests/unit/test_mask_helpers.py new file mode 100644 index 000000000..9e05a0660 --- /dev/null +++ b/tests/unit/test_mask_helpers.py @@ -0,0 +1,228 @@ +"""Tier 0 / I9: mask helpers equivalence and structure. + +The engine picks a different code path based on batch size: +- B <= 8: get_custom_mask_cached (precomputes components into a global cache) +- B > 8: get_custom_mask_vectorized (ragged concat; avoids per-batch loop) + +For every combination of (K, F, fan_out_list, fan_out_list_miss, cache_hits, +context_lens, step), both paths must produce the same flat bool tensor. These +tests also validate the structural contract (shape, causal layout). +""" +from __future__ import annotations + +from types import SimpleNamespace + +import pytest +import torch + +from ssd.engine.helpers import mask_helpers +from ssd.engine.helpers.mask_helpers import ( + get_custom_mask_cached, + get_custom_mask_vectorized, + get_mask_iter_i, +) + +pytestmark = pytest.mark.tier0 + + +def _cfg(fan_out_list, fan_out_list_miss, max_model_len=4096): + return SimpleNamespace( + fan_out_list=fan_out_list, + fan_out_list_miss=fan_out_list_miss, + max_model_len=max_model_len, + ) + + +def _reset_caches(): + """Mask helpers use module-level global caches — reset between tests to avoid cross-test contamination.""" + mask_helpers._mask_cache = { + "glue_and_rec_mask": None, + "diag_components": None, + "ones_tensor": None, + "cached_params": None, + } + mask_helpers._vec_cache = {} + + +# --------------------------------------------------------------------------- +# Cached vs vectorized equivalence +# --------------------------------------------------------------------------- +CONFIGS = [ + # (K, F, fan_out_list, fan_out_list_miss) + (2, 3, [1, 3, 3], [1, 3, 3]), + (2, 3, [1, 3, 3], [7, 0, 0]), + (3, 2, [2, 2, 2, 2], [8, 0, 0, 0]), + (1, 4, [1, 4], [1, 4]), +] + + +@pytest.mark.parametrize("K,F,fan_out_list,fan_out_list_miss", CONFIGS) +@pytest.mark.parametrize("B", [1, 3, 8, 9, 16]) +@pytest.mark.parametrize("step", [0, 1]) +def test_cached_equals_vectorized(K, F, fan_out_list, fan_out_list_miss, B, step): + _reset_caches() + device = torch.device("cpu") + MQ_LEN = sum(fan_out_list) + glue_added = K + 1 + tree_decode_added = (step + 1) * MQ_LEN + ttl_added = glue_added + tree_decode_added + # Context lens must satisfy prefix_len = context_len - ttl_added >= 0. + torch.manual_seed(B * 10 + step) + context_lens_cpu = torch.tensor( + [ttl_added + 3 + i * 2 for i in range(B)], dtype=torch.int64, device=device, + ) + cache_hits = torch.tensor([i % 2 for i in range(B)], dtype=torch.int64, device=device) + + cfg = _cfg(fan_out_list, fan_out_list_miss) + + mask_cached = get_custom_mask_cached( + cfg, context_lens_cpu, step, K, F, B, device, + fan_out_list=fan_out_list, fan_out_list_miss=fan_out_list_miss, cache_hits=cache_hits, + ) + mask_vec = get_custom_mask_vectorized( + cfg, context_lens_cpu, step, K, B, device, cache_hits, + ) + assert mask_cached.shape == mask_vec.shape, f"shapes differ: {mask_cached.shape} vs {mask_vec.shape}" + assert mask_cached.dtype == torch.bool + assert mask_vec.dtype == torch.bool + # Flat content must match bit-for-bit. + assert torch.equal(mask_cached, mask_vec), ( + f"cached and vectorized masks differ for K={K},F={F},B={B},step={step}," + f" fan_out_list={fan_out_list}, fan_out_list_miss={fan_out_list_miss}" + ) + + +# --------------------------------------------------------------------------- +# Structural contract: shape +# --------------------------------------------------------------------------- +@pytest.mark.parametrize("K,F,fan_out_list,fan_out_list_miss", CONFIGS) +@pytest.mark.parametrize("B", [1, 4, 12]) +def test_mask_total_length_matches_expected(K, F, fan_out_list, fan_out_list_miss, B): + _reset_caches() + device = torch.device("cpu") + MQ_LEN = sum(fan_out_list) + step = 0 + ttl_added = (step + 1) * MQ_LEN + (K + 1) + torch.manual_seed(42) + context_lens = torch.tensor( + [ttl_added + 5 + i for i in range(B)], dtype=torch.int64, device=device, + ) + cache_hits = torch.zeros(B, dtype=torch.int64, device=device) + cfg = _cfg(fan_out_list, fan_out_list_miss) + + mask = get_custom_mask_cached( + cfg, context_lens, step, K, F, B, device, + fan_out_list=fan_out_list, fan_out_list_miss=fan_out_list_miss, cache_hits=cache_hits, + ) + # Expected length: sum_b MQ_LEN * context_len[b] + expected_len = int((MQ_LEN * context_lens).sum().item()) + assert mask.numel() == expected_len, ( + f"mask length {mask.numel()} != expected {expected_len}" + ) + + +# --------------------------------------------------------------------------- +# Structural contract: cache-hit rows use fan_out_list, cache-miss rows use fan_out_list_miss +# --------------------------------------------------------------------------- +def test_hit_vs_miss_row_uses_correct_glue(): + """When fan_out_list != fan_out_list_miss, the glue block must differ by row.""" + _reset_caches() + device = torch.device("cpu") + K = 2 + F = 3 + fan_out_list = [1, 3, 3] # hit-path fan-out + fan_out_list_miss = [7, 0, 0] # miss-path fan-out + MQ_LEN = sum(fan_out_list) + assert MQ_LEN == sum(fan_out_list_miss) + step = 0 + ttl_added = (step + 1) * MQ_LEN + (K + 1) + B = 2 # one hit, one miss + context_lens = torch.tensor([ttl_added, ttl_added], dtype=torch.int64, device=device) + cache_hits = torch.tensor([1, 0], dtype=torch.int64, device=device) + cfg = _cfg(fan_out_list, fan_out_list_miss) + + mask = get_custom_mask_cached( + cfg, context_lens, step, K, F, B, device, + fan_out_list=fan_out_list, fan_out_list_miss=fan_out_list_miss, cache_hits=cache_hits, + ) + # prefix_len = 0 here, so the only content is [glue | diag]. + # glue block for a row has shape (MQ_LEN, K+1). + per_row_cols = K + 1 + (step + 1) * MQ_LEN + mask2d_hit = mask[:MQ_LEN * per_row_cols].view(MQ_LEN, per_row_cols) + mask2d_miss = mask[MQ_LEN * per_row_cols:].view(MQ_LEN, per_row_cols) + + glue_hit = mask2d_hit[:, :K + 1] + glue_miss = mask2d_miss[:, :K + 1] + # The two glue blocks must NOT be equal because fan_out_list differs from miss. + assert not torch.equal(glue_hit, glue_miss), ( + "glue blocks for hit and miss rows unexpectedly equal" + ) + + +# --------------------------------------------------------------------------- +# Reference check: with uniform fan_out_list and step=0, the mask layout must +# match a hand-built reference via get_mask_iter_i. +# --------------------------------------------------------------------------- +def test_mask_matches_reference_iter_i(): + """For uniform fan_out_list=[F]*(K+1), step=0, the per-row mask equals the + output of get_mask_iter_i(i=0, prefix_len, K, F) followed by flatten.""" + _reset_caches() + device = torch.device("cpu") + K, F = 2, 3 + fan_out_list = [F] * (K + 1) # uniform + cfg = _cfg(fan_out_list, fan_out_list) + MQ_LEN = F * (K + 1) + step = 0 + ttl_added = (step + 1) * MQ_LEN + (K + 1) + B = 2 + context_lens = torch.tensor([ttl_added + 5, ttl_added + 5], dtype=torch.int64, device=device) + cache_hits = torch.ones(B, dtype=torch.int64, device=device) + + mask_flat = get_custom_mask_cached( + cfg, context_lens, step, K, F, B, device, + fan_out_list=fan_out_list, fan_out_list_miss=fan_out_list, cache_hits=cache_hits, + ) + + # Reference: get_mask_iter_i returns [MQ_LEN, prefix_len + K+1 + (i+1)*MQ_LEN] + # (uniform F), matches our per-row layout exactly. + cols_per_row = int(context_lens[0].item()) + prefix_len = cols_per_row - ttl_added + ref_row = get_mask_iter_i(i=0, prefix_len=prefix_len, K=K, F=F).to(torch.bool) + assert ref_row.shape == (MQ_LEN, cols_per_row) + + got = mask_flat.view(B, MQ_LEN, cols_per_row) + for b in range(B): + assert torch.equal(got[b], ref_row), f"row {b} does not match reference" + + +# --------------------------------------------------------------------------- +# Structural contract: prefix is all-ones, diagonal section is identity-stacked +# --------------------------------------------------------------------------- +def test_prefix_is_all_ones_and_diag_is_identity(): + _reset_caches() + device = torch.device("cpu") + K, F = 1, 4 + fan_out_list = [1, 4] + cfg = _cfg(fan_out_list, fan_out_list) + MQ_LEN = sum(fan_out_list) # 5 + step = 2 + prefix_len = 6 + ttl_added = (step + 1) * MQ_LEN + (K + 1) + context_len = prefix_len + ttl_added + B = 1 + context_lens = torch.tensor([context_len], dtype=torch.int64, device=device) + cache_hits = torch.ones(B, dtype=torch.int64, device=device) + + flat = get_custom_mask_cached( + cfg, context_lens, step, K, F, B, device, + fan_out_list=fan_out_list, fan_out_list_miss=fan_out_list, cache_hits=cache_hits, + ) + m = flat.view(MQ_LEN, context_len) + # Prefix region is all True + assert torch.all(m[:, :prefix_len]) + # Each diagonal sub-block is an identity + diag_start = prefix_len + (K + 1) + eye = torch.eye(MQ_LEN, dtype=torch.bool) + for s in range(step + 1): + sub = m[:, diag_start + s * MQ_LEN: diag_start + (s + 1) * MQ_LEN] + assert torch.equal(sub, eye), f"diagonal sub-block at step {s} not identity" diff --git a/tests/unit/test_tree_cache_semantics.py b/tests/unit/test_tree_cache_semantics.py new file mode 100644 index 000000000..298e5f7d2 --- /dev/null +++ b/tests/unit/test_tree_cache_semantics.py @@ -0,0 +1,139 @@ +"""Tier 0 / I7: draft-side tree-cache lookup semantics. + +The draft runner stores a tensor of keys `[T, 3]` (seq_id, keep_idx, recovery_token) +and matches incoming `[B, 3]` request keys via broadcast-equality + all-rows. +On hit, it indexes into stored tokens/logits/activations. + +This test models that lookup in pure Python (replicating the logic from +`draft_runner.hit_cache`, lines ~242–246 on the cc/sglang-fa4 branch) and +verifies: +- all-match key → hit, index points at the first matching entry +- partial match (only seq_id agrees) → miss +- empty cache → miss for every request +- different recovery_token or keep_idx → miss + +Note: this intentionally does NOT import DraftRunner, because constructing one +requires a GPU, model weights, and an initialized process group. The matching +logic is simple and regressions in it would be equally captured by the small +model here. +""" +from __future__ import annotations + +import pytest +import torch + +pytestmark = pytest.mark.tier0 + + +def _lookup(request_keys: torch.Tensor, cache_keys: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Replicates the matcher in draft_runner.hit_cache. + + request_keys: [B, 3] int64 + cache_keys: [T, 3] int64 + Returns: + hits: [B] bool + idx: [B] int — index of first match per row, 0 when no match (mirrors torch.max on a zero mask) + """ + if cache_keys.numel() == 0: + return torch.zeros(request_keys.shape[0], dtype=torch.bool), torch.zeros( + request_keys.shape[0], dtype=torch.int64, + ) + eq = request_keys.unsqueeze(1) == cache_keys.unsqueeze(0) # [B, T, 3] + match = torch.all(eq, dim=2) # [B, T] + hits, idx = match.max(dim=1) + return hits, idx + + +class TestCacheLookup: + def test_empty_cache_is_all_miss(self): + cache = torch.empty(0, 3, dtype=torch.int64) + req = torch.tensor([[1, 0, 42], [2, 1, 7]], dtype=torch.int64) + hits, idx = _lookup(req, cache) + assert hits.tolist() == [False, False] + + def test_exact_match_hits(self): + cache = torch.tensor([ + [1, 0, 42], + [2, 1, 7], + [3, 2, 99], + ], dtype=torch.int64) + req = torch.tensor([[2, 1, 7]], dtype=torch.int64) + hits, idx = _lookup(req, cache) + assert hits.tolist() == [True] + assert idx.tolist() == [1] + + def test_different_recovery_token_misses(self): + cache = torch.tensor([[1, 0, 42]], dtype=torch.int64) + req = torch.tensor([[1, 0, 43]], dtype=torch.int64) # different rec token + hits, _idx = _lookup(req, cache) + assert hits.tolist() == [False] + + def test_different_keep_idx_misses(self): + cache = torch.tensor([[1, 0, 42]], dtype=torch.int64) + req = torch.tensor([[1, 1, 42]], dtype=torch.int64) # different keep_idx + hits, _idx = _lookup(req, cache) + assert hits.tolist() == [False] + + def test_different_seq_id_misses(self): + cache = torch.tensor([[1, 0, 42]], dtype=torch.int64) + req = torch.tensor([[2, 0, 42]], dtype=torch.int64) # different seq_id + hits, _idx = _lookup(req, cache) + assert hits.tolist() == [False] + + def test_first_match_wins_on_duplicates(self): + cache = torch.tensor([ + [1, 0, 42], + [1, 0, 42], # duplicate + ], dtype=torch.int64) + req = torch.tensor([[1, 0, 42]], dtype=torch.int64) + hits, idx = _lookup(req, cache) + assert hits.tolist() == [True] + assert idx.tolist() == [0] # first match + + def test_mixed_hit_miss_in_batch(self): + cache = torch.tensor([ + [1, 0, 42], + [2, 1, 7], + ], dtype=torch.int64) + req = torch.tensor([ + [1, 0, 42], # hit + [99, 99, 99], # miss + [2, 1, 7], # hit + ], dtype=torch.int64) + hits, idx = _lookup(req, cache) + assert hits.tolist() == [True, False, True] + assert idx.tolist()[0] == 0 + assert idx.tolist()[2] == 1 + + +class TestRollbackInvalidation: + """After a sequence rolls back, old cache entries for that seq_id+keep_idx+rec + combination should not be reachable from the new key. We model that by + evolving the state of a sequence across two steps and showing that the cache + entry from step 1 does not service step 2's key (because at least one of the + three components always changes across a real rollback). + """ + + def test_key_changes_after_rollback(self): + # Step 1: seq 7 has accepted_len=3, rec=111. Cache entry written with this key. + cache = torch.tensor([[7, 2, 111]], dtype=torch.int64) # keep_idx = accepted_len - 1 + + # Step 2 (the verifier rolled back to accepted_len=2 because only 1 token accepted + # after sampling rec=111): new accepted_len=2 -> keep_idx=1, new rec is resampled. + new_req = torch.tensor([[7, 1, 222]], dtype=torch.int64) + hits, _idx = _lookup(new_req, cache) + assert hits.tolist() == [False], "rollback should invalidate the prior cache key" + + +class TestCollisionSemantics: + """Different sequences writing keys that share components should not collide unless all three match.""" + + def test_same_rec_and_keep_different_seq_no_collision(self): + cache = torch.tensor([ + [1, 0, 42], + [2, 0, 42], + ], dtype=torch.int64) + req = torch.tensor([[1, 0, 42]], dtype=torch.int64) + hits, idx = _lookup(req, cache) + assert hits.tolist() == [True] + assert idx.tolist() == [0] diff --git a/tests/unit/test_verify.py b/tests/unit/test_verify.py new file mode 100644 index 000000000..3d3e62bab --- /dev/null +++ b/tests/unit/test_verify.py @@ -0,0 +1,282 @@ +"""Tier 0 / I8: correctness of ssd.utils.verify.verify across branches. + +Branches exercised: +- greedy only (temps_t=0, temps_q=0) +- target-sampled, draft-greedy (temp_t>0, temp_q=0) — goes through sampling branch +- both sampled, cache hit (ratio acceptance) +- both sampled, cache miss (falls back to greedy when jit_speculate=False) +- jit_speculate=True uses ratio acceptance regardless of cache_hits + +verify() lives in /work/avner/git/ssd-phnx/ssd/utils/verify.py and is pure +(tensors in, tensors out), so no GPU / no model weights are needed. +""" +from __future__ import annotations + +import pytest +import torch + +from ssd.utils.verify import verify + +pytestmark = pytest.mark.tier0 + + +# --------------------------------------------------------------------------- +# Oracle: pure-python re-implementation of the greedy-only branch. +# --------------------------------------------------------------------------- +def _greedy_oracle( + logits_p: torch.Tensor, + speculations: torch.Tensor, +) -> tuple[list[list[int]], list[int]]: + """Pure-python greedy verify, ignoring logits_q. + + accepted_suffix[b] = [starts[b]] + draft_tokens[b, :accept_count[b]] + accept_count is the number of leading draft tokens equal to the target's argmax. + recovery token is target argmax at position accept_count. + """ + B, Kp1, _V = logits_p.shape + K = Kp1 - 1 + starts = speculations[:, 0].tolist() + draft = speculations[:, 1:] + preds_p = logits_p.argmax(dim=-1) # [B, K+1] + + accepted_suffixes: list[list[int]] = [] + recovery: list[int] = [] + for b in range(B): + n = 0 + for j in range(K): + if int(draft[b, j].item()) == int(preds_p[b, j].item()): + n += 1 + else: + break + suffix = [starts[b]] + draft[b, :n].tolist() + accepted_suffixes.append(suffix) + recovery.append(int(preds_p[b, n].item())) + return accepted_suffixes, recovery + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- +def _peaked_logits(B: int, Kp1: int, V: int, token_ids: torch.Tensor, peak: float = 50.0) -> torch.Tensor: + """Build logits where token_ids[b, i] is the clear argmax on row (b, i).""" + assert token_ids.shape == (B, Kp1) + logits = torch.randn(B, Kp1, V) * 0.01 + logits.scatter_(2, token_ids.unsqueeze(-1), peak) + return logits + + +# --------------------------------------------------------------------------- +# Greedy tests +# --------------------------------------------------------------------------- +class TestGreedy: + """temp_t == 0, temp_q == 0: pure argmax compare.""" + + @pytest.mark.parametrize("K", [1, 3, 6]) + def test_all_accept(self, K): + """Draft matches target's argmax at every position → accept all K.""" + torch.manual_seed(0) + B, V = 4, 64 + # Target's argmax on each (b, i) — pick any legal vocab ids + target_argmax = torch.randint(0, V, (B, K + 1)) + logits_p = _peaked_logits(B, K + 1, V, target_argmax) + # Draft proposes exactly the same tokens as target argmax (offset by 1 — starts token takes index 0) + starts = torch.randint(0, V, (B,)) + speculations = torch.empty(B, K + 1, dtype=torch.int64) + speculations[:, 0] = starts + speculations[:, 1:] = target_argmax[:, :K] + + logits_q = torch.randn(B, K, V) # unused in greedy + temps_t = torch.zeros(B) + temps_q = torch.zeros(B) + + got = verify(logits_p, logits_q, speculations, temps_t, temps_q) + expect = _greedy_oracle(logits_p, speculations) + assert got == expect + # Each suffix is len K+1 (starts + K accepted) + for s in got[0]: + assert len(s) == K + 1 + + def test_first_mismatch_rejects_rest(self): + """If the draft mismatches at position j, we accept j and recovery = target argmax at j.""" + B, K, V = 2, 4, 32 + torch.manual_seed(1) + target_argmax = torch.tensor([ + [10, 11, 12, 13, 14], + [20, 21, 22, 23, 24], + ], dtype=torch.int64) + logits_p = _peaked_logits(B, K + 1, V, target_argmax) + + # Draft matches at j=0 and j=1 for seq 0 (so accept 2, recovery = 12), + # and matches at j=0 only for seq 1 (accept 1, recovery = 21). + speculations = torch.tensor([ + [99, 10, 11, 0, 0], # mismatch at j=2 (draft=0, target=12) + [88, 20, 999, 0, 0], # mismatch at j=1 (draft=999, target=21) + ], dtype=torch.int64) + + logits_q = torch.randn(B, K, V) + suffixes, recovery = verify(logits_p, logits_q, speculations, torch.zeros(B), torch.zeros(B)) + + assert suffixes[0] == [99, 10, 11] + assert suffixes[1] == [88, 20] + assert recovery[0] == 12 + assert recovery[1] == 21 + + def test_no_accepts(self): + """First draft token mismatches — accept 0, recovery = target argmax at 0.""" + B, K, V = 2, 3, 32 + target_argmax = torch.tensor([ + [5, 6, 7, 8], + [15, 16, 17, 18], + ], dtype=torch.int64) + logits_p = _peaked_logits(B, K + 1, V, target_argmax) + speculations = torch.tensor([ + [100, 999, 999, 999], + [200, 999, 999, 999], + ], dtype=torch.int64) + logits_q = torch.randn(B, K, V) + suffixes, recovery = verify(logits_p, logits_q, speculations, torch.zeros(B), torch.zeros(B)) + assert suffixes[0] == [100] # just the starts token + assert suffixes[1] == [200] + assert recovery == [5, 15] + + +# --------------------------------------------------------------------------- +# Sampled tests — target-sampled, draft-greedy (no ratio branch) +# --------------------------------------------------------------------------- +class TestTargetSampled: + """temp_t > 0, temp_q == 0, cache_hits=0, jit_speculate=False. + + Acceptance stays greedy (no ratio branch) because cache_hits are all 0 + and jit_speculate=False. But recovery is sampled from p. + """ + + def test_accept_decision_is_greedy_on_miss(self): + B, K, V = 3, 2, 16 + torch.manual_seed(42) + target_argmax = torch.tensor([ + [0, 1, 2], + [5, 6, 7], + [10, 11, 12], + ], dtype=torch.int64) + logits_p = _peaked_logits(B, K + 1, V, target_argmax) + # All matches → full accept regardless of sampling + speculations = torch.stack([ + torch.tensor([99, 0, 1]), + torch.tensor([99, 5, 6]), + torch.tensor([99, 10, 11]), + ]).to(torch.int64) + + logits_q = torch.randn(B, K, V) + temps_t = torch.tensor([1.0, 1.0, 0.0]) + temps_q = torch.zeros(B) + cache_hits = torch.zeros(B, dtype=torch.int64) # all misses + + # Run verify three times with different seeds; accept counts must be deterministic. + for seed in [0, 1, 2]: + torch.manual_seed(seed) + suffixes, _recovery = verify( + logits_p, logits_q, speculations, temps_t, temps_q, + cache_hits=cache_hits, jit_speculate=False, + ) + assert [len(s) for s in suffixes] == [K + 1, K + 1, K + 1] + + +# --------------------------------------------------------------------------- +# jit_speculate=True: ratio acceptance even when cache_hits are zero +# --------------------------------------------------------------------------- +class TestJitSpeculate: + """jit_speculate=True ignores cache_hits and takes the ratio path when any temp > 0.""" + + def test_ratio_branch_is_taken(self): + """With jit_speculate=True and temps>0 we exercise ratio acceptance code (probabilistic).""" + B, K, V = 2, 2, 8 + torch.manual_seed(7) + target_argmax = torch.tensor([ + [0, 1, 2], + [3, 4, 5], + ], dtype=torch.int64) + logits_p = _peaked_logits(B, K + 1, V, target_argmax, peak=5.0) # less peaked: some prob mass elsewhere + logits_q = _peaked_logits(B, K, V, target_argmax[:, :K], peak=5.0) + + speculations = torch.stack([ + torch.tensor([99, 0, 1]), + torch.tensor([99, 3, 4]), + ]).to(torch.int64) + + temps_t = torch.tensor([1.0, 1.0]) + temps_q = torch.tensor([1.0, 1.0]) + # Key: cache_hits=None + jit_speculate=True → ratio path is active. + torch.manual_seed(0) + suffixes, recovery = verify( + logits_p, logits_q, speculations, temps_t, temps_q, + cache_hits=None, jit_speculate=True, + ) + # Sanity: outputs have the right shapes and types (we don't assert exact equality + # since ratio acceptance samples). + assert len(suffixes) == B + assert len(recovery) == B + for s in suffixes: + assert 1 <= len(s) <= K + 1 + + +# --------------------------------------------------------------------------- +# Cache-hit gating: jit_speculate=False, some rows hit, some miss +# --------------------------------------------------------------------------- +class TestCacheHitGating: + """Mixed cache_hits with temps>0 and jit_speculate=False. + + Rows with hit=1 may go through ratio acceptance; rows with hit=0 stay greedy. + We test this by setting logits such that the greedy decision is a full accept + for miss rows, and verifying that miss rows always accept fully (irrespective + of RNG state), while hit rows' accept counts are equal to greedy in the + specific case where p and q agree (accept prob = 1). + """ + + def test_miss_rows_are_greedy_always(self): + B, K, V = 4, 3, 16 + torch.manual_seed(11) + # Target argmax per row + target_argmax = torch.tensor([ + [0, 1, 2, 3], + [4, 5, 6, 7], + [8, 9, 10, 11], + [12, 13, 14, 15], + ], dtype=torch.int64) + logits_p = _peaked_logits(B, K + 1, V, target_argmax, peak=50.0) + # q distribution identical to p for the first K positions → ratio=1 on hit rows + logits_q = _peaked_logits(B, K, V, target_argmax[:, :K], peak=50.0) + + speculations = torch.empty(B, K + 1, dtype=torch.int64) + speculations[:, 0] = torch.tensor([100, 200, 300, 400]) + speculations[:, 1:] = target_argmax[:, :K] # all proposals match argmax + + temps_t = torch.ones(B) + temps_q = torch.ones(B) + cache_hits = torch.tensor([1, 0, 1, 0], dtype=torch.int64) + + # With extremely peaked p and q matching p, ratio≈1 always and greedy-on-miss + # also accepts fully. So all four rows accept K. + for seed in [0, 1, 2, 3, 4]: + torch.manual_seed(seed) + suffixes, _rec = verify( + logits_p, logits_q, speculations, temps_t, temps_q, + cache_hits=cache_hits, jit_speculate=False, + ) + accept_counts = [len(s) - 1 for s in suffixes] + assert accept_counts == [K, K, K, K] + + +# --------------------------------------------------------------------------- +# Structural sanity: output shapes/types +# --------------------------------------------------------------------------- +def test_output_shapes_and_types(): + B, K, V = 2, 4, 32 + torch.manual_seed(0) + logits_p = torch.randn(B, K + 1, V) + logits_q = torch.randn(B, K, V) + speculations = torch.randint(0, V, (B, K + 1), dtype=torch.int64) + suffixes, recovery = verify(logits_p, logits_q, speculations, torch.zeros(B), torch.zeros(B)) + assert isinstance(suffixes, list) and len(suffixes) == B + assert all(isinstance(s, list) and len(s) >= 1 for s in suffixes) + assert isinstance(recovery, list) and len(recovery) == B + assert all(isinstance(r, int) for r in recovery) diff --git a/uv.lock b/uv.lock deleted file mode 100644 index 096d3a138..000000000 --- a/uv.lock +++ /dev/null @@ -1,1571 +0,0 @@ -version = 1 -revision = 3 -requires-python = ">=3.11, <3.13" -resolution-markers = [ - "python_full_version >= '3.12' and sys_platform == 'linux'", - "python_full_version >= '3.12' and sys_platform == 'win32'", - "python_full_version >= '3.12' and sys_platform == 'emscripten'", - "python_full_version >= '3.12' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'", - "python_full_version < '3.12' and sys_platform == 'linux'", - "python_full_version < '3.12' and sys_platform == 'win32'", - "python_full_version < '3.12' and sys_platform == 'emscripten'", - "python_full_version < '3.12' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32'", -] - -[[package]] -name = "aiohappyeyeballs" -version = "2.6.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" }, -] - -[[package]] -name = "aiohttp" -version = "3.13.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "aiohappyeyeballs" }, - { name = "aiosignal" }, - { name = "attrs" }, - { name = "frozenlist" }, - { name = "multidict" }, - { name = "propcache" }, - { name = "yarl" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/50/42/32cf8e7704ceb4481406eb87161349abb46a57fee3f008ba9cb610968646/aiohttp-3.13.3.tar.gz", hash = "sha256:a949eee43d3782f2daae4f4a2819b2cb9b0c5d3b7f7a927067cc84dafdbb9f88", size = 7844556, upload-time = "2026-01-03T17:33:05.204Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f1/4c/a164164834f03924d9a29dc3acd9e7ee58f95857e0b467f6d04298594ebb/aiohttp-3.13.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5b6073099fb654e0a068ae678b10feff95c5cae95bbfcbfa7af669d361a8aa6b", size = 746051, upload-time = "2026-01-03T17:29:43.287Z" }, - { url = "https://files.pythonhosted.org/packages/82/71/d5c31390d18d4f58115037c432b7e0348c60f6f53b727cad33172144a112/aiohttp-3.13.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cb93e166e6c28716c8c6aeb5f99dfb6d5ccf482d29fe9bf9a794110e6d0ab64", size = 499234, upload-time = "2026-01-03T17:29:44.822Z" }, - { url = "https://files.pythonhosted.org/packages/0e/c9/741f8ac91e14b1d2e7100690425a5b2b919a87a5075406582991fb7de920/aiohttp-3.13.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:28e027cf2f6b641693a09f631759b4d9ce9165099d2b5d92af9bd4e197690eea", size = 494979, upload-time = "2026-01-03T17:29:46.405Z" }, - { url = "https://files.pythonhosted.org/packages/75/b5/31d4d2e802dfd59f74ed47eba48869c1c21552c586d5e81a9d0d5c2ad640/aiohttp-3.13.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b61b7169ababd7802f9568ed96142616a9118dd2be0d1866e920e77ec8fa92a", size = 1748297, upload-time = "2026-01-03T17:29:48.083Z" }, - { url = "https://files.pythonhosted.org/packages/1a/3e/eefad0ad42959f226bb79664826883f2687d602a9ae2941a18e0484a74d3/aiohttp-3.13.3-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:80dd4c21b0f6237676449c6baaa1039abae86b91636b6c91a7f8e61c87f89540", size = 1707172, upload-time = "2026-01-03T17:29:49.648Z" }, - { url = "https://files.pythonhosted.org/packages/c5/3a/54a64299fac2891c346cdcf2aa6803f994a2e4beeaf2e5a09dcc54acc842/aiohttp-3.13.3-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:65d2ccb7eabee90ce0503c17716fc77226be026dcc3e65cce859a30db715025b", size = 1805405, upload-time = "2026-01-03T17:29:51.244Z" }, - { url = "https://files.pythonhosted.org/packages/6c/70/ddc1b7169cf64075e864f64595a14b147a895a868394a48f6a8031979038/aiohttp-3.13.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5b179331a481cb5529fca8b432d8d3c7001cb217513c94cd72d668d1248688a3", size = 1899449, upload-time = "2026-01-03T17:29:53.938Z" }, - { url = "https://files.pythonhosted.org/packages/a1/7e/6815aab7d3a56610891c76ef79095677b8b5be6646aaf00f69b221765021/aiohttp-3.13.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d4c940f02f49483b18b079d1c27ab948721852b281f8b015c058100e9421dd1", size = 1748444, upload-time = "2026-01-03T17:29:55.484Z" }, - { url = "https://files.pythonhosted.org/packages/6b/f2/073b145c4100da5511f457dc0f7558e99b2987cf72600d42b559db856fbc/aiohttp-3.13.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f9444f105664c4ce47a2a7171a2418bce5b7bae45fb610f4e2c36045d85911d3", size = 1606038, upload-time = "2026-01-03T17:29:57.179Z" }, - { url = "https://files.pythonhosted.org/packages/0a/c1/778d011920cae03ae01424ec202c513dc69243cf2db303965615b81deeea/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:694976222c711d1d00ba131904beb60534f93966562f64440d0c9d41b8cdb440", size = 1724156, upload-time = "2026-01-03T17:29:58.914Z" }, - { url = "https://files.pythonhosted.org/packages/0e/cb/3419eabf4ec1e9ec6f242c32b689248365a1cf621891f6f0386632525494/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f33ed1a2bf1997a36661874b017f5c4b760f41266341af36febaf271d179f6d7", size = 1722340, upload-time = "2026-01-03T17:30:01.962Z" }, - { url = "https://files.pythonhosted.org/packages/7a/e5/76cf77bdbc435bf233c1f114edad39ed4177ccbfab7c329482b179cff4f4/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e636b3c5f61da31a92bf0d91da83e58fdfa96f178ba682f11d24f31944cdd28c", size = 1783041, upload-time = "2026-01-03T17:30:03.609Z" }, - { url = "https://files.pythonhosted.org/packages/9d/d4/dd1ca234c794fd29c057ce8c0566b8ef7fd6a51069de5f06fa84b9a1971c/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:5d2d94f1f5fcbe40838ac51a6ab5704a6f9ea42e72ceda48de5e6b898521da51", size = 1596024, upload-time = "2026-01-03T17:30:05.132Z" }, - { url = "https://files.pythonhosted.org/packages/55/58/4345b5f26661a6180afa686c473620c30a66afdf120ed3dd545bbc809e85/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2be0e9ccf23e8a94f6f0650ce06042cefc6ac703d0d7ab6c7a917289f2539ad4", size = 1804590, upload-time = "2026-01-03T17:30:07.135Z" }, - { url = "https://files.pythonhosted.org/packages/7b/06/05950619af6c2df7e0a431d889ba2813c9f0129cec76f663e547a5ad56f2/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9af5e68ee47d6534d36791bbe9b646d2a7c7deb6fc24d7943628edfbb3581f29", size = 1740355, upload-time = "2026-01-03T17:30:09.083Z" }, - { url = "https://files.pythonhosted.org/packages/3e/80/958f16de79ba0422d7c1e284b2abd0c84bc03394fbe631d0a39ffa10e1eb/aiohttp-3.13.3-cp311-cp311-win32.whl", hash = "sha256:a2212ad43c0833a873d0fb3c63fa1bacedd4cf6af2fee62bf4b739ceec3ab239", size = 433701, upload-time = "2026-01-03T17:30:10.869Z" }, - { url = "https://files.pythonhosted.org/packages/dc/f2/27cdf04c9851712d6c1b99df6821a6623c3c9e55956d4b1e318c337b5a48/aiohttp-3.13.3-cp311-cp311-win_amd64.whl", hash = "sha256:642f752c3eb117b105acbd87e2c143de710987e09860d674e068c4c2c441034f", size = 457678, upload-time = "2026-01-03T17:30:12.719Z" }, - { url = "https://files.pythonhosted.org/packages/a0/be/4fc11f202955a69e0db803a12a062b8379c970c7c84f4882b6da17337cc1/aiohttp-3.13.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b903a4dfee7d347e2d87697d0713be59e0b87925be030c9178c5faa58ea58d5c", size = 739732, upload-time = "2026-01-03T17:30:14.23Z" }, - { url = "https://files.pythonhosted.org/packages/97/2c/621d5b851f94fa0bb7430d6089b3aa970a9d9b75196bc93bb624b0db237a/aiohttp-3.13.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a45530014d7a1e09f4a55f4f43097ba0fd155089372e105e4bff4ca76cb1b168", size = 494293, upload-time = "2026-01-03T17:30:15.96Z" }, - { url = "https://files.pythonhosted.org/packages/5d/43/4be01406b78e1be8320bb8316dc9c42dbab553d281c40364e0f862d5661c/aiohttp-3.13.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:27234ef6d85c914f9efeb77ff616dbf4ad2380be0cda40b4db086ffc7ddd1b7d", size = 493533, upload-time = "2026-01-03T17:30:17.431Z" }, - { url = "https://files.pythonhosted.org/packages/8d/a8/5a35dc56a06a2c90d4742cbf35294396907027f80eea696637945a106f25/aiohttp-3.13.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d32764c6c9aafb7fb55366a224756387cd50bfa720f32b88e0e6fa45b27dcf29", size = 1737839, upload-time = "2026-01-03T17:30:19.422Z" }, - { url = "https://files.pythonhosted.org/packages/bf/62/4b9eeb331da56530bf2e198a297e5303e1c1ebdceeb00fe9b568a65c5a0c/aiohttp-3.13.3-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b1a6102b4d3ebc07dad44fbf07b45bb600300f15b552ddf1851b5390202ea2e3", size = 1703932, upload-time = "2026-01-03T17:30:21.756Z" }, - { url = "https://files.pythonhosted.org/packages/7c/f6/af16887b5d419e6a367095994c0b1332d154f647e7dc2bd50e61876e8e3d/aiohttp-3.13.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c014c7ea7fb775dd015b2d3137378b7be0249a448a1612268b5a90c2d81de04d", size = 1771906, upload-time = "2026-01-03T17:30:23.932Z" }, - { url = "https://files.pythonhosted.org/packages/ce/83/397c634b1bcc24292fa1e0c7822800f9f6569e32934bdeef09dae7992dfb/aiohttp-3.13.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2b8d8ddba8f95ba17582226f80e2de99c7a7948e66490ef8d947e272a93e9463", size = 1871020, upload-time = "2026-01-03T17:30:26Z" }, - { url = "https://files.pythonhosted.org/packages/86/f6/a62cbbf13f0ac80a70f71b1672feba90fdb21fd7abd8dbf25c0105fb6fa3/aiohttp-3.13.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ae8dd55c8e6c4257eae3a20fd2c8f41edaea5992ed67156642493b8daf3cecc", size = 1755181, upload-time = "2026-01-03T17:30:27.554Z" }, - { url = "https://files.pythonhosted.org/packages/0a/87/20a35ad487efdd3fba93d5843efdfaa62d2f1479eaafa7453398a44faf13/aiohttp-3.13.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:01ad2529d4b5035578f5081606a465f3b814c542882804e2e8cda61adf5c71bf", size = 1561794, upload-time = "2026-01-03T17:30:29.254Z" }, - { url = "https://files.pythonhosted.org/packages/de/95/8fd69a66682012f6716e1bc09ef8a1a2a91922c5725cb904689f112309c4/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bb4f7475e359992b580559e008c598091c45b5088f28614e855e42d39c2f1033", size = 1697900, upload-time = "2026-01-03T17:30:31.033Z" }, - { url = "https://files.pythonhosted.org/packages/e5/66/7b94b3b5ba70e955ff597672dad1691333080e37f50280178967aff68657/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c19b90316ad3b24c69cd78d5c9b4f3aa4497643685901185b65166293d36a00f", size = 1728239, upload-time = "2026-01-03T17:30:32.703Z" }, - { url = "https://files.pythonhosted.org/packages/47/71/6f72f77f9f7d74719692ab65a2a0252584bf8d5f301e2ecb4c0da734530a/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:96d604498a7c782cb15a51c406acaea70d8c027ee6b90c569baa6e7b93073679", size = 1740527, upload-time = "2026-01-03T17:30:34.695Z" }, - { url = "https://files.pythonhosted.org/packages/fa/b4/75ec16cbbd5c01bdaf4a05b19e103e78d7ce1ef7c80867eb0ace42ff4488/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:084911a532763e9d3dd95adf78a78f4096cd5f58cdc18e6fdbc1b58417a45423", size = 1554489, upload-time = "2026-01-03T17:30:36.864Z" }, - { url = "https://files.pythonhosted.org/packages/52/8f/bc518c0eea29f8406dcf7ed1f96c9b48e3bc3995a96159b3fc11f9e08321/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7a4a94eb787e606d0a09404b9c38c113d3b099d508021faa615d70a0131907ce", size = 1767852, upload-time = "2026-01-03T17:30:39.433Z" }, - { url = "https://files.pythonhosted.org/packages/9d/f2/a07a75173124f31f11ea6f863dc44e6f09afe2bca45dd4e64979490deab1/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:87797e645d9d8e222e04160ee32aa06bc5c163e8499f24db719e7852ec23093a", size = 1722379, upload-time = "2026-01-03T17:30:41.081Z" }, - { url = "https://files.pythonhosted.org/packages/3c/4a/1a3fee7c21350cac78e5c5cef711bac1b94feca07399f3d406972e2d8fcd/aiohttp-3.13.3-cp312-cp312-win32.whl", hash = "sha256:b04be762396457bef43f3597c991e192ee7da460a4953d7e647ee4b1c28e7046", size = 428253, upload-time = "2026-01-03T17:30:42.644Z" }, - { url = "https://files.pythonhosted.org/packages/d9/b7/76175c7cb4eb73d91ad63c34e29fc4f77c9386bba4a65b53ba8e05ee3c39/aiohttp-3.13.3-cp312-cp312-win_amd64.whl", hash = "sha256:e3531d63d3bdfa7e3ac5e9b27b2dd7ec9df3206a98e0b3445fa906f233264c57", size = 455407, upload-time = "2026-01-03T17:30:44.195Z" }, -] - -[[package]] -name = "aiosignal" -version = "1.4.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "frozenlist" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, -] - -[[package]] -name = "annotated-types" -version = "0.7.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, -] - -[[package]] -name = "anyio" -version = "4.12.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "idna" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/96/f0/5eb65b2bb0d09ac6776f2eb54adee6abe8228ea05b20a5ad0e4945de8aac/anyio-4.12.1.tar.gz", hash = "sha256:41cfcc3a4c85d3f05c932da7c26d0201ac36f72abd4435ba90d0464a3ffed703", size = 228685, upload-time = "2026-01-06T11:45:21.246Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/38/0e/27be9fdef66e72d64c0cdc3cc2823101b80585f8119b5c112c2e8f5f7dab/anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c", size = 113592, upload-time = "2026-01-06T11:45:19.497Z" }, -] - -[[package]] -name = "apache-tvm-ffi" -version = "0.1.9" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/6f/60/1e787a0b5ebf318483235be2a689ee367173983067e441b8379564f667c0/apache_tvm_ffi-0.1.9.tar.gz", hash = "sha256:d2d402587e8906de0a07f4746aa78f3d452c7efe3625d4bb39ac2ad693bce530", size = 2513731, upload-time = "2026-02-27T19:28:06.602Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b0/44/130571cede8704b1412e48b3dd78de41b4d31b68241f954743d1a9925bd9/apache_tvm_ffi-0.1.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:932d94e29595a47109f0ef6e0b4209a934451582954ea8b426e758d6b3e307e3", size = 2070368, upload-time = "2026-02-27T19:27:13.779Z" }, - { url = "https://files.pythonhosted.org/packages/42/b1/9f2cfd6d49b03c5d4ec5c12548d911e2e01265be783f343103b4df716765/apache_tvm_ffi-0.1.9-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c0449fc3802987c3652bea266ffda2934a6f69c80bba791a3f55b91040656a18", size = 2231154, upload-time = "2026-02-27T19:27:15.691Z" }, - { url = "https://files.pythonhosted.org/packages/55/43/63faedea83494e99122466a993bcdccd31cf93c7e8a0d56731120e82e2b9/apache_tvm_ffi-0.1.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6f16d73a82a9e68a439b7d233d48b1b929be17fe92df4bbf1ee2274e573144a3", size = 2323130, upload-time = "2026-02-27T19:27:17.259Z" }, - { url = "https://files.pythonhosted.org/packages/27/96/d735bc4c528efaf0a8a954076963c727aad2dde8577641aa9025ec4f2d52/apache_tvm_ffi-0.1.9-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:01ebb1308b2666c206aa9a4015eb48f03a5d98ea2e9cfb002bd5e2ca0b9c7ef3", size = 2159854, upload-time = "2026-02-27T19:27:18.789Z" }, - { url = "https://files.pythonhosted.org/packages/e4/3b/6cfc82a3ab5d9e501bbcee5df36eebe09da1c384461d7a55e2a17776d117/apache_tvm_ffi-0.1.9-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21365abd2a2a1a6d3b4e6e4f048309651125becfa795440c3607f3cc27d30ac7", size = 2307140, upload-time = "2026-02-27T19:27:20.222Z" }, - { url = "https://files.pythonhosted.org/packages/5f/61/3ffe1fe3190e12807a12b72ed0d291c7f66569c2e7c3571fde18175f19e1/apache_tvm_ffi-0.1.9-cp311-cp311-win_amd64.whl", hash = "sha256:9ee710a9fba3d9ff9747870bbd7e2175eb8d5b9c791f17fd645f35f6dab3f8aa", size = 1993218, upload-time = "2026-02-27T19:27:22.043Z" }, - { url = "https://files.pythonhosted.org/packages/df/f2/b8c4b151169f6d7ba8773c8af68b2e0c1013d7fb3f1bdf87573f47157ce9/apache_tvm_ffi-0.1.9-cp312-abi3-macosx_11_0_arm64.whl", hash = "sha256:49e52350b0470654847de752e65603b604a4d3323e7e9f5e8a982f44acc4c143", size = 2041756, upload-time = "2026-02-27T19:27:23.931Z" }, - { url = "https://files.pythonhosted.org/packages/a7/c0/6d3d54f50012255b41bc3e24944c086f63c4707c8686c7c6780e9283eb96/apache_tvm_ffi-0.1.9-cp312-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d503029e66c43b1a1cb1a42a1e9bb428c8a28dcbdec31c28e705472ca648a3a", size = 2203712, upload-time = "2026-02-27T19:27:25.867Z" }, - { url = "https://files.pythonhosted.org/packages/c6/dd/2bab4c6cd86257dbf99e93452a1af833113f8dc3e25a25579f6e4e4c8a94/apache_tvm_ffi-0.1.9-cp312-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28241371934ea8af10d5067087ba1229ebddded7b2c02d33a258ec2a96df8c46", size = 2299704, upload-time = "2026-02-27T19:27:27.477Z" }, - { url = "https://files.pythonhosted.org/packages/7a/4a/b469bcb2e1014cb84d336d2a59f42958a058251c577a4c2680cacad346e2/apache_tvm_ffi-0.1.9-cp312-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:87cacce81df55685fc6a76e1e3c5db1200e85e87bf5974b692c59d131b7bc622", size = 2130865, upload-time = "2026-02-27T19:27:29.092Z" }, - { url = "https://files.pythonhosted.org/packages/70/ef/5402da5d37f5270fd88ea0348acca78dba9be8bdbf6c2bcae0935eb03ef1/apache_tvm_ffi-0.1.9-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f45eb43499acac45ff6c93564f0ff2d3ca27b69656d540fd56ce59d51c0b4c65", size = 2278991, upload-time = "2026-02-27T19:27:30.729Z" }, - { url = "https://files.pythonhosted.org/packages/b5/23/1b7dc5f0807f83098183a57db6ee85b2c93b646d74a6e03781c9208aaeb0/apache_tvm_ffi-0.1.9-cp312-abi3-win_amd64.whl", hash = "sha256:d1dcf4c041d5ec05e3da1d545800c33cdbb95c113baa7705085ff79fa262752b", size = 1973200, upload-time = "2026-02-27T19:27:32.367Z" }, -] - -[[package]] -name = "attrs" -version = "25.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6b/5c/685e6633917e101e5dcb62b9dd76946cbb57c26e133bae9e0cd36033c0a9/attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11", size = 934251, upload-time = "2025-10-06T13:54:44.725Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, -] - -[[package]] -name = "certifi" -version = "2026.2.25" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/af/2d/7bf41579a8986e348fa033a31cdd0e4121114f6bce2457e8876010b092dd/certifi-2026.2.25.tar.gz", hash = "sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7", size = 155029, upload-time = "2026-02-25T02:54:17.342Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" }, -] - -[[package]] -name = "charset-normalizer" -version = "3.4.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/13/69/33ddede1939fdd074bce5434295f38fae7136463422fe4fd3e0e89b98062/charset_normalizer-3.4.4.tar.gz", hash = "sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a", size = 129418, upload-time = "2025-10-14T04:42:32.879Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ed/27/c6491ff4954e58a10f69ad90aca8a1b6fe9c5d3c6f380907af3c37435b59/charset_normalizer-3.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6e1fcf0720908f200cd21aa4e6750a48ff6ce4afe7ff5a79a90d5ed8a08296f8", size = 206988, upload-time = "2025-10-14T04:40:33.79Z" }, - { url = "https://files.pythonhosted.org/packages/94/59/2e87300fe67ab820b5428580a53cad894272dbb97f38a7a814a2a1ac1011/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f819d5fe9234f9f82d75bdfa9aef3a3d72c4d24a6e57aeaebba32a704553aa0", size = 147324, upload-time = "2025-10-14T04:40:34.961Z" }, - { url = "https://files.pythonhosted.org/packages/07/fb/0cf61dc84b2b088391830f6274cb57c82e4da8bbc2efeac8c025edb88772/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:a59cb51917aa591b1c4e6a43c132f0cdc3c76dbad6155df4e28ee626cc77a0a3", size = 142742, upload-time = "2025-10-14T04:40:36.105Z" }, - { url = "https://files.pythonhosted.org/packages/62/8b/171935adf2312cd745d290ed93cf16cf0dfe320863ab7cbeeae1dcd6535f/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8ef3c867360f88ac904fd3f5e1f902f13307af9052646963ee08ff4f131adafc", size = 160863, upload-time = "2025-10-14T04:40:37.188Z" }, - { url = "https://files.pythonhosted.org/packages/09/73/ad875b192bda14f2173bfc1bc9a55e009808484a4b256748d931b6948442/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d9e45d7faa48ee908174d8fe84854479ef838fc6a705c9315372eacbc2f02897", size = 157837, upload-time = "2025-10-14T04:40:38.435Z" }, - { url = "https://files.pythonhosted.org/packages/6d/fc/de9cce525b2c5b94b47c70a4b4fb19f871b24995c728e957ee68ab1671ea/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:840c25fb618a231545cbab0564a799f101b63b9901f2569faecd6b222ac72381", size = 151550, upload-time = "2025-10-14T04:40:40.053Z" }, - { url = "https://files.pythonhosted.org/packages/55/c2/43edd615fdfba8c6f2dfbd459b25a6b3b551f24ea21981e23fb768503ce1/charset_normalizer-3.4.4-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ca5862d5b3928c4940729dacc329aa9102900382fea192fc5e52eb69d6093815", size = 149162, upload-time = "2025-10-14T04:40:41.163Z" }, - { url = "https://files.pythonhosted.org/packages/03/86/bde4ad8b4d0e9429a4e82c1e8f5c659993a9a863ad62c7df05cf7b678d75/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9c7f57c3d666a53421049053eaacdd14bbd0a528e2186fcb2e672effd053bb0", size = 150019, upload-time = "2025-10-14T04:40:42.276Z" }, - { url = "https://files.pythonhosted.org/packages/1f/86/a151eb2af293a7e7bac3a739b81072585ce36ccfb4493039f49f1d3cae8c/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:277e970e750505ed74c832b4bf75dac7476262ee2a013f5574dd49075879e161", size = 143310, upload-time = "2025-10-14T04:40:43.439Z" }, - { url = "https://files.pythonhosted.org/packages/b5/fe/43dae6144a7e07b87478fdfc4dbe9efd5defb0e7ec29f5f58a55aeef7bf7/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:31fd66405eaf47bb62e8cd575dc621c56c668f27d46a61d975a249930dd5e2a4", size = 162022, upload-time = "2025-10-14T04:40:44.547Z" }, - { url = "https://files.pythonhosted.org/packages/80/e6/7aab83774f5d2bca81f42ac58d04caf44f0cc2b65fc6db2b3b2e8a05f3b3/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:0d3d8f15c07f86e9ff82319b3d9ef6f4bf907608f53fe9d92b28ea9ae3d1fd89", size = 149383, upload-time = "2025-10-14T04:40:46.018Z" }, - { url = "https://files.pythonhosted.org/packages/4f/e8/b289173b4edae05c0dde07f69f8db476a0b511eac556dfe0d6bda3c43384/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:9f7fcd74d410a36883701fafa2482a6af2ff5ba96b9a620e9e0721e28ead5569", size = 159098, upload-time = "2025-10-14T04:40:47.081Z" }, - { url = "https://files.pythonhosted.org/packages/d8/df/fe699727754cae3f8478493c7f45f777b17c3ef0600e28abfec8619eb49c/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ebf3e58c7ec8a8bed6d66a75d7fb37b55e5015b03ceae72a8e7c74495551e224", size = 152991, upload-time = "2025-10-14T04:40:48.246Z" }, - { url = "https://files.pythonhosted.org/packages/1a/86/584869fe4ddb6ffa3bd9f491b87a01568797fb9bd8933f557dba9771beaf/charset_normalizer-3.4.4-cp311-cp311-win32.whl", hash = "sha256:eecbc200c7fd5ddb9a7f16c7decb07b566c29fa2161a16cf67b8d068bd21690a", size = 99456, upload-time = "2025-10-14T04:40:49.376Z" }, - { url = "https://files.pythonhosted.org/packages/65/f6/62fdd5feb60530f50f7e38b4f6a1d5203f4d16ff4f9f0952962c044e919a/charset_normalizer-3.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:5ae497466c7901d54b639cf42d5b8c1b6a4fead55215500d2f486d34db48d016", size = 106978, upload-time = "2025-10-14T04:40:50.844Z" }, - { url = "https://files.pythonhosted.org/packages/7a/9d/0710916e6c82948b3be62d9d398cb4fcf4e97b56d6a6aeccd66c4b2f2bd5/charset_normalizer-3.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:65e2befcd84bc6f37095f5961e68a6f077bf44946771354a28ad434c2cce0ae1", size = 99969, upload-time = "2025-10-14T04:40:52.272Z" }, - { url = "https://files.pythonhosted.org/packages/f3/85/1637cd4af66fa687396e757dec650f28025f2a2f5a5531a3208dc0ec43f2/charset_normalizer-3.4.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0a98e6759f854bd25a58a73fa88833fba3b7c491169f86ce1180c948ab3fd394", size = 208425, upload-time = "2025-10-14T04:40:53.353Z" }, - { url = "https://files.pythonhosted.org/packages/9d/6a/04130023fef2a0d9c62d0bae2649b69f7b7d8d24ea5536feef50551029df/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b5b290ccc2a263e8d185130284f8501e3e36c5e02750fc6b6bdeb2e9e96f1e25", size = 148162, upload-time = "2025-10-14T04:40:54.558Z" }, - { url = "https://files.pythonhosted.org/packages/78/29/62328d79aa60da22c9e0b9a66539feae06ca0f5a4171ac4f7dc285b83688/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:74bb723680f9f7a6234dcf67aea57e708ec1fbdf5699fb91dfd6f511b0a320ef", size = 144558, upload-time = "2025-10-14T04:40:55.677Z" }, - { url = "https://files.pythonhosted.org/packages/86/bb/b32194a4bf15b88403537c2e120b817c61cd4ecffa9b6876e941c3ee38fe/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f1e34719c6ed0b92f418c7c780480b26b5d9c50349e9a9af7d76bf757530350d", size = 161497, upload-time = "2025-10-14T04:40:57.217Z" }, - { url = "https://files.pythonhosted.org/packages/19/89/a54c82b253d5b9b111dc74aca196ba5ccfcca8242d0fb64146d4d3183ff1/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2437418e20515acec67d86e12bf70056a33abdacb5cb1655042f6538d6b085a8", size = 159240, upload-time = "2025-10-14T04:40:58.358Z" }, - { url = "https://files.pythonhosted.org/packages/c0/10/d20b513afe03acc89ec33948320a5544d31f21b05368436d580dec4e234d/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11d694519d7f29d6cd09f6ac70028dba10f92f6cdd059096db198c283794ac86", size = 153471, upload-time = "2025-10-14T04:40:59.468Z" }, - { url = "https://files.pythonhosted.org/packages/61/fa/fbf177b55bdd727010f9c0a3c49eefa1d10f960e5f09d1d887bf93c2e698/charset_normalizer-3.4.4-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ac1c4a689edcc530fc9d9aa11f5774b9e2f33f9a0c6a57864e90908f5208d30a", size = 150864, upload-time = "2025-10-14T04:41:00.623Z" }, - { url = "https://files.pythonhosted.org/packages/05/12/9fbc6a4d39c0198adeebbde20b619790e9236557ca59fc40e0e3cebe6f40/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:21d142cc6c0ec30d2efee5068ca36c128a30b0f2c53c1c07bd78cb6bc1d3be5f", size = 150647, upload-time = "2025-10-14T04:41:01.754Z" }, - { url = "https://files.pythonhosted.org/packages/ad/1f/6a9a593d52e3e8c5d2b167daf8c6b968808efb57ef4c210acb907c365bc4/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:5dbe56a36425d26d6cfb40ce79c314a2e4dd6211d51d6d2191c00bed34f354cc", size = 145110, upload-time = "2025-10-14T04:41:03.231Z" }, - { url = "https://files.pythonhosted.org/packages/30/42/9a52c609e72471b0fc54386dc63c3781a387bb4fe61c20231a4ebcd58bdd/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:5bfbb1b9acf3334612667b61bd3002196fe2a1eb4dd74d247e0f2a4d50ec9bbf", size = 162839, upload-time = "2025-10-14T04:41:04.715Z" }, - { url = "https://files.pythonhosted.org/packages/c4/5b/c0682bbf9f11597073052628ddd38344a3d673fda35a36773f7d19344b23/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:d055ec1e26e441f6187acf818b73564e6e6282709e9bcb5b63f5b23068356a15", size = 150667, upload-time = "2025-10-14T04:41:05.827Z" }, - { url = "https://files.pythonhosted.org/packages/e4/24/a41afeab6f990cf2daf6cb8c67419b63b48cf518e4f56022230840c9bfb2/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:af2d8c67d8e573d6de5bc30cdb27e9b95e49115cd9baad5ddbd1a6207aaa82a9", size = 160535, upload-time = "2025-10-14T04:41:06.938Z" }, - { url = "https://files.pythonhosted.org/packages/2a/e5/6a4ce77ed243c4a50a1fecca6aaaab419628c818a49434be428fe24c9957/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:780236ac706e66881f3b7f2f32dfe90507a09e67d1d454c762cf642e6e1586e0", size = 154816, upload-time = "2025-10-14T04:41:08.101Z" }, - { url = "https://files.pythonhosted.org/packages/a8/ef/89297262b8092b312d29cdb2517cb1237e51db8ecef2e9af5edbe7b683b1/charset_normalizer-3.4.4-cp312-cp312-win32.whl", hash = "sha256:5833d2c39d8896e4e19b689ffc198f08ea58116bee26dea51e362ecc7cd3ed26", size = 99694, upload-time = "2025-10-14T04:41:09.23Z" }, - { url = "https://files.pythonhosted.org/packages/3d/2d/1e5ed9dd3b3803994c155cd9aacb60c82c331bad84daf75bcb9c91b3295e/charset_normalizer-3.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:a79cfe37875f822425b89a82333404539ae63dbdddf97f84dcbc3d339aae9525", size = 107131, upload-time = "2025-10-14T04:41:10.467Z" }, - { url = "https://files.pythonhosted.org/packages/d0/d9/0ed4c7098a861482a7b6a95603edce4c0d9db2311af23da1fb2b75ec26fc/charset_normalizer-3.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:376bec83a63b8021bb5c8ea75e21c4ccb86e7e45ca4eb81146091b56599b80c3", size = 100390, upload-time = "2025-10-14T04:41:11.915Z" }, - { url = "https://files.pythonhosted.org/packages/0a/4c/925909008ed5a988ccbb72dcc897407e5d6d3bd72410d69e051fc0c14647/charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f", size = 53402, upload-time = "2025-10-14T04:42:31.76Z" }, -] - -[[package]] -name = "click" -version = "8.3.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, -] - -[[package]] -name = "colorama" -version = "0.4.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, -] - -[[package]] -name = "cuda-bindings" -version = "13.1.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cuda-pathfinder" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/b0/58/b8d4c7c5fb29ba46088a7e78d1065484219f8fe41a08adc4a85b1ee56149/cuda_bindings-13.1.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a5f5a6ade0ad45096568bc4dd1eb3377b65884d29124338fe9a4353130ef6631", size = 15771605, upload-time = "2025-12-09T22:05:48.266Z" }, - { url = "https://files.pythonhosted.org/packages/17/af/710403f76f2d608d483d87089465e1f666351641dbd73d19bd025e652bad/cuda_bindings-13.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9348f69b03b257f07159dd4c869615e139722c2bd81e96c66f6b8f77615efd82", size = 16338970, upload-time = "2025-12-09T22:05:50.598Z" }, - { url = "https://files.pythonhosted.org/packages/64/1c/e7ea27d4cb7d07331c88e3bbed3cacc947d2237471801086c7447b3e195d/cuda_bindings-13.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:ec33b84f4bd65a86a734427f2b9cb8f221bedab2c4cfb681488cabc82f1d64ab", size = 15210672, upload-time = "2025-12-09T22:05:53.369Z" }, - { url = "https://files.pythonhosted.org/packages/53/3d/c8ed9d169843091f3f0d6b8218e826fd59520a37e0434c204feada597988/cuda_bindings-13.1.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1e75ad0cb863330df784236d289612d71ca855c013d19ae00e5693574abd6915", size = 15530160, upload-time = "2025-12-09T22:05:55.386Z" }, - { url = "https://files.pythonhosted.org/packages/4a/8e/368295623ee43fba622909d780fbb6863efc1638dff55f67a0f04eac6470/cuda_bindings-13.1.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:25785d1a3cdcd98f151240fd5efd025609319a6720a217dee2a929241749d488", size = 16110386, upload-time = "2025-12-09T22:05:57.71Z" }, - { url = "https://files.pythonhosted.org/packages/60/1f/ecc4701ade3e85f091c625a920574527b9daf7fb354189fbfbc5516af6cd/cuda_bindings-13.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:ccde9c95c0e953b31fe7731bb08da9d0a34b1770498df9a3c156fdfdbe3951ad", size = 15250028, upload-time = "2025-12-09T22:06:00.346Z" }, -] - -[[package]] -name = "cuda-pathfinder" -version = "1.4.0" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ff/60/d8f1dbfb7f06b94c662e98c95189e6f39b817da638bc8fcea0d003f89e5d/cuda_pathfinder-1.4.0-py3-none-any.whl", hash = "sha256:437079ca59e7b61ae439ecc501d69ed87b3accc34d58153ef1e54815e2c2e118", size = 38406, upload-time = "2026-02-25T22:13:00.807Z" }, -] - -[[package]] -name = "cuda-python" -version = "13.1.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cuda-bindings" }, - { name = "cuda-pathfinder" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/cd/08/b5e3b9822662d72d540d830531e3ab6a7cabbda3dd56175696aabccfeb76/cuda_python-13.1.1-py3-none-any.whl", hash = "sha256:944cc4fe6482673d28dd545797a28840945a1668739328fa2ad1e9be4f7050d9", size = 8038, upload-time = "2025-12-09T22:13:10.719Z" }, -] - -[[package]] -name = "datasets" -version = "4.6.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "dill" }, - { name = "filelock" }, - { name = "fsspec", extra = ["http"] }, - { name = "httpx" }, - { name = "huggingface-hub" }, - { name = "multiprocess" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "pandas" }, - { name = "pyarrow" }, - { name = "pyyaml" }, - { name = "requests" }, - { name = "tqdm" }, - { name = "xxhash" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d7/94/eb81c6fe32e9b6ef92223141b5a553aeff2e9456968424a8533cbe88f476/datasets-4.6.1.tar.gz", hash = "sha256:140ce500bc41939ff6ce995702d66b1f4b2ee7f117bb9b07512fab6804d4070a", size = 593865, upload-time = "2026-02-27T23:26:49.482Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/37/f0/99fe6eb530c7ee9ee1faee48059eb8a6437f80c893a496b98a78864e0fc6/datasets-4.6.1-py3-none-any.whl", hash = "sha256:f53228e6dadc9f837037b1bf3051d7d8c054abbb3eb29f1f022926e08090e0da", size = 520667, upload-time = "2026-02-27T23:26:46.855Z" }, -] - -[[package]] -name = "dill" -version = "0.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/12/80/630b4b88364e9a8c8c5797f4602d0f76ef820909ee32f0bacb9f90654042/dill-0.4.0.tar.gz", hash = "sha256:0633f1d2df477324f53a895b02c901fb961bdbf65a17122586ea7019292cbcf0", size = 186976, upload-time = "2025-04-16T00:41:48.867Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/50/3d/9373ad9c56321fdab5b41197068e1d8c25883b3fea29dd361f9b55116869/dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049", size = 119668, upload-time = "2025-04-16T00:41:47.671Z" }, -] - -[[package]] -name = "einops" -version = "0.8.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/2c/77/850bef8d72ffb9219f0b1aac23fbc1bf7d038ee6ea666f331fa273031aa2/einops-0.8.2.tar.gz", hash = "sha256:609da665570e5e265e27283aab09e7f279ade90c4f01bcfca111f3d3e13f2827", size = 56261, upload-time = "2026-01-26T04:13:17.638Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" }, -] - -[[package]] -name = "filelock" -version = "3.25.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/77/18/a1fd2231c679dcb9726204645721b12498aeac28e1ad0601038f94b42556/filelock-3.25.0.tar.gz", hash = "sha256:8f00faf3abf9dc730a1ffe9c354ae5c04e079ab7d3a683b7c32da5dd05f26af3", size = 40158, upload-time = "2026-03-01T15:08:45.916Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f9/0b/de6f54d4a8bedfe8645c41497f3c18d749f0bd3218170c667bf4b81d0cdd/filelock-3.25.0-py3-none-any.whl", hash = "sha256:5ccf8069f7948f494968fc0713c10e5c182a9c9d9eef3a636307a20c2490f047", size = 26427, upload-time = "2026-03-01T15:08:44.593Z" }, -] - -[[package]] -name = "flashinfer-python" -version = "0.5.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "apache-tvm-ffi" }, - { name = "click" }, - { name = "einops" }, - { name = "ninja" }, - { name = "numpy" }, - { name = "nvidia-cudnn-frontend" }, - { name = "nvidia-cutlass-dsl" }, - { name = "nvidia-ml-py" }, - { name = "packaging" }, - { name = "requests" }, - { name = "tabulate" }, - { name = "torch" }, - { name = "tqdm" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d8/04/e357eaa50238e12c49e66fcf47f83e066e741ef19a117c136782b32eafbb/flashinfer_python-0.5.2.tar.gz", hash = "sha256:99d097a28be1e98c7f85e4a767e9e9a4794374f9318c27db14d21e367149063f", size = 4632657, upload-time = "2025-11-07T02:53:27.261Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8d/0c/4a8ffbbc0d85e314f534cf5c32711f2af5d5e6e49225a5a414400a67b684/flashinfer_python-0.5.2-py3-none-any.whl", hash = "sha256:739c27d86d5ff4e3ad1ea41dcb90bda08e44c332549bf696f9c9c5c57f608e63", size = 6936306, upload-time = "2025-11-07T02:53:25.515Z" }, -] - -[[package]] -name = "frozenlist" -version = "1.8.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/2d/f5/c831fac6cc817d26fd54c7eaccd04ef7e0288806943f7cc5bbf69f3ac1f0/frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad", size = 45875, upload-time = "2025-10-06T05:38:17.865Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/bc/03/077f869d540370db12165c0aa51640a873fb661d8b315d1d4d67b284d7ac/frozenlist-1.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:09474e9831bc2b2199fad6da3c14c7b0fbdd377cce9d3d77131be28906cb7d84", size = 86912, upload-time = "2025-10-06T05:35:45.98Z" }, - { url = "https://files.pythonhosted.org/packages/df/b5/7610b6bd13e4ae77b96ba85abea1c8cb249683217ef09ac9e0ae93f25a91/frozenlist-1.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:17c883ab0ab67200b5f964d2b9ed6b00971917d5d8a92df149dc2c9779208ee9", size = 50046, upload-time = "2025-10-06T05:35:47.009Z" }, - { url = "https://files.pythonhosted.org/packages/6e/ef/0e8f1fe32f8a53dd26bdd1f9347efe0778b0fddf62789ea683f4cc7d787d/frozenlist-1.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fa47e444b8ba08fffd1c18e8cdb9a75db1b6a27f17507522834ad13ed5922b93", size = 50119, upload-time = "2025-10-06T05:35:48.38Z" }, - { url = "https://files.pythonhosted.org/packages/11/b1/71a477adc7c36e5fb628245dfbdea2166feae310757dea848d02bd0689fd/frozenlist-1.8.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2552f44204b744fba866e573be4c1f9048d6a324dfe14475103fd51613eb1d1f", size = 231067, upload-time = "2025-10-06T05:35:49.97Z" }, - { url = "https://files.pythonhosted.org/packages/45/7e/afe40eca3a2dc19b9904c0f5d7edfe82b5304cb831391edec0ac04af94c2/frozenlist-1.8.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:957e7c38f250991e48a9a73e6423db1bb9dd14e722a10f6b8bb8e16a0f55f695", size = 233160, upload-time = "2025-10-06T05:35:51.729Z" }, - { url = "https://files.pythonhosted.org/packages/a6/aa/7416eac95603ce428679d273255ffc7c998d4132cfae200103f164b108aa/frozenlist-1.8.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:8585e3bb2cdea02fc88ffa245069c36555557ad3609e83be0ec71f54fd4abb52", size = 228544, upload-time = "2025-10-06T05:35:53.246Z" }, - { url = "https://files.pythonhosted.org/packages/8b/3d/2a2d1f683d55ac7e3875e4263d28410063e738384d3adc294f5ff3d7105e/frozenlist-1.8.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:edee74874ce20a373d62dc28b0b18b93f645633c2943fd90ee9d898550770581", size = 243797, upload-time = "2025-10-06T05:35:54.497Z" }, - { url = "https://files.pythonhosted.org/packages/78/1e/2d5565b589e580c296d3bb54da08d206e797d941a83a6fdea42af23be79c/frozenlist-1.8.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c9a63152fe95756b85f31186bddf42e4c02c6321207fd6601a1c89ebac4fe567", size = 247923, upload-time = "2025-10-06T05:35:55.861Z" }, - { url = "https://files.pythonhosted.org/packages/aa/c3/65872fcf1d326a7f101ad4d86285c403c87be7d832b7470b77f6d2ed5ddc/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b6db2185db9be0a04fecf2f241c70b63b1a242e2805be291855078f2b404dd6b", size = 230886, upload-time = "2025-10-06T05:35:57.399Z" }, - { url = "https://files.pythonhosted.org/packages/a0/76/ac9ced601d62f6956f03cc794f9e04c81719509f85255abf96e2510f4265/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f4be2e3d8bc8aabd566f8d5b8ba7ecc09249d74ba3c9ed52e54dc23a293f0b92", size = 245731, upload-time = "2025-10-06T05:35:58.563Z" }, - { url = "https://files.pythonhosted.org/packages/b9/49/ecccb5f2598daf0b4a1415497eba4c33c1e8ce07495eb07d2860c731b8d5/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c8d1634419f39ea6f5c427ea2f90ca85126b54b50837f31497f3bf38266e853d", size = 241544, upload-time = "2025-10-06T05:35:59.719Z" }, - { url = "https://files.pythonhosted.org/packages/53/4b/ddf24113323c0bbcc54cb38c8b8916f1da7165e07b8e24a717b4a12cbf10/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:1a7fa382a4a223773ed64242dbe1c9c326ec09457e6b8428efb4118c685c3dfd", size = 241806, upload-time = "2025-10-06T05:36:00.959Z" }, - { url = "https://files.pythonhosted.org/packages/a7/fb/9b9a084d73c67175484ba2789a59f8eebebd0827d186a8102005ce41e1ba/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:11847b53d722050808926e785df837353bd4d75f1d494377e59b23594d834967", size = 229382, upload-time = "2025-10-06T05:36:02.22Z" }, - { url = "https://files.pythonhosted.org/packages/95/a3/c8fb25aac55bf5e12dae5c5aa6a98f85d436c1dc658f21c3ac73f9fa95e5/frozenlist-1.8.0-cp311-cp311-win32.whl", hash = "sha256:27c6e8077956cf73eadd514be8fb04d77fc946a7fe9f7fe167648b0b9085cc25", size = 39647, upload-time = "2025-10-06T05:36:03.409Z" }, - { url = "https://files.pythonhosted.org/packages/0a/f5/603d0d6a02cfd4c8f2a095a54672b3cf967ad688a60fb9faf04fc4887f65/frozenlist-1.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac913f8403b36a2c8610bbfd25b8013488533e71e62b4b4adce9c86c8cea905b", size = 44064, upload-time = "2025-10-06T05:36:04.368Z" }, - { url = "https://files.pythonhosted.org/packages/5d/16/c2c9ab44e181f043a86f9a8f84d5124b62dbcb3a02c0977ec72b9ac1d3e0/frozenlist-1.8.0-cp311-cp311-win_arm64.whl", hash = "sha256:d4d3214a0f8394edfa3e303136d0575eece0745ff2b47bd2cb2e66dd92d4351a", size = 39937, upload-time = "2025-10-06T05:36:05.669Z" }, - { url = "https://files.pythonhosted.org/packages/69/29/948b9aa87e75820a38650af445d2ef2b6b8a6fab1a23b6bb9e4ef0be2d59/frozenlist-1.8.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78f7b9e5d6f2fdb88cdde9440dc147259b62b9d3b019924def9f6478be254ac1", size = 87782, upload-time = "2025-10-06T05:36:06.649Z" }, - { url = "https://files.pythonhosted.org/packages/64/80/4f6e318ee2a7c0750ed724fa33a4bdf1eacdc5a39a7a24e818a773cd91af/frozenlist-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:229bf37d2e4acdaf808fd3f06e854a4a7a3661e871b10dc1f8f1896a3b05f18b", size = 50594, upload-time = "2025-10-06T05:36:07.69Z" }, - { url = "https://files.pythonhosted.org/packages/2b/94/5c8a2b50a496b11dd519f4a24cb5496cf125681dd99e94c604ccdea9419a/frozenlist-1.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f833670942247a14eafbb675458b4e61c82e002a148f49e68257b79296e865c4", size = 50448, upload-time = "2025-10-06T05:36:08.78Z" }, - { url = "https://files.pythonhosted.org/packages/6a/bd/d91c5e39f490a49df14320f4e8c80161cfcce09f1e2cde1edd16a551abb3/frozenlist-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:494a5952b1c597ba44e0e78113a7266e656b9794eec897b19ead706bd7074383", size = 242411, upload-time = "2025-10-06T05:36:09.801Z" }, - { url = "https://files.pythonhosted.org/packages/8f/83/f61505a05109ef3293dfb1ff594d13d64a2324ac3482be2cedc2be818256/frozenlist-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96f423a119f4777a4a056b66ce11527366a8bb92f54e541ade21f2374433f6d4", size = 243014, upload-time = "2025-10-06T05:36:11.394Z" }, - { url = "https://files.pythonhosted.org/packages/d8/cb/cb6c7b0f7d4023ddda30cf56b8b17494eb3a79e3fda666bf735f63118b35/frozenlist-1.8.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3462dd9475af2025c31cc61be6652dfa25cbfb56cbbf52f4ccfe029f38decaf8", size = 234909, upload-time = "2025-10-06T05:36:12.598Z" }, - { url = "https://files.pythonhosted.org/packages/31/c5/cd7a1f3b8b34af009fb17d4123c5a778b44ae2804e3ad6b86204255f9ec5/frozenlist-1.8.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4c800524c9cd9bac5166cd6f55285957fcfc907db323e193f2afcd4d9abd69b", size = 250049, upload-time = "2025-10-06T05:36:14.065Z" }, - { url = "https://files.pythonhosted.org/packages/c0/01/2f95d3b416c584a1e7f0e1d6d31998c4a795f7544069ee2e0962a4b60740/frozenlist-1.8.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d6a5df73acd3399d893dafc71663ad22534b5aa4f94e8a2fabfe856c3c1b6a52", size = 256485, upload-time = "2025-10-06T05:36:15.39Z" }, - { url = "https://files.pythonhosted.org/packages/ce/03/024bf7720b3abaebcff6d0793d73c154237b85bdf67b7ed55e5e9596dc9a/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:405e8fe955c2280ce66428b3ca55e12b3c4e9c336fb2103a4937e891c69a4a29", size = 237619, upload-time = "2025-10-06T05:36:16.558Z" }, - { url = "https://files.pythonhosted.org/packages/69/fa/f8abdfe7d76b731f5d8bd217827cf6764d4f1d9763407e42717b4bed50a0/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:908bd3f6439f2fef9e85031b59fd4f1297af54415fb60e4254a95f75b3cab3f3", size = 250320, upload-time = "2025-10-06T05:36:17.821Z" }, - { url = "https://files.pythonhosted.org/packages/f5/3c/b051329f718b463b22613e269ad72138cc256c540f78a6de89452803a47d/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:294e487f9ec720bd8ffcebc99d575f7eff3568a08a253d1ee1a0378754b74143", size = 246820, upload-time = "2025-10-06T05:36:19.046Z" }, - { url = "https://files.pythonhosted.org/packages/0f/ae/58282e8f98e444b3f4dd42448ff36fa38bef29e40d40f330b22e7108f565/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:74c51543498289c0c43656701be6b077f4b265868fa7f8a8859c197006efb608", size = 250518, upload-time = "2025-10-06T05:36:20.763Z" }, - { url = "https://files.pythonhosted.org/packages/8f/96/007e5944694d66123183845a106547a15944fbbb7154788cbf7272789536/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:776f352e8329135506a1d6bf16ac3f87bc25b28e765949282dcc627af36123aa", size = 239096, upload-time = "2025-10-06T05:36:22.129Z" }, - { url = "https://files.pythonhosted.org/packages/66/bb/852b9d6db2fa40be96f29c0d1205c306288f0684df8fd26ca1951d461a56/frozenlist-1.8.0-cp312-cp312-win32.whl", hash = "sha256:433403ae80709741ce34038da08511d4a77062aa924baf411ef73d1146e74faf", size = 39985, upload-time = "2025-10-06T05:36:23.661Z" }, - { url = "https://files.pythonhosted.org/packages/b8/af/38e51a553dd66eb064cdf193841f16f077585d4d28394c2fa6235cb41765/frozenlist-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:34187385b08f866104f0c0617404c8eb08165ab1272e884abc89c112e9c00746", size = 44591, upload-time = "2025-10-06T05:36:24.958Z" }, - { url = "https://files.pythonhosted.org/packages/a7/06/1dc65480ab147339fecc70797e9c2f69d9cea9cf38934ce08df070fdb9cb/frozenlist-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:fe3c58d2f5db5fbd18c2987cba06d51b0529f52bc3a6cdc33d3f4eab725104bd", size = 40102, upload-time = "2025-10-06T05:36:26.333Z" }, - { url = "https://files.pythonhosted.org/packages/9a/9a/e35b4a917281c0b8419d4207f4334c8e8c5dbf4f3f5f9ada73958d937dcc/frozenlist-1.8.0-py3-none-any.whl", hash = "sha256:0c18a16eab41e82c295618a77502e17b195883241c563b00f0aa5106fc4eaa0d", size = 13409, upload-time = "2025-10-06T05:38:16.721Z" }, -] - -[[package]] -name = "fsspec" -version = "2026.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload-time = "2026-02-05T21:50:53.743Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" }, -] - -[package.optional-dependencies] -http = [ - { name = "aiohttp" }, -] - -[[package]] -name = "gitdb" -version = "4.0.12" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "smmap" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/72/94/63b0fc47eb32792c7ba1fe1b694daec9a63620db1e313033d18140c2320a/gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571", size = 394684, upload-time = "2025-01-02T07:20:46.413Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a0/61/5c78b91c3143ed5c14207f463aecfc8f9dbb5092fb2869baf37c273b2705/gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf", size = 62794, upload-time = "2025-01-02T07:20:43.624Z" }, -] - -[[package]] -name = "gitpython" -version = "3.1.46" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "gitdb" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/df/b5/59d16470a1f0dfe8c793f9ef56fd3826093fc52b3bd96d6b9d6c26c7e27b/gitpython-3.1.46.tar.gz", hash = "sha256:400124c7d0ef4ea03f7310ac2fbf7151e09ff97f2a3288d64a440c584a29c37f", size = 215371, upload-time = "2026-01-01T15:37:32.073Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6a/09/e21df6aef1e1ffc0c816f0522ddc3f6dcded766c3261813131c78a704470/gitpython-3.1.46-py3-none-any.whl", hash = "sha256:79812ed143d9d25b6d176a10bb511de0f9c67b1fa641d82097b0ab90398a2058", size = 208620, upload-time = "2026-01-01T15:37:30.574Z" }, -] - -[[package]] -name = "h11" -version = "0.16.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, -] - -[[package]] -name = "hf-transfer" -version = "0.1.9" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1a/eb/8fc64f40388c29ce8ce3b2b180a089d4d6b25b1d0d232d016704cb852104/hf_transfer-0.1.9.tar.gz", hash = "sha256:035572865dab29d17e783fbf1e84cf1cb24f3fcf8f1b17db1cfc7fdf139f02bf", size = 25201, upload-time = "2025-01-07T10:05:12.947Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/81/f5/461d2e5f307e5048289b1168d5c642ae3bb2504e88dff1a38b92ed990a21/hf_transfer-0.1.9-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:e66acf91df4a8b72f60223059df3003062a5ae111757187ed1a06750a30e911b", size = 1393046, upload-time = "2025-01-07T10:04:51.003Z" }, - { url = "https://files.pythonhosted.org/packages/41/ba/8d9fd9f1083525edfcb389c93738c802f3559cb749324090d7109c8bf4c2/hf_transfer-0.1.9-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:8669dbcc7a3e2e8d61d42cd24da9c50d57770bd74b445c65123291ca842a7e7a", size = 1348126, upload-time = "2025-01-07T10:04:45.712Z" }, - { url = "https://files.pythonhosted.org/packages/8e/a2/cd7885bc9959421065a6fae0fe67b6c55becdeda4e69b873e52976f9a9f0/hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8fd0167c4407a3bc4cdd0307e65ada2294ec04f1813d8a69a5243e379b22e9d8", size = 3728604, upload-time = "2025-01-07T10:04:14.173Z" }, - { url = "https://files.pythonhosted.org/packages/f6/2e/a072cf196edfeda3310c9a5ade0a0fdd785e6154b3ce24fc738c818da2a7/hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ee8b10afedcb75f71091bcc197c526a6ebf5c58bbbadb34fdeee6160f55f619f", size = 3064995, upload-time = "2025-01-07T10:04:18.663Z" }, - { url = "https://files.pythonhosted.org/packages/c2/84/aec9ef4c0fab93c1ea2b1badff38c78b4b2f86f0555b26d2051dbc920cde/hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5828057e313de59300dd1abb489444bc452efe3f479d3c55b31a8f680936ba42", size = 3580908, upload-time = "2025-01-07T10:04:32.834Z" }, - { url = "https://files.pythonhosted.org/packages/29/63/b560d39651a56603d64f1a0212d0472a44cbd965db2fa62b99d99cb981bf/hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fc6bd19e1cc177c66bdef15ef8636ad3bde79d5a4f608c158021153b4573509d", size = 3400839, upload-time = "2025-01-07T10:04:26.122Z" }, - { url = "https://files.pythonhosted.org/packages/d6/d8/f87ea6f42456254b48915970ed98e993110521e9263472840174d32c880d/hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdca9bfb89e6f8f281890cc61a8aff2d3cecaff7e1a4d275574d96ca70098557", size = 3552664, upload-time = "2025-01-07T10:04:40.123Z" }, - { url = "https://files.pythonhosted.org/packages/d6/56/1267c39b65fc8f4e2113b36297320f102718bf5799b544a6cbe22013aa1d/hf_transfer-0.1.9-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:89a23f58b7b7effbc047b8ca286f131b17728c99a9f972723323003ffd1bb916", size = 4073732, upload-time = "2025-01-07T10:04:55.624Z" }, - { url = "https://files.pythonhosted.org/packages/82/1a/9c748befbe3decf7cb415e34f8a0c3789a0a9c55910dea73d581e48c0ce5/hf_transfer-0.1.9-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:dc7fff1345980d6c0ebb92c811d24afa4b98b3e07ed070c8e38cc91fd80478c5", size = 3390096, upload-time = "2025-01-07T10:04:59.98Z" }, - { url = "https://files.pythonhosted.org/packages/72/85/4c03da147b6b4b7cb12e074d3d44eee28604a387ed0eaf7eaaead5069c57/hf_transfer-0.1.9-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:1a6bd16c667ebe89a069ca163060127a794fa3a3525292c900b8c8cc47985b0d", size = 3664743, upload-time = "2025-01-07T10:05:05.416Z" }, - { url = "https://files.pythonhosted.org/packages/e7/6e/e597b04f753f1b09e6893075d53a82a30c13855cbaa791402695b01e369f/hf_transfer-0.1.9-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d2fde99d502093ade3ab1b53f80da18480e9902aa960dab7f74fb1b9e5bc5746", size = 3695243, upload-time = "2025-01-07T10:05:11.411Z" }, - { url = "https://files.pythonhosted.org/packages/09/89/d4e234727a26b2546c8fb70a276cd924260d60135f2165bf8b9ed67bb9a4/hf_transfer-0.1.9-cp38-abi3-win32.whl", hash = "sha256:435cc3cdc8524ce57b074032b8fd76eed70a4224d2091232fa6a8cef8fd6803e", size = 1086605, upload-time = "2025-01-07T10:05:18.873Z" }, - { url = "https://files.pythonhosted.org/packages/a1/14/f1e15b851d1c2af5b0b1a82bf8eb10bda2da62d98180220ba6fd8879bb5b/hf_transfer-0.1.9-cp38-abi3-win_amd64.whl", hash = "sha256:16f208fc678911c37e11aa7b586bc66a37d02e636208f18b6bc53d29b5df40ad", size = 1160240, upload-time = "2025-01-07T10:05:14.324Z" }, -] - -[[package]] -name = "hf-xet" -version = "1.3.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/8b/cb/9bb543bd987ffa1ee48202cc96a756951b734b79a542335c566148ade36c/hf_xet-1.3.2.tar.gz", hash = "sha256:e130ee08984783d12717444e538587fa2119385e5bd8fc2bb9f930419b73a7af", size = 643646, upload-time = "2026-02-27T17:26:08.051Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d8/28/dbb024e2e3907f6f3052847ca7d1a2f7a3972fafcd53ff79018977fcb3e4/hf_xet-1.3.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f93b7595f1d8fefddfede775c18b5c9256757824f7f6832930b49858483cd56f", size = 3763961, upload-time = "2026-02-27T17:25:52.537Z" }, - { url = "https://files.pythonhosted.org/packages/e4/71/b99aed3823c9d1795e4865cf437d651097356a3f38c7d5877e4ac544b8e4/hf_xet-1.3.2-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:a85d3d43743174393afe27835bde0cd146e652b5fcfdbcd624602daef2ef3259", size = 3526171, upload-time = "2026-02-27T17:25:50.968Z" }, - { url = "https://files.pythonhosted.org/packages/9d/ca/907890ce6ef5598b5920514f255ed0a65f558f820515b18db75a51b2f878/hf_xet-1.3.2-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7c2a054a97c44e136b1f7f5a78f12b3efffdf2eed3abc6746fc5ea4b39511633", size = 4180750, upload-time = "2026-02-27T17:25:43.125Z" }, - { url = "https://files.pythonhosted.org/packages/8c/ad/bc7f41f87173d51d0bce497b171c4ee0cbde1eed2d7b4216db5d0ada9f50/hf_xet-1.3.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:06b724a361f670ae557836e57801b82c75b534812e351a87a2c739f77d1e0635", size = 3961035, upload-time = "2026-02-27T17:25:41.837Z" }, - { url = "https://files.pythonhosted.org/packages/73/38/600f4dda40c4a33133404d9fe644f1d35ff2d9babb4d0435c646c63dd107/hf_xet-1.3.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:305f5489d7241a47e0458ef49334be02411d1d0f480846363c1c8084ed9916f7", size = 4161378, upload-time = "2026-02-27T17:26:00.365Z" }, - { url = "https://files.pythonhosted.org/packages/00/b3/7bc1ff91d1ac18420b7ad1e169b618b27c00001b96310a89f8a9294fe509/hf_xet-1.3.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:06cdbde243c85f39a63b28e9034321399c507bcd5e7befdd17ed2ccc06dfe14e", size = 4398020, upload-time = "2026-02-27T17:26:03.977Z" }, - { url = "https://files.pythonhosted.org/packages/2b/0b/99bfd948a3ed3620ab709276df3ad3710dcea61976918cce8706502927af/hf_xet-1.3.2-cp37-abi3-win_amd64.whl", hash = "sha256:9298b47cce6037b7045ae41482e703c471ce36b52e73e49f71226d2e8e5685a1", size = 3641624, upload-time = "2026-02-27T17:26:13.542Z" }, - { url = "https://files.pythonhosted.org/packages/cc/02/9a6e4ca1f3f73a164c0cd48e41b3cc56585dcc37e809250de443d673266f/hf_xet-1.3.2-cp37-abi3-win_arm64.whl", hash = "sha256:83d8ec273136171431833a6957e8f3af496bee227a0fe47c7b8b39c106d1749a", size = 3503976, upload-time = "2026-02-27T17:26:12.123Z" }, -] - -[[package]] -name = "httpcore" -version = "1.0.9" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "certifi" }, - { name = "h11" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, -] - -[[package]] -name = "httpx" -version = "0.28.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "anyio" }, - { name = "certifi" }, - { name = "httpcore" }, - { name = "idna" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, -] - -[[package]] -name = "huggingface-hub" -version = "0.36.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "filelock" }, - { name = "fsspec" }, - { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, - { name = "packaging" }, - { name = "pyyaml" }, - { name = "requests" }, - { name = "tqdm" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7c/b7/8cb61d2eece5fb05a83271da168186721c450eb74e3c31f7ef3169fa475b/huggingface_hub-0.36.2.tar.gz", hash = "sha256:1934304d2fb224f8afa3b87007d58501acfda9215b334eed53072dd5e815ff7a", size = 649782, upload-time = "2026-02-06T09:24:13.098Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a8/af/48ac8483240de756d2438c380746e7130d1c6f75802ef22f3c6d49982787/huggingface_hub-0.36.2-py3-none-any.whl", hash = "sha256:48f0c8eac16145dfce371e9d2d7772854a4f591bcb56c9cf548accf531d54270", size = 566395, upload-time = "2026-02-06T09:24:11.133Z" }, -] - -[[package]] -name = "idna" -version = "3.11" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, -] - -[[package]] -name = "jinja2" -version = "3.1.6" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markupsafe" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, -] - -[[package]] -name = "markupsafe" -version = "3.0.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/08/db/fefacb2136439fc8dd20e797950e749aa1f4997ed584c62cfb8ef7c2be0e/markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad", size = 11631, upload-time = "2025-09-27T18:36:18.185Z" }, - { url = "https://files.pythonhosted.org/packages/e1/2e/5898933336b61975ce9dc04decbc0a7f2fee78c30353c5efba7f2d6ff27a/markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a", size = 12058, upload-time = "2025-09-27T18:36:19.444Z" }, - { url = "https://files.pythonhosted.org/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50", size = 24287, upload-time = "2025-09-27T18:36:20.768Z" }, - { url = "https://files.pythonhosted.org/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf", size = 22940, upload-time = "2025-09-27T18:36:22.249Z" }, - { url = "https://files.pythonhosted.org/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f", size = 21887, upload-time = "2025-09-27T18:36:23.535Z" }, - { url = "https://files.pythonhosted.org/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a", size = 23692, upload-time = "2025-09-27T18:36:24.823Z" }, - { url = "https://files.pythonhosted.org/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115", size = 21471, upload-time = "2025-09-27T18:36:25.95Z" }, - { url = "https://files.pythonhosted.org/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a", size = 22923, upload-time = "2025-09-27T18:36:27.109Z" }, - { url = "https://files.pythonhosted.org/packages/0f/62/d9c46a7f5c9adbeeeda52f5b8d802e1094e9717705a645efc71b0913a0a8/markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19", size = 14572, upload-time = "2025-09-27T18:36:28.045Z" }, - { url = "https://files.pythonhosted.org/packages/83/8a/4414c03d3f891739326e1783338e48fb49781cc915b2e0ee052aa490d586/markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01", size = 15077, upload-time = "2025-09-27T18:36:29.025Z" }, - { url = "https://files.pythonhosted.org/packages/35/73/893072b42e6862f319b5207adc9ae06070f095b358655f077f69a35601f0/markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c", size = 13876, upload-time = "2025-09-27T18:36:29.954Z" }, - { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615, upload-time = "2025-09-27T18:36:30.854Z" }, - { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020, upload-time = "2025-09-27T18:36:31.971Z" }, - { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" }, - { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" }, - { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" }, - { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" }, - { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" }, - { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" }, - { url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540, upload-time = "2025-09-27T18:36:38.761Z" }, - { url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105, upload-time = "2025-09-27T18:36:39.701Z" }, - { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" }, -] - -[[package]] -name = "mpmath" -version = "1.3.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, -] - -[[package]] -name = "multidict" -version = "6.7.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1a/c2/c2d94cbe6ac1753f3fc980da97b3d930efe1da3af3c9f5125354436c073d/multidict-6.7.1.tar.gz", hash = "sha256:ec6652a1bee61c53a3e5776b6049172c53b6aaba34f18c9ad04f82712bac623d", size = 102010, upload-time = "2026-01-26T02:46:45.979Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ce/f1/a90635c4f88fb913fbf4ce660b83b7445b7a02615bda034b2f8eb38fd597/multidict-6.7.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7ff981b266af91d7b4b3793ca3382e53229088d193a85dfad6f5f4c27fc73e5d", size = 76626, upload-time = "2026-01-26T02:43:26.485Z" }, - { url = "https://files.pythonhosted.org/packages/a6/9b/267e64eaf6fc637a15b35f5de31a566634a2740f97d8d094a69d34f524a4/multidict-6.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:844c5bca0b5444adb44a623fb0a1310c2f4cd41f402126bb269cd44c9b3f3e1e", size = 44706, upload-time = "2026-01-26T02:43:27.607Z" }, - { url = "https://files.pythonhosted.org/packages/dd/a4/d45caf2b97b035c57267791ecfaafbd59c68212004b3842830954bb4b02e/multidict-6.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f2a0a924d4c2e9afcd7ec64f9de35fcd96915149b2216e1cb2c10a56df483855", size = 44356, upload-time = "2026-01-26T02:43:28.661Z" }, - { url = "https://files.pythonhosted.org/packages/fd/d2/0a36c8473f0cbaeadd5db6c8b72d15bbceeec275807772bfcd059bef487d/multidict-6.7.1-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:8be1802715a8e892c784c0197c2ace276ea52702a0ede98b6310c8f255a5afb3", size = 244355, upload-time = "2026-01-26T02:43:31.165Z" }, - { url = "https://files.pythonhosted.org/packages/5d/16/8c65be997fd7dd311b7d39c7b6e71a0cb449bad093761481eccbbe4b42a2/multidict-6.7.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e2d2ed645ea29f31c4c7ea1552fcfd7cb7ba656e1eafd4134a6620c9f5fdd9e", size = 246433, upload-time = "2026-01-26T02:43:32.581Z" }, - { url = "https://files.pythonhosted.org/packages/01/fb/4dbd7e848d2799c6a026ec88ad39cf2b8416aa167fcc903baa55ecaa045c/multidict-6.7.1-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:95922cee9a778659e91db6497596435777bd25ed116701a4c034f8e46544955a", size = 225376, upload-time = "2026-01-26T02:43:34.417Z" }, - { url = "https://files.pythonhosted.org/packages/b6/8a/4a3a6341eac3830f6053062f8fbc9a9e54407c80755b3f05bc427295c2d0/multidict-6.7.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6b83cabdc375ffaaa15edd97eb7c0c672ad788e2687004990074d7d6c9b140c8", size = 257365, upload-time = "2026-01-26T02:43:35.741Z" }, - { url = "https://files.pythonhosted.org/packages/f7/a2/dd575a69c1aa206e12d27d0770cdf9b92434b48a9ef0cd0d1afdecaa93c4/multidict-6.7.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:38fb49540705369bab8484db0689d86c0a33a0a9f2c1b197f506b71b4b6c19b0", size = 254747, upload-time = "2026-01-26T02:43:36.976Z" }, - { url = "https://files.pythonhosted.org/packages/5a/56/21b27c560c13822ed93133f08aa6372c53a8e067f11fbed37b4adcdac922/multidict-6.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:439cbebd499f92e9aa6793016a8acaa161dfa749ae86d20960189f5398a19144", size = 246293, upload-time = "2026-01-26T02:43:38.258Z" }, - { url = "https://files.pythonhosted.org/packages/5a/a4/23466059dc3854763423d0ad6c0f3683a379d97673b1b89ec33826e46728/multidict-6.7.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6d3bc717b6fe763b8be3f2bee2701d3c8eb1b2a8ae9f60910f1b2860c82b6c49", size = 242962, upload-time = "2026-01-26T02:43:40.034Z" }, - { url = "https://files.pythonhosted.org/packages/1f/67/51dd754a3524d685958001e8fa20a0f5f90a6a856e0a9dcabff69be3dbb7/multidict-6.7.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:619e5a1ac57986dbfec9f0b301d865dddf763696435e2962f6d9cf2fdff2bb71", size = 237360, upload-time = "2026-01-26T02:43:41.752Z" }, - { url = "https://files.pythonhosted.org/packages/64/3f/036dfc8c174934d4b55d86ff4f978e558b0e585cef70cfc1ad01adc6bf18/multidict-6.7.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0b38ebffd9be37c1170d33bc0f36f4f262e0a09bc1aac1c34c7aa51a7293f0b3", size = 245940, upload-time = "2026-01-26T02:43:43.042Z" }, - { url = "https://files.pythonhosted.org/packages/3d/20/6214d3c105928ebc353a1c644a6ef1408bc5794fcb4f170bb524a3c16311/multidict-6.7.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:10ae39c9cfe6adedcdb764f5e8411d4a92b055e35573a2eaa88d3323289ef93c", size = 253502, upload-time = "2026-01-26T02:43:44.371Z" }, - { url = "https://files.pythonhosted.org/packages/b1/e2/c653bc4ae1be70a0f836b82172d643fcf1dade042ba2676ab08ec08bff0f/multidict-6.7.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:25167cc263257660290fba06b9318d2026e3c910be240a146e1f66dd114af2b0", size = 247065, upload-time = "2026-01-26T02:43:45.745Z" }, - { url = "https://files.pythonhosted.org/packages/c8/11/a854b4154cd3bd8b1fd375e8a8ca9d73be37610c361543d56f764109509b/multidict-6.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:128441d052254f42989ef98b7b6a6ecb1e6f708aa962c7984235316db59f50fa", size = 241870, upload-time = "2026-01-26T02:43:47.054Z" }, - { url = "https://files.pythonhosted.org/packages/13/bf/9676c0392309b5fdae322333d22a829715b570edb9baa8016a517b55b558/multidict-6.7.1-cp311-cp311-win32.whl", hash = "sha256:d62b7f64ffde3b99d06b707a280db04fb3855b55f5a06df387236051d0668f4a", size = 41302, upload-time = "2026-01-26T02:43:48.753Z" }, - { url = "https://files.pythonhosted.org/packages/c9/68/f16a3a8ba6f7b6dc92a1f19669c0810bd2c43fc5a02da13b1cbf8e253845/multidict-6.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:bdbf9f3b332abd0cdb306e7c2113818ab1e922dc84b8f8fd06ec89ed2a19ab8b", size = 45981, upload-time = "2026-01-26T02:43:49.921Z" }, - { url = "https://files.pythonhosted.org/packages/ac/ad/9dd5305253fa00cd3c7555dbef69d5bf4133debc53b87ab8d6a44d411665/multidict-6.7.1-cp311-cp311-win_arm64.whl", hash = "sha256:b8c990b037d2fff2f4e33d3f21b9b531c5745b33a49a7d6dbe7a177266af44f6", size = 43159, upload-time = "2026-01-26T02:43:51.635Z" }, - { url = "https://files.pythonhosted.org/packages/8d/9c/f20e0e2cf80e4b2e4b1c365bf5fe104ee633c751a724246262db8f1a0b13/multidict-6.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a90f75c956e32891a4eda3639ce6dd86e87105271f43d43442a3aedf3cddf172", size = 76893, upload-time = "2026-01-26T02:43:52.754Z" }, - { url = "https://files.pythonhosted.org/packages/fe/cf/18ef143a81610136d3da8193da9d80bfe1cb548a1e2d1c775f26b23d024a/multidict-6.7.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fccb473e87eaa1382689053e4a4618e7ba7b9b9b8d6adf2027ee474597128cd", size = 45456, upload-time = "2026-01-26T02:43:53.893Z" }, - { url = "https://files.pythonhosted.org/packages/a9/65/1caac9d4cd32e8433908683446eebc953e82d22b03d10d41a5f0fefe991b/multidict-6.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0fa96985700739c4c7853a43c0b3e169360d6855780021bfc6d0f1ce7c123e7", size = 43872, upload-time = "2026-01-26T02:43:55.041Z" }, - { url = "https://files.pythonhosted.org/packages/cf/3b/d6bd75dc4f3ff7c73766e04e705b00ed6dbbaccf670d9e05a12b006f5a21/multidict-6.7.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cb2a55f408c3043e42b40cc8eecd575afa27b7e0b956dfb190de0f8499a57a53", size = 251018, upload-time = "2026-01-26T02:43:56.198Z" }, - { url = "https://files.pythonhosted.org/packages/fd/80/c959c5933adedb9ac15152e4067c702a808ea183a8b64cf8f31af8ad3155/multidict-6.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb0ce7b2a32d09892b3dd6cc44877a0d02a33241fafca5f25c8b6b62374f8b75", size = 258883, upload-time = "2026-01-26T02:43:57.499Z" }, - { url = "https://files.pythonhosted.org/packages/86/85/7ed40adafea3d4f1c8b916e3b5cc3a8e07dfcdcb9cd72800f4ed3ca1b387/multidict-6.7.1-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c3a32d23520ee37bf327d1e1a656fec76a2edd5c038bf43eddfa0572ec49c60b", size = 242413, upload-time = "2026-01-26T02:43:58.755Z" }, - { url = "https://files.pythonhosted.org/packages/d2/57/b8565ff533e48595503c785f8361ff9a4fde4d67de25c207cd0ba3befd03/multidict-6.7.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9c90fed18bffc0189ba814749fdcc102b536e83a9f738a9003e569acd540a733", size = 268404, upload-time = "2026-01-26T02:44:00.216Z" }, - { url = "https://files.pythonhosted.org/packages/e0/50/9810c5c29350f7258180dfdcb2e52783a0632862eb334c4896ac717cebcb/multidict-6.7.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:da62917e6076f512daccfbbde27f46fed1c98fee202f0559adec8ee0de67f71a", size = 269456, upload-time = "2026-01-26T02:44:02.202Z" }, - { url = "https://files.pythonhosted.org/packages/f3/8d/5e5be3ced1d12966fefb5c4ea3b2a5b480afcea36406559442c6e31d4a48/multidict-6.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bfde23ef6ed9db7eaee6c37dcec08524cb43903c60b285b172b6c094711b3961", size = 256322, upload-time = "2026-01-26T02:44:03.56Z" }, - { url = "https://files.pythonhosted.org/packages/31/6e/d8a26d81ac166a5592782d208dd90dfdc0a7a218adaa52b45a672b46c122/multidict-6.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3758692429e4e32f1ba0df23219cd0b4fc0a52f476726fff9337d1a57676a582", size = 253955, upload-time = "2026-01-26T02:44:04.845Z" }, - { url = "https://files.pythonhosted.org/packages/59/4c/7c672c8aad41534ba619bcd4ade7a0dc87ed6b8b5c06149b85d3dd03f0cd/multidict-6.7.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:398c1478926eca669f2fd6a5856b6de9c0acf23a2cb59a14c0ba5844fa38077e", size = 251254, upload-time = "2026-01-26T02:44:06.133Z" }, - { url = "https://files.pythonhosted.org/packages/7b/bd/84c24de512cbafbdbc39439f74e967f19570ce7924e3007174a29c348916/multidict-6.7.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c102791b1c4f3ab36ce4101154549105a53dc828f016356b3e3bcae2e3a039d3", size = 252059, upload-time = "2026-01-26T02:44:07.518Z" }, - { url = "https://files.pythonhosted.org/packages/fa/ba/f5449385510825b73d01c2d4087bf6d2fccc20a2d42ac34df93191d3dd03/multidict-6.7.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a088b62bd733e2ad12c50dad01b7d0166c30287c166e137433d3b410add807a6", size = 263588, upload-time = "2026-01-26T02:44:09.382Z" }, - { url = "https://files.pythonhosted.org/packages/d7/11/afc7c677f68f75c84a69fe37184f0f82fce13ce4b92f49f3db280b7e92b3/multidict-6.7.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3d51ff4785d58d3f6c91bdbffcb5e1f7ddfda557727043aa20d20ec4f65e324a", size = 259642, upload-time = "2026-01-26T02:44:10.73Z" }, - { url = "https://files.pythonhosted.org/packages/2b/17/ebb9644da78c4ab36403739e0e6e0e30ebb135b9caf3440825001a0bddcb/multidict-6.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc5907494fccf3e7d3f94f95c91d6336b092b5fc83811720fae5e2765890dfba", size = 251377, upload-time = "2026-01-26T02:44:12.042Z" }, - { url = "https://files.pythonhosted.org/packages/ca/a4/840f5b97339e27846c46307f2530a2805d9d537d8b8bd416af031cad7fa0/multidict-6.7.1-cp312-cp312-win32.whl", hash = "sha256:28ca5ce2fd9716631133d0e9a9b9a745ad7f60bac2bccafb56aa380fc0b6c511", size = 41887, upload-time = "2026-01-26T02:44:14.245Z" }, - { url = "https://files.pythonhosted.org/packages/80/31/0b2517913687895f5904325c2069d6a3b78f66cc641a86a2baf75a05dcbb/multidict-6.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcee94dfbd638784645b066074b338bc9cc155d4b4bffa4adce1615c5a426c19", size = 46053, upload-time = "2026-01-26T02:44:15.371Z" }, - { url = "https://files.pythonhosted.org/packages/0c/5b/aba28e4ee4006ae4c7df8d327d31025d760ffa992ea23812a601d226e682/multidict-6.7.1-cp312-cp312-win_arm64.whl", hash = "sha256:ba0a9fb644d0c1a2194cf7ffb043bd852cea63a57f66fbd33959f7dae18517bf", size = 43307, upload-time = "2026-01-26T02:44:16.852Z" }, - { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" }, -] - -[[package]] -name = "multiprocess" -version = "0.70.18" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "dill" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/72/fd/2ae3826f5be24c6ed87266bc4e59c46ea5b059a103f3d7e7eb76a52aeecb/multiprocess-0.70.18.tar.gz", hash = "sha256:f9597128e6b3e67b23956da07cf3d2e5cba79e2f4e0fba8d7903636663ec6d0d", size = 1798503, upload-time = "2025-04-17T03:11:27.742Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/55/4d/9af0d1279c84618bcd35bf5fd7e371657358c7b0a523e54a9cffb87461f8/multiprocess-0.70.18-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8b8940ae30139e04b076da6c5b83e9398585ebdf0f2ad3250673fef5b2ff06d6", size = 144695, upload-time = "2025-04-17T03:11:09.161Z" }, - { url = "https://files.pythonhosted.org/packages/17/bf/87323e79dd0562474fad3373c21c66bc6c3c9963b68eb2a209deb4c8575e/multiprocess-0.70.18-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:0929ba95831adb938edbd5fb801ac45e705ecad9d100b3e653946b7716cb6bd3", size = 144742, upload-time = "2025-04-17T03:11:10.072Z" }, - { url = "https://files.pythonhosted.org/packages/dd/74/cb8c831e58dc6d5cf450b17c7db87f14294a1df52eb391da948b5e0a0b94/multiprocess-0.70.18-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d77f8e4bfe6c6e2e661925bbf9aed4d5ade9a1c6502d5dfc10129b9d1141797", size = 144745, upload-time = "2025-04-17T03:11:11.453Z" }, - { url = "https://files.pythonhosted.org/packages/ba/d8/0cba6cf51a1a31f20471fbc823a716170c73012ddc4fb85d706630ed6e8f/multiprocess-0.70.18-py310-none-any.whl", hash = "sha256:60c194974c31784019c1f459d984e8f33ee48f10fcf42c309ba97b30d9bd53ea", size = 134948, upload-time = "2025-04-17T03:11:20.223Z" }, - { url = "https://files.pythonhosted.org/packages/4b/88/9039f2fed1012ef584751d4ceff9ab4a51e5ae264898f0b7cbf44340a859/multiprocess-0.70.18-py311-none-any.whl", hash = "sha256:5aa6eef98e691281b3ad923be2832bf1c55dd2c859acd73e5ec53a66aae06a1d", size = 144462, upload-time = "2025-04-17T03:11:21.657Z" }, - { url = "https://files.pythonhosted.org/packages/bf/b6/5f922792be93b82ec6b5f270bbb1ef031fd0622847070bbcf9da816502cc/multiprocess-0.70.18-py312-none-any.whl", hash = "sha256:9b78f8e5024b573730bfb654783a13800c2c0f2dfc0c25e70b40d184d64adaa2", size = 150287, upload-time = "2025-04-17T03:11:22.69Z" }, - { url = "https://files.pythonhosted.org/packages/3b/c3/ca84c19bd14cdfc21c388fdcebf08b86a7a470ebc9f5c3c084fc2dbc50f7/multiprocess-0.70.18-py38-none-any.whl", hash = "sha256:dbf705e52a154fe5e90fb17b38f02556169557c2dd8bb084f2e06c2784d8279b", size = 132636, upload-time = "2025-04-17T03:11:24.936Z" }, - { url = "https://files.pythonhosted.org/packages/6c/28/dd72947e59a6a8c856448a5e74da6201cb5502ddff644fbc790e4bd40b9a/multiprocess-0.70.18-py39-none-any.whl", hash = "sha256:e78ca805a72b1b810c690b6b4cc32579eba34f403094bbbae962b7b5bf9dfcb8", size = 133478, upload-time = "2025-04-17T03:11:26.253Z" }, -] - -[[package]] -name = "networkx" -version = "3.6.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, -] - -[[package]] -name = "ninja" -version = "1.13.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/43/73/79a0b22fc731989c708068427579e840a6cf4e937fe7ae5c5d0b7356ac22/ninja-1.13.0.tar.gz", hash = "sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978", size = 242558, upload-time = "2025-08-11T15:10:19.421Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3c/74/d02409ed2aa865e051b7edda22ad416a39d81a84980f544f8de717cab133/ninja-1.13.0-py3-none-macosx_10_9_universal2.whl", hash = "sha256:fa2a8bfc62e31b08f83127d1613d10821775a0eb334197154c4d6067b7068ff1", size = 310125, upload-time = "2025-08-11T15:09:50.971Z" }, - { url = "https://files.pythonhosted.org/packages/8e/de/6e1cd6b84b412ac1ef327b76f0641aeb5dcc01e9d3f9eee0286d0c34fd93/ninja-1.13.0-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3d00c692fb717fd511abeb44b8c5d00340c36938c12d6538ba989fe764e79630", size = 177467, upload-time = "2025-08-11T15:09:52.767Z" }, - { url = "https://files.pythonhosted.org/packages/c8/83/49320fb6e58ae3c079381e333575fdbcf1cca3506ee160a2dcce775046fa/ninja-1.13.0-py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:be7f478ff9f96a128b599a964fc60a6a87b9fa332ee1bd44fa243ac88d50291c", size = 187834, upload-time = "2025-08-11T15:09:54.115Z" }, - { url = "https://files.pythonhosted.org/packages/56/c7/ba22748fb59f7f896b609cd3e568d28a0a367a6d953c24c461fe04fc4433/ninja-1.13.0-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:60056592cf495e9a6a4bea3cd178903056ecb0943e4de45a2ea825edb6dc8d3e", size = 202736, upload-time = "2025-08-11T15:09:55.745Z" }, - { url = "https://files.pythonhosted.org/packages/79/22/d1de07632b78ac8e6b785f41fa9aad7a978ec8c0a1bf15772def36d77aac/ninja-1.13.0-py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:1c97223cdda0417f414bf864cfb73b72d8777e57ebb279c5f6de368de0062988", size = 179034, upload-time = "2025-08-11T15:09:57.394Z" }, - { url = "https://files.pythonhosted.org/packages/ed/de/0e6edf44d6a04dabd0318a519125ed0415ce437ad5a1ec9b9be03d9048cf/ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fb46acf6b93b8dd0322adc3a4945452a4e774b75b91293bafcc7b7f8e6517dfa", size = 180716, upload-time = "2025-08-11T15:09:58.696Z" }, - { url = "https://files.pythonhosted.org/packages/54/28/938b562f9057aaa4d6bfbeaa05e81899a47aebb3ba6751e36c027a7f5ff7/ninja-1.13.0-py3-none-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4be9c1b082d244b1ad7ef41eb8ab088aae8c109a9f3f0b3e56a252d3e00f42c1", size = 146843, upload-time = "2025-08-11T15:10:00.046Z" }, - { url = "https://files.pythonhosted.org/packages/2a/fb/d06a3838de4f8ab866e44ee52a797b5491df823901c54943b2adb0389fbb/ninja-1.13.0-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:6739d3352073341ad284246f81339a384eec091d9851a886dfa5b00a6d48b3e2", size = 154402, upload-time = "2025-08-11T15:10:01.657Z" }, - { url = "https://files.pythonhosted.org/packages/31/bf/0d7808af695ceddc763cf251b84a9892cd7f51622dc8b4c89d5012779f06/ninja-1.13.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:11be2d22027bde06f14c343f01d31446747dbb51e72d00decca2eb99be911e2f", size = 552388, upload-time = "2025-08-11T15:10:03.349Z" }, - { url = "https://files.pythonhosted.org/packages/9d/70/c99d0c2c809f992752453cce312848abb3b1607e56d4cd1b6cded317351a/ninja-1.13.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:aa45b4037b313c2f698bc13306239b8b93b4680eb47e287773156ac9e9304714", size = 472501, upload-time = "2025-08-11T15:10:04.735Z" }, - { url = "https://files.pythonhosted.org/packages/9f/43/c217b1153f0e499652f5e0766da8523ce3480f0a951039c7af115e224d55/ninja-1.13.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:5f8e1e8a1a30835eeb51db05cf5a67151ad37542f5a4af2a438e9490915e5b72", size = 638280, upload-time = "2025-08-11T15:10:06.512Z" }, - { url = "https://files.pythonhosted.org/packages/8c/45/9151bba2c8d0ae2b6260f71696330590de5850e5574b7b5694dce6023e20/ninja-1.13.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:3d7d7779d12cb20c6d054c61b702139fd23a7a964ec8f2c823f1ab1b084150db", size = 642420, upload-time = "2025-08-11T15:10:08.35Z" }, - { url = "https://files.pythonhosted.org/packages/3c/fb/95752eb635bb8ad27d101d71bef15bc63049de23f299e312878fc21cb2da/ninja-1.13.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:d741a5e6754e0bda767e3274a0f0deeef4807f1fec6c0d7921a0244018926ae5", size = 585106, upload-time = "2025-08-11T15:10:09.818Z" }, - { url = "https://files.pythonhosted.org/packages/c1/31/aa56a1a286703800c0cbe39fb4e82811c277772dc8cd084f442dd8e2938a/ninja-1.13.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:e8bad11f8a00b64137e9b315b137d8bb6cbf3086fbdc43bf1f90fd33324d2e96", size = 707138, upload-time = "2025-08-11T15:10:11.366Z" }, - { url = "https://files.pythonhosted.org/packages/34/6f/5f5a54a1041af945130abdb2b8529cbef0cdcbbf9bcf3f4195378319d29a/ninja-1.13.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b4f2a072db3c0f944c32793e91532d8948d20d9ab83da9c0c7c15b5768072200", size = 581758, upload-time = "2025-08-11T15:10:13.295Z" }, - { url = "https://files.pythonhosted.org/packages/95/97/51359c77527d45943fe7a94d00a3843b81162e6c4244b3579fe8fc54cb9c/ninja-1.13.0-py3-none-win32.whl", hash = "sha256:8cfbb80b4a53456ae8a39f90ae3d7a2129f45ea164f43fadfa15dc38c4aef1c9", size = 267201, upload-time = "2025-08-11T15:10:15.158Z" }, - { url = "https://files.pythonhosted.org/packages/29/45/c0adfbfb0b5895aa18cec400c535b4f7ff3e52536e0403602fc1a23f7de9/ninja-1.13.0-py3-none-win_amd64.whl", hash = "sha256:fb8ee8719f8af47fed145cced4a85f0755dd55d45b2bddaf7431fa89803c5f3e", size = 309975, upload-time = "2025-08-11T15:10:16.697Z" }, - { url = "https://files.pythonhosted.org/packages/df/93/a7b983643d1253bb223234b5b226e69de6cda02b76cdca7770f684b795f5/ninja-1.13.0-py3-none-win_arm64.whl", hash = "sha256:3c0b40b1f0bba764644385319028650087b4c1b18cdfa6f45cb39a3669b81aa9", size = 290806, upload-time = "2025-08-11T15:10:18.018Z" }, -] - -[[package]] -name = "numpy" -version = "2.3.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d0/19/95b3d357407220ed24c139018d2518fab0a61a948e68286a25f1a4d049ff/numpy-2.3.3.tar.gz", hash = "sha256:ddc7c39727ba62b80dfdbedf400d1c10ddfa8eefbd7ec8dcb118be8b56d31029", size = 20576648, upload-time = "2025-09-09T16:54:12.543Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7a/45/e80d203ef6b267aa29b22714fb558930b27960a0c5ce3c19c999232bb3eb/numpy-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0ffc4f5caba7dfcbe944ed674b7eef683c7e94874046454bb79ed7ee0236f59d", size = 21259253, upload-time = "2025-09-09T15:56:02.094Z" }, - { url = "https://files.pythonhosted.org/packages/52/18/cf2c648fccf339e59302e00e5f2bc87725a3ce1992f30f3f78c9044d7c43/numpy-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e7e946c7170858a0295f79a60214424caac2ffdb0063d4d79cb681f9aa0aa569", size = 14450980, upload-time = "2025-09-09T15:56:05.926Z" }, - { url = "https://files.pythonhosted.org/packages/93/fb/9af1082bec870188c42a1c239839915b74a5099c392389ff04215dcee812/numpy-2.3.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:cd4260f64bc794c3390a63bf0728220dd1a68170c169088a1e0dfa2fde1be12f", size = 5379709, upload-time = "2025-09-09T15:56:07.95Z" }, - { url = "https://files.pythonhosted.org/packages/75/0f/bfd7abca52bcbf9a4a65abc83fe18ef01ccdeb37bfb28bbd6ad613447c79/numpy-2.3.3-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:f0ddb4b96a87b6728df9362135e764eac3cfa674499943ebc44ce96c478ab125", size = 6913923, upload-time = "2025-09-09T15:56:09.443Z" }, - { url = "https://files.pythonhosted.org/packages/79/55/d69adad255e87ab7afda1caf93ca997859092afeb697703e2f010f7c2e55/numpy-2.3.3-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:afd07d377f478344ec6ca2b8d4ca08ae8bd44706763d1efb56397de606393f48", size = 14589591, upload-time = "2025-09-09T15:56:11.234Z" }, - { url = "https://files.pythonhosted.org/packages/10/a2/010b0e27ddeacab7839957d7a8f00e91206e0c2c47abbb5f35a2630e5387/numpy-2.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bc92a5dedcc53857249ca51ef29f5e5f2f8c513e22cfb90faeb20343b8c6f7a6", size = 16938714, upload-time = "2025-09-09T15:56:14.637Z" }, - { url = "https://files.pythonhosted.org/packages/1c/6b/12ce8ede632c7126eb2762b9e15e18e204b81725b81f35176eac14dc5b82/numpy-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7af05ed4dc19f308e1d9fc759f36f21921eb7bbfc82843eeec6b2a2863a0aefa", size = 16370592, upload-time = "2025-09-09T15:56:17.285Z" }, - { url = "https://files.pythonhosted.org/packages/b4/35/aba8568b2593067bb6a8fe4c52babb23b4c3b9c80e1b49dff03a09925e4a/numpy-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:433bf137e338677cebdd5beac0199ac84712ad9d630b74eceeb759eaa45ddf30", size = 18884474, upload-time = "2025-09-09T15:56:20.943Z" }, - { url = "https://files.pythonhosted.org/packages/45/fa/7f43ba10c77575e8be7b0138d107e4f44ca4a1ef322cd16980ea3e8b8222/numpy-2.3.3-cp311-cp311-win32.whl", hash = "sha256:eb63d443d7b4ffd1e873f8155260d7f58e7e4b095961b01c91062935c2491e57", size = 6599794, upload-time = "2025-09-09T15:56:23.258Z" }, - { url = "https://files.pythonhosted.org/packages/0a/a2/a4f78cb2241fe5664a22a10332f2be886dcdea8784c9f6a01c272da9b426/numpy-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:ec9d249840f6a565f58d8f913bccac2444235025bbb13e9a4681783572ee3caa", size = 13088104, upload-time = "2025-09-09T15:56:25.476Z" }, - { url = "https://files.pythonhosted.org/packages/79/64/e424e975adbd38282ebcd4891661965b78783de893b381cbc4832fb9beb2/numpy-2.3.3-cp311-cp311-win_arm64.whl", hash = "sha256:74c2a948d02f88c11a3c075d9733f1ae67d97c6bdb97f2bb542f980458b257e7", size = 10460772, upload-time = "2025-09-09T15:56:27.679Z" }, - { url = "https://files.pythonhosted.org/packages/51/5d/bb7fc075b762c96329147799e1bcc9176ab07ca6375ea976c475482ad5b3/numpy-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cfdd09f9c84a1a934cde1eec2267f0a43a7cd44b2cca4ff95b7c0d14d144b0bf", size = 20957014, upload-time = "2025-09-09T15:56:29.966Z" }, - { url = "https://files.pythonhosted.org/packages/6b/0e/c6211bb92af26517acd52125a237a92afe9c3124c6a68d3b9f81b62a0568/numpy-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cb32e3cf0f762aee47ad1ddc6672988f7f27045b0783c887190545baba73aa25", size = 14185220, upload-time = "2025-09-09T15:56:32.175Z" }, - { url = "https://files.pythonhosted.org/packages/22/f2/07bb754eb2ede9073f4054f7c0286b0d9d2e23982e090a80d478b26d35ca/numpy-2.3.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:396b254daeb0a57b1fe0ecb5e3cff6fa79a380fa97c8f7781a6d08cd429418fe", size = 5113918, upload-time = "2025-09-09T15:56:34.175Z" }, - { url = "https://files.pythonhosted.org/packages/81/0a/afa51697e9fb74642f231ea36aca80fa17c8fb89f7a82abd5174023c3960/numpy-2.3.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:067e3d7159a5d8f8a0b46ee11148fc35ca9b21f61e3c49fbd0a027450e65a33b", size = 6647922, upload-time = "2025-09-09T15:56:36.149Z" }, - { url = "https://files.pythonhosted.org/packages/5d/f5/122d9cdb3f51c520d150fef6e87df9279e33d19a9611a87c0d2cf78a89f4/numpy-2.3.3-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1c02d0629d25d426585fb2e45a66154081b9fa677bc92a881ff1d216bc9919a8", size = 14281991, upload-time = "2025-09-09T15:56:40.548Z" }, - { url = "https://files.pythonhosted.org/packages/51/64/7de3c91e821a2debf77c92962ea3fe6ac2bc45d0778c1cbe15d4fce2fd94/numpy-2.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d9192da52b9745f7f0766531dcfa978b7763916f158bb63bdb8a1eca0068ab20", size = 16641643, upload-time = "2025-09-09T15:56:43.343Z" }, - { url = "https://files.pythonhosted.org/packages/30/e4/961a5fa681502cd0d68907818b69f67542695b74e3ceaa513918103b7e80/numpy-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:cd7de500a5b66319db419dc3c345244404a164beae0d0937283b907d8152e6ea", size = 16056787, upload-time = "2025-09-09T15:56:46.141Z" }, - { url = "https://files.pythonhosted.org/packages/99/26/92c912b966e47fbbdf2ad556cb17e3a3088e2e1292b9833be1dfa5361a1a/numpy-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:93d4962d8f82af58f0b2eb85daaf1b3ca23fe0a85d0be8f1f2b7bb46034e56d7", size = 18579598, upload-time = "2025-09-09T15:56:49.844Z" }, - { url = "https://files.pythonhosted.org/packages/17/b6/fc8f82cb3520768718834f310c37d96380d9dc61bfdaf05fe5c0b7653e01/numpy-2.3.3-cp312-cp312-win32.whl", hash = "sha256:5534ed6b92f9b7dca6c0a19d6df12d41c68b991cef051d108f6dbff3babc4ebf", size = 6320800, upload-time = "2025-09-09T15:56:52.499Z" }, - { url = "https://files.pythonhosted.org/packages/32/ee/de999f2625b80d043d6d2d628c07d0d5555a677a3cf78fdf868d409b8766/numpy-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:497d7cad08e7092dba36e3d296fe4c97708c93daf26643a1ae4b03f6294d30eb", size = 12786615, upload-time = "2025-09-09T15:56:54.422Z" }, - { url = "https://files.pythonhosted.org/packages/49/6e/b479032f8a43559c383acb20816644f5f91c88f633d9271ee84f3b3a996c/numpy-2.3.3-cp312-cp312-win_arm64.whl", hash = "sha256:ca0309a18d4dfea6fc6262a66d06c26cfe4640c3926ceec90e57791a82b6eee5", size = 10195936, upload-time = "2025-09-09T15:56:56.541Z" }, - { url = "https://files.pythonhosted.org/packages/b8/f2/7e0a37cfced2644c9563c529f29fa28acbd0960dde32ece683aafa6f4949/numpy-2.3.3-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1e02c7159791cd481e1e6d5ddd766b62a4d5acf8df4d4d1afe35ee9c5c33a41e", size = 21131019, upload-time = "2025-09-09T15:58:42.838Z" }, - { url = "https://files.pythonhosted.org/packages/1a/7e/3291f505297ed63831135a6cc0f474da0c868a1f31b0dd9a9f03a7a0d2ed/numpy-2.3.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:dca2d0fc80b3893ae72197b39f69d55a3cd8b17ea1b50aa4c62de82419936150", size = 14376288, upload-time = "2025-09-09T15:58:45.425Z" }, - { url = "https://files.pythonhosted.org/packages/bf/4b/ae02e985bdeee73d7b5abdefeb98aef1207e96d4c0621ee0cf228ddfac3c/numpy-2.3.3-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:99683cbe0658f8271b333a1b1b4bb3173750ad59c0c61f5bbdc5b318918fffe3", size = 5305425, upload-time = "2025-09-09T15:58:48.6Z" }, - { url = "https://files.pythonhosted.org/packages/8b/eb/9df215d6d7250db32007941500dc51c48190be25f2401d5b2b564e467247/numpy-2.3.3-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:d9d537a39cc9de668e5cd0e25affb17aec17b577c6b3ae8a3d866b479fbe88d0", size = 6819053, upload-time = "2025-09-09T15:58:50.401Z" }, - { url = "https://files.pythonhosted.org/packages/57/62/208293d7d6b2a8998a4a1f23ac758648c3c32182d4ce4346062018362e29/numpy-2.3.3-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8596ba2f8af5f93b01d97563832686d20206d303024777f6dfc2e7c7c3f1850e", size = 14420354, upload-time = "2025-09-09T15:58:52.704Z" }, - { url = "https://files.pythonhosted.org/packages/ed/0c/8e86e0ff7072e14a71b4c6af63175e40d1e7e933ce9b9e9f765a95b4e0c3/numpy-2.3.3-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e1ec5615b05369925bd1125f27df33f3b6c8bc10d788d5999ecd8769a1fa04db", size = 16760413, upload-time = "2025-09-09T15:58:55.027Z" }, - { url = "https://files.pythonhosted.org/packages/af/11/0cc63f9f321ccf63886ac203336777140011fb669e739da36d8db3c53b98/numpy-2.3.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:2e267c7da5bf7309670523896df97f93f6e469fb931161f483cd6882b3b1a5dc", size = 12971844, upload-time = "2025-09-09T15:58:57.359Z" }, -] - -[[package]] -name = "nvidia-cublas-cu12" -version = "12.8.4.1" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" }, -] - -[[package]] -name = "nvidia-cuda-cupti-cu12" -version = "12.8.90" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" }, -] - -[[package]] -name = "nvidia-cuda-nvrtc-cu12" -version = "12.8.93" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" }, -] - -[[package]] -name = "nvidia-cuda-runtime-cu12" -version = "12.8.90" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" }, -] - -[[package]] -name = "nvidia-cudnn-cu12" -version = "9.10.2.21" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "nvidia-cublas-cu12" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" }, -] - -[[package]] -name = "nvidia-cudnn-frontend" -version = "1.18.0" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e2/9a/83d3d080118de4a7810fa019349edec634b8b37b9cafaacd05719de62dd6/nvidia_cudnn_frontend-1.18.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6d4d0b88d617b233a503c84980b54d840b60b2734497d1a7a071ec5293daec2", size = 2023709, upload-time = "2026-01-27T23:32:10.912Z" }, - { url = "https://files.pythonhosted.org/packages/13/c7/c3624b3ed77b102618f26295e816b27f1c3ebb1143730237a9f51d403c3f/nvidia_cudnn_frontend-1.18.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:382ea063b92cbfd5b442cb75ff8422932d78276aecf139e46713ed1ad3d07af4", size = 2155568, upload-time = "2026-01-27T23:07:13.277Z" }, - { url = "https://files.pythonhosted.org/packages/52/dd/8613dfd029d076b86a8a87efe3f4bb4ab73cec15fa8fc27e665098f4d167/nvidia_cudnn_frontend-1.18.0-cp311-cp311-win_amd64.whl", hash = "sha256:baa509effc4d299d3f04e549d4188f88bca8a8b527f483cbd2f66bc18f13a8b1", size = 1591244, upload-time = "2026-01-27T23:08:44.691Z" }, - { url = "https://files.pythonhosted.org/packages/e3/b4/604e230378680ee117849a4e1045baca092f93161a829291a84d5acce70c/nvidia_cudnn_frontend-1.18.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:310b417f2848a83d1437203fcaeea320a74fb7f28af20bf42bf5afc9c01f1c12", size = 2027408, upload-time = "2026-01-27T23:32:46.576Z" }, - { url = "https://files.pythonhosted.org/packages/c6/52/08f98262e77b1cbcc834cc1a5db494d0661ea1dbdea58c2e2d51a57fdaca/nvidia_cudnn_frontend-1.18.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6c023539ca6de99234cf5102c3ec0d6af817f5396fc93028a22ba5b834a35b8a", size = 2159245, upload-time = "2026-01-27T23:07:32.664Z" }, - { url = "https://files.pythonhosted.org/packages/aa/1f/751a5a8cfdc95fb4dc556192d37369ae488c30c473fe9a3ec720b23d07ea/nvidia_cudnn_frontend-1.18.0-cp312-cp312-win_amd64.whl", hash = "sha256:e13f7dd46cdb4762dde87f181f06d1c5e15e9478bbdd547bfa74d9b11f415aae", size = 1591041, upload-time = "2026-01-27T23:09:04.118Z" }, -] - -[[package]] -name = "nvidia-cufft-cu12" -version = "11.3.3.83" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "nvidia-nvjitlink-cu12" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" }, -] - -[[package]] -name = "nvidia-cufile-cu12" -version = "1.13.1.3" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" }, -] - -[[package]] -name = "nvidia-curand-cu12" -version = "10.3.9.90" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" }, -] - -[[package]] -name = "nvidia-cusolver-cu12" -version = "11.7.3.90" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "nvidia-cublas-cu12" }, - { name = "nvidia-cusparse-cu12" }, - { name = "nvidia-nvjitlink-cu12" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" }, -] - -[[package]] -name = "nvidia-cusparse-cu12" -version = "12.5.8.93" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "nvidia-nvjitlink-cu12" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" }, -] - -[[package]] -name = "nvidia-cusparselt-cu12" -version = "0.7.1" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" }, -] - -[[package]] -name = "nvidia-cutlass-dsl" -version = "4.2.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cuda-python" }, - { name = "numpy" }, - { name = "typing-extensions" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/1d/2a/e65312728338e5bb00b592ce0be12b51e7594a3ef288cd8c99bc1c456968/nvidia_cutlass_dsl-4.2.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:04e605417773957405cad0ac6c2d46139a88aca07a783b4f66e1363f3a91a835", size = 58540069, upload-time = "2025-09-23T14:38:56.002Z" }, - { url = "https://files.pythonhosted.org/packages/be/f3/20eacdf9876abd892668c191003edc5d7100e45fabfa027d9f3f99d21871/nvidia_cutlass_dsl-4.2.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:874aa3620b3d3dc6598af2226fa3b78f2e7998b8656929b492259e0c9f778786", size = 62233009, upload-time = "2025-09-23T14:39:23.308Z" }, - { url = "https://files.pythonhosted.org/packages/1e/1d/f168a3dbd8570e5dbbe0deca217d7b374c977b4a4970ebadf3b6d0f1174f/nvidia_cutlass_dsl-4.2.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:10ace6e2005cb0bc04d158c7660f8ec104ab29aeffb26f1ed3bb0b5a577ccc34", size = 58535504, upload-time = "2025-09-23T14:38:29.028Z" }, - { url = "https://files.pythonhosted.org/packages/02/ab/5bcc0c8c620af5d4acbc71abce10e3eb3023e50342e6bc29b6461f72530e/nvidia_cutlass_dsl-4.2.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:d7ddc9c1f5bb803718d736c907fac857fc606f1fce630c0b1d741935a72723b9", size = 62230361, upload-time = "2025-09-23T14:40:18.156Z" }, -] - -[[package]] -name = "nvidia-ml-py" -version = "13.590.48" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/af/a0/f4fc18cf72f06821a9a665085435b901449986855519d5b3843532db35e9/nvidia_ml_py-13.590.48.tar.gz", hash = "sha256:8184d1be52914ac7f0991cd1c0d946c65dc88a840c754cd12c274b77b88760dd", size = 49732, upload-time = "2026-01-22T01:14:56.456Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fd/72/fb2af0d259a651affdce65fd6a495f0e07a685a0136baf585c5065204ee7/nvidia_ml_py-13.590.48-py3-none-any.whl", hash = "sha256:fd43d30ee9cd0b7940f5f9f9220b68d42722975e3992b6c21d14144c48760e43", size = 50680, upload-time = "2026-01-22T01:14:55.281Z" }, -] - -[[package]] -name = "nvidia-nccl-cu12" -version = "2.27.3" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5c/5b/4e4fff7bad39adf89f735f2bc87248c81db71205b62bcc0d5ca5b606b3c3/nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adf27ccf4238253e0b826bce3ff5fa532d65fc42322c8bfdfaf28024c0fbe039", size = 322364134, upload-time = "2025-06-03T21:58:04.013Z" }, -] - -[[package]] -name = "nvidia-nvjitlink-cu12" -version = "12.8.93" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" }, -] - -[[package]] -name = "nvidia-nvtx-cu12" -version = "12.8.90" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" }, -] - -[[package]] -name = "packaging" -version = "26.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, -] - -[[package]] -name = "pandas" -version = "3.0.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, - { name = "python-dateutil" }, - { name = "tzdata", marker = "sys_platform == 'emscripten' or sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/2e/0c/b28ed414f080ee0ad153f848586d61d1878f91689950f037f976ce15f6c8/pandas-3.0.1.tar.gz", hash = "sha256:4186a699674af418f655dbd420ed87f50d56b4cd6603784279d9eef6627823c8", size = 4641901, upload-time = "2026-02-17T22:20:16.434Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ff/07/c7087e003ceee9b9a82539b40414ec557aa795b584a1a346e89180853d79/pandas-3.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:de09668c1bf3b925c07e5762291602f0d789eca1b3a781f99c1c78f6cac0e7ea", size = 10323380, upload-time = "2026-02-17T22:18:16.133Z" }, - { url = "https://files.pythonhosted.org/packages/c1/27/90683c7122febeefe84a56f2cde86a9f05f68d53885cebcc473298dfc33e/pandas-3.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:24ba315ba3d6e5806063ac6eb717504e499ce30bd8c236d8693a5fd3f084c796", size = 9923455, upload-time = "2026-02-17T22:18:19.13Z" }, - { url = "https://files.pythonhosted.org/packages/0e/f1/ed17d927f9950643bc7631aa4c99ff0cc83a37864470bc419345b656a41f/pandas-3.0.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:406ce835c55bac912f2a0dcfaf27c06d73c6b04a5dde45f1fd3169ce31337389", size = 10753464, upload-time = "2026-02-17T22:18:21.134Z" }, - { url = "https://files.pythonhosted.org/packages/2e/7c/870c7e7daec2a6c7ff2ac9e33b23317230d4e4e954b35112759ea4a924a7/pandas-3.0.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:830994d7e1f31dd7e790045235605ab61cff6c94defc774547e8b7fdfbff3dc7", size = 11255234, upload-time = "2026-02-17T22:18:24.175Z" }, - { url = "https://files.pythonhosted.org/packages/5c/39/3653fe59af68606282b989c23d1a543ceba6e8099cbcc5f1d506a7bae2aa/pandas-3.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a64ce8b0f2de1d2efd2ae40b0abe7f8ae6b29fbfb3812098ed5a6f8e235ad9bf", size = 11767299, upload-time = "2026-02-17T22:18:26.824Z" }, - { url = "https://files.pythonhosted.org/packages/9b/31/1daf3c0c94a849c7a8dab8a69697b36d313b229918002ba3e409265c7888/pandas-3.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9832c2c69da24b602c32e0c7b1b508a03949c18ba08d4d9f1c1033426685b447", size = 12333292, upload-time = "2026-02-17T22:18:28.996Z" }, - { url = "https://files.pythonhosted.org/packages/1f/67/af63f83cd6ca603a00fe8530c10a60f0879265b8be00b5930e8e78c5b30b/pandas-3.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:84f0904a69e7365f79a0c77d3cdfccbfb05bf87847e3a51a41e1426b0edb9c79", size = 9892176, upload-time = "2026-02-17T22:18:31.79Z" }, - { url = "https://files.pythonhosted.org/packages/79/ab/9c776b14ac4b7b4140788eca18468ea39894bc7340a408f1d1e379856a6b/pandas-3.0.1-cp311-cp311-win_arm64.whl", hash = "sha256:4a68773d5a778afb31d12e34f7dd4612ab90de8c6fb1d8ffe5d4a03b955082a1", size = 9151328, upload-time = "2026-02-17T22:18:35.721Z" }, - { url = "https://files.pythonhosted.org/packages/37/51/b467209c08dae2c624873d7491ea47d2b47336e5403309d433ea79c38571/pandas-3.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:476f84f8c20c9f5bc47252b66b4bb25e1a9fc2fa98cead96744d8116cb85771d", size = 10344357, upload-time = "2026-02-17T22:18:38.262Z" }, - { url = "https://files.pythonhosted.org/packages/7c/f1/e2567ffc8951ab371db2e40b2fe068e36b81d8cf3260f06ae508700e5504/pandas-3.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0ab749dfba921edf641d4036c4c21c0b3ea70fea478165cb98a998fb2a261955", size = 9884543, upload-time = "2026-02-17T22:18:41.476Z" }, - { url = "https://files.pythonhosted.org/packages/d7/39/327802e0b6d693182403c144edacbc27eb82907b57062f23ef5a4c4a5ea7/pandas-3.0.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b8e36891080b87823aff3640c78649b91b8ff6eea3c0d70aeabd72ea43ab069b", size = 10396030, upload-time = "2026-02-17T22:18:43.822Z" }, - { url = "https://files.pythonhosted.org/packages/3d/fe/89d77e424365280b79d99b3e1e7d606f5165af2f2ecfaf0c6d24c799d607/pandas-3.0.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:532527a701281b9dd371e2f582ed9094f4c12dd9ffb82c0c54ee28d8ac9520c4", size = 10876435, upload-time = "2026-02-17T22:18:45.954Z" }, - { url = "https://files.pythonhosted.org/packages/b5/a6/2a75320849dd154a793f69c951db759aedb8d1dd3939eeacda9bdcfa1629/pandas-3.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:356e5c055ed9b0da1580d465657bc7d00635af4fd47f30afb23025352ba764d1", size = 11405133, upload-time = "2026-02-17T22:18:48.533Z" }, - { url = "https://files.pythonhosted.org/packages/58/53/1d68fafb2e02d7881df66aa53be4cd748d25cbe311f3b3c85c93ea5d30ca/pandas-3.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9d810036895f9ad6345b8f2a338dd6998a74e8483847403582cab67745bff821", size = 11932065, upload-time = "2026-02-17T22:18:50.837Z" }, - { url = "https://files.pythonhosted.org/packages/75/08/67cc404b3a966b6df27b38370ddd96b3b023030b572283d035181854aac5/pandas-3.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:536232a5fe26dd989bd633e7a0c450705fdc86a207fec7254a55e9a22950fe43", size = 9741627, upload-time = "2026-02-17T22:18:53.905Z" }, - { url = "https://files.pythonhosted.org/packages/86/4f/caf9952948fb00d23795f09b893d11f1cacb384e666854d87249530f7cbe/pandas-3.0.1-cp312-cp312-win_arm64.whl", hash = "sha256:0f463ebfd8de7f326d38037c7363c6dacb857c5881ab8961fb387804d6daf2f7", size = 9052483, upload-time = "2026-02-17T22:18:57.31Z" }, -] - -[[package]] -name = "platformdirs" -version = "4.9.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1b/04/fea538adf7dbbd6d186f551d595961e564a3b6715bdf276b477460858672/platformdirs-4.9.2.tar.gz", hash = "sha256:9a33809944b9db043ad67ca0db94b14bf452cc6aeaac46a88ea55b26e2e9d291", size = 28394, upload-time = "2026-02-16T03:56:10.574Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/48/31/05e764397056194206169869b50cf2fee4dbbbc71b344705b9c0d878d4d8/platformdirs-4.9.2-py3-none-any.whl", hash = "sha256:9170634f126f8efdae22fb58ae8a0eaa86f38365bc57897a6c4f781d1f5875bd", size = 21168, upload-time = "2026-02-16T03:56:08.891Z" }, -] - -[[package]] -name = "propcache" -version = "0.4.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/9e/da/e9fc233cf63743258bff22b3dfa7ea5baef7b5bc324af47a0ad89b8ffc6f/propcache-0.4.1.tar.gz", hash = "sha256:f48107a8c637e80362555f37ecf49abe20370e557cc4ab374f04ec4423c97c3d", size = 46442, upload-time = "2025-10-08T19:49:02.291Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8c/d4/4e2c9aaf7ac2242b9358f98dccd8f90f2605402f5afeff6c578682c2c491/propcache-0.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:60a8fda9644b7dfd5dece8c61d8a85e271cb958075bfc4e01083c148b61a7caf", size = 80208, upload-time = "2025-10-08T19:46:24.597Z" }, - { url = "https://files.pythonhosted.org/packages/c2/21/d7b68e911f9c8e18e4ae43bdbc1e1e9bbd971f8866eb81608947b6f585ff/propcache-0.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c30b53e7e6bda1d547cabb47c825f3843a0a1a42b0496087bb58d8fedf9f41b5", size = 45777, upload-time = "2025-10-08T19:46:25.733Z" }, - { url = "https://files.pythonhosted.org/packages/d3/1d/11605e99ac8ea9435651ee71ab4cb4bf03f0949586246476a25aadfec54a/propcache-0.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6918ecbd897443087a3b7cd978d56546a812517dcaaca51b49526720571fa93e", size = 47647, upload-time = "2025-10-08T19:46:27.304Z" }, - { url = "https://files.pythonhosted.org/packages/58/1a/3c62c127a8466c9c843bccb503d40a273e5cc69838805f322e2826509e0d/propcache-0.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3d902a36df4e5989763425a8ab9e98cd8ad5c52c823b34ee7ef307fd50582566", size = 214929, upload-time = "2025-10-08T19:46:28.62Z" }, - { url = "https://files.pythonhosted.org/packages/56/b9/8fa98f850960b367c4b8fe0592e7fc341daa7a9462e925228f10a60cf74f/propcache-0.4.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a9695397f85973bb40427dedddf70d8dc4a44b22f1650dd4af9eedf443d45165", size = 221778, upload-time = "2025-10-08T19:46:30.358Z" }, - { url = "https://files.pythonhosted.org/packages/46/a6/0ab4f660eb59649d14b3d3d65c439421cf2f87fe5dd68591cbe3c1e78a89/propcache-0.4.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2bb07ffd7eaad486576430c89f9b215f9e4be68c4866a96e97db9e97fead85dc", size = 228144, upload-time = "2025-10-08T19:46:32.607Z" }, - { url = "https://files.pythonhosted.org/packages/52/6a/57f43e054fb3d3a56ac9fc532bc684fc6169a26c75c353e65425b3e56eef/propcache-0.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fd6f30fdcf9ae2a70abd34da54f18da086160e4d7d9251f81f3da0ff84fc5a48", size = 210030, upload-time = "2025-10-08T19:46:33.969Z" }, - { url = "https://files.pythonhosted.org/packages/40/e2/27e6feebb5f6b8408fa29f5efbb765cd54c153ac77314d27e457a3e993b7/propcache-0.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fc38cba02d1acba4e2869eef1a57a43dfbd3d49a59bf90dda7444ec2be6a5570", size = 208252, upload-time = "2025-10-08T19:46:35.309Z" }, - { url = "https://files.pythonhosted.org/packages/9e/f8/91c27b22ccda1dbc7967f921c42825564fa5336a01ecd72eb78a9f4f53c2/propcache-0.4.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:67fad6162281e80e882fb3ec355398cf72864a54069d060321f6cd0ade95fe85", size = 202064, upload-time = "2025-10-08T19:46:36.993Z" }, - { url = "https://files.pythonhosted.org/packages/f2/26/7f00bd6bd1adba5aafe5f4a66390f243acab58eab24ff1a08bebb2ef9d40/propcache-0.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f10207adf04d08bec185bae14d9606a1444715bc99180f9331c9c02093e1959e", size = 212429, upload-time = "2025-10-08T19:46:38.398Z" }, - { url = "https://files.pythonhosted.org/packages/84/89/fd108ba7815c1117ddca79c228f3f8a15fc82a73bca8b142eb5de13b2785/propcache-0.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e9b0d8d0845bbc4cfcdcbcdbf5086886bc8157aa963c31c777ceff7846c77757", size = 216727, upload-time = "2025-10-08T19:46:39.732Z" }, - { url = "https://files.pythonhosted.org/packages/79/37/3ec3f7e3173e73f1d600495d8b545b53802cbf35506e5732dd8578db3724/propcache-0.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:981333cb2f4c1896a12f4ab92a9cc8f09ea664e9b7dbdc4eff74627af3a11c0f", size = 205097, upload-time = "2025-10-08T19:46:41.025Z" }, - { url = "https://files.pythonhosted.org/packages/61/b0/b2631c19793f869d35f47d5a3a56fb19e9160d3c119f15ac7344fc3ccae7/propcache-0.4.1-cp311-cp311-win32.whl", hash = "sha256:f1d2f90aeec838a52f1c1a32fe9a619fefd5e411721a9117fbf82aea638fe8a1", size = 38084, upload-time = "2025-10-08T19:46:42.693Z" }, - { url = "https://files.pythonhosted.org/packages/f4/78/6cce448e2098e9f3bfc91bb877f06aa24b6ccace872e39c53b2f707c4648/propcache-0.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:364426a62660f3f699949ac8c621aad6977be7126c5807ce48c0aeb8e7333ea6", size = 41637, upload-time = "2025-10-08T19:46:43.778Z" }, - { url = "https://files.pythonhosted.org/packages/9c/e9/754f180cccd7f51a39913782c74717c581b9cc8177ad0e949f4d51812383/propcache-0.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:e53f3a38d3510c11953f3e6a33f205c6d1b001129f972805ca9b42fc308bc239", size = 38064, upload-time = "2025-10-08T19:46:44.872Z" }, - { url = "https://files.pythonhosted.org/packages/a2/0f/f17b1b2b221d5ca28b4b876e8bb046ac40466513960646bda8e1853cdfa2/propcache-0.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e153e9cd40cc8945138822807139367f256f89c6810c2634a4f6902b52d3b4e2", size = 80061, upload-time = "2025-10-08T19:46:46.075Z" }, - { url = "https://files.pythonhosted.org/packages/76/47/8ccf75935f51448ba9a16a71b783eb7ef6b9ee60f5d14c7f8a8a79fbeed7/propcache-0.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cd547953428f7abb73c5ad82cbb32109566204260d98e41e5dfdc682eb7f8403", size = 46037, upload-time = "2025-10-08T19:46:47.23Z" }, - { url = "https://files.pythonhosted.org/packages/0a/b6/5c9a0e42df4d00bfb4a3cbbe5cf9f54260300c88a0e9af1f47ca5ce17ac0/propcache-0.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f048da1b4f243fc44f205dfd320933a951b8d89e0afd4c7cacc762a8b9165207", size = 47324, upload-time = "2025-10-08T19:46:48.384Z" }, - { url = "https://files.pythonhosted.org/packages/9e/d3/6c7ee328b39a81ee877c962469f1e795f9db87f925251efeb0545e0020d0/propcache-0.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ec17c65562a827bba85e3872ead335f95405ea1674860d96483a02f5c698fa72", size = 225505, upload-time = "2025-10-08T19:46:50.055Z" }, - { url = "https://files.pythonhosted.org/packages/01/5d/1c53f4563490b1d06a684742cc6076ef944bc6457df6051b7d1a877c057b/propcache-0.4.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:405aac25c6394ef275dee4c709be43745d36674b223ba4eb7144bf4d691b7367", size = 230242, upload-time = "2025-10-08T19:46:51.815Z" }, - { url = "https://files.pythonhosted.org/packages/20/e1/ce4620633b0e2422207c3cb774a0ee61cac13abc6217763a7b9e2e3f4a12/propcache-0.4.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0013cb6f8dde4b2a2f66903b8ba740bdfe378c943c4377a200551ceb27f379e4", size = 238474, upload-time = "2025-10-08T19:46:53.208Z" }, - { url = "https://files.pythonhosted.org/packages/46/4b/3aae6835b8e5f44ea6a68348ad90f78134047b503765087be2f9912140ea/propcache-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15932ab57837c3368b024473a525e25d316d8353016e7cc0e5ba9eb343fbb1cf", size = 221575, upload-time = "2025-10-08T19:46:54.511Z" }, - { url = "https://files.pythonhosted.org/packages/6e/a5/8a5e8678bcc9d3a1a15b9a29165640d64762d424a16af543f00629c87338/propcache-0.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:031dce78b9dc099f4c29785d9cf5577a3faf9ebf74ecbd3c856a7b92768c3df3", size = 216736, upload-time = "2025-10-08T19:46:56.212Z" }, - { url = "https://files.pythonhosted.org/packages/f1/63/b7b215eddeac83ca1c6b934f89d09a625aa9ee4ba158338854c87210cc36/propcache-0.4.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:ab08df6c9a035bee56e31af99be621526bd237bea9f32def431c656b29e41778", size = 213019, upload-time = "2025-10-08T19:46:57.595Z" }, - { url = "https://files.pythonhosted.org/packages/57/74/f580099a58c8af587cac7ba19ee7cb418506342fbbe2d4a4401661cca886/propcache-0.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4d7af63f9f93fe593afbf104c21b3b15868efb2c21d07d8732c0c4287e66b6a6", size = 220376, upload-time = "2025-10-08T19:46:59.067Z" }, - { url = "https://files.pythonhosted.org/packages/c4/ee/542f1313aff7eaf19c2bb758c5d0560d2683dac001a1c96d0774af799843/propcache-0.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:cfc27c945f422e8b5071b6e93169679e4eb5bf73bbcbf1ba3ae3a83d2f78ebd9", size = 226988, upload-time = "2025-10-08T19:47:00.544Z" }, - { url = "https://files.pythonhosted.org/packages/8f/18/9c6b015dd9c6930f6ce2229e1f02fb35298b847f2087ea2b436a5bfa7287/propcache-0.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:35c3277624a080cc6ec6f847cbbbb5b49affa3598c4535a0a4682a697aaa5c75", size = 215615, upload-time = "2025-10-08T19:47:01.968Z" }, - { url = "https://files.pythonhosted.org/packages/80/9e/e7b85720b98c45a45e1fca6a177024934dc9bc5f4d5dd04207f216fc33ed/propcache-0.4.1-cp312-cp312-win32.whl", hash = "sha256:671538c2262dadb5ba6395e26c1731e1d52534bfe9ae56d0b5573ce539266aa8", size = 38066, upload-time = "2025-10-08T19:47:03.503Z" }, - { url = "https://files.pythonhosted.org/packages/54/09/d19cff2a5aaac632ec8fc03737b223597b1e347416934c1b3a7df079784c/propcache-0.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:cb2d222e72399fcf5890d1d5cc1060857b9b236adff2792ff48ca2dfd46c81db", size = 41655, upload-time = "2025-10-08T19:47:04.973Z" }, - { url = "https://files.pythonhosted.org/packages/68/ab/6b5c191bb5de08036a8c697b265d4ca76148efb10fa162f14af14fb5f076/propcache-0.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:204483131fb222bdaaeeea9f9e6c6ed0cac32731f75dfc1d4a567fc1926477c1", size = 37789, upload-time = "2025-10-08T19:47:06.077Z" }, - { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" }, -] - -[[package]] -name = "protobuf" -version = "6.33.5" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ba/25/7c72c307aafc96fa87062aa6291d9f7c94836e43214d43722e86037aac02/protobuf-6.33.5.tar.gz", hash = "sha256:6ddcac2a081f8b7b9642c09406bc6a4290128fce5f471cddd165960bb9119e5c", size = 444465, upload-time = "2026-01-29T21:51:33.494Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b1/79/af92d0a8369732b027e6d6084251dd8e782c685c72da161bd4a2e00fbabb/protobuf-6.33.5-cp310-abi3-win32.whl", hash = "sha256:d71b040839446bac0f4d162e758bea99c8251161dae9d0983a3b88dee345153b", size = 425769, upload-time = "2026-01-29T21:51:21.751Z" }, - { url = "https://files.pythonhosted.org/packages/55/75/bb9bc917d10e9ee13dee8607eb9ab963b7cf8be607c46e7862c748aa2af7/protobuf-6.33.5-cp310-abi3-win_amd64.whl", hash = "sha256:3093804752167bcab3998bec9f1048baae6e29505adaf1afd14a37bddede533c", size = 437118, upload-time = "2026-01-29T21:51:24.022Z" }, - { url = "https://files.pythonhosted.org/packages/a2/6b/e48dfc1191bc5b52950246275bf4089773e91cb5ba3592621723cdddca62/protobuf-6.33.5-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:a5cb85982d95d906df1e2210e58f8e4f1e3cdc088e52c921a041f9c9a0386de5", size = 427766, upload-time = "2026-01-29T21:51:25.413Z" }, - { url = "https://files.pythonhosted.org/packages/4e/b1/c79468184310de09d75095ed1314b839eb2f72df71097db9d1404a1b2717/protobuf-6.33.5-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:9b71e0281f36f179d00cbcb119cb19dec4d14a81393e5ea220f64b286173e190", size = 324638, upload-time = "2026-01-29T21:51:26.423Z" }, - { url = "https://files.pythonhosted.org/packages/c5/f5/65d838092fd01c44d16037953fd4c2cc851e783de9b8f02b27ec4ffd906f/protobuf-6.33.5-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:8afa18e1d6d20af15b417e728e9f60f3aa108ee76f23c3b2c07a2c3b546d3afd", size = 339411, upload-time = "2026-01-29T21:51:27.446Z" }, - { url = "https://files.pythonhosted.org/packages/9b/53/a9443aa3ca9ba8724fdfa02dd1887c1bcd8e89556b715cfbacca6b63dbec/protobuf-6.33.5-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:cbf16ba3350fb7b889fca858fb215967792dc125b35c7976ca4818bee3521cf0", size = 323465, upload-time = "2026-01-29T21:51:28.925Z" }, - { url = "https://files.pythonhosted.org/packages/57/bf/2086963c69bdac3d7cff1cc7ff79b8ce5ea0bec6797a017e1be338a46248/protobuf-6.33.5-py3-none-any.whl", hash = "sha256:69915a973dd0f60f31a08b8318b73eab2bd6a392c79184b3612226b0a3f8ec02", size = 170687, upload-time = "2026-01-29T21:51:32.557Z" }, -] - -[[package]] -name = "pyarrow" -version = "23.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b0/41/8e6b6ef7e225d4ceead8459427a52afdc23379768f54dd3566014d7618c1/pyarrow-23.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6f0147ee9e0386f519c952cc670eb4a8b05caa594eeffe01af0e25f699e4e9bb", size = 34302230, upload-time = "2026-02-16T10:09:03.859Z" }, - { url = "https://files.pythonhosted.org/packages/bf/4a/1472c00392f521fea03ae93408bf445cc7bfa1ab81683faf9bc188e36629/pyarrow-23.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:0ae6e17c828455b6265d590100c295193f93cc5675eb0af59e49dbd00d2de350", size = 35850050, upload-time = "2026-02-16T10:09:11.877Z" }, - { url = "https://files.pythonhosted.org/packages/0c/b2/bd1f2f05ded56af7f54d702c8364c9c43cd6abb91b0e9933f3d77b4f4132/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fed7020203e9ef273360b9e45be52a2a47d3103caf156a30ace5247ffb51bdbd", size = 44491918, upload-time = "2026-02-16T10:09:18.144Z" }, - { url = "https://files.pythonhosted.org/packages/0b/62/96459ef5b67957eac38a90f541d1c28833d1b367f014a482cb63f3b7cd2d/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:26d50dee49d741ac0e82185033488d28d35be4d763ae6f321f97d1140eb7a0e9", size = 47562811, upload-time = "2026-02-16T10:09:25.792Z" }, - { url = "https://files.pythonhosted.org/packages/7d/94/1170e235add1f5f45a954e26cd0e906e7e74e23392dcb560de471f7366ec/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c30143b17161310f151f4a2bcfe41b5ff744238c1039338779424e38579d701", size = 48183766, upload-time = "2026-02-16T10:09:34.645Z" }, - { url = "https://files.pythonhosted.org/packages/0e/2d/39a42af4570377b99774cdb47f63ee6c7da7616bd55b3d5001aa18edfe4f/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db2190fa79c80a23fdd29fef4b8992893f024ae7c17d2f5f4db7171fa30c2c78", size = 50607669, upload-time = "2026-02-16T10:09:44.153Z" }, - { url = "https://files.pythonhosted.org/packages/00/ca/db94101c187f3df742133ac837e93b1f269ebdac49427f8310ee40b6a58f/pyarrow-23.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:f00f993a8179e0e1c9713bcc0baf6d6c01326a406a9c23495ec1ba9c9ebf2919", size = 27527698, upload-time = "2026-02-16T10:09:50.263Z" }, - { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" }, - { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" }, - { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" }, - { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" }, - { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" }, - { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" }, - { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" }, -] - -[[package]] -name = "pydantic" -version = "2.12.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "annotated-types" }, - { name = "pydantic-core" }, - { name = "typing-extensions" }, - { name = "typing-inspection" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" }, -] - -[[package]] -name = "pydantic-core" -version = "2.41.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e8/72/74a989dd9f2084b3d9530b0915fdda64ac48831c30dbf7c72a41a5232db8/pydantic_core-2.41.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a3a52f6156e73e7ccb0f8cced536adccb7042be67cb45f9562e12b319c119da6", size = 2105873, upload-time = "2025-11-04T13:39:31.373Z" }, - { url = "https://files.pythonhosted.org/packages/12/44/37e403fd9455708b3b942949e1d7febc02167662bf1a7da5b78ee1ea2842/pydantic_core-2.41.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f3bf998340c6d4b0c9a2f02d6a400e51f123b59565d74dc60d252ce888c260b", size = 1899826, upload-time = "2025-11-04T13:39:32.897Z" }, - { url = "https://files.pythonhosted.org/packages/33/7f/1d5cab3ccf44c1935a359d51a8a2a9e1a654b744b5e7f80d41b88d501eec/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:378bec5c66998815d224c9ca994f1e14c0c21cb95d2f52b6021cc0b2a58f2a5a", size = 1917869, upload-time = "2025-11-04T13:39:34.469Z" }, - { url = "https://files.pythonhosted.org/packages/6e/6a/30d94a9674a7fe4f4744052ed6c5e083424510be1e93da5bc47569d11810/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e7b576130c69225432866fe2f4a469a85a54ade141d96fd396dffcf607b558f8", size = 2063890, upload-time = "2025-11-04T13:39:36.053Z" }, - { url = "https://files.pythonhosted.org/packages/50/be/76e5d46203fcb2750e542f32e6c371ffa9b8ad17364cf94bb0818dbfb50c/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6cb58b9c66f7e4179a2d5e0f849c48eff5c1fca560994d6eb6543abf955a149e", size = 2229740, upload-time = "2025-11-04T13:39:37.753Z" }, - { url = "https://files.pythonhosted.org/packages/d3/ee/fed784df0144793489f87db310a6bbf8118d7b630ed07aa180d6067e653a/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88942d3a3dff3afc8288c21e565e476fc278902ae4d6d134f1eeda118cc830b1", size = 2350021, upload-time = "2025-11-04T13:39:40.94Z" }, - { url = "https://files.pythonhosted.org/packages/c8/be/8fed28dd0a180dca19e72c233cbf58efa36df055e5b9d90d64fd1740b828/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31d95a179f8d64d90f6831d71fa93290893a33148d890ba15de25642c5d075b", size = 2066378, upload-time = "2025-11-04T13:39:42.523Z" }, - { url = "https://files.pythonhosted.org/packages/b0/3b/698cf8ae1d536a010e05121b4958b1257f0b5522085e335360e53a6b1c8b/pydantic_core-2.41.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c1df3d34aced70add6f867a8cf413e299177e0c22660cc767218373d0779487b", size = 2175761, upload-time = "2025-11-04T13:39:44.553Z" }, - { url = "https://files.pythonhosted.org/packages/b8/ba/15d537423939553116dea94ce02f9c31be0fa9d0b806d427e0308ec17145/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4009935984bd36bd2c774e13f9a09563ce8de4abaa7226f5108262fa3e637284", size = 2146303, upload-time = "2025-11-04T13:39:46.238Z" }, - { url = "https://files.pythonhosted.org/packages/58/7f/0de669bf37d206723795f9c90c82966726a2ab06c336deba4735b55af431/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:34a64bc3441dc1213096a20fe27e8e128bd3ff89921706e83c0b1ac971276594", size = 2340355, upload-time = "2025-11-04T13:39:48.002Z" }, - { url = "https://files.pythonhosted.org/packages/e5/de/e7482c435b83d7e3c3ee5ee4451f6e8973cff0eb6007d2872ce6383f6398/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c9e19dd6e28fdcaa5a1de679aec4141f691023916427ef9bae8584f9c2fb3b0e", size = 2319875, upload-time = "2025-11-04T13:39:49.705Z" }, - { url = "https://files.pythonhosted.org/packages/fe/e6/8c9e81bb6dd7560e33b9053351c29f30c8194b72f2d6932888581f503482/pydantic_core-2.41.5-cp311-cp311-win32.whl", hash = "sha256:2c010c6ded393148374c0f6f0bf89d206bf3217f201faa0635dcd56bd1520f6b", size = 1987549, upload-time = "2025-11-04T13:39:51.842Z" }, - { url = "https://files.pythonhosted.org/packages/11/66/f14d1d978ea94d1bc21fc98fcf570f9542fe55bfcc40269d4e1a21c19bf7/pydantic_core-2.41.5-cp311-cp311-win_amd64.whl", hash = "sha256:76ee27c6e9c7f16f47db7a94157112a2f3a00e958bc626e2f4ee8bec5c328fbe", size = 2011305, upload-time = "2025-11-04T13:39:53.485Z" }, - { url = "https://files.pythonhosted.org/packages/56/d8/0e271434e8efd03186c5386671328154ee349ff0354d83c74f5caaf096ed/pydantic_core-2.41.5-cp311-cp311-win_arm64.whl", hash = "sha256:4bc36bbc0b7584de96561184ad7f012478987882ebf9f9c389b23f432ea3d90f", size = 1972902, upload-time = "2025-11-04T13:39:56.488Z" }, - { url = "https://files.pythonhosted.org/packages/5f/5d/5f6c63eebb5afee93bcaae4ce9a898f3373ca23df3ccaef086d0233a35a7/pydantic_core-2.41.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7", size = 2110990, upload-time = "2025-11-04T13:39:58.079Z" }, - { url = "https://files.pythonhosted.org/packages/aa/32/9c2e8ccb57c01111e0fd091f236c7b371c1bccea0fa85247ac55b1e2b6b6/pydantic_core-2.41.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0", size = 1896003, upload-time = "2025-11-04T13:39:59.956Z" }, - { url = "https://files.pythonhosted.org/packages/68/b8/a01b53cb0e59139fbc9e4fda3e9724ede8de279097179be4ff31f1abb65a/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69", size = 1919200, upload-time = "2025-11-04T13:40:02.241Z" }, - { url = "https://files.pythonhosted.org/packages/38/de/8c36b5198a29bdaade07b5985e80a233a5ac27137846f3bc2d3b40a47360/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75", size = 2052578, upload-time = "2025-11-04T13:40:04.401Z" }, - { url = "https://files.pythonhosted.org/packages/00/b5/0e8e4b5b081eac6cb3dbb7e60a65907549a1ce035a724368c330112adfdd/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05", size = 2208504, upload-time = "2025-11-04T13:40:06.072Z" }, - { url = "https://files.pythonhosted.org/packages/77/56/87a61aad59c7c5b9dc8caad5a41a5545cba3810c3e828708b3d7404f6cef/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc", size = 2335816, upload-time = "2025-11-04T13:40:07.835Z" }, - { url = "https://files.pythonhosted.org/packages/0d/76/941cc9f73529988688a665a5c0ecff1112b3d95ab48f81db5f7606f522d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c", size = 2075366, upload-time = "2025-11-04T13:40:09.804Z" }, - { url = "https://files.pythonhosted.org/packages/d3/43/ebef01f69baa07a482844faaa0a591bad1ef129253ffd0cdaa9d8a7f72d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5", size = 2171698, upload-time = "2025-11-04T13:40:12.004Z" }, - { url = "https://files.pythonhosted.org/packages/b1/87/41f3202e4193e3bacfc2c065fab7706ebe81af46a83d3e27605029c1f5a6/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c", size = 2132603, upload-time = "2025-11-04T13:40:13.868Z" }, - { url = "https://files.pythonhosted.org/packages/49/7d/4c00df99cb12070b6bccdef4a195255e6020a550d572768d92cc54dba91a/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294", size = 2329591, upload-time = "2025-11-04T13:40:15.672Z" }, - { url = "https://files.pythonhosted.org/packages/cc/6a/ebf4b1d65d458f3cda6a7335d141305dfa19bdc61140a884d165a8a1bbc7/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1", size = 2319068, upload-time = "2025-11-04T13:40:17.532Z" }, - { url = "https://files.pythonhosted.org/packages/49/3b/774f2b5cd4192d5ab75870ce4381fd89cf218af999515baf07e7206753f0/pydantic_core-2.41.5-cp312-cp312-win32.whl", hash = "sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d", size = 1985908, upload-time = "2025-11-04T13:40:19.309Z" }, - { url = "https://files.pythonhosted.org/packages/86/45/00173a033c801cacf67c190fef088789394feaf88a98a7035b0e40d53dc9/pydantic_core-2.41.5-cp312-cp312-win_amd64.whl", hash = "sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815", size = 2020145, upload-time = "2025-11-04T13:40:21.548Z" }, - { url = "https://files.pythonhosted.org/packages/f9/22/91fbc821fa6d261b376a3f73809f907cec5ca6025642c463d3488aad22fb/pydantic_core-2.41.5-cp312-cp312-win_arm64.whl", hash = "sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3", size = 1976179, upload-time = "2025-11-04T13:40:23.393Z" }, - { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" }, - { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" }, - { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" }, - { url = "https://files.pythonhosted.org/packages/2e/1b/687711069de7efa6af934e74f601e2a4307365e8fdc404703afc453eab26/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15489ba13d61f670dcc96772e733aad1a6f9c429cc27574c6cdaed82d0146ad", size = 2138905, upload-time = "2025-11-04T13:42:47.156Z" }, - { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" }, - { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388, upload-time = "2025-11-04T13:42:52.215Z" }, - { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879, upload-time = "2025-11-04T13:42:56.483Z" }, - { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" }, - { url = "https://files.pythonhosted.org/packages/5f/9b/1b3f0e9f9305839d7e84912f9e8bfbd191ed1b1ef48083609f0dabde978c/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2379fa7ed44ddecb5bfe4e48577d752db9fc10be00a6b7446e9663ba143de26", size = 2101980, upload-time = "2025-11-04T13:43:25.97Z" }, - { url = "https://files.pythonhosted.org/packages/a4/ed/d71fefcb4263df0da6a85b5d8a7508360f2f2e9b3bf5814be9c8bccdccc1/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:266fb4cbf5e3cbd0b53669a6d1b039c45e3ce651fd5442eff4d07c2cc8d66808", size = 1923865, upload-time = "2025-11-04T13:43:28.763Z" }, - { url = "https://files.pythonhosted.org/packages/ce/3a/626b38db460d675f873e4444b4bb030453bbe7b4ba55df821d026a0493c4/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58133647260ea01e4d0500089a8c4f07bd7aa6ce109682b1426394988d8aaacc", size = 2134256, upload-time = "2025-11-04T13:43:31.71Z" }, - { url = "https://files.pythonhosted.org/packages/83/d9/8412d7f06f616bbc053d30cb4e5f76786af3221462ad5eee1f202021eb4e/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:287dad91cfb551c363dc62899a80e9e14da1f0e2b6ebde82c806612ca2a13ef1", size = 2174762, upload-time = "2025-11-04T13:43:34.744Z" }, - { url = "https://files.pythonhosted.org/packages/55/4c/162d906b8e3ba3a99354e20faa1b49a85206c47de97a639510a0e673f5da/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:03b77d184b9eb40240ae9fd676ca364ce1085f203e1b1256f8ab9984dca80a84", size = 2143141, upload-time = "2025-11-04T13:43:37.701Z" }, - { url = "https://files.pythonhosted.org/packages/1f/f2/f11dd73284122713f5f89fc940f370d035fa8e1e078d446b3313955157fe/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:a668ce24de96165bb239160b3d854943128f4334822900534f2fe947930e5770", size = 2330317, upload-time = "2025-11-04T13:43:40.406Z" }, - { url = "https://files.pythonhosted.org/packages/88/9d/b06ca6acfe4abb296110fb1273a4d848a0bfb2ff65f3ee92127b3244e16b/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f14f8f046c14563f8eb3f45f499cc658ab8d10072961e07225e507adb700e93f", size = 2316992, upload-time = "2025-11-04T13:43:43.602Z" }, - { url = "https://files.pythonhosted.org/packages/36/c7/cfc8e811f061c841d7990b0201912c3556bfeb99cdcb7ed24adc8d6f8704/pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51", size = 2145302, upload-time = "2025-11-04T13:43:46.64Z" }, -] - -[[package]] -name = "python-dateutil" -version = "2.9.0.post0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "six" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, -] - -[[package]] -name = "pyyaml" -version = "6.0.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" }, - { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" }, - { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" }, - { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" }, - { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" }, - { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" }, - { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" }, - { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" }, - { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" }, - { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" }, - { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" }, - { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" }, - { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" }, - { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" }, - { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" }, - { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" }, - { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" }, - { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" }, - { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, -] - -[[package]] -name = "regex" -version = "2026.2.28" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/8b/71/41455aa99a5a5ac1eaf311f5d8efd9ce6433c03ac1e0962de163350d0d97/regex-2026.2.28.tar.gz", hash = "sha256:a729e47d418ea11d03469f321aaf67cdee8954cde3ff2cf8403ab87951ad10f2", size = 415184, upload-time = "2026-02-28T02:19:42.792Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/db/8cbfd0ba3f302f2d09dd0019a9fcab74b63fee77a76c937d0e33161fb8c1/regex-2026.2.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e621fb7c8dc147419b28e1702f58a0177ff8308a76fa295c71f3e7827849f5d9", size = 488462, upload-time = "2026-02-28T02:16:22.616Z" }, - { url = "https://files.pythonhosted.org/packages/5d/10/ccc22c52802223f2368731964ddd117799e1390ffc39dbb31634a83022ee/regex-2026.2.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0d5bef2031cbf38757a0b0bc4298bb4824b6332d28edc16b39247228fbdbad97", size = 290774, upload-time = "2026-02-28T02:16:23.993Z" }, - { url = "https://files.pythonhosted.org/packages/62/b9/6796b3bf3101e64117201aaa3a5a030ec677ecf34b3cd6141b5d5c6c67d5/regex-2026.2.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bcb399ed84eabf4282587ba151f2732ad8168e66f1d3f85b1d038868fe547703", size = 288724, upload-time = "2026-02-28T02:16:25.403Z" }, - { url = "https://files.pythonhosted.org/packages/9c/02/291c0ae3f3a10cea941d0f5366da1843d8d1fa8a25b0671e20a0e454bb38/regex-2026.2.28-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7c1b34dfa72f826f535b20712afa9bb3ba580020e834f3c69866c5bddbf10098", size = 791924, upload-time = "2026-02-28T02:16:26.863Z" }, - { url = "https://files.pythonhosted.org/packages/0f/57/f0235cc520d9672742196c5c15098f8f703f2758d48d5a7465a56333e496/regex-2026.2.28-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:851fa70df44325e1e4cdb79c5e676e91a78147b1b543db2aec8734d2add30ec2", size = 860095, upload-time = "2026-02-28T02:16:28.772Z" }, - { url = "https://files.pythonhosted.org/packages/b3/7c/393c94cbedda79a0f5f2435ebd01644aba0b338d327eb24b4aa5b8d6c07f/regex-2026.2.28-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:516604edd17b1c2c3e579cf4e9b25a53bf8fa6e7cedddf1127804d3e0140ca64", size = 906583, upload-time = "2026-02-28T02:16:30.977Z" }, - { url = "https://files.pythonhosted.org/packages/2c/73/a72820f47ca5abf2b5d911d0407ba5178fc52cf9780191ed3a54f5f419a2/regex-2026.2.28-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e7ce83654d1ab701cb619285a18a8e5a889c1216d746ddc710c914ca5fd71022", size = 800234, upload-time = "2026-02-28T02:16:32.55Z" }, - { url = "https://files.pythonhosted.org/packages/34/b3/6e6a4b7b31fa998c4cf159a12cbeaf356386fbd1a8be743b1e80a3da51e4/regex-2026.2.28-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2791948f7c70bb9335a9102df45e93d428f4b8128020d85920223925d73b9e1", size = 772803, upload-time = "2026-02-28T02:16:34.029Z" }, - { url = "https://files.pythonhosted.org/packages/10/e7/5da0280c765d5a92af5e1cd324b3fe8464303189cbaa449de9a71910e273/regex-2026.2.28-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:03a83cc26aa2acda6b8b9dfe748cf9e84cbd390c424a1de34fdcef58961a297a", size = 781117, upload-time = "2026-02-28T02:16:36.253Z" }, - { url = "https://files.pythonhosted.org/packages/76/39/0b8d7efb256ae34e1b8157acc1afd8758048a1cf0196e1aec2e71fd99f4b/regex-2026.2.28-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ec6f5674c5dc836994f50f1186dd1fafde4be0666aae201ae2fcc3d29d8adf27", size = 854224, upload-time = "2026-02-28T02:16:38.119Z" }, - { url = "https://files.pythonhosted.org/packages/21/ff/a96d483ebe8fe6d1c67907729202313895d8de8495569ec319c6f29d0438/regex-2026.2.28-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:50c2fc924749543e0eacc93ada6aeeb3ea5f6715825624baa0dccaec771668ae", size = 761898, upload-time = "2026-02-28T02:16:40.333Z" }, - { url = "https://files.pythonhosted.org/packages/89/bd/d4f2e75cb4a54b484e796017e37c0d09d8a0a837de43d17e238adf163f4e/regex-2026.2.28-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:ba55c50f408fb5c346a3a02d2ce0ebc839784e24f7c9684fde328ff063c3cdea", size = 844832, upload-time = "2026-02-28T02:16:41.875Z" }, - { url = "https://files.pythonhosted.org/packages/8a/a7/428a135cf5e15e4e11d1e696eb2bf968362f8ea8a5f237122e96bc2ae950/regex-2026.2.28-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:edb1b1b3a5576c56f08ac46f108c40333f222ebfd5cf63afdfa3aab0791ebe5b", size = 788347, upload-time = "2026-02-28T02:16:43.472Z" }, - { url = "https://files.pythonhosted.org/packages/a9/59/68691428851cf9c9c3707217ab1d9b47cfeec9d153a49919e6c368b9e926/regex-2026.2.28-cp311-cp311-win32.whl", hash = "sha256:948c12ef30ecedb128903c2c2678b339746eb7c689c5c21957c4a23950c96d15", size = 266033, upload-time = "2026-02-28T02:16:45.094Z" }, - { url = "https://files.pythonhosted.org/packages/42/8b/1483de1c57024e89296cbcceb9cccb3f625d416ddb46e570be185c9b05a9/regex-2026.2.28-cp311-cp311-win_amd64.whl", hash = "sha256:fd63453f10d29097cc3dc62d070746523973fb5aa1c66d25f8558bebd47fed61", size = 277978, upload-time = "2026-02-28T02:16:46.75Z" }, - { url = "https://files.pythonhosted.org/packages/a4/36/abec45dc6e7252e3dbc797120496e43bb5730a7abf0d9cb69340696a2f2d/regex-2026.2.28-cp311-cp311-win_arm64.whl", hash = "sha256:00f2b8d9615aa165fdff0a13f1a92049bfad555ee91e20d246a51aa0b556c60a", size = 270340, upload-time = "2026-02-28T02:16:48.626Z" }, - { url = "https://files.pythonhosted.org/packages/07/42/9061b03cf0fc4b5fa2c3984cbbaed54324377e440a5c5a29d29a72518d62/regex-2026.2.28-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:fcf26c3c6d0da98fada8ae4ef0aa1c3405a431c0a77eb17306d38a89b02adcd7", size = 489574, upload-time = "2026-02-28T02:16:50.455Z" }, - { url = "https://files.pythonhosted.org/packages/77/83/0c8a5623a233015595e3da499c5a1c13720ac63c107897a6037bb97af248/regex-2026.2.28-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:02473c954af35dd2defeb07e44182f5705b30ea3f351a7cbffa9177beb14da5d", size = 291426, upload-time = "2026-02-28T02:16:52.52Z" }, - { url = "https://files.pythonhosted.org/packages/9e/06/3ef1ac6910dc3295ebd71b1f9bfa737e82cfead211a18b319d45f85ddd09/regex-2026.2.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9b65d33a17101569f86d9c5966a8b1d7fbf8afdda5a8aa219301b0a80f58cf7d", size = 289200, upload-time = "2026-02-28T02:16:54.08Z" }, - { url = "https://files.pythonhosted.org/packages/dd/c9/8cc8d850b35ab5650ff6756a1cb85286e2000b66c97520b29c1587455344/regex-2026.2.28-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e71dcecaa113eebcc96622c17692672c2d104b1d71ddf7adeda90da7ddeb26fc", size = 796765, upload-time = "2026-02-28T02:16:55.905Z" }, - { url = "https://files.pythonhosted.org/packages/e9/5d/57702597627fc23278ebf36fbb497ac91c0ce7fec89ac6c81e420ca3e38c/regex-2026.2.28-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:481df4623fa4969c8b11f3433ed7d5e3dc9cec0f008356c3212b3933fb77e3d8", size = 863093, upload-time = "2026-02-28T02:16:58.094Z" }, - { url = "https://files.pythonhosted.org/packages/02/6d/f3ecad537ca2811b4d26b54ca848cf70e04fcfc138667c146a9f3157779c/regex-2026.2.28-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:64e7c6ad614573e0640f271e811a408d79a9e1fe62a46adb602f598df42a818d", size = 909455, upload-time = "2026-02-28T02:17:00.918Z" }, - { url = "https://files.pythonhosted.org/packages/9e/40/bb226f203caa22c1043c1ca79b36340156eca0f6a6742b46c3bb222a3a57/regex-2026.2.28-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6b08a06976ff4fb0d83077022fde3eca06c55432bb997d8c0495b9a4e9872f4", size = 802037, upload-time = "2026-02-28T02:17:02.842Z" }, - { url = "https://files.pythonhosted.org/packages/44/7c/c6d91d8911ac6803b45ca968e8e500c46934e58c0903cbc6d760ee817a0a/regex-2026.2.28-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:864cdd1a2ef5716b0ab468af40139e62ede1b3a53386b375ec0786bb6783fc05", size = 775113, upload-time = "2026-02-28T02:17:04.506Z" }, - { url = "https://files.pythonhosted.org/packages/dc/8d/4a9368d168d47abd4158580b8c848709667b1cd293ff0c0c277279543bd0/regex-2026.2.28-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:511f7419f7afab475fd4d639d4aedfc54205bcb0800066753ef68a59f0f330b5", size = 784194, upload-time = "2026-02-28T02:17:06.888Z" }, - { url = "https://files.pythonhosted.org/packages/cc/bf/2c72ab5d8b7be462cb1651b5cc333da1d0068740342f350fcca3bca31947/regex-2026.2.28-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:b42f7466e32bf15a961cf09f35fa6323cc72e64d3d2c990b10de1274a5da0a59", size = 856846, upload-time = "2026-02-28T02:17:09.11Z" }, - { url = "https://files.pythonhosted.org/packages/7c/f4/6b65c979bb6d09f51bb2d2a7bc85de73c01ec73335d7ddd202dcb8cd1c8f/regex-2026.2.28-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8710d61737b0c0ce6836b1da7109f20d495e49b3809f30e27e9560be67a257bf", size = 763516, upload-time = "2026-02-28T02:17:11.004Z" }, - { url = "https://files.pythonhosted.org/packages/8e/32/29ea5e27400ee86d2cc2b4e80aa059df04eaf78b4f0c18576ae077aeff68/regex-2026.2.28-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4390c365fd2d45278f45afd4673cb90f7285f5701607e3ad4274df08e36140ae", size = 849278, upload-time = "2026-02-28T02:17:12.693Z" }, - { url = "https://files.pythonhosted.org/packages/1d/91/3233d03b5f865111cd517e1c95ee8b43e8b428d61fa73764a80c9bb6f537/regex-2026.2.28-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cb3b1db8ff6c7b8bf838ab05583ea15230cb2f678e569ab0e3a24d1e8320940b", size = 790068, upload-time = "2026-02-28T02:17:14.9Z" }, - { url = "https://files.pythonhosted.org/packages/76/92/abc706c1fb03b4580a09645b206a3fc032f5a9f457bc1a8038ac555658ab/regex-2026.2.28-cp312-cp312-win32.whl", hash = "sha256:f8ed9a5d4612df9d4de15878f0bc6aa7a268afbe5af21a3fdd97fa19516e978c", size = 266416, upload-time = "2026-02-28T02:17:17.15Z" }, - { url = "https://files.pythonhosted.org/packages/fa/06/2a6f7dff190e5fa9df9fb4acf2fdf17a1aa0f7f54596cba8de608db56b3a/regex-2026.2.28-cp312-cp312-win_amd64.whl", hash = "sha256:01d65fd24206c8e1e97e2e31b286c59009636c022eb5d003f52760b0f42155d4", size = 277297, upload-time = "2026-02-28T02:17:18.723Z" }, - { url = "https://files.pythonhosted.org/packages/b7/f0/58a2484851fadf284458fdbd728f580d55c1abac059ae9f048c63b92f427/regex-2026.2.28-cp312-cp312-win_arm64.whl", hash = "sha256:c0b5ccbb8ffb433939d248707d4a8b31993cb76ab1a0187ca886bf50e96df952", size = 270408, upload-time = "2026-02-28T02:17:20.328Z" }, -] - -[[package]] -name = "requests" -version = "2.32.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "certifi" }, - { name = "charset-normalizer" }, - { name = "idna" }, - { name = "urllib3" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, -] - -[[package]] -name = "safetensors" -version = "0.6.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ac/cc/738f3011628920e027a11754d9cae9abec1aed00f7ae860abbf843755233/safetensors-0.6.2.tar.gz", hash = "sha256:43ff2aa0e6fa2dc3ea5524ac7ad93a9839256b8703761e76e2d0b2a3fa4f15d9", size = 197968, upload-time = "2025-08-08T13:13:58.654Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4d/b1/3f5fd73c039fc87dba3ff8b5d528bfc5a32b597fea8e7a6a4800343a17c7/safetensors-0.6.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9c85ede8ec58f120bad982ec47746981e210492a6db876882aa021446af8ffba", size = 454797, upload-time = "2025-08-08T13:13:52.066Z" }, - { url = "https://files.pythonhosted.org/packages/8c/c9/bb114c158540ee17907ec470d01980957fdaf87b4aa07914c24eba87b9c6/safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d6675cf4b39c98dbd7d940598028f3742e0375a6b4d4277e76beb0c35f4b843b", size = 432206, upload-time = "2025-08-08T13:13:50.931Z" }, - { url = "https://files.pythonhosted.org/packages/d3/8e/f70c34e47df3110e8e0bb268d90db8d4be8958a54ab0336c9be4fe86dac8/safetensors-0.6.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d2d2b3ce1e2509c68932ca03ab8f20570920cd9754b05063d4368ee52833ecd", size = 473261, upload-time = "2025-08-08T13:13:41.259Z" }, - { url = "https://files.pythonhosted.org/packages/2a/f5/be9c6a7c7ef773e1996dc214e73485286df1836dbd063e8085ee1976f9cb/safetensors-0.6.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:93de35a18f46b0f5a6a1f9e26d91b442094f2df02e9fd7acf224cfec4238821a", size = 485117, upload-time = "2025-08-08T13:13:43.506Z" }, - { url = "https://files.pythonhosted.org/packages/c9/55/23f2d0a2c96ed8665bf17a30ab4ce5270413f4d74b6d87dd663258b9af31/safetensors-0.6.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89a89b505f335640f9120fac65ddeb83e40f1fd081cb8ed88b505bdccec8d0a1", size = 616154, upload-time = "2025-08-08T13:13:45.096Z" }, - { url = "https://files.pythonhosted.org/packages/98/c6/affb0bd9ce02aa46e7acddbe087912a04d953d7a4d74b708c91b5806ef3f/safetensors-0.6.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc4d0d0b937e04bdf2ae6f70cd3ad51328635fe0e6214aa1fc811f3b576b3bda", size = 520713, upload-time = "2025-08-08T13:13:46.25Z" }, - { url = "https://files.pythonhosted.org/packages/fe/5d/5a514d7b88e310c8b146e2404e0dc161282e78634d9358975fd56dfd14be/safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8045db2c872db8f4cbe3faa0495932d89c38c899c603f21e9b6486951a5ecb8f", size = 485835, upload-time = "2025-08-08T13:13:49.373Z" }, - { url = "https://files.pythonhosted.org/packages/7a/7b/4fc3b2ba62c352b2071bea9cfbad330fadda70579f617506ae1a2f129cab/safetensors-0.6.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:81e67e8bab9878bb568cffbc5f5e655adb38d2418351dc0859ccac158f753e19", size = 521503, upload-time = "2025-08-08T13:13:47.651Z" }, - { url = "https://files.pythonhosted.org/packages/5a/50/0057e11fe1f3cead9254315a6c106a16dd4b1a19cd247f7cc6414f6b7866/safetensors-0.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b0e4d029ab0a0e0e4fdf142b194514695b1d7d3735503ba700cf36d0fc7136ce", size = 652256, upload-time = "2025-08-08T13:13:53.167Z" }, - { url = "https://files.pythonhosted.org/packages/e9/29/473f789e4ac242593ac1656fbece6e1ecd860bb289e635e963667807afe3/safetensors-0.6.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:fa48268185c52bfe8771e46325a1e21d317207bcabcb72e65c6e28e9ffeb29c7", size = 747281, upload-time = "2025-08-08T13:13:54.656Z" }, - { url = "https://files.pythonhosted.org/packages/68/52/f7324aad7f2df99e05525c84d352dc217e0fa637a4f603e9f2eedfbe2c67/safetensors-0.6.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:d83c20c12c2d2f465997c51b7ecb00e407e5f94d7dec3ea0cc11d86f60d3fde5", size = 692286, upload-time = "2025-08-08T13:13:55.884Z" }, - { url = "https://files.pythonhosted.org/packages/ad/fe/cad1d9762868c7c5dc70c8620074df28ebb1a8e4c17d4c0cb031889c457e/safetensors-0.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d944cea65fad0ead848b6ec2c37cc0b197194bec228f8020054742190e9312ac", size = 655957, upload-time = "2025-08-08T13:13:57.029Z" }, - { url = "https://files.pythonhosted.org/packages/59/a7/e2158e17bbe57d104f0abbd95dff60dda916cf277c9f9663b4bf9bad8b6e/safetensors-0.6.2-cp38-abi3-win32.whl", hash = "sha256:cab75ca7c064d3911411461151cb69380c9225798a20e712b102edda2542ddb1", size = 308926, upload-time = "2025-08-08T13:14:01.095Z" }, - { url = "https://files.pythonhosted.org/packages/2c/c3/c0be1135726618dc1e28d181b8c442403d8dbb9e273fd791de2d4384bcdd/safetensors-0.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:c7b214870df923cbc1593c3faee16bec59ea462758699bd3fee399d00aac072c", size = 320192, upload-time = "2025-08-08T13:13:59.467Z" }, -] - -[[package]] -name = "sentry-sdk" -version = "2.54.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "certifi" }, - { name = "urllib3" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c8/e9/2e3a46c304e7fa21eaa70612f60354e32699c7102eb961f67448e222ad7c/sentry_sdk-2.54.0.tar.gz", hash = "sha256:2620c2575128d009b11b20f7feb81e4e4e8ae08ec1d36cbc845705060b45cc1b", size = 413813, upload-time = "2026-03-02T15:12:41.355Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/53/39/be412cc86bc6247b8f69e9383d7950711bd86f8d0a4a4b0fe8fad685bc21/sentry_sdk-2.54.0-py2.py3-none-any.whl", hash = "sha256:fd74e0e281dcda63afff095d23ebcd6e97006102cdc8e78a29f19ecdf796a0de", size = 439198, upload-time = "2026-03-02T15:12:39.546Z" }, -] - -[[package]] -name = "setuptools" -version = "82.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/82/f3/748f4d6f65d1756b9ae577f329c951cda23fb900e4de9f70900ced962085/setuptools-82.0.0.tar.gz", hash = "sha256:22e0a2d69474c6ae4feb01951cb69d515ed23728cf96d05513d36e42b62b37cb", size = 1144893, upload-time = "2026-02-08T15:08:40.206Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e1/c6/76dc613121b793286a3f91621d7b75a2b493e0390ddca50f11993eadf192/setuptools-82.0.0-py3-none-any.whl", hash = "sha256:70b18734b607bd1da571d097d236cfcfacaf01de45717d59e6e04b96877532e0", size = 1003468, upload-time = "2026-02-08T15:08:38.723Z" }, -] - -[[package]] -name = "sgl-kernel" -version = "0.3.17.post1" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/57/a2/d2b36e0b8a7b5d88117d8d96c4eb612fe3677069316d444479ff78c73547/sgl_kernel-0.3.17.post1-cp310-abi3-manylinux2014_aarch64.whl", hash = "sha256:330057ad2d239e9363ee9abd85ed445ee1795161c60b7357f9792103121039cc", size = 341776329, upload-time = "2025-11-15T15:39:54.528Z" }, - { url = "https://files.pythonhosted.org/packages/10/8f/6286c74887c42ee4e888a6c36170ff394185e581fbecce2f1bf5c174b96e/sgl_kernel-0.3.17.post1-cp310-abi3-manylinux2014_x86_64.whl", hash = "sha256:c864e6d6eebcd91e59a71ba781739761a21774f0cb862578381f54f504f93b4a", size = 511995347, upload-time = "2025-11-15T15:41:45.029Z" }, -] - -[[package]] -name = "six" -version = "1.17.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, -] - -[[package]] -name = "smmap" -version = "5.0.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/44/cd/a040c4b3119bbe532e5b0732286f805445375489fceaec1f48306068ee3b/smmap-5.0.2.tar.gz", hash = "sha256:26ea65a03958fa0c8a1c7e8c7a58fdc77221b8910f6be2131affade476898ad5", size = 22329, upload-time = "2025-01-02T07:14:40.909Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303, upload-time = "2025-01-02T07:14:38.724Z" }, -] - -[[package]] -name = "ssd" -version = "0.2.0" -source = { editable = "." } -dependencies = [ - { name = "flashinfer-python" }, - { name = "hf-transfer" }, - { name = "numpy" }, - { name = "nvidia-cutlass-dsl" }, - { name = "safetensors" }, - { name = "sgl-kernel" }, - { name = "tiktoken" }, - { name = "torch" }, - { name = "tqdm" }, - { name = "transformers" }, - { name = "triton" }, - { name = "wandb" }, - { name = "xxhash" }, -] - -[package.optional-dependencies] -scripts = [ - { name = "datasets" }, - { name = "huggingface-hub" }, -] - -[package.metadata] -requires-dist = [ - { name = "datasets", marker = "extra == 'scripts'" }, - { name = "flashinfer-python", specifier = "==0.5.2" }, - { name = "hf-transfer" }, - { name = "huggingface-hub", marker = "extra == 'scripts'" }, - { name = "numpy", specifier = "==2.3.3" }, - { name = "nvidia-cutlass-dsl", specifier = "==4.2.1" }, - { name = "safetensors", specifier = "==0.6.2" }, - { name = "sgl-kernel", specifier = "==0.3.17.post1" }, - { name = "tiktoken" }, - { name = "torch", specifier = "==2.8.0" }, - { name = "tqdm", specifier = "==4.67.1" }, - { name = "transformers", specifier = "==4.57.1" }, - { name = "triton", specifier = "==3.4.0" }, - { name = "wandb", specifier = "==0.22.0" }, - { name = "xxhash", specifier = "==3.5.0" }, -] -provides-extras = ["scripts"] - -[[package]] -name = "sympy" -version = "1.14.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mpmath" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, -] - -[[package]] -name = "tabulate" -version = "0.9.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, -] - -[[package]] -name = "tiktoken" -version = "0.12.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "regex" }, - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/de/46/21ea696b21f1d6d1efec8639c204bdf20fde8bafb351e1355c72c5d7de52/tiktoken-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6e227c7f96925003487c33b1b32265fad2fbcec2b7cf4817afb76d416f40f6bb", size = 1051565, upload-time = "2025-10-06T20:21:44.566Z" }, - { url = "https://files.pythonhosted.org/packages/c9/d9/35c5d2d9e22bb2a5f74ba48266fb56c63d76ae6f66e02feb628671c0283e/tiktoken-0.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c06cf0fcc24c2cb2adb5e185c7082a82cba29c17575e828518c2f11a01f445aa", size = 995284, upload-time = "2025-10-06T20:21:45.622Z" }, - { url = "https://files.pythonhosted.org/packages/01/84/961106c37b8e49b9fdcf33fe007bb3a8fdcc380c528b20cc7fbba80578b8/tiktoken-0.12.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:f18f249b041851954217e9fd8e5c00b024ab2315ffda5ed77665a05fa91f42dc", size = 1129201, upload-time = "2025-10-06T20:21:47.074Z" }, - { url = "https://files.pythonhosted.org/packages/6a/d0/3d9275198e067f8b65076a68894bb52fd253875f3644f0a321a720277b8a/tiktoken-0.12.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:47a5bc270b8c3db00bb46ece01ef34ad050e364b51d406b6f9730b64ac28eded", size = 1152444, upload-time = "2025-10-06T20:21:48.139Z" }, - { url = "https://files.pythonhosted.org/packages/78/db/a58e09687c1698a7c592e1038e01c206569b86a0377828d51635561f8ebf/tiktoken-0.12.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:508fa71810c0efdcd1b898fda574889ee62852989f7c1667414736bcb2b9a4bd", size = 1195080, upload-time = "2025-10-06T20:21:49.246Z" }, - { url = "https://files.pythonhosted.org/packages/9e/1b/a9e4d2bf91d515c0f74afc526fd773a812232dd6cda33ebea7f531202325/tiktoken-0.12.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a1af81a6c44f008cba48494089dd98cccb8b313f55e961a52f5b222d1e507967", size = 1255240, upload-time = "2025-10-06T20:21:50.274Z" }, - { url = "https://files.pythonhosted.org/packages/9d/15/963819345f1b1fb0809070a79e9dd96938d4ca41297367d471733e79c76c/tiktoken-0.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:3e68e3e593637b53e56f7237be560f7a394451cb8c11079755e80ae64b9e6def", size = 879422, upload-time = "2025-10-06T20:21:51.734Z" }, - { url = "https://files.pythonhosted.org/packages/a4/85/be65d39d6b647c79800fd9d29241d081d4eeb06271f383bb87200d74cf76/tiktoken-0.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b97f74aca0d78a1ff21b8cd9e9925714c15a9236d6ceacf5c7327c117e6e21e8", size = 1050728, upload-time = "2025-10-06T20:21:52.756Z" }, - { url = "https://files.pythonhosted.org/packages/4a/42/6573e9129bc55c9bf7300b3a35bef2c6b9117018acca0dc760ac2d93dffe/tiktoken-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2b90f5ad190a4bb7c3eb30c5fa32e1e182ca1ca79f05e49b448438c3e225a49b", size = 994049, upload-time = "2025-10-06T20:21:53.782Z" }, - { url = "https://files.pythonhosted.org/packages/66/c5/ed88504d2f4a5fd6856990b230b56d85a777feab84e6129af0822f5d0f70/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:65b26c7a780e2139e73acc193e5c63ac754021f160df919add909c1492c0fb37", size = 1129008, upload-time = "2025-10-06T20:21:54.832Z" }, - { url = "https://files.pythonhosted.org/packages/f4/90/3dae6cc5436137ebd38944d396b5849e167896fc2073da643a49f372dc4f/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:edde1ec917dfd21c1f2f8046b86348b0f54a2c0547f68149d8600859598769ad", size = 1152665, upload-time = "2025-10-06T20:21:56.129Z" }, - { url = "https://files.pythonhosted.org/packages/a3/fe/26df24ce53ffde419a42f5f53d755b995c9318908288c17ec3f3448313a3/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:35a2f8ddd3824608b3d650a000c1ef71f730d0c56486845705a8248da00f9fe5", size = 1194230, upload-time = "2025-10-06T20:21:57.546Z" }, - { url = "https://files.pythonhosted.org/packages/20/cc/b064cae1a0e9fac84b0d2c46b89f4e57051a5f41324e385d10225a984c24/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83d16643edb7fa2c99eff2ab7733508aae1eebb03d5dfc46f5565862810f24e3", size = 1254688, upload-time = "2025-10-06T20:21:58.619Z" }, - { url = "https://files.pythonhosted.org/packages/81/10/b8523105c590c5b8349f2587e2fdfe51a69544bd5a76295fc20f2374f470/tiktoken-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffc5288f34a8bc02e1ea7047b8d041104791d2ddbf42d1e5fa07822cbffe16bd", size = 878694, upload-time = "2025-10-06T20:21:59.876Z" }, -] - -[[package]] -name = "tokenizers" -version = "0.22.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "huggingface-hub" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/73/6f/f80cfef4a312e1fb34baf7d85c72d4411afde10978d4657f8cdd811d3ccc/tokenizers-0.22.2.tar.gz", hash = "sha256:473b83b915e547aa366d1eee11806deaf419e17be16310ac0a14077f1e28f917", size = 372115, upload-time = "2026-01-05T10:45:15.988Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/92/97/5dbfabf04c7e348e655e907ed27913e03db0923abb5dfdd120d7b25630e1/tokenizers-0.22.2-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:544dd704ae7238755d790de45ba8da072e9af3eea688f698b137915ae959281c", size = 3100275, upload-time = "2026-01-05T10:41:02.158Z" }, - { url = "https://files.pythonhosted.org/packages/2e/47/174dca0502ef88b28f1c9e06b73ce33500eedfac7a7692108aec220464e7/tokenizers-0.22.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:1e418a55456beedca4621dbab65a318981467a2b188e982a23e117f115ce5001", size = 2981472, upload-time = "2026-01-05T10:41:00.276Z" }, - { url = "https://files.pythonhosted.org/packages/d6/84/7990e799f1309a8b87af6b948f31edaa12a3ed22d11b352eaf4f4b2e5753/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2249487018adec45d6e3554c71d46eb39fa8ea67156c640f7513eb26f318cec7", size = 3290736, upload-time = "2026-01-05T10:40:32.165Z" }, - { url = "https://files.pythonhosted.org/packages/78/59/09d0d9ba94dcd5f4f1368d4858d24546b4bdc0231c2354aa31d6199f0399/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25b85325d0815e86e0bac263506dd114578953b7b53d7de09a6485e4a160a7dd", size = 3168835, upload-time = "2026-01-05T10:40:38.847Z" }, - { url = "https://files.pythonhosted.org/packages/47/50/b3ebb4243e7160bda8d34b731e54dd8ab8b133e50775872e7a434e524c28/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bfb88f22a209ff7b40a576d5324bf8286b519d7358663db21d6246fb17eea2d5", size = 3521673, upload-time = "2026-01-05T10:40:56.614Z" }, - { url = "https://files.pythonhosted.org/packages/e0/fa/89f4cb9e08df770b57adb96f8cbb7e22695a4cb6c2bd5f0c4f0ebcf33b66/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1c774b1276f71e1ef716e5486f21e76333464f47bece56bbd554485982a9e03e", size = 3724818, upload-time = "2026-01-05T10:40:44.507Z" }, - { url = "https://files.pythonhosted.org/packages/64/04/ca2363f0bfbe3b3d36e95bf67e56a4c88c8e3362b658e616d1ac185d47f2/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df6c4265b289083bf710dff49bc51ef252f9d5be33a45ee2bed151114a56207b", size = 3379195, upload-time = "2026-01-05T10:40:51.139Z" }, - { url = "https://files.pythonhosted.org/packages/2e/76/932be4b50ef6ccedf9d3c6639b056a967a86258c6d9200643f01269211ca/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:369cc9fc8cc10cb24143873a0d95438bb8ee257bb80c71989e3ee290e8d72c67", size = 3274982, upload-time = "2026-01-05T10:40:58.331Z" }, - { url = "https://files.pythonhosted.org/packages/1d/28/5f9f5a4cc211b69e89420980e483831bcc29dade307955cc9dc858a40f01/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:29c30b83d8dcd061078b05ae0cb94d3c710555fbb44861139f9f83dcca3dc3e4", size = 9478245, upload-time = "2026-01-05T10:41:04.053Z" }, - { url = "https://files.pythonhosted.org/packages/6c/fb/66e2da4704d6aadebf8cb39f1d6d1957df667ab24cff2326b77cda0dcb85/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:37ae80a28c1d3265bb1f22464c856bd23c02a05bb211e56d0c5301a435be6c1a", size = 9560069, upload-time = "2026-01-05T10:45:10.673Z" }, - { url = "https://files.pythonhosted.org/packages/16/04/fed398b05caa87ce9b1a1bb5166645e38196081b225059a6edaff6440fac/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:791135ee325f2336f498590eb2f11dc5c295232f288e75c99a36c5dbce63088a", size = 9899263, upload-time = "2026-01-05T10:45:12.559Z" }, - { url = "https://files.pythonhosted.org/packages/05/a1/d62dfe7376beaaf1394917e0f8e93ee5f67fea8fcf4107501db35996586b/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38337540fbbddff8e999d59970f3c6f35a82de10053206a7562f1ea02d046fa5", size = 10033429, upload-time = "2026-01-05T10:45:14.333Z" }, - { url = "https://files.pythonhosted.org/packages/fd/18/a545c4ea42af3df6effd7d13d250ba77a0a86fb20393143bbb9a92e434d4/tokenizers-0.22.2-cp39-abi3-win32.whl", hash = "sha256:a6bf3f88c554a2b653af81f3204491c818ae2ac6fbc09e76ef4773351292bc92", size = 2502363, upload-time = "2026-01-05T10:45:20.593Z" }, - { url = "https://files.pythonhosted.org/packages/65/71/0670843133a43d43070abeb1949abfdef12a86d490bea9cd9e18e37c5ff7/tokenizers-0.22.2-cp39-abi3-win_amd64.whl", hash = "sha256:c9ea31edff2968b44a88f97d784c2f16dc0729b8b143ed004699ebca91f05c48", size = 2747786, upload-time = "2026-01-05T10:45:18.411Z" }, - { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" }, -] - -[[package]] -name = "torch" -version = "2.8.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "filelock" }, - { name = "fsspec" }, - { name = "jinja2" }, - { name = "networkx" }, - { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "setuptools", marker = "python_full_version >= '3.12'" }, - { name = "sympy" }, - { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "typing-extensions" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/8f/c4/3e7a3887eba14e815e614db70b3b529112d1513d9dae6f4d43e373360b7f/torch-2.8.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:220a06fd7af8b653c35d359dfe1aaf32f65aa85befa342629f716acb134b9710", size = 102073391, upload-time = "2025-08-06T14:53:20.937Z" }, - { url = "https://files.pythonhosted.org/packages/5a/63/4fdc45a0304536e75a5e1b1bbfb1b56dd0e2743c48ee83ca729f7ce44162/torch-2.8.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:c12fa219f51a933d5f80eeb3a7a5d0cbe9168c0a14bbb4055f1979431660879b", size = 888063640, upload-time = "2025-08-06T14:55:05.325Z" }, - { url = "https://files.pythonhosted.org/packages/84/57/2f64161769610cf6b1c5ed782bd8a780e18a3c9d48931319f2887fa9d0b1/torch-2.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:8c7ef765e27551b2fbfc0f41bcf270e1292d9bf79f8e0724848b1682be6e80aa", size = 241366752, upload-time = "2025-08-06T14:53:38.692Z" }, - { url = "https://files.pythonhosted.org/packages/a4/5e/05a5c46085d9b97e928f3f037081d3d2b87fb4b4195030fc099aaec5effc/torch-2.8.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:5ae0524688fb6707c57a530c2325e13bb0090b745ba7b4a2cd6a3ce262572916", size = 73621174, upload-time = "2025-08-06T14:53:25.44Z" }, - { url = "https://files.pythonhosted.org/packages/49/0c/2fd4df0d83a495bb5e54dca4474c4ec5f9c62db185421563deeb5dabf609/torch-2.8.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e2fab4153768d433f8ed9279c8133a114a034a61e77a3a104dcdf54388838705", size = 101906089, upload-time = "2025-08-06T14:53:52.631Z" }, - { url = "https://files.pythonhosted.org/packages/99/a8/6acf48d48838fb8fe480597d98a0668c2beb02ee4755cc136de92a0a956f/torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b2aca0939fb7e4d842561febbd4ffda67a8e958ff725c1c27e244e85e982173c", size = 887913624, upload-time = "2025-08-06T14:56:44.33Z" }, - { url = "https://files.pythonhosted.org/packages/af/8a/5c87f08e3abd825c7dfecef5a0f1d9aa5df5dd0e3fd1fa2f490a8e512402/torch-2.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:2f4ac52f0130275d7517b03a33d2493bab3693c83dcfadf4f81688ea82147d2e", size = 241326087, upload-time = "2025-08-06T14:53:46.503Z" }, - { url = "https://files.pythonhosted.org/packages/be/66/5c9a321b325aaecb92d4d1855421e3a055abd77903b7dab6575ca07796db/torch-2.8.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:619c2869db3ada2c0105487ba21b5008defcc472d23f8b80ed91ac4a380283b0", size = 73630478, upload-time = "2025-08-06T14:53:57.144Z" }, -] - -[[package]] -name = "tqdm" -version = "4.67.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" }, -] - -[[package]] -name = "transformers" -version = "4.57.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "filelock" }, - { name = "huggingface-hub" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "pyyaml" }, - { name = "regex" }, - { name = "requests" }, - { name = "safetensors" }, - { name = "tokenizers" }, - { name = "tqdm" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d6/68/a39307bcc4116a30b2106f2e689130a48de8bd8a1e635b5e1030e46fcd9e/transformers-4.57.1.tar.gz", hash = "sha256:f06c837959196c75039809636cd964b959f6604b75b8eeec6fdfc0440b89cc55", size = 10142511, upload-time = "2025-10-14T15:39:26.18Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/71/d3/c16c3b3cf7655a67db1144da94b021c200ac1303f82428f2beef6c2e72bb/transformers-4.57.1-py3-none-any.whl", hash = "sha256:b10d05da8fa67dc41644dbbf9bc45a44cb86ae33da6f9295f5fbf5b7890bd267", size = 11990925, upload-time = "2025-10-14T15:39:23.085Z" }, -] - -[[package]] -name = "triton" -version = "3.4.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "setuptools" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/7d/39/43325b3b651d50187e591eefa22e236b2981afcebaefd4f2fc0ea99df191/triton-3.4.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b70f5e6a41e52e48cfc087436c8a28c17ff98db369447bcaff3b887a3ab4467", size = 155531138, upload-time = "2025-07-30T19:58:29.908Z" }, - { url = "https://files.pythonhosted.org/packages/d0/66/b1eb52839f563623d185f0927eb3530ee4d5ffe9d377cdaf5346b306689e/triton-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:31c1d84a5c0ec2c0f8e8a072d7fd150cab84a9c239eaddc6706c081bfae4eb04", size = 155560068, upload-time = "2025-07-30T19:58:37.081Z" }, -] - -[[package]] -name = "typing-extensions" -version = "4.15.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, -] - -[[package]] -name = "typing-inspection" -version = "0.4.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, -] - -[[package]] -name = "tzdata" -version = "2025.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, -] - -[[package]] -name = "urllib3" -version = "2.6.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, -] - -[[package]] -name = "wandb" -version = "0.22.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click" }, - { name = "gitpython" }, - { name = "packaging" }, - { name = "platformdirs" }, - { name = "protobuf" }, - { name = "pydantic" }, - { name = "pyyaml" }, - { name = "requests" }, - { name = "sentry-sdk" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/93/37/0d4194707ceaa3168fa9ce54c1332bf15958bdbf67837f39cfac2e3b98bb/wandb-0.22.0.tar.gz", hash = "sha256:717e3d085f8f57dbde745c9ec6d605e51b2da51e47a7d2a7bfa82c9c6e3d3f5a", size = 40241826, upload-time = "2025-09-18T19:13:22.256Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/19/7d/8841e39e4f97a8777babad57b13856b5e24d6efe35ad75649c8da28472d9/wandb-0.22.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:8650a14615c23dcfc8cf393f88d41a879d6bfffb3c290a556aeb6ee62986c359", size = 18343096, upload-time = "2025-09-18T19:12:58.473Z" }, - { url = "https://files.pythonhosted.org/packages/c1/6e/0416fea679527b80109c083782ae2696a6c37ac45e7f8901c27b665ea94b/wandb-0.22.0-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:94ec449b3ed9516cad7008ab37c55b299d0036cdadfa83688b7245bd6ba04dd3", size = 19373158, upload-time = "2025-09-18T19:13:02.441Z" }, - { url = "https://files.pythonhosted.org/packages/db/58/48499272541eb21c3db2e28a0dc128270e8acb533a358944306210b1cb9e/wandb-0.22.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b2fe78b5f2d1ec7396f7925c7ac33f04ea0a62f07779cb654c45633d17dfc45", size = 18149252, upload-time = "2025-09-18T19:13:05.344Z" }, - { url = "https://files.pythonhosted.org/packages/06/c7/93a70c6f31ea127fd1c89800e6e733e172d9eaba6a33c9e08348503df78b/wandb-0.22.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44da9a83301d89c008f608832b74237f9e0a0758b2bb6d69ba51652818fffb5e", size = 19564075, upload-time = "2025-09-18T19:13:07.882Z" }, - { url = "https://files.pythonhosted.org/packages/c0/d8/910e4dee2dc2010d688087244d0502621105d5f314088af9265081c73079/wandb-0.22.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:21f05cc609c62c8ccba7c3338f9288d723c64d16ffd4fa70c02d6db60b42abae", size = 18188310, upload-time = "2025-09-18T19:13:10.321Z" }, - { url = "https://files.pythonhosted.org/packages/97/ac/2c09e536aca56d01b50207acc25aadbe0ee6ae8b825ec0f30c5ea7c1cd2f/wandb-0.22.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:884d37fb8d4daeb4d1f68ad8b5ea2817cabecc715efaff2f89bf006f2e977e37", size = 19658593, upload-time = "2025-09-18T19:13:13.812Z" }, - { url = "https://files.pythonhosted.org/packages/29/cb/d5f832adfd68f3a4700928e0cbdac78acb0f3182983a57a020cd1c5bab26/wandb-0.22.0-py3-none-win32.whl", hash = "sha256:60776fae528c3f64caf47a94dec08899c308f96fe974e0a82cefddb9a65e223c", size = 18742395, upload-time = "2025-09-18T19:13:16.496Z" }, - { url = "https://files.pythonhosted.org/packages/3c/c9/d9f0c7b8a743af589e694ce8fec8e6cffa46873179912d4ed4f992d08381/wandb-0.22.0-py3-none-win_amd64.whl", hash = "sha256:53ba0fa048b766c1aa44592f1e530fb7eead7749089a66c3892b35f153a8d8bd", size = 18742399, upload-time = "2025-09-18T19:13:19.26Z" }, -] - -[[package]] -name = "xxhash" -version = "3.5.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/00/5e/d6e5258d69df8b4ed8c83b6664f2b47d30d2dec551a29ad72a6c69eafd31/xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f", size = 84241, upload-time = "2024-08-17T09:20:38.972Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b8/c7/afed0f131fbda960ff15eee7f304fa0eeb2d58770fade99897984852ef23/xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1", size = 31969, upload-time = "2024-08-17T09:18:00.852Z" }, - { url = "https://files.pythonhosted.org/packages/8c/0c/7c3bc6d87e5235672fcc2fb42fd5ad79fe1033925f71bf549ee068c7d1ca/xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8", size = 30800, upload-time = "2024-08-17T09:18:01.863Z" }, - { url = "https://files.pythonhosted.org/packages/04/9e/01067981d98069eec1c20201f8c145367698e9056f8bc295346e4ea32dd1/xxhash-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166", size = 221566, upload-time = "2024-08-17T09:18:03.461Z" }, - { url = "https://files.pythonhosted.org/packages/d4/09/d4996de4059c3ce5342b6e1e6a77c9d6c91acce31f6ed979891872dd162b/xxhash-3.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7", size = 201214, upload-time = "2024-08-17T09:18:05.616Z" }, - { url = "https://files.pythonhosted.org/packages/62/f5/6d2dc9f8d55a7ce0f5e7bfef916e67536f01b85d32a9fbf137d4cadbee38/xxhash-3.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623", size = 429433, upload-time = "2024-08-17T09:18:06.957Z" }, - { url = "https://files.pythonhosted.org/packages/d9/72/9256303f10e41ab004799a4aa74b80b3c5977d6383ae4550548b24bd1971/xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a", size = 194822, upload-time = "2024-08-17T09:18:08.331Z" }, - { url = "https://files.pythonhosted.org/packages/34/92/1a3a29acd08248a34b0e6a94f4e0ed9b8379a4ff471f1668e4dce7bdbaa8/xxhash-3.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88", size = 208538, upload-time = "2024-08-17T09:18:10.332Z" }, - { url = "https://files.pythonhosted.org/packages/53/ad/7fa1a109663366de42f724a1cdb8e796a260dbac45047bce153bc1e18abf/xxhash-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c", size = 216953, upload-time = "2024-08-17T09:18:11.707Z" }, - { url = "https://files.pythonhosted.org/packages/35/02/137300e24203bf2b2a49b48ce898ecce6fd01789c0fcd9c686c0a002d129/xxhash-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2", size = 203594, upload-time = "2024-08-17T09:18:13.799Z" }, - { url = "https://files.pythonhosted.org/packages/23/03/aeceb273933d7eee248c4322b98b8e971f06cc3880e5f7602c94e5578af5/xxhash-3.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084", size = 210971, upload-time = "2024-08-17T09:18:15.824Z" }, - { url = "https://files.pythonhosted.org/packages/e3/64/ed82ec09489474cbb35c716b189ddc1521d8b3de12b1b5ab41ce7f70253c/xxhash-3.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d", size = 415050, upload-time = "2024-08-17T09:18:17.142Z" }, - { url = "https://files.pythonhosted.org/packages/71/43/6db4c02dcb488ad4e03bc86d70506c3d40a384ee73c9b5c93338eb1f3c23/xxhash-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839", size = 192216, upload-time = "2024-08-17T09:18:18.779Z" }, - { url = "https://files.pythonhosted.org/packages/22/6d/db4abec29e7a567455344433d095fdb39c97db6955bb4a2c432e486b4d28/xxhash-3.5.0-cp311-cp311-win32.whl", hash = "sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da", size = 30120, upload-time = "2024-08-17T09:18:20.009Z" }, - { url = "https://files.pythonhosted.org/packages/52/1c/fa3b61c0cf03e1da4767213672efe186b1dfa4fc901a4a694fb184a513d1/xxhash-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58", size = 30003, upload-time = "2024-08-17T09:18:21.052Z" }, - { url = "https://files.pythonhosted.org/packages/6b/8e/9e6fc572acf6e1cc7ccb01973c213f895cb8668a9d4c2b58a99350da14b7/xxhash-3.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3", size = 26777, upload-time = "2024-08-17T09:18:22.809Z" }, - { url = "https://files.pythonhosted.org/packages/07/0e/1bfce2502c57d7e2e787600b31c83535af83746885aa1a5f153d8c8059d6/xxhash-3.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:14470ace8bd3b5d51318782cd94e6f94431974f16cb3b8dc15d52f3b69df8e00", size = 31969, upload-time = "2024-08-17T09:18:24.025Z" }, - { url = "https://files.pythonhosted.org/packages/3f/d6/8ca450d6fe5b71ce521b4e5db69622383d039e2b253e9b2f24f93265b52c/xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:59aa1203de1cb96dbeab595ded0ad0c0056bb2245ae11fac11c0ceea861382b9", size = 30787, upload-time = "2024-08-17T09:18:25.318Z" }, - { url = "https://files.pythonhosted.org/packages/5b/84/de7c89bc6ef63d750159086a6ada6416cc4349eab23f76ab870407178b93/xxhash-3.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08424f6648526076e28fae6ea2806c0a7d504b9ef05ae61d196d571e5c879c84", size = 220959, upload-time = "2024-08-17T09:18:26.518Z" }, - { url = "https://files.pythonhosted.org/packages/fe/86/51258d3e8a8545ff26468c977101964c14d56a8a37f5835bc0082426c672/xxhash-3.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:61a1ff00674879725b194695e17f23d3248998b843eb5e933007ca743310f793", size = 200006, upload-time = "2024-08-17T09:18:27.905Z" }, - { url = "https://files.pythonhosted.org/packages/02/0a/96973bd325412feccf23cf3680fd2246aebf4b789122f938d5557c54a6b2/xxhash-3.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2f2c61bee5844d41c3eb015ac652a0229e901074951ae48581d58bfb2ba01be", size = 428326, upload-time = "2024-08-17T09:18:29.335Z" }, - { url = "https://files.pythonhosted.org/packages/11/a7/81dba5010f7e733de88af9555725146fc133be97ce36533867f4c7e75066/xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d32a592cac88d18cc09a89172e1c32d7f2a6e516c3dfde1b9adb90ab5df54a6", size = 194380, upload-time = "2024-08-17T09:18:30.706Z" }, - { url = "https://files.pythonhosted.org/packages/fb/7d/f29006ab398a173f4501c0e4977ba288f1c621d878ec217b4ff516810c04/xxhash-3.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:70dabf941dede727cca579e8c205e61121afc9b28516752fd65724be1355cc90", size = 207934, upload-time = "2024-08-17T09:18:32.133Z" }, - { url = "https://files.pythonhosted.org/packages/8a/6e/6e88b8f24612510e73d4d70d9b0c7dff62a2e78451b9f0d042a5462c8d03/xxhash-3.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e5d0ddaca65ecca9c10dcf01730165fd858533d0be84c75c327487c37a906a27", size = 216301, upload-time = "2024-08-17T09:18:33.474Z" }, - { url = "https://files.pythonhosted.org/packages/af/51/7862f4fa4b75a25c3b4163c8a873f070532fe5f2d3f9b3fc869c8337a398/xxhash-3.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e5b5e16c5a480fe5f59f56c30abdeba09ffd75da8d13f6b9b6fd224d0b4d0a2", size = 203351, upload-time = "2024-08-17T09:18:34.889Z" }, - { url = "https://files.pythonhosted.org/packages/22/61/8d6a40f288f791cf79ed5bb113159abf0c81d6efb86e734334f698eb4c59/xxhash-3.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149b7914451eb154b3dfaa721315117ea1dac2cc55a01bfbd4df7c68c5dd683d", size = 210294, upload-time = "2024-08-17T09:18:36.355Z" }, - { url = "https://files.pythonhosted.org/packages/17/02/215c4698955762d45a8158117190261b2dbefe9ae7e5b906768c09d8bc74/xxhash-3.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:eade977f5c96c677035ff39c56ac74d851b1cca7d607ab3d8f23c6b859379cab", size = 414674, upload-time = "2024-08-17T09:18:38.536Z" }, - { url = "https://files.pythonhosted.org/packages/31/5c/b7a8db8a3237cff3d535261325d95de509f6a8ae439a5a7a4ffcff478189/xxhash-3.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fa9f547bd98f5553d03160967866a71056a60960be00356a15ecc44efb40ba8e", size = 192022, upload-time = "2024-08-17T09:18:40.138Z" }, - { url = "https://files.pythonhosted.org/packages/78/e3/dd76659b2811b3fd06892a8beb850e1996b63e9235af5a86ea348f053e9e/xxhash-3.5.0-cp312-cp312-win32.whl", hash = "sha256:f7b58d1fd3551b8c80a971199543379be1cee3d0d409e1f6d8b01c1a2eebf1f8", size = 30170, upload-time = "2024-08-17T09:18:42.163Z" }, - { url = "https://files.pythonhosted.org/packages/d9/6b/1c443fe6cfeb4ad1dcf231cdec96eb94fb43d6498b4469ed8b51f8b59a37/xxhash-3.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:fa0cafd3a2af231b4e113fba24a65d7922af91aeb23774a8b78228e6cd785e3e", size = 30040, upload-time = "2024-08-17T09:18:43.699Z" }, - { url = "https://files.pythonhosted.org/packages/0f/eb/04405305f290173acc0350eba6d2f1a794b57925df0398861a20fbafa415/xxhash-3.5.0-cp312-cp312-win_arm64.whl", hash = "sha256:586886c7e89cb9828bcd8a5686b12e161368e0064d040e225e72607b43858ba2", size = 26796, upload-time = "2024-08-17T09:18:45.29Z" }, -] - -[[package]] -name = "yarl" -version = "1.23.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "idna" }, - { name = "multidict" }, - { name = "propcache" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/23/6e/beb1beec874a72f23815c1434518bfc4ed2175065173fb138c3705f658d4/yarl-1.23.0.tar.gz", hash = "sha256:53b1ea6ca88ebd4420379c330aea57e258408dd0df9af0992e5de2078dc9f5d5", size = 194676, upload-time = "2026-03-01T22:07:53.373Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a2/aa/60da938b8f0997ba3a911263c40d82b6f645a67902a490b46f3355e10fae/yarl-1.23.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b35d13d549077713e4414f927cdc388d62e543987c572baee613bf82f11a4b99", size = 123641, upload-time = "2026-03-01T22:04:42.841Z" }, - { url = "https://files.pythonhosted.org/packages/24/84/e237607faf4e099dbb8a4f511cfd5efcb5f75918baad200ff7380635631b/yarl-1.23.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cbb0fef01f0c6b38cb0f39b1f78fc90b807e0e3c86a7ff3ce74ad77ce5c7880c", size = 86248, upload-time = "2026-03-01T22:04:44.757Z" }, - { url = "https://files.pythonhosted.org/packages/b2/0d/71ceabc14c146ba8ee3804ca7b3d42b1664c8440439de5214d366fec7d3a/yarl-1.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc52310451fc7c629e13c4e061cbe2dd01684d91f2f8ee2821b083c58bd72432", size = 85988, upload-time = "2026-03-01T22:04:46.365Z" }, - { url = "https://files.pythonhosted.org/packages/8c/6c/4a90d59c572e46b270ca132aca66954f1175abd691f74c1ef4c6711828e2/yarl-1.23.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b2c6b50c7b0464165472b56b42d4c76a7b864597007d9c085e8b63e185cf4a7a", size = 100566, upload-time = "2026-03-01T22:04:47.639Z" }, - { url = "https://files.pythonhosted.org/packages/49/fb/c438fb5108047e629f6282a371e6e91cf3f97ee087c4fb748a1f32ceef55/yarl-1.23.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:aafe5dcfda86c8af00386d7781d4c2181b5011b7be3f2add5e99899ea925df05", size = 92079, upload-time = "2026-03-01T22:04:48.925Z" }, - { url = "https://files.pythonhosted.org/packages/d9/13/d269aa1aed3e4f50a5a103f96327210cc5fa5dd2d50882778f13c7a14606/yarl-1.23.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9ee33b875f0b390564c1fb7bc528abf18c8ee6073b201c6ae8524aca778e2d83", size = 108741, upload-time = "2026-03-01T22:04:50.838Z" }, - { url = "https://files.pythonhosted.org/packages/85/fb/115b16f22c37ea4437d323e472945bea97301c8ec6089868fa560abab590/yarl-1.23.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4c41e021bc6d7affb3364dc1e1e5fa9582b470f283748784bd6ea0558f87f42c", size = 108099, upload-time = "2026-03-01T22:04:52.499Z" }, - { url = "https://files.pythonhosted.org/packages/9a/64/c53487d9f4968045b8afa51aed7ca44f58b2589e772f32745f3744476c82/yarl-1.23.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:99c8a9ed30f4164bc4c14b37a90208836cbf50d4ce2a57c71d0f52c7fb4f7598", size = 102678, upload-time = "2026-03-01T22:04:55.176Z" }, - { url = "https://files.pythonhosted.org/packages/85/59/cd98e556fbb2bf8fab29c1a722f67ad45c5f3447cac798ab85620d1e70af/yarl-1.23.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2af5c81a1f124609d5f33507082fc3f739959d4719b56877ab1ee7e7b3d602b", size = 100803, upload-time = "2026-03-01T22:04:56.588Z" }, - { url = "https://files.pythonhosted.org/packages/9e/c0/b39770b56d4a9f0bb5f77e2f1763cd2d75cc2f6c0131e3b4c360348fcd65/yarl-1.23.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6b41389c19b07c760c7e427a3462e8ab83c4bb087d127f0e854c706ce1b9215c", size = 100163, upload-time = "2026-03-01T22:04:58.492Z" }, - { url = "https://files.pythonhosted.org/packages/e7/64/6980f99ab00e1f0ff67cb84766c93d595b067eed07439cfccfc8fb28c1a6/yarl-1.23.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:1dc702e42d0684f42d6519c8d581e49c96cefaaab16691f03566d30658ee8788", size = 93859, upload-time = "2026-03-01T22:05:00.268Z" }, - { url = "https://files.pythonhosted.org/packages/38/69/912e6c5e146793e5d4b5fe39ff5b00f4d22463dfd5a162bec565ac757673/yarl-1.23.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:0e40111274f340d32ebcc0a5668d54d2b552a6cca84c9475859d364b380e3222", size = 108202, upload-time = "2026-03-01T22:05:02.273Z" }, - { url = "https://files.pythonhosted.org/packages/59/97/35ca6767524687ad64e5f5c31ad54bc76d585585a9fcb40f649e7e82ffed/yarl-1.23.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:4764a6a7588561a9aef92f65bda2c4fb58fe7c675c0883862e6df97559de0bfb", size = 99866, upload-time = "2026-03-01T22:05:03.597Z" }, - { url = "https://files.pythonhosted.org/packages/d3/1c/1a3387ee6d73589f6f2a220ae06f2984f6c20b40c734989b0a44f5987308/yarl-1.23.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:03214408cfa590df47728b84c679ae4ef00be2428e11630277be0727eba2d7cc", size = 107852, upload-time = "2026-03-01T22:05:04.986Z" }, - { url = "https://files.pythonhosted.org/packages/a4/b8/35c0750fcd5a3f781058bfd954515dd4b1eab45e218cbb85cf11132215f1/yarl-1.23.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:170e26584b060879e29fac213e4228ef063f39128723807a312e5c7fec28eff2", size = 102919, upload-time = "2026-03-01T22:05:06.397Z" }, - { url = "https://files.pythonhosted.org/packages/e5/1c/9a1979aec4a81896d597bcb2177827f2dbee3f5b7cc48b2d0dadb644b41d/yarl-1.23.0-cp311-cp311-win32.whl", hash = "sha256:51430653db848d258336cfa0244427b17d12db63d42603a55f0d4546f50f25b5", size = 82602, upload-time = "2026-03-01T22:05:08.444Z" }, - { url = "https://files.pythonhosted.org/packages/93/22/b85eca6fa2ad9491af48c973e4c8cf6b103a73dbb271fe3346949449fca0/yarl-1.23.0-cp311-cp311-win_amd64.whl", hash = "sha256:bf49a3ae946a87083ef3a34c8f677ae4243f5b824bfc4c69672e72b3d6719d46", size = 87461, upload-time = "2026-03-01T22:05:10.145Z" }, - { url = "https://files.pythonhosted.org/packages/93/95/07e3553fe6f113e6864a20bdc53a78113cda3b9ced8784ee52a52c9f80d8/yarl-1.23.0-cp311-cp311-win_arm64.whl", hash = "sha256:b39cb32a6582750b6cc77bfb3c49c0f8760dc18dc96ec9fb55fbb0f04e08b928", size = 82336, upload-time = "2026-03-01T22:05:11.554Z" }, - { url = "https://files.pythonhosted.org/packages/88/8a/94615bc31022f711add374097ad4144d569e95ff3c38d39215d07ac153a0/yarl-1.23.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1932b6b8bba8d0160a9d1078aae5838a66039e8832d41d2992daa9a3a08f7860", size = 124737, upload-time = "2026-03-01T22:05:12.897Z" }, - { url = "https://files.pythonhosted.org/packages/e3/6f/c6554045d59d64052698add01226bc867b52fe4a12373415d7991fdca95d/yarl-1.23.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:411225bae281f114067578891bc75534cfb3d92a3b4dfef7a6ca78ba354e6069", size = 87029, upload-time = "2026-03-01T22:05:14.376Z" }, - { url = "https://files.pythonhosted.org/packages/19/2a/725ecc166d53438bc88f76822ed4b1e3b10756e790bafd7b523fe97c322d/yarl-1.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:13a563739ae600a631c36ce096615fe307f131344588b0bc0daec108cdb47b25", size = 86310, upload-time = "2026-03-01T22:05:15.71Z" }, - { url = "https://files.pythonhosted.org/packages/99/30/58260ed98e6ff7f90ba84442c1ddd758c9170d70327394a6227b310cd60f/yarl-1.23.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9cbf44c5cb4a7633d078788e1b56387e3d3cf2b8139a3be38040b22d6c3221c8", size = 97587, upload-time = "2026-03-01T22:05:17.384Z" }, - { url = "https://files.pythonhosted.org/packages/76/0a/8b08aac08b50682e65759f7f8dde98ae8168f72487e7357a5d684c581ef9/yarl-1.23.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53ad387048f6f09a8969631e4de3f1bf70c50e93545d64af4f751b2498755072", size = 92528, upload-time = "2026-03-01T22:05:18.804Z" }, - { url = "https://files.pythonhosted.org/packages/52/07/0b7179101fe5f8385ec6c6bb5d0cb9f76bd9fb4a769591ab6fb5cdbfc69a/yarl-1.23.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4a59ba56f340334766f3a4442e0efd0af895fae9e2b204741ef885c446b3a1a8", size = 105339, upload-time = "2026-03-01T22:05:20.235Z" }, - { url = "https://files.pythonhosted.org/packages/d3/8a/36d82869ab5ec829ca8574dfcb92b51286fcfb1e9c7a73659616362dc880/yarl-1.23.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:803a3c3ce4acc62eaf01eaca1208dcf0783025ef27572c3336502b9c232005e7", size = 105061, upload-time = "2026-03-01T22:05:22.268Z" }, - { url = "https://files.pythonhosted.org/packages/66/3e/868e5c3364b6cee19ff3e1a122194fa4ce51def02c61023970442162859e/yarl-1.23.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3d2bff8f37f8d0f96c7ec554d16945050d54462d6e95414babaa18bfafc7f51", size = 100132, upload-time = "2026-03-01T22:05:23.638Z" }, - { url = "https://files.pythonhosted.org/packages/cf/26/9c89acf82f08a52cb52d6d39454f8d18af15f9d386a23795389d1d423823/yarl-1.23.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c75eb09e8d55bceb4367e83496ff8ef2bc7ea6960efb38e978e8073ea59ecb67", size = 99289, upload-time = "2026-03-01T22:05:25.749Z" }, - { url = "https://files.pythonhosted.org/packages/6f/54/5b0db00d2cb056922356104468019c0a132e89c8d3ab67d8ede9f4483d2a/yarl-1.23.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877b0738624280e34c55680d6054a307aa94f7d52fa0e3034a9cc6e790871da7", size = 96950, upload-time = "2026-03-01T22:05:27.318Z" }, - { url = "https://files.pythonhosted.org/packages/f6/40/10fa93811fd439341fad7e0718a86aca0de9548023bbb403668d6555acab/yarl-1.23.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b5405bb8f0e783a988172993cfc627e4d9d00432d6bbac65a923041edacf997d", size = 93960, upload-time = "2026-03-01T22:05:28.738Z" }, - { url = "https://files.pythonhosted.org/packages/bc/d2/8ae2e6cd77d0805f4526e30ec43b6f9a3dfc542d401ac4990d178e4bf0cf/yarl-1.23.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1c3a3598a832590c5a3ce56ab5576361b5688c12cb1d39429cf5dba30b510760", size = 104703, upload-time = "2026-03-01T22:05:30.438Z" }, - { url = "https://files.pythonhosted.org/packages/2f/0c/b3ceacf82c3fe21183ce35fa2acf5320af003d52bc1fcf5915077681142e/yarl-1.23.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8419ebd326430d1cbb7efb5292330a2cf39114e82df5cc3d83c9a0d5ebeaf2f2", size = 98325, upload-time = "2026-03-01T22:05:31.835Z" }, - { url = "https://files.pythonhosted.org/packages/9d/e0/12900edd28bdab91a69bd2554b85ad7b151f64e8b521fe16f9ad2f56477a/yarl-1.23.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:be61f6fff406ca40e3b1d84716fde398fc08bc63dd96d15f3a14230a0973ed86", size = 105067, upload-time = "2026-03-01T22:05:33.358Z" }, - { url = "https://files.pythonhosted.org/packages/15/61/74bb1182cf79c9bbe4eb6b1f14a57a22d7a0be5e9cedf8e2d5c2086474c3/yarl-1.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ceb13c5c858d01321b5d9bb65e4cf37a92169ea470b70fec6f236b2c9dd7e34", size = 100285, upload-time = "2026-03-01T22:05:35.4Z" }, - { url = "https://files.pythonhosted.org/packages/69/7f/cd5ef733f2550de6241bd8bd8c3febc78158b9d75f197d9c7baa113436af/yarl-1.23.0-cp312-cp312-win32.whl", hash = "sha256:fffc45637bcd6538de8b85f51e3df3223e4ad89bccbfca0481c08c7fc8b7ed7d", size = 82359, upload-time = "2026-03-01T22:05:36.811Z" }, - { url = "https://files.pythonhosted.org/packages/f5/be/25216a49daeeb7af2bec0db22d5e7df08ed1d7c9f65d78b14f3b74fd72fc/yarl-1.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:f69f57305656a4852f2a7203efc661d8c042e6cc67f7acd97d8667fb448a426e", size = 87674, upload-time = "2026-03-01T22:05:38.171Z" }, - { url = "https://files.pythonhosted.org/packages/d2/35/aeab955d6c425b227d5b7247eafb24f2653fedc32f95373a001af5dfeb9e/yarl-1.23.0-cp312-cp312-win_arm64.whl", hash = "sha256:6e87a6e8735b44816e7db0b2fbc9686932df473c826b0d9743148432e10bb9b9", size = 81879, upload-time = "2026-03-01T22:05:40.006Z" }, - { url = "https://files.pythonhosted.org/packages/69/68/c8739671f5699c7dc470580a4f821ef37c32c4cb0b047ce223a7f115757f/yarl-1.23.0-py3-none-any.whl", hash = "sha256:a2df6afe50dea8ae15fa34c9f824a3ee958d785fd5d089063d960bae1daa0a3f", size = 48288, upload-time = "2026-03-01T22:07:51.388Z" }, -]