diff --git a/.gitignore b/.gitignore
index 0ae1fb6a..06784f4f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,6 @@ build-*/
 benchmark_results.csv
 __pycache__/
 *.pyc
+
+# Tuner artifacts (run_tuner.py)
+fusilli_tuning_*/
diff --git a/README.md b/README.md
index d0e46383..131ae3c0 100644
--- a/README.md
+++ b/README.md
@@ -278,6 +278,53 @@ python benchmarks/run_benchmark.py \
   -f commands.txt -o results.csv
 ```
 
+### Tuner
+
+The Fusilli tuner (`benchmarks/run_tuner.py`) generates optimized IREE tuning
+specs for Fusilli operations. It wraps the
+[IREE tuner](https://github.com/nod-ai/amd-shark-ai/tree/main/amdsharktuner) to automatically generate,
+compile, and benchmark tuning candidates.
+
+**AMDGPU only.** The tuner targets ROCm dispatches and requires a build
+configured with `-DFUSILLI_SYSTEMS_AMDGPU=ON`, an AMD GPU at runtime, and
+amdsharktuner installed from source. The PyPI release lags and isn't compatible
+with the IREE RC pinned in `version.json`, so install from GitHub directly:
+
+```shell
+pip install --pre \
+  "amdsharktuner @ git+https://github.com/nod-ai/amd-shark-ai.git@main#subdirectory=amdsharktuner" \
+  --find-links https://iree.dev/pip-release-links.html
+```
+
+**Single operation:**
+```shell
+python benchmarks/run_tuner.py \
+  --devices hip://0 \
+  --num-candidates 30 \
+  --output-td-spec tuning_spec.mlir \
+  --fusilli-args "matmul -M 1024 -N 1024 -K 1024 --a_type bf16 --b_type bf16 --out_type bf16"
+```
+
+**Multiple operations from file:**
+```shell
+python benchmarks/run_tuner.py \
+  --devices hip://0 \
+  --num-candidates 30 \
+  --output-td-spec tuning_spec.mlir \
+  --commands-file commands.txt
+```
+
+When tuning multiple commands, the best spec from each command is automatically
+chained as the starting spec for the next command. To start from an existing
+spec, use `--starter-td-spec <path>`.
+
+The generated tuning spec can then be used with the benchmark driver:
+```shell
+FUSILLI_EXTRA_COMPILER_FLAGS="--iree-codegen-tuning-spec-path=tuning_spec.mlir" \
+  build/bin/benchmarks/fusilli_benchmark_driver --iter 100 \
+  matmul -M 1024 -N 1024 -K 1024 --a_type bf16 --b_type bf16 --out_type bf16
+```
+
 ### Sanitizers
 
 Fusilli supports building with the following sanitizers:
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index fafd7ce0..401b9b1c 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -42,6 +42,33 @@ if(FUSILLI_SYSTEMS_AMDGPU)
       ENVIRONMENT "${FUSILLI_SANITIZER_TEST_ENV_VARS}"
     )
   endif()
+
+  # Test tuner runner (GPU integration tests)
+  add_test(
+    NAME fusilli_tuner_runner_tests
+    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/test_tuner_runner.sh
+            ${CMAKE_CURRENT_SOURCE_DIR}/run_tuner.py
+            $<TARGET_FILE:fusilli_benchmark_driver>
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+  )
+
+  # Configure sanitizer options
+  if(FUSILLI_SANITIZER_TEST_ENV_VARS)
+    set_tests_properties(
+      fusilli_tuner_runner_tests PROPERTIES
+      ENVIRONMENT "${FUSILLI_SANITIZER_TEST_ENV_VARS}"
+    )
+  endif()
+endif()
+
+# Tuner cache extraction unit tests (CPU-only, no GPU or amdsharktuner needed)
+if(FUSILLI_BUILD_TESTS)
+  add_test(
+    NAME fusilli_tuner_cache_tests
+    COMMAND python3 -m unittest
+            ${CMAKE_CURRENT_SOURCE_DIR}/test_tuner_cache.py -v
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  )
 endif()
 
 # Add some benchmark configurations for CI coverage.
diff --git a/benchmarks/run_tuner.py b/benchmarks/run_tuner.py
new file mode 100644
index 00000000..874bd4ce
--- /dev/null
+++ b/benchmarks/run_tuner.py
@@ -0,0 +1,610 @@
+#!/usr/bin/env python3
+# Copyright 2026 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""Fusilli Tuner — tune IREE kernels generated by Fusilli operations.
+
+AMDGPU only. Wraps amdsharktuner.libtuner to generate, compile, and benchmark
+tuning candidates for Fusilli operations on ROCm. Produces an MLIR tuning spec
+that can be passed to Fusilli via FUSILLI_EXTRA_COMPILER_FLAGS or
+--Xiree-compile.
+
+Requires:
+    - Fusilli built with -DFUSILLI_SYSTEMS_AMDGPU=ON
+    - An AMD GPU + ROCm runtime
+    - amdsharktuner from source (PyPI lags; install from GitHub):
+        pip install --pre \\
+          "amdsharktuner @ git+https://github.com/nod-ai/amd-shark-ai.git@main#subdirectory=amdsharktuner" \\
+          --find-links https://iree.dev/pip-release-links.html
+
+Usage:
+    # Single command:
+    python benchmarks/run_tuner.py \\
+        --devices hip://0 --num-candidates 30 \\
+        --fusilli-args "matmul -M 1024 -N 1024 -K 1024 --a_type bf16 --b_type bf16 --out_type bf16"
+
+    # Multiple commands from file:
+    python benchmarks/run_tuner.py \\
+        --devices hip://0 --num-candidates 30 \\
+        --commands-file commands.txt --output-td-spec tuning_spec.mlir
+"""
+
+import argparse
+import logging
+import os
+import shlex
+import shutil
+import subprocess
+import sys
+import tempfile
+from datetime import datetime
+from pathlib import Path
+from typing import Iterator, Optional
+
+_HAS_LIBTUNER = True
+_LIBTUNER_IMPORT_ERROR: Optional[Exception] = None
+try:
+    from amdsharktuner import common, libtuner
+    from typing_extensions import override
+except (
+    Exception
+) as exc:  # noqa: BLE001 - amdsharktuner can raise RuntimeError on version mismatch
+    _HAS_LIBTUNER = False
+    _LIBTUNER_IMPORT_ERROR = exc
+
+
+def _require_libtuner():
+    """Exit with a helpful message if amdsharktuner is not installed."""
+    if not _HAS_LIBTUNER:
+        print(
+            "ERROR: amdsharktuner is required but not installed.\n"
+            f"Import failed with: {_LIBTUNER_IMPORT_ERROR}\n"
+            "Install from GitHub (PyPI release lags):\n"
+            "  pip install --pre \\\n"
+            '    "amdsharktuner @ git+https://github.com/nod-ai/'
+            'amd-shark-ai.git@main#subdirectory=amdsharktuner" \\\n'
+            "    --find-links https://iree.dev/pip-release-links.html\n"
+            "See https://github.com/nod-ai/amd-shark-ai/tree/main/amdsharktuner for details.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+
+# ===----------------------------------------------------------------------=== #
+# Tuner classes (only defined when amdsharktuner is available)
+# ===----------------------------------------------------------------------=== #
+
+if _HAS_LIBTUNER:
+
+    class FusilliPathConfig(libtuner.PathConfig):
+        """Path configuration with timestamped Fusilli tuning directories."""
+
+        def _name_base_dir(self) -> Path:
+            timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M")
+            return Path(f"./fusilli_tuning_{timestamp}")
+
+        def create_benchmark_path_config(
+            self, benchmark_name: str
+        ) -> libtuner.PathConfig:
+            """Create a PathConfig for a specific benchmark under the main directory."""
+            base_dir = self.base_dir
+
+            class BenchmarkPathConfig(libtuner.PathConfig):
+                def _name_base_dir(self) -> Path:
+                    return base_dir / benchmark_name
+
+            return BenchmarkPathConfig()
+
+    class FusilliTuner(libtuner.TuningClient):
+        """Tuning client for IREE kernels generated by Fusilli."""
+
+        def __init__(self, tuner_context: common.TunerContext):
+            super().__init__(tuner_context)
+            self.compile_flags: list[str] = []
+            self.benchmark_flags: list[str] = []
+            # Per-candidate compile budget; covers the bulk of dispatch
+            # compiles while terminating runaway pathological candidates.
+            self.compile_timeout: Optional[float] = 16
+            self.benchmark_timeout: Optional[float] = None
+            self.auto_benchmark_timeout: bool = True
+
+        @override
+        def get_iree_compile_flags(self) -> list[str]:
+            return self.compile_flags
+
+        @override
+        def get_iree_compile_timeout_s(self) -> Optional[float]:
+            return self.compile_timeout
+
+        @override
+        def get_iree_benchmark_module_flags(self) -> list[str]:
+            return self.benchmark_flags
+
+        @override
+        def get_iree_benchmark_timeout_s(self) -> Optional[float]:
+            return self.benchmark_timeout
+
+        @override
+        def is_auto_iree_benchmark_timeout(self) -> bool:
+            return self.auto_benchmark_timeout
+
+        @override
+        def should_prune_slower_candidates(self) -> bool:
+            return True
+
+
+# ===----------------------------------------------------------------------=== #
+# Cache and command utilities
+# ===----------------------------------------------------------------------=== #
+
+
+def find_cached_artifacts(base_dir: Path) -> tuple[Path, Path]:
+    """Find source MLIR and compile command from Fusilli cache.
+
+    Fusilli cache structure (controlled by FUSILLI_CACHE_DIR):
+        base_dir/.cache/fusilli/<graph_hash>/*.mlir
+        base_dir/.cache/fusilli/<graph_hash>/*.txt
+
+    Returns:
+        Tuple of (source_mlir_path, compile_command_path).
+
+    Raises:
+        FileNotFoundError: If cache structure is missing or unexpected.
+    """
+    fusilli_cache = base_dir / ".cache" / "fusilli"
+
+    if not fusilli_cache.exists():
+        raise FileNotFoundError(f"Fusilli cache not found at {fusilli_cache}")
+
+    graph_dirs = list(fusilli_cache.iterdir())
+    if len(graph_dirs) != 1:
+        raise FileNotFoundError(
+            f"Expected exactly one graph directory in {fusilli_cache}, "
+            f"found {len(graph_dirs)}"
+        )
+
+    graph_dir = graph_dirs[0]
+
+    mlir_files = list(graph_dir.glob("*.mlir"))
+    txt_files = list(graph_dir.glob("*.txt"))
+
+    if len(mlir_files) != 1:
+        raise FileNotFoundError(
+            f"Expected exactly one .mlir file in {graph_dir}, "
+            f"found {len(mlir_files)}"
+        )
+    if len(txt_files) != 1:
+        raise FileNotFoundError(
+            f"Expected exactly one .txt file in {graph_dir}, " f"found {len(txt_files)}"
+        )
+
+    source_mlir = mlir_files[0]
+    compile_cmd = txt_files[0]
+
+    # Symlink-escape guard: relative_to() raises ValueError if either
+    # resolved path lies outside base_dir.
+    source_mlir.resolve().relative_to(base_dir.resolve())
+    compile_cmd.resolve().relative_to(base_dir.resolve())
+
+    return source_mlir, compile_cmd
+
+
+def build_compile_args(compile_command: str, benchmarks_dir: Path) -> list[str]:
+    """Transform Fusilli's compile command into tuner-compatible iree-compile args.
+
+    Strips output flags and scheduling statistics flags, then appends
+    tuner-specific flags for dumping executable benchmarks.
+    """
+    tokens = shlex.split(compile_command)
+
+    compile_args: list[str] = ["iree-compile"]
+    args_iter: Iterator[str] = iter(tokens[1:])
+    for arg in args_iter:
+        # Skip "-o <path>" (Fusilli generates as separate arg + path).
+        if arg == "-o":
+            next(args_iter, None)
+            continue
+        # Skip scheduling statistics flags (Fusilli uses "=" syntax).
+        if arg.startswith(
+            (
+                "--iree-scheduling-dump-statistics-format=",
+                "--iree-scheduling-dump-statistics-file=",
+            )
+        ):
+            continue
+        compile_args.append(arg)
+
+    compile_args += [
+        "--iree-config-add-tuner-attributes",
+        "--iree-hal-dump-executable-benchmarks-to",
+        str(benchmarks_dir),
+        "-o",
+        os.devnull,
+    ]
+
+    return compile_args
+
+
+def load_commands(
+    commands_file: Optional[str], fusilli_op_args: list[str]
+) -> list[list[str]]:
+    """Load Fusilli commands from file or CLI args.
+
+    Caller must ensure exactly one of commands_file / fusilli_op_args is set;
+    main() validates this before invoking.
+
+    Returns a list of commands, each as a list of string tokens.
+    """
+    if not commands_file:
+        return [fusilli_op_args]
+
+    with open(commands_file) as f:
+        return [
+            shlex.split(line)
+            for line in f
+            if line.strip() and not line.strip().startswith("#")
+        ]
+
+
+# ===----------------------------------------------------------------------=== #
+# Driver and tuning orchestration
+# ===----------------------------------------------------------------------=== #
+
+
+def run_fusilli_benchmark_driver(
+    driver_path: str,
+    cli_args: list[str],
+    cache_dir: Path,
+) -> None:
+    """Run fusilli_benchmark_driver --dump to generate MLIR artifacts.
+
+    Args:
+        driver_path: Path to fusilli_benchmark_driver binary.
+        cli_args: Fusilli operation arguments (e.g., ["conv", "-F", "1", ...]).
+        cache_dir: Directory for FUSILLI_CACHE_DIR override.
+
+    Raises:
+        RuntimeError: If the driver exits with non-zero status.
+    """
+    cmd = [driver_path, "--dump", "--iter", "1"] + cli_args
+
+    env = os.environ.copy()
+    env["FUSILLI_CACHE_DIR"] = str(cache_dir)
+
+    logging.info(f"> {shlex.join(cmd)}")
+    logging.info(f"  FUSILLI_CACHE_DIR={cache_dir}")
+
+    result = subprocess.run(cmd, env=env, capture_output=True, text=True)
+
+    if result.returncode == 0:
+        if result.stdout:
+            logging.debug(f"Driver stdout:\n{result.stdout}")
+        return
+
+    logging.error(f"Driver failed with return code {result.returncode}")
+    if result.stdout:
+        logging.error(f"stdout: {result.stdout}")
+    if result.stderr:
+        logging.error(f"stderr: {result.stderr}")
+    raise RuntimeError(f"fusilli_benchmark_driver failed with code {result.returncode}")
+
+
+def tune_fusilli_dispatch(
+    benchmark_path: Path,
+    args: argparse.Namespace,
+    path_config: "libtuner.PathConfig",
+    root_logger: logging.Logger,
+    summary_handler: logging.Handler,
+    starter_td_spec: Optional[Path],
+) -> Optional[Path]:
+    """Tune a single Fusilli dispatch using libtuner.
+
+    Runs the three-phase tuning loop: generate candidates, compile, benchmark.
+
+    Returns:
+        Path to output tuning spec if tuning succeeded, None otherwise.
+    """
+    args.input_file = benchmark_path
+
+    if starter_td_spec and starter_td_spec.exists():
+        args.starter_td_spec = starter_td_spec
+    else:
+        args.starter_td_spec = None
+
+    logging.info("Generating candidate tuning specs...")
+    with common.TunerContext(logger=root_logger) as tuner_context:
+        tuner_context.logger.addHandler(summary_handler)
+        tuner = FusilliTuner(tuner_context)
+
+        candidates = libtuner.generate_candidate_specs(args, path_config, tuner)
+        logging.info(f"Stored candidate specs in {path_config.specs_dir}")
+
+        logging.info("Compiling dispatch candidates...")
+        tuner.compile_flags = ["--compile-from=executable-sources"]
+        compiled = libtuner.compile(args, path_config, candidates, tuner)
+
+        logging.info("Benchmarking compiled dispatch candidates...")
+        tuner.benchmark_flags = ["--input=1", "--benchmark_repetitions=3"]
+        top_candidates = libtuner.benchmark(
+            args,
+            path_config,
+            compiled,
+            tuner,
+            args.fusilli_num_dispatch_candidates,
+            args.fusilli_dispatch_benchmark_timeout_mins,
+        )
+
+        if not top_candidates:
+            logging.warning("No candidates performed better than baseline.")
+            return None
+
+        logging.info(f"Top dispatch candidates: {top_candidates}")
+        for cid in top_candidates:
+            logging.info(f"  {tuner.candidate_trackers[cid].spec_path.resolve()}")
+
+        best_id = top_candidates[0]
+        best_spec = tuner.candidate_trackers[best_id].spec_path
+        shutil.copy(best_spec, args.output_td_spec)
+        logging.info(f"Saved best tuning spec to: {args.output_td_spec}")
+        print(f"Saved best tuning spec to: {args.output_td_spec}")
+
+    return args.output_td_spec
+
+
+def process_fusilli_command(
+    cli_args: list[str],
+    args: argparse.Namespace,
+    fusilli_path_config: "FusilliPathConfig",
+    root_logger: logging.Logger,
+    starter_td_spec: Optional[Path],
+    command_idx: int,
+) -> Optional[Path]:
+    """Process a single Fusilli command: dump MLIR, extract benchmarks, tune.
+
+    Returns:
+        Path to the best tuning spec if tuning succeeded, None otherwise.
+    """
+    # Create isolated temp directory for this command's cache.
+    if args.tmp_dir:
+        base_tmp = Path(args.tmp_dir)
+        base_tmp.mkdir(parents=True, exist_ok=True)
+        tmp_dir = Path(tempfile.mkdtemp(dir=base_tmp, prefix="fusilli_cache_"))
+    else:
+        Path("fusilli_tuner").mkdir(exist_ok=True)
+        tmp_dir = Path(tempfile.mkdtemp(dir="fusilli_tuner", prefix="fusilli_cache_"))
+    logging.info(f"Using temporary directory: {tmp_dir}")
+
+    # Step 1: Generate MLIR artifacts via --dump.
+    run_fusilli_benchmark_driver(args.fusilli_driver, cli_args, tmp_dir)
+
+    # Step 2: Extract cached artifacts.
+    source_mlir, compile_cmd_path = find_cached_artifacts(tmp_dir)
+    logging.debug(f"source_mlir: {source_mlir}")
+
+    compile_command = compile_cmd_path.read_text().strip()
+
+    # Step 3: Compile with tuner flags to extract executable benchmarks.
+    benchmarks_dir = tmp_dir / "benchmarks"
+    compile_args = build_compile_args(compile_command, benchmarks_dir)
+
+    logging.info(f"> {shlex.join(compile_args)}")
+    compile_result = subprocess.run(compile_args, capture_output=True, text=True)
+
+    if compile_result.returncode != 0:
+        logging.error(f"iree-compile failed with code {compile_result.returncode}")
+        if compile_result.stdout:
+            logging.error(f"stdout: {compile_result.stdout}")
+        if compile_result.stderr:
+            logging.error(f"stderr: {compile_result.stderr}")
+        raise RuntimeError(f"iree-compile failed with code {compile_result.returncode}")
+
+    if not benchmarks_dir.exists():
+        logging.warning(f"No benchmarks directory found at {benchmarks_dir}")
+        return None
+
+    # Step 4: Tune each generated benchmark dispatch.
+    # Sort for deterministic spec-chaining order across runs (os.listdir
+    # is filesystem-defined).
+    best_spec_path: Optional[Path] = None
+    dispatch_starter_td_spec = starter_td_spec
+    benchmark_files = sorted(os.listdir(benchmarks_dir))
+
+    for benchmark_file in benchmark_files:
+        benchmark_path = benchmarks_dir / benchmark_file
+        logging.info(f"Tuning benchmark: {benchmark_path}")
+
+        benchmark_name = benchmark_file.replace("_benchmark.mlir", "")
+        op_type = cli_args[0] if cli_args else "unknown"
+        unique_name = f"{benchmark_name}_{op_type}_{command_idx}"
+
+        benchmark_path_config = fusilli_path_config.create_benchmark_path_config(
+            unique_name
+        )
+        benchmark_path_config.base_dir.mkdir(parents=True, exist_ok=True)
+
+        summary_log = benchmark_path_config.base_dir / "summary.log"
+        summary_handler = logging.FileHandler(summary_log)
+        summary_handler.setLevel(logging.INFO)
+        summary_handler.setFormatter(
+            logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+        )
+
+        try:
+            result = tune_fusilli_dispatch(
+                benchmark_path,
+                args,
+                benchmark_path_config,
+                root_logger,
+                summary_handler,
+                dispatch_starter_td_spec,
+            )
+            if result:
+                best_spec_path = result
+                dispatch_starter_td_spec = result
+
+            if benchmark_path_config.run_log is not None:
+                print(f"\nDetailed logs: {benchmark_path_config.run_log.resolve()}")
+            print(f"Summary: {summary_log.resolve()}")
+        except Exception:
+            logging.exception(f"Error tuning benchmark {benchmark_path}")
+            raise
+        finally:
+            root_logger.removeHandler(summary_handler)
+            summary_handler.close()
+
+    return args.output_td_spec if best_spec_path else None
+
+
+# ===----------------------------------------------------------------------=== #
+# CLI and entry point
+# ===----------------------------------------------------------------------=== #
+
+
+def insert_placeholder_input_file(argv: list[str]) -> list[str]:
+    """Insert placeholder to satisfy libtuner's required input_file argument.
+
+    Fusilli generates files internally rather than from a pre-existing input
+    file. This placeholder will be overridden per-dispatch in tune_fusilli_dispatch.
+    """
+    return [argv[0], "fusilli.mlir"] + argv[1:]
+
+
+def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
+    """Parse command line arguments.
+
+    Returns:
+        Tuple of (parsed_args, fusilli_op_args).
+    """
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    group = parser.add_argument_group("Fusilli Tuner Options")
+    group.add_argument(
+        "--fusilli-args",
+        type=str,
+        help='Fusilli operation command, e.g.: --fusilli-args="conv -F 1 --bf16 ..."',
+    )
+    group.add_argument(
+        "--commands-file",
+        type=str,
+        help="File with Fusilli commands (one per line).",
+    )
+    group.add_argument(
+        "--output-td-spec",
+        type=Path,
+        default=Path("tuning-spec.mlir"),
+        help="Output tuning spec file (default: tuning-spec.mlir).",
+    )
+    group.add_argument(
+        "--tmp-dir",
+        type=str,
+        default="",
+        help="Temp directory for Fusilli cache. Auto-created if not specified.",
+    )
+
+    script_dir = Path(__file__).parent.absolute()
+    default_driver = (
+        script_dir.parent / "build" / "bin" / "benchmarks" / "fusilli_benchmark_driver"
+    )
+    group.add_argument(
+        "--fusilli-driver",
+        type=str,
+        default=str(default_driver),
+        help=f"Path to fusilli_benchmark_driver (default: {default_driver}).",
+    )
+    group.add_argument(
+        "--fusilli-num-dispatch-candidates",
+        type=int,
+        default=None,
+        help="Limit top dispatch candidates to benchmark.",
+    )
+    group.add_argument(
+        "--fusilli-dispatch-benchmark-timeout-mins",
+        type=float,
+        default=None,
+        help="Time budget in minutes per dispatch for benchmarking.",
+    )
+
+    if _HAS_LIBTUNER:
+        # Insert placeholder for libtuner's required input_file positional arg.
+        argv_with_placeholder = insert_placeholder_input_file(argv)
+
+        original_argv = sys.argv
+        sys.argv = argv_with_placeholder
+        try:
+            args = libtuner.parse_arguments(parser)
+        finally:
+            sys.argv = original_argv
+
+        if "--codegen-pipeline" not in argv_with_placeholder:
+            args.codegen_pipeline = libtuner.CodegenPipelines.llvmgpu_tile_and_fuse
+    else:
+        # Fallback: parse only fusilli-specific args (enough for --help).
+        args, _ = parser.parse_known_args(argv[1:])
+
+    fusilli_op_args = shlex.split(args.fusilli_args) if args.fusilli_args else []
+
+    return args, fusilli_op_args
+
+
+def main() -> int:
+    """Main entry point for the Fusilli tuner."""
+    args, fusilli_op_args = parse_args(sys.argv)
+    _require_libtuner()
+
+    if args.commands_file and fusilli_op_args:
+        print(
+            "ERROR: Cannot specify both --commands-file and --fusilli-args",
+            file=sys.stderr,
+        )
+        return 1
+    if not args.commands_file and not fusilli_op_args:
+        print(
+            "ERROR: Must specify either --commands-file or --fusilli-args",
+            file=sys.stderr,
+        )
+        return 1
+
+    fusilli_path_config = FusilliPathConfig()
+    fusilli_path_config.base_dir.mkdir(parents=True, exist_ok=True)
+
+    root_logger = libtuner.setup_logging(args, fusilli_path_config)
+    print(fusilli_path_config.run_log)
+
+    logging.warning("Fusilli Tuner is still experimental")
+
+    if not args.dry_run:
+        logging.info("Validating devices")
+        libtuner.validate_devices(args.devices)
+        logging.info("Validation successful!")
+
+    commands = load_commands(args.commands_file, fusilli_op_args)
+
+    starter_td_spec: Optional[Path] = args.starter_td_spec
+    for idx, cli_args in enumerate(commands):
+        msg = f">>> ({idx + 1}/{len(commands)}) {shlex.join(cli_args)}"
+        logging.info(msg)
+
+        result_spec = process_fusilli_command(
+            cli_args,
+            args,
+            fusilli_path_config,
+            root_logger,
+            starter_td_spec,
+            idx + 1,
+        )
+
+        # Chain: best spec from this command becomes starter for next.
+        if result_spec:
+            starter_td_spec = result_spec
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/test_tuner_cache.py b/benchmarks/test_tuner_cache.py
new file mode 100644
index 00000000..755e05c9
--- /dev/null
+++ b/benchmarks/test_tuner_cache.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+# Copyright 2026 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import tempfile
+import unittest
+from pathlib import Path
+
+# Add benchmarks/ to path so we can import run_tuner.
+# The utility functions under test don't depend on amdsharktuner, and the
+# import guard in run_tuner.py defers the sys.exit to main().
+import sys
+
+sys.path.insert(0, str(Path(__file__).parent))
+
+from run_tuner import build_compile_args, find_cached_artifacts, load_commands
+
+
+class TestFindCachedArtifacts(unittest.TestCase):
+    def test_finds_mlir_and_txt(self):
+        """Given a valid cache structure, returns paths to .mlir and .txt files."""
+        with tempfile.TemporaryDirectory() as tmp:
+            base = Path(tmp)
+            graph_dir = base / ".cache" / "fusilli" / "abc123"
+            graph_dir.mkdir(parents=True)
+            mlir_file = graph_dir / "iree-compile-input.mlir"
+            txt_file = graph_dir / "iree-compile-command.txt"
+            mlir_file.write_text("module {}")
+            txt_file.write_text("iree-compile input.mlir -o out.vmfb")
+
+            mlir_path, txt_path = find_cached_artifacts(base)
+            self.assertEqual(mlir_path, mlir_file)
+            self.assertEqual(txt_path, txt_file)
+
+    def test_raises_when_no_cache_dir(self):
+        """Raises FileNotFoundError when .cache/fusilli doesn't exist."""
+        with tempfile.TemporaryDirectory() as tmp:
+            with self.assertRaises(FileNotFoundError):
+                find_cached_artifacts(Path(tmp))
+
+    def test_raises_when_multiple_graph_dirs(self):
+        """Raises FileNotFoundError when multiple graph directories exist."""
+        with tempfile.TemporaryDirectory() as tmp:
+            base = Path(tmp)
+            cache = base / ".cache" / "fusilli"
+            (cache / "hash1").mkdir(parents=True)
+            (cache / "hash2").mkdir(parents=True)
+            with self.assertRaises(FileNotFoundError):
+                find_cached_artifacts(base)
+
+    def test_raises_when_no_mlir_file(self):
+        """Raises FileNotFoundError when no .mlir file exists."""
+        with tempfile.TemporaryDirectory() as tmp:
+            base = Path(tmp)
+            graph_dir = base / ".cache" / "fusilli" / "abc123"
+            graph_dir.mkdir(parents=True)
+            (graph_dir / "cmd.txt").write_text("iree-compile ...")
+            with self.assertRaises(FileNotFoundError):
+                find_cached_artifacts(base)
+
+
+class TestBuildCompileArgs(unittest.TestCase):
+    def test_strips_output_and_stats_flags(self):
+        """Filters -o, scheduling stats flags, and adds tuner flags."""
+        cmd = (
+            "iree-compile input.mlir "
+            "--iree-hal-target-backends=rocm "
+            "--iree-scheduling-dump-statistics-format=json "
+            "--iree-scheduling-dump-statistics-file=stats.json "
+            "-o output.vmfb"
+        )
+        result = build_compile_args(cmd, Path("/tmp/benchmarks"))
+
+        self.assertEqual(result[0], "iree-compile")
+        self.assertIn("--iree-hal-target-backends=rocm", result)
+        # Original "-o output.vmfb" should be stripped.
+        self.assertNotIn("output.vmfb", result)
+        self.assertNotIn("--iree-scheduling-dump-statistics-format=json", result)
+        # Tuner-specific flags should be appended.
+        self.assertIn("--iree-config-add-tuner-attributes", result)
+        self.assertIn("--iree-hal-dump-executable-benchmarks-to", result)
+        # Output redirected to platform null device.
+        idx = result.index("-o")
+        self.assertEqual(result[idx + 1], os.devnull)
+
+    def test_preserves_input_mlir(self):
+        """Keeps the input MLIR path from the original command."""
+        cmd = "iree-compile my_model.mlir --iree-hal-target-backends=rocm -o out.vmfb"
+        result = build_compile_args(cmd, Path("/tmp/bench"))
+        self.assertIn("my_model.mlir", result)
+
+
+class TestLoadCommands(unittest.TestCase):
+    def test_loads_from_args(self):
+        """When no file given, returns fusilli_op_args as single command."""
+        result = load_commands(None, ["conv", "-F", "1", "--bf16"])
+        self.assertEqual(result, [["conv", "-F", "1", "--bf16"]])
+
+    def test_loads_from_file(self):
+        """Reads commands from file, skipping comments and blank lines."""
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
+            f.write("# comment\n")
+            f.write("conv -F 1 --bf16\n")
+            f.write("\n")
+            f.write("matmul -M 1024 -N 1024 -K 1024\n")
+            f.flush()
+            tmp_path = f.name
+
+        try:
+            result = load_commands(tmp_path, [])
+            self.assertEqual(len(result), 2)
+            self.assertEqual(result[0], ["conv", "-F", "1", "--bf16"])
+            self.assertEqual(
+                result[1], ["matmul", "-M", "1024", "-N", "1024", "-K", "1024"]
+            )
+        finally:
+            os.unlink(tmp_path)
+
+    def test_prefers_file_when_both_given(self):
+        """When a file is given, fusilli_op_args is ignored (gating is in main)."""
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
+            f.write("conv -F 1 --bf16\n")
+            f.flush()
+            tmp_path = f.name
+
+        try:
+            result = load_commands(tmp_path, ["matmul", "-M", "16"])
+            self.assertEqual(result, [["conv", "-F", "1", "--bf16"]])
+        finally:
+            os.unlink(tmp_path)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/benchmarks/test_tuner_runner.sh b/benchmarks/test_tuner_runner.sh
new file mode 100755
index 00000000..02b52ed3
--- /dev/null
+++ b/benchmarks/test_tuner_runner.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# Copyright 2026 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+set -euo pipefail
+set -x
+
+# Arguments from CMake
+TUNER_SCRIPT="$1"
+BENCHMARK_DRIVER="$2"
+TMP_FILES=()
+
+cleanup() {
+  rm -f "${TMP_FILES[@]}"
+}
+trap cleanup EXIT
+
+# This test is registered only under FUSILLI_SYSTEMS_AMDGPU; libtuner must be
+# importable. If it isn't, the environment is misconfigured (e.g., test.sh's
+# pip install failed) and we fail loudly rather than silently degrading.
+# Mirror the import path run_tuner.py uses, since the top-level package can
+# load successfully while these submodules raise on version mismatch with
+# iree-compiler.
+if ! python3 -c "from amdsharktuner import common, libtuner" >/dev/null 2>&1; then
+  echo "ERROR: amdsharktuner is not importable; cannot run tuner integration tests."
+  echo "  This test is gated on FUSILLI_SYSTEMS_AMDGPU=ON, which implies a"
+  echo "  fully configured tuner environment. Check that build_tools/scripts/test.sh"
+  echo "  ran the amdsharktuner install successfully."
+  python3 -c "from amdsharktuner import common, libtuner" || true
+  exit 1
+fi
+
+# Test 1: Verify --help works and reports the expected libtuner option groups.
+HELP_OUTPUT="$(mktemp)"
+TMP_FILES+=("${HELP_OUTPUT}")
+python3 "${TUNER_SCRIPT}" --help > "${HELP_OUTPUT}" 2>&1
+grep -q "Fusilli Tuner Options" "${HELP_OUTPUT}"
+grep -q "General Options" "${HELP_OUTPUT}"
+grep -q "Candidate Generation Options" "${HELP_OUTPUT}"
+echo "PASSED: run_tuner.py --help"
+
+# Cache extraction unit tests (pure-Python wrapper logic).
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+(cd "${SCRIPT_DIR}" && python3 -m unittest test_tuner_cache -v)
+echo "PASSED: cache extraction unit tests"
+
+# Test 2: Verify error on missing args
+MISSING_ARGS_OUTPUT="$(mktemp)"
+TMP_FILES+=("${MISSING_ARGS_OUTPUT}")
+if python3 "${TUNER_SCRIPT}" --devices hip://0 \
+  --fusilli-driver "${BENCHMARK_DRIVER}" >"${MISSING_ARGS_OUTPUT}" 2>&1; then
+  echo "ERROR: Expected failure when no --fusilli-args or --commands-file given"
+  exit 1
+fi
+grep -q "Must specify either --commands-file or --fusilli-args" "${MISSING_ARGS_OUTPUT}"
+echo "PASSED: run_tuner.py rejects missing args"
+
+# Test 3: Verify error on both args
+CONFLICTING_ARGS_OUTPUT="$(mktemp)"
+TMP_FILES+=("${CONFLICTING_ARGS_OUTPUT}")
+if python3 "${TUNER_SCRIPT}" \
+  --devices hip://0 \
+  --fusilli-driver "${BENCHMARK_DRIVER}" \
+  --fusilli-args "matmul -M 16 -N 16 -K 16 --a_type f32 --b_type f32 --out_type f32" \
+  --commands-file /dev/null >"${CONFLICTING_ARGS_OUTPUT}" 2>&1; then
+  echo "ERROR: Expected failure when both --fusilli-args and --commands-file given"
+  exit 1
+fi
+grep -q "Cannot specify both --commands-file and --fusilli-args" "${CONFLICTING_ARGS_OUTPUT}"
+echo "PASSED: run_tuner.py rejects conflicting args"
+
+echo "ALL TESTS PASSED"
diff --git a/build_tools/scripts/test.sh b/build_tools/scripts/test.sh
index 36e36a19..4244ad2b 100755
--- a/build_tools/scripts/test.sh
+++ b/build_tools/scripts/test.sh
@@ -84,6 +84,16 @@ while [[ $# -gt 0 ]]; do
   esac
 done
 
+# Install amdsharktuner for the AMDGPU-only tuner tests. PyPI lags the IREE
+# RC pinned in version.json, so install from GitHub. Tracks @main deliberately
+# (no manual pin-bump); pin a SHA here, in README.md, and in run_tuner.py if
+# upstream breaks CI.
+if grep -q "^FUSILLI_SYSTEMS_AMDGPU:BOOL=ON$" "${BUILD_DIR}/CMakeCache.txt" 2>/dev/null; then
+  pip install --pre \
+    "amdsharktuner @ git+https://github.com/nod-ai/amd-shark-ai.git@main#subdirectory=amdsharktuner" \
+    --find-links https://iree.dev/pip-release-links.html
+fi
+
 if [[ "${BACKEND}" == "cli" ]]; then
   export FUSILLI_COMPILE_BACKEND_USE_CLI=1
   echo "=== Fusilli test: backend=cli ==="