From 014881027da404e9975503aff8f955a9c7c75d3f Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 13 Jan 2026 18:38:57 -0600 Subject: [PATCH 01/17] add the skill for running a docker container with correct options; build and run tests in the container --- .claude/skills/ck-docker | 309 ++++++++++++++++++++++++++++++++++++ .claude/skills/ck-docker.md | 76 +++++++++ 2 files changed, 385 insertions(+) create mode 100755 .claude/skills/ck-docker create mode 100644 .claude/skills/ck-docker.md diff --git a/.claude/skills/ck-docker b/.claude/skills/ck-docker new file mode 100755 index 00000000000..83250d8f111 --- /dev/null +++ b/.claude/skills/ck-docker @@ -0,0 +1,309 @@ +#!/bin/bash +# CK Docker Skill - Build and test composable_kernel in Docker with ROCm support + +set -e + +# Find project root (where .git directory is) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +# Detect git branch and sanitize for docker naming (replace / and special chars with _) +GIT_BRANCH=$(cd "${PROJECT_ROOT}" && git rev-parse --abbrev-ref HEAD 2>/dev/null | tr '/' '_' | tr -cd 'a-zA-Z0-9_-') + +# Default container name: ck__ +DEFAULT_NAME="ck_${USER}_${GIT_BRANCH}" +CONTAINER_NAME="${CK_CONTAINER_NAME:-${DEFAULT_NAME}}" + +# Help message +show_help() { + cat << EOF +CK Docker Skill - Build and test composable_kernel in Docker + +Usage: ck-docker [options] + +Commands: + start [name] Start Docker container + build [target] [--name] Build target + test [options] Run test + shell [name] Open shell in container + status [name] Check container status + stop [name] Stop and remove container + rebuild-cmake [name] Reconfigure CMake from scratch + +Examples: + ck-docker start + ck-docker build test_amdgcn_mma + ck-docker test test_amdgcn_mma --gtest_filter=*Fp16* + ck-docker shell + +Environment: + CK_CONTAINER_NAME - Override default container name (default: ck__) +EOF +} + +# Detect GPU target +detect_gpu() { + local container=$1 + docker exec ${container} bash -c " + rocminfo 2>/dev/null | grep -oP 'gfx[0-9a-z]+' | head -1 || echo 'gfx950' + " | tr -d '\r\n' +} + +# Start container +cmd_start() { + local name="${1:-${CONTAINER_NAME}}" + + if docker ps -a -f name=${name} | grep -q ${name}; then + if docker ps -f name=${name} | grep -q ${name}; then + echo "Container '${name}' is already running" + return 0 + else + echo "Starting existing container '${name}'..." + docker start ${name} + echo "Container started" + return 0 + fi + fi + + echo "Creating new Docker container '${name}'..." + docker run -d \ + --name ${name} \ + --device=/dev/kfd --device=/dev/dri \ + --security-opt seccomp=unconfined \ + --group-add video \ + -v "${PROJECT_ROOT}":/workspace \ + -w /workspace \ + rocm/composable_kernel:ck_ub24.04_rocm7.0.1 \ + tail -f /dev/null + + echo "Container '${name}' started successfully" + docker exec ${name} bash -c "echo 'Working directory:' && pwd" +} + +# Build target +cmd_build() { + local target="" + local name="${CONTAINER_NAME}" + + while [[ $# -gt 0 ]]; do + case $1 in + --name) + name="$2" + shift 2 + ;; + *) + target="$1" + shift + ;; + esac + done + + if ! docker ps -f name=${name} | grep -q ${name}; then + echo "Container '${name}' not running. Starting..." + cmd_start ${name} + fi + + if ! docker exec ${name} test -f /workspace/build/build.ninja 2>/dev/null; then + echo "Detecting GPU target..." + local gpu_target=$(detect_gpu ${name}) + + echo "Configuring build with CMake for GPU target: ${gpu_target}" + docker exec ${name} bash -c " + cd /workspace && + rm -rf build && + mkdir build && + cd build && + cmake .. -GNinja \ + -DGPU_TARGETS=${gpu_target} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ + -DBUILD_TESTING=ON 2>&1 | tail -30 + " + fi + + if [ -z "$target" ]; then + echo "Building all configured targets..." + else + echo "Building target: ${target}" + fi + + docker exec ${name} bash -c " + cd /workspace/build && + ninja ${target} 2>&1 + " + + echo "Build complete" +} + +# Run test +cmd_test() { + local test_name="" + local name="${CONTAINER_NAME}" + local test_options="" + + while [[ $# -gt 0 ]]; do + case $1 in + --name) + name="$2" + shift 2 + ;; + --gtest_*|--help) + test_options="${test_options} $1" + shift + ;; + *) + if [ -z "$test_name" ]; then + test_name="$1" + else + test_options="${test_options} $1" + fi + shift + ;; + esac + done + + if [ -z "$test_name" ]; then + echo "Error: test_name required" + echo "Usage: ck-docker test [--name container_name] [gtest_options]" + return 1 + fi + + if ! docker ps -f name=${name} | grep -q ${name}; then + echo "Error: Container '${name}' not running" + echo "Start it with: ck-docker start --name ${name}" + return 1 + fi + + if ! docker exec ${name} test -f "/workspace/build/bin/${test_name}" 2>/dev/null; then + echo "Test executable not found. Building ${test_name}..." + cmd_build ${test_name} --name ${name} + fi + + echo "Running: ${test_name} ${test_options}" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + docker exec ${name} bash -c " + cd /workspace/build && + ./bin/${test_name} ${test_options} + " +} + +# Shell +cmd_shell() { + local name="${1:-${CONTAINER_NAME}}" + + if ! docker ps -f name=${name} | grep -q ${name}; then + echo "Container '${name}' not running. Starting..." + cmd_start ${name} + fi + + echo "Opening shell in '${name}' (type 'exit' to leave)..." + docker exec -it ${name} bash +} + +# Status +cmd_status() { + local name="${1:-}" + + if [ -z "$name" ]; then + echo "Composable Kernel Docker Containers:" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + docker ps -a --filter "ancestor=rocm/composable_kernel:ck_ub24.04_rocm7.0.1" \ + --format "table {{.Names}}\t{{.Status}}\t{{.CreatedAt}}" || echo "No containers found" + else + if docker ps -f name=${name} | grep -q ${name}; then + echo "Container '${name}' is RUNNING" + docker ps -f name=${name} --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" + echo "" + echo "GPU Information:" + docker exec ${name} bash -c "rocm-smi --showproductname 2>/dev/null | head -10 || echo 'No GPU detected'" + elif docker ps -a -f name=${name} | grep -q ${name}; then + echo "Container '${name}' exists but is STOPPED" + echo "Start with: ck-docker start ${name}" + else + echo "Container '${name}' does NOT exist" + echo "Create with: ck-docker start ${name}" + fi + fi +} + +# Stop +cmd_stop() { + local name="${1:-${CONTAINER_NAME}}" + + if docker ps -a -f name=${name} | grep -q ${name}; then + echo "Stopping and removing container '${name}'..." + docker stop ${name} 2>/dev/null || true + docker rm ${name} 2>/dev/null || true + echo "Container stopped and removed" + else + echo "Container '${name}' does not exist" + fi +} + +# Rebuild CMake +cmd_rebuild_cmake() { + local name="${1:-${CONTAINER_NAME}}" + + if ! docker ps -f name=${name} | grep -q ${name}; then + echo "Container '${name}' not running. Starting..." + cmd_start ${name} + fi + + echo "Detecting GPU target..." + local gpu_target=$(detect_gpu ${name}) + + echo "Reconfiguring CMake from scratch in '${name}' for GPU target: ${gpu_target}" + docker exec ${name} bash -c " + cd /workspace && + rm -rf build && + mkdir build && + cd build && + cmake .. -GNinja \ + -DGPU_TARGETS=${gpu_target} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ + -DBUILD_TESTING=ON 2>&1 | tail -30 + " + echo "CMake configuration complete for ${gpu_target}" +} + +# Main command dispatcher +case "${1:-}" in + start) + shift + cmd_start "$@" + ;; + build) + shift + cmd_build "$@" + ;; + test) + shift + cmd_test "$@" + ;; + shell) + shift + cmd_shell "$@" + ;; + status) + shift + cmd_status "$@" + ;; + stop) + shift + cmd_stop "$@" + ;; + rebuild-cmake) + shift + cmd_rebuild_cmake "$@" + ;; + help|--help|-h) + show_help + ;; + *) + echo "Unknown command: ${1:-}" + echo "" + show_help + exit 1 + ;; +esac diff --git a/.claude/skills/ck-docker.md b/.claude/skills/ck-docker.md new file mode 100644 index 00000000000..8c9887a5ccd --- /dev/null +++ b/.claude/skills/ck-docker.md @@ -0,0 +1,76 @@ +# ck-docker + +Build and test composable_kernel in Docker with ROCm support. + +## Terminal Usage + +Direct command-line usage: + +```bash +# From composable_kernel directory +.claude/skills/ck-docker start +.claude/skills/ck-docker build test_amdgcn_mma +.claude/skills/ck-docker test test_amdgcn_mma --gtest_filter=*Fp16* +.claude/skills/ck-docker status +.claude/skills/ck-docker shell + +# Or add to PATH +export PATH="$PATH:$PWD/.claude/skills" +ck-docker start +``` + +## Ask Claude + +Just ask in natural language: +- "Start the docker container" +- "Build test_amdgcn_mma" +- "Run test_amdgcn_mma with filter *Fp16*" +- "Check container status" +- "Open a shell in the container" + +## Commands + +``` +ck-docker start [name] Start Docker container +ck-docker build [target] Build target +ck-docker test [options] Run test +ck-docker shell [name] Interactive shell +ck-docker status [name] Check status +ck-docker stop [name] Stop container +ck-docker rebuild-cmake [name] Reconfigure CMake +``` + +## Configuration + +- **Image**: rocm/composable_kernel:ck_ub24.04_rocm7.0.1 +- **GPU**: Auto-detected via rocminfo (fallback: gfx950) +- **Compiler**: /opt/rocm/llvm/bin/clang++ +- **Build**: Ninja + CMake (Release) +- **Mount**: Current directory → /workspace +- **Container Name**: Auto-generated as `ck__` to avoid clashes + +## Environment + +```bash +export CK_CONTAINER_NAME=my_build # Override default container name +``` + +## Examples + +```bash +# Start container +ck-docker start + +# Build and run test +ck-docker build test_amdgcn_mma +ck-docker test test_amdgcn_mma + +# Custom container +ck-docker start my_build +ck-docker build test_amdgcn_mma --name my_build +ck-docker test test_amdgcn_mma --name my_build + +# Debug +ck-docker shell +ck-docker status +``` From 5ba39269c7c9c398e71802bb39621a3bb6a14366 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 13 Jan 2026 18:54:51 -0600 Subject: [PATCH 02/17] try to handle corner cases --- .claude/skills/ck-docker | 129 ++++++++++++++++++++++-------------- .claude/skills/ck-docker.md | 4 +- 2 files changed, 82 insertions(+), 51 deletions(-) diff --git a/.claude/skills/ck-docker b/.claude/skills/ck-docker index 83250d8f111..e884b47def8 100755 --- a/.claude/skills/ck-docker +++ b/.claude/skills/ck-docker @@ -2,16 +2,26 @@ # CK Docker Skill - Build and test composable_kernel in Docker with ROCm support set -e +set -o pipefail # Find project root (where .git directory is) SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" # Detect git branch and sanitize for docker naming (replace / and special chars with _) -GIT_BRANCH=$(cd "${PROJECT_ROOT}" && git rev-parse --abbrev-ref HEAD 2>/dev/null | tr '/' '_' | tr -cd 'a-zA-Z0-9_-') +GIT_BRANCH=$(cd "${PROJECT_ROOT}" && git rev-parse --abbrev-ref HEAD 2>/dev/null | tr '/' '_' | tr -cd 'a-zA-Z0-9_-' || echo "") +# Handle edge cases: detached HEAD, empty branch name +GIT_BRANCH=${GIT_BRANCH:-unknown} +# If branch is just "HEAD" (detached state), make it more descriptive +if [ "${GIT_BRANCH}" = "HEAD" ]; then + GIT_BRANCH="detached" +fi + +# Ensure USER is set +USER_NAME=${USER:-$(whoami 2>/dev/null || echo "user")} # Default container name: ck__ -DEFAULT_NAME="ck_${USER}_${GIT_BRANCH}" +DEFAULT_NAME="ck_${USER_NAME}_${GIT_BRANCH}" CONTAINER_NAME="${CK_CONTAINER_NAME:-${DEFAULT_NAME}}" # Help message @@ -38,13 +48,20 @@ Examples: Environment: CK_CONTAINER_NAME - Override default container name (default: ck__) + CK_DOCKER_IMAGE - Override Docker image (default: rocm/composable_kernel:ck_ub24.04_rocm7.0.1) + GPU_TARGET - Override GPU target detection (e.g., gfx950, gfx942) EOF } # Detect GPU target detect_gpu() { local container=$1 - docker exec ${container} bash -c " + # Allow override via GPU_TARGET environment variable + if [ -n "${GPU_TARGET:-}" ]; then + echo "${GPU_TARGET}" + return 0 + fi + docker exec "${container}" bash -c " rocminfo 2>/dev/null | grep -oP 'gfx[0-9a-z]+' | head -1 || echo 'gfx950' " | tr -d '\r\n' } @@ -52,14 +69,17 @@ detect_gpu() { # Start container cmd_start() { local name="${1:-${CONTAINER_NAME}}" + local docker_image="${CK_DOCKER_IMAGE:-rocm/composable_kernel:ck_ub24.04_rocm7.0.1}" - if docker ps -a -f name=${name} | grep -q ${name}; then - if docker ps -f name=${name} | grep -q ${name}; then + # Check if container exists (exact match to avoid substring collisions) + if docker ps -a --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then + # Check if container is running + if docker ps --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then echo "Container '${name}' is already running" return 0 else echo "Starting existing container '${name}'..." - docker start ${name} + docker start "${name}" echo "Container started" return 0 fi @@ -67,17 +87,17 @@ cmd_start() { echo "Creating new Docker container '${name}'..." docker run -d \ - --name ${name} \ + --name "${name}" \ --device=/dev/kfd --device=/dev/dri \ --security-opt seccomp=unconfined \ --group-add video \ -v "${PROJECT_ROOT}":/workspace \ -w /workspace \ - rocm/composable_kernel:ck_ub24.04_rocm7.0.1 \ + "${docker_image}" \ tail -f /dev/null echo "Container '${name}' started successfully" - docker exec ${name} bash -c "echo 'Working directory:' && pwd" + docker exec "${name}" bash -c "echo 'Working directory:' && pwd" } # Build target @@ -98,21 +118,22 @@ cmd_build() { esac done - if ! docker ps -f name=${name} | grep -q ${name}; then + # Check if container is running (exact match) + if ! docker ps --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then echo "Container '${name}' not running. Starting..." - cmd_start ${name} + cmd_start "${name}" fi - if ! docker exec ${name} test -f /workspace/build/build.ninja 2>/dev/null; then + if ! docker exec "${name}" test -f /workspace/build/build.ninja 2>/dev/null; then echo "Detecting GPU target..." - local gpu_target=$(detect_gpu ${name}) + local gpu_target=$(detect_gpu "${name}") echo "Configuring build with CMake for GPU target: ${gpu_target}" - docker exec ${name} bash -c " - cd /workspace && - rm -rf build && - mkdir build && - cd build && + docker exec "${name}" bash -c " + cd /workspace || exit 1 + rm -rf /workspace/build + mkdir /workspace/build + cd /workspace/build || exit 1 cmake .. -GNinja \ -DGPU_TARGETS=${gpu_target} \ -DCMAKE_BUILD_TYPE=Release \ @@ -127,8 +148,8 @@ cmd_build() { echo "Building target: ${target}" fi - docker exec ${name} bash -c " - cd /workspace/build && + docker exec "${name}" bash -c " + cd /workspace/build || exit 1 ninja ${target} 2>&1 " @@ -139,7 +160,7 @@ cmd_build() { cmd_test() { local test_name="" local name="${CONTAINER_NAME}" - local test_options="" + local -a test_options=() while [[ $# -gt 0 ]]; do case $1 in @@ -148,14 +169,14 @@ cmd_test() { shift 2 ;; --gtest_*|--help) - test_options="${test_options} $1" + test_options+=("$1") shift ;; *) if [ -z "$test_name" ]; then test_name="$1" else - test_options="${test_options} $1" + test_options+=("$1") fi shift ;; @@ -168,55 +189,61 @@ cmd_test() { return 1 fi - if ! docker ps -f name=${name} | grep -q ${name}; then + # Check if container is running (exact match) + if ! docker ps --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then echo "Error: Container '${name}' not running" echo "Start it with: ck-docker start --name ${name}" return 1 fi - if ! docker exec ${name} test -f "/workspace/build/bin/${test_name}" 2>/dev/null; then + if ! docker exec "${name}" test -f "/workspace/build/bin/${test_name}" 2>/dev/null; then echo "Test executable not found. Building ${test_name}..." - cmd_build ${test_name} --name ${name} + cmd_build "${test_name}" --name "${name}" fi - echo "Running: ${test_name} ${test_options}" + echo "Running: ${test_name} ${test_options[*]}" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - docker exec ${name} bash -c " - cd /workspace/build && - ./bin/${test_name} ${test_options} - " + # Build the command with proper quoting + local cmd="cd /workspace/build && ./bin/${test_name}" + for opt in "${test_options[@]}"; do + cmd="${cmd} $(printf '%q' "$opt")" + done + docker exec "${name}" bash -c "${cmd}" } # Shell cmd_shell() { local name="${1:-${CONTAINER_NAME}}" - if ! docker ps -f name=${name} | grep -q ${name}; then + # Check if container is running (exact match) + if ! docker ps --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then echo "Container '${name}' not running. Starting..." - cmd_start ${name} + cmd_start "${name}" fi echo "Opening shell in '${name}' (type 'exit' to leave)..." - docker exec -it ${name} bash + docker exec -it "${name}" bash } # Status cmd_status() { local name="${1:-}" + local docker_image="${CK_DOCKER_IMAGE:-rocm/composable_kernel:ck_ub24.04_rocm7.0.1}" if [ -z "$name" ]; then echo "Composable Kernel Docker Containers:" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - docker ps -a --filter "ancestor=rocm/composable_kernel:ck_ub24.04_rocm7.0.1" \ + docker ps -a --filter "ancestor=${docker_image}" \ --format "table {{.Names}}\t{{.Status}}\t{{.CreatedAt}}" || echo "No containers found" else - if docker ps -f name=${name} | grep -q ${name}; then + # Check if container is running (exact match) + if docker ps --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then echo "Container '${name}' is RUNNING" - docker ps -f name=${name} --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" + docker ps --filter "name=^${name}$" --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" echo "" echo "GPU Information:" - docker exec ${name} bash -c "rocm-smi --showproductname 2>/dev/null | head -10 || echo 'No GPU detected'" - elif docker ps -a -f name=${name} | grep -q ${name}; then + docker exec "${name}" bash -c "rocm-smi --showproductname 2>/dev/null | head -10 || echo 'No GPU detected'" + elif docker ps -a --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then echo "Container '${name}' exists but is STOPPED" echo "Start with: ck-docker start ${name}" else @@ -230,10 +257,11 @@ cmd_status() { cmd_stop() { local name="${1:-${CONTAINER_NAME}}" - if docker ps -a -f name=${name} | grep -q ${name}; then + # Check if container exists (exact match) + if docker ps -a --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then echo "Stopping and removing container '${name}'..." - docker stop ${name} 2>/dev/null || true - docker rm ${name} 2>/dev/null || true + docker stop "${name}" 2>/dev/null || true + docker rm "${name}" 2>/dev/null || true echo "Container stopped and removed" else echo "Container '${name}' does not exist" @@ -244,20 +272,21 @@ cmd_stop() { cmd_rebuild_cmake() { local name="${1:-${CONTAINER_NAME}}" - if ! docker ps -f name=${name} | grep -q ${name}; then + # Check if container is running (exact match) + if ! docker ps --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then echo "Container '${name}' not running. Starting..." - cmd_start ${name} + cmd_start "${name}" fi echo "Detecting GPU target..." - local gpu_target=$(detect_gpu ${name}) + local gpu_target=$(detect_gpu "${name}") echo "Reconfiguring CMake from scratch in '${name}' for GPU target: ${gpu_target}" - docker exec ${name} bash -c " - cd /workspace && - rm -rf build && - mkdir build && - cd build && + docker exec "${name}" bash -c " + cd /workspace || exit 1 + rm -rf /workspace/build + mkdir /workspace/build + cd /workspace/build || exit 1 cmake .. -GNinja \ -DGPU_TARGETS=${gpu_target} \ -DCMAKE_BUILD_TYPE=Release \ diff --git a/.claude/skills/ck-docker.md b/.claude/skills/ck-docker.md index 8c9887a5ccd..c95ee10beda 100644 --- a/.claude/skills/ck-docker.md +++ b/.claude/skills/ck-docker.md @@ -52,7 +52,9 @@ ck-docker rebuild-cmake [name] Reconfigure CMake ## Environment ```bash -export CK_CONTAINER_NAME=my_build # Override default container name +export CK_CONTAINER_NAME=my_build # Override default container name +export CK_DOCKER_IMAGE=rocm/composable_kernel:ck_ub24.04_rocm7.0.1 # Override Docker image +export GPU_TARGET=gfx942 # Override GPU target detection ``` ## Examples From ba65875e4dc7bc73de2762d6db4708605cbea235 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 13 Jan 2026 19:49:00 -0600 Subject: [PATCH 03/17] combine build and rebuild --- .claude/skills/ck-docker | 61 ++++++++++++------------------------- .claude/skills/ck-docker.md | 16 +++++----- 2 files changed, 29 insertions(+), 48 deletions(-) diff --git a/.claude/skills/ck-docker b/.claude/skills/ck-docker index e884b47def8..1217f6ae1bb 100755 --- a/.claude/skills/ck-docker +++ b/.claude/skills/ck-docker @@ -32,17 +32,17 @@ CK Docker Skill - Build and test composable_kernel in Docker Usage: ck-docker [options] Commands: - start [name] Start Docker container - build [target] [--name] Build target - test [options] Run test - shell [name] Open shell in container - status [name] Check container status - stop [name] Stop and remove container - rebuild-cmake [name] Reconfigure CMake from scratch + start [name] Start Docker container + build [target] [--reconfigure] Build target (optionally reconfigure CMake) + test [options] Run test + shell [name] Open shell in container + status [name] Check container status + stop [name] Stop and remove container Examples: ck-docker start ck-docker build test_amdgcn_mma + ck-docker build --reconfigure test_amdgcn_mma ck-docker test test_amdgcn_mma --gtest_filter=*Fp16* ck-docker shell @@ -104,6 +104,7 @@ cmd_start() { cmd_build() { local target="" local name="${CONTAINER_NAME}" + local reconfigure=false while [[ $# -gt 0 ]]; do case $1 in @@ -111,6 +112,10 @@ cmd_build() { name="$2" shift 2 ;; + --reconfigure) + reconfigure=true + shift + ;; *) target="$1" shift @@ -124,11 +129,17 @@ cmd_build() { cmd_start "${name}" fi - if ! docker exec "${name}" test -f /workspace/build/build.ninja 2>/dev/null; then + # Reconfigure CMake if requested or if build.ninja doesn't exist + if [ "$reconfigure" = true ] || ! docker exec "${name}" test -f /workspace/build/build.ninja 2>/dev/null; then echo "Detecting GPU target..." local gpu_target=$(detect_gpu "${name}") - echo "Configuring build with CMake for GPU target: ${gpu_target}" + if [ "$reconfigure" = true ]; then + echo "Reconfiguring CMake from scratch for GPU target: ${gpu_target}" + else + echo "Configuring build with CMake for GPU target: ${gpu_target}" + fi + docker exec "${name}" bash -c " cd /workspace || exit 1 rm -rf /workspace/build @@ -268,34 +279,6 @@ cmd_stop() { fi } -# Rebuild CMake -cmd_rebuild_cmake() { - local name="${1:-${CONTAINER_NAME}}" - - # Check if container is running (exact match) - if ! docker ps --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then - echo "Container '${name}' not running. Starting..." - cmd_start "${name}" - fi - - echo "Detecting GPU target..." - local gpu_target=$(detect_gpu "${name}") - - echo "Reconfiguring CMake from scratch in '${name}' for GPU target: ${gpu_target}" - docker exec "${name}" bash -c " - cd /workspace || exit 1 - rm -rf /workspace/build - mkdir /workspace/build - cd /workspace/build || exit 1 - cmake .. -GNinja \ - -DGPU_TARGETS=${gpu_target} \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ - -DBUILD_TESTING=ON 2>&1 | tail -30 - " - echo "CMake configuration complete for ${gpu_target}" -} - # Main command dispatcher case "${1:-}" in start) @@ -322,10 +305,6 @@ case "${1:-}" in shift cmd_stop "$@" ;; - rebuild-cmake) - shift - cmd_rebuild_cmake "$@" - ;; help|--help|-h) show_help ;; diff --git a/.claude/skills/ck-docker.md b/.claude/skills/ck-docker.md index c95ee10beda..f31022e0bda 100644 --- a/.claude/skills/ck-docker.md +++ b/.claude/skills/ck-docker.md @@ -31,13 +31,12 @@ Just ask in natural language: ## Commands ``` -ck-docker start [name] Start Docker container -ck-docker build [target] Build target -ck-docker test [options] Run test -ck-docker shell [name] Interactive shell -ck-docker status [name] Check status -ck-docker stop [name] Stop container -ck-docker rebuild-cmake [name] Reconfigure CMake +ck-docker start [name] Start Docker container +ck-docker build [target] [--reconfigure] Build target (optionally reconfigure CMake) +ck-docker test [options] Run test +ck-docker shell [name] Interactive shell +ck-docker status [name] Check status +ck-docker stop [name] Stop container ``` ## Configuration @@ -67,6 +66,9 @@ ck-docker start ck-docker build test_amdgcn_mma ck-docker test test_amdgcn_mma +# Force clean CMake reconfiguration and build +ck-docker build --reconfigure test_amdgcn_mma + # Custom container ck-docker start my_build ck-docker build test_amdgcn_mma --name my_build From 0fc7bfefbdab2356020550692bc497224f0c807e Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:13:44 -0600 Subject: [PATCH 04/17] Add ck-build-analysis skill for compilation profiling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add automated build time analysis using Clang's -ftime-trace feature to identify template instantiation bottlenecks. Features: - Configurable granularity (500µs, 100µs, 1µs) - Comprehensive markdown reports with statistics - Template family analysis and optimization recommendations - Integration with ck-docker for containerized builds Testing shows default 500µs granularity filters out 86% of template instantiations. Using 100µs captures 2.7x more data while keeping trace files manageable at ~11MB. Key findings on example_convnd_fwd_xdl_fp8: - Template instantiation: 26.6% of compilation time - TensorDescriptor: 2,297 instantiations (18.5% of time) - run_grouped_conv_fwd: Only 3 instantiations but 583ms average Co-Authored-By: Claude --- .claude/skills/ck-build-analysis | 376 ++++++++++++++++++++++++++++ .claude/skills/ck-build-analysis.md | 112 +++++++++ 2 files changed, 488 insertions(+) create mode 100755 .claude/skills/ck-build-analysis create mode 100644 .claude/skills/ck-build-analysis.md diff --git a/.claude/skills/ck-build-analysis b/.claude/skills/ck-build-analysis new file mode 100755 index 00000000000..1543705a517 --- /dev/null +++ b/.claude/skills/ck-build-analysis @@ -0,0 +1,376 @@ +#!/bin/bash +# CK Build Analysis Skill - Analyze build times using -ftime-trace + +set -e +set -o pipefail + +# Find project root +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +# Detect git branch and sanitize for docker naming +GIT_BRANCH=$(cd "${PROJECT_ROOT}" && git rev-parse --abbrev-ref HEAD 2>/dev/null | tr '/' '_' | tr -cd 'a-zA-Z0-9_-' || echo "") +GIT_BRANCH=${GIT_BRANCH:-unknown} +if [ "${GIT_BRANCH}" = "HEAD" ]; then + GIT_BRANCH="detached" +fi + +# Ensure USER is set +USER_NAME=${USER:-$(whoami 2>/dev/null || echo "user")} + +# Default container name +DEFAULT_NAME="ck_${USER_NAME}_${GIT_BRANCH}" +CONTAINER_NAME="${CK_CONTAINER_NAME:-${DEFAULT_NAME}}" + +# Default settings +GRANULARITY="${CK_BUILD_ANALYSIS_GRANULARITY:-500}" +OUTPUT_FILE="build_time_analysis_report.md" +RECONFIGURE=true + +# Help message +show_help() { + cat << EOF +CK Build Analysis - Analyze build times using Clang -ftime-trace + +Usage: ck-build-analysis [options] + +Arguments: + target Build target to analyze (e.g., example_convnd_fwd_xdl_fp8) + +Options: + --granularity=N Time trace granularity in microseconds (default: 500) + --output=FILE Output report filename (default: build_time_analysis_report.md) + --name=NAME Docker container name (default: ${CONTAINER_NAME}) + --no-reconfigure Skip CMake reconfiguration if build exists + --help Show this help message + +Examples: + ck-build-analysis example_convnd_fwd_xdl_fp8 + ck-build-analysis example_convnd_fwd_xdl_fp8 --granularity=1 + ck-build-analysis test_amdgcn_mma --granularity=100 --output=mma_test_analysis.md + +Granularity Guide: + 500 (default) - Quick overview, filters 86% of events (~5k instantiations, 3-5 MB) + 100 - Balanced detail (~15k instantiations, 15-20 MB) + 1 - Complete analysis (~36k instantiations, 80-100 MB) +EOF +} + +# Parse arguments +TARGET="" +while [[ $# -gt 0 ]]; do + case $1 in + --granularity=*) + GRANULARITY="${1#*=}" + shift + ;; + --output=*) + OUTPUT_FILE="${1#*=}" + shift + ;; + --name=*) + CONTAINER_NAME="${1#*=}" + shift + ;; + --no-reconfigure) + RECONFIGURE=false + shift + ;; + --help|-h) + show_help + exit 0 + ;; + -*) + echo "Unknown option: $1" + show_help + exit 1 + ;; + *) + if [ -z "$TARGET" ]; then + TARGET="$1" + else + echo "Error: Multiple targets specified" + show_help + exit 1 + fi + shift + ;; + esac +done + +if [ -z "$TARGET" ]; then + echo "Error: No target specified" + echo "" + show_help + exit 1 +fi + +echo "═══════════════════════════════════════════════════════════════" +echo " CK Build Time Analysis" +echo "═══════════════════════════════════════════════════════════════" +echo "Target: $TARGET" +echo "Granularity: ${GRANULARITY}µs" +echo "Container: $CONTAINER_NAME" +echo "Output: $OUTPUT_FILE" +echo "═══════════════════════════════════════════════════════════════" +echo "" + +# Ensure container is running +if ! docker ps --filter "name=^${CONTAINER_NAME}$" --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + echo "Container not running. Starting with ck-docker..." + "${SCRIPT_DIR}/ck-docker" start "${CONTAINER_NAME}" +fi + +# Configure CMake with -ftime-trace if needed +if [ "$RECONFIGURE" = true ] || ! docker exec "${CONTAINER_NAME}" test -f /workspace/build/build.ninja 2>/dev/null; then + echo "" + echo "Configuring CMake with -ftime-trace (granularity=${GRANULARITY}µs)..." + + GPU_TARGET=$(docker exec "${CONTAINER_NAME}" bash -c "rocminfo 2>/dev/null | grep -oP 'gfx[0-9a-z]+' | head -1 || echo 'gfx950'" | tr -d '\r\n') + + docker exec "${CONTAINER_NAME}" bash -c " + cd /workspace || exit 1 + rm -rf /workspace/build + mkdir /workspace/build + cd /workspace/build || exit 1 + cmake .. -GNinja \ + -DGPU_TARGETS=${GPU_TARGET} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ + -DCMAKE_CXX_FLAGS='-ftime-trace -ftime-trace-granularity=${GRANULARITY}' \ + -DCMAKE_HIP_FLAGS='-ftime-trace -ftime-trace-granularity=${GRANULARITY}' \ + -DBUILD_TESTING=ON 2>&1 | tail -20 + " + echo "CMake configuration complete" +fi + +# Build the target +echo "" +echo "Building target: $TARGET" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + +BUILD_START=$(date +%s) +docker exec "${CONTAINER_NAME}" bash -c "cd /workspace/build && time ninja ${TARGET} 2>&1" +BUILD_END=$(date +%s) +BUILD_TIME=$((BUILD_END - BUILD_START)) + +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +echo "Build completed in ${BUILD_TIME} seconds" + +# Find the trace JSON file +echo "" +echo "Locating trace file..." +TRACE_FILE=$(docker exec "${CONTAINER_NAME}" bash -c "find /workspace/build -name '*.cpp.json' -o -name '*.hip.json' 2>/dev/null | grep -i '${TARGET}' | head -1") + +if [ -z "$TRACE_FILE" ]; then + echo "Error: Could not find trace file for target ${TARGET}" + echo "Expected pattern: build/**/${TARGET}*.json" + exit 1 +fi + +TRACE_SIZE=$(docker exec "${CONTAINER_NAME}" bash -c "ls -lh ${TRACE_FILE} | awk '{print \$5}'") +echo "Found trace file: ${TRACE_FILE} (${TRACE_SIZE})" + +# Generate analysis script +echo "" +echo "Generating analysis report..." + +ANALYSIS_SCRIPT="/tmp/analyze_${TARGET}_$$.py" +cat > "${ANALYSIS_SCRIPT}" << 'PYSCRIPT' +#!/usr/bin/env python3 +import json +import re +import sys +from collections import defaultdict +from datetime import datetime + +if len(sys.argv) < 4: + print("Usage: analyze.py ") + sys.exit(1) + +trace_file = sys.argv[1] +output_file = sys.argv[2] +target = sys.argv[3] +granularity = sys.argv[4] +build_time = sys.argv[5] + +print(f'Loading trace file: {trace_file}') +with open(trace_file, 'r') as f: + data = json.load(f) + +print('Processing events...') +template_stats = defaultdict(lambda: {'count': 0, 'total_dur': 0.0}) +phase_stats = defaultdict(float) +top_individual = [] + +for event in data.get('traceEvents', []): + name = event.get('name', '') + dur = event.get('dur', 0) / 1000.0 + + if name and dur > 0: + phase_stats[name] += dur + + if name in ['InstantiateFunction', 'InstantiateClass']: + detail = event.get('args', {}).get('detail', '') + top_individual.append({'detail': detail, 'dur': dur, 'type': name}) + + match = re.match(r'^([^<(]+)', detail) + if match: + template_name = match.group(1).strip() + template_name = re.sub(r'^ck::', '', template_name) + template_name = re.sub(r'^std::', 'std::', template_name) + + template_stats[template_name]['count'] += 1 + template_stats[template_name]['total_dur'] += dur + +print('Sorting and generating report...') +sorted_templates = sorted(template_stats.items(), key=lambda x: x[1]['total_dur'], reverse=True) +sorted_phases = sorted(phase_stats.items(), key=lambda x: x[1], reverse=True) +top_individual.sort(key=lambda x: x['dur'], reverse=True) + +total_template_time = sum(s['total_dur'] for s in template_stats.values()) +total_trace_time = sum(phase_stats.values()) +total_events = len(data.get('traceEvents', [])) +total_inst = sum(s['count'] for s in template_stats.values()) + +report = [] +report.append('# Composable Kernel Build Time Analysis Report') +report.append('') +report.append(f'**Generated:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') +report.append(f'**Target:** {target}') +report.append(f'**Granularity:** {granularity}µs') +report.append('') +report.append('## Executive Summary') +report.append('') +report.append(f'- **Wall Clock Time:** {build_time} seconds') +report.append(f'- **Trace Time:** {total_trace_time/1000:.1f} seconds') +report.append(f'- **Template Instantiation Time:** {total_template_time/1000:.1f} seconds ({100*total_template_time/total_trace_time:.1f}% of trace)') +report.append(f'- **Total Events Captured:** {total_events:,}') +report.append(f'- **Total Template Instantiations:** {total_inst:,}') +report.append(f'- **Unique Template Families:** {len(sorted_templates)}') +report.append('') +report.append('## Compilation Phase Breakdown') +report.append('') +report.append('| Phase | Time (ms) | Time (s) | % of Total |') +report.append('|-------|-----------|----------|------------|') +for phase, dur in sorted_phases[:20]: + pct = 100 * dur / total_trace_time + report.append(f'| {phase:<40} | {dur:>9.2f} | {dur/1000:>8.2f} | {pct:>9.1f}% |') +report.append('') +report.append('## Top 30 Most Expensive Individual Instantiations') +report.append('') +report.append('| Rank | Template | Type | Time (ms) |') +report.append('|------|----------|------|-----------|') +for i, inst in enumerate(top_individual[:30], 1): + detail = inst['detail'][:70] + '...' if len(inst['detail']) > 70 else inst['detail'] + inst_type = 'Func' if inst['type'] == 'InstantiateFunction' else 'Class' + report.append(f'| {i:>4} | {detail:<70} | {inst_type:<5} | {inst["dur"]:>9.2f} |') +report.append('') +report.append('## Template Families by Total Time (Top 50)') +report.append('') +report.append('| Rank | Template Family | Count | Total (ms) | Avg (ms) | % of Total |') +report.append('|------|-----------------|-------|------------|----------|------------|') +for i, (name, stats) in enumerate(sorted_templates[:50], 1): + count = stats['count'] + total = stats['total_dur'] + avg = total / count if count > 0 else 0 + pct = 100 * total / total_template_time if total_template_time > 0 else 0 + display_name = name[:40] + '...' if len(name) > 40 else name + report.append(f'| {i:>4} | {display_name:<43} | {count:>5} | {total:>10.2f} | {avg:>8.2f} | {pct:>9.1f}% |') +report.append('') +report.append('## Template Families by Instantiation Count (Top 50)') +report.append('') +sorted_by_count = sorted(template_stats.items(), key=lambda x: x[1]['count'], reverse=True) +report.append('| Rank | Template Family | Count | Total (ms) | Avg (ms) |') +report.append('|------|-----------------|-------|------------|----------|') +for i, (name, stats) in enumerate(sorted_by_count[:50], 1): + count = stats['count'] + total = stats['total_dur'] + avg = total / count if count > 0 else 0 + display_name = name[:40] + '...' if len(name) > 40 else name + report.append(f'| {i:>4} | {display_name:<43} | {count:>5} | {total:>10.2f} | {avg:>8.2f} |') +report.append('') +report.append('## Key Insights') +report.append('') +report.append('### 1. Template Instantiation Impact') +report.append(f'- Template instantiation accounts for {100*total_template_time/total_trace_time:.1f}% of total trace time') +if len(sorted_templates) >= 10: + top10_pct = 100*sum(s[1]["total_dur"] for s in sorted_templates[:10])/total_template_time + report.append(f'- Top 10 template families account for {top10_pct:.1f}% of instantiation time') +report.append('') +report.append('### 2. Most Expensive Templates') +if len(sorted_templates) > 0: + report.append(f'- **{sorted_templates[0][0]}**: {sorted_templates[0][1]["count"]:,} instantiations, {sorted_templates[0][1]["total_dur"]/1000:.2f}s total') +if len(sorted_templates) > 1: + avg = sorted_templates[1][1]["total_dur"] / sorted_templates[1][1]["count"] + report.append(f'- **{sorted_templates[1][0]}**: {sorted_templates[1][1]["count"]:,} instantiations, {avg:.2f}ms average') +report.append('') +report.append('## Optimization Recommendations') +report.append('') +report.append('### Short Term') +report.append('1. **Focus on High-Impact Templates**: Address top 10 families first') +report.append('2. **Explicit Template Instantiation**: Pre-instantiate common configurations') +report.append('3. **Extern Templates**: Mark frequently-used templates as extern in headers') +report.append('') +report.append('### Medium Term') +report.append('1. **Precompiled Headers**: Include heavy templates in PCH') +report.append('2. **Template Specialization**: Replace general templates with specialized versions') +report.append('3. **Template Depth Reduction**: Simplify template hierarchies') +report.append('') +report.append('### Long Term') +report.append('1. **Architectural Review**: Evaluate necessity of deep template metaprogramming') +report.append('2. **C++20 Concepts**: Earlier constraint checking, fewer instantiations') +report.append('3. **Build Caching**: Distributed build cache for template instantiations') +report.append('') +report.append('## Detailed Statistics') +report.append('') +report.append(f'- **Total Unique Templates:** {len(sorted_templates)}') +report.append(f'- **Total Instantiations:** {total_inst:,}') +if total_inst > 0: + report.append(f'- **Average Instantiation Time:** {total_template_time/total_inst:.3f}ms') +if len(template_stats) > 0: + median_count = sorted([s["count"] for s in template_stats.values()])[len(template_stats)//2] + report.append(f'- **Median Template Family Count:** {median_count}') +report.append('') +report.append('---') +report.append('') +report.append(f'*Report generated using Clang -ftime-trace with {granularity}µs granularity*') +report.append(f'*Analysis tool: ck-build-analysis*') + +with open(output_file, 'w') as f: + f.write('\n'.join(report)) + +print(f'Report generated: {output_file}') +print(f'Total lines: {len(report)}') +PYSCRIPT + +# Copy analysis script to container and run it +docker cp "${ANALYSIS_SCRIPT}" "${CONTAINER_NAME}:/tmp/analyze.py" + +docker exec "${CONTAINER_NAME}" python3 /tmp/analyze.py \ + "${TRACE_FILE}" \ + "/workspace/${OUTPUT_FILE}" \ + "${TARGET}" \ + "${GRANULARITY}" \ + "${BUILD_TIME}" + +# Copy report back to host +docker cp "${CONTAINER_NAME}:/workspace/${OUTPUT_FILE}" "${PROJECT_ROOT}/${OUTPUT_FILE}" + +# Cleanup +rm -f "${ANALYSIS_SCRIPT}" +docker exec "${CONTAINER_NAME}" rm -f /tmp/analyze.py + +echo "" +echo "═══════════════════════════════════════════════════════════════" +echo " Analysis Complete!" +echo "═══════════════════════════════════════════════════════════════" +echo "Report: ${PROJECT_ROOT}/${OUTPUT_FILE}" +echo "" +echo "Summary:" +docker exec "${CONTAINER_NAME}" bash -c "head -20 /workspace/${OUTPUT_FILE} | tail -10" +echo "" +echo "View the full report:" +echo " cat ${OUTPUT_FILE}" +echo " or open it in your editor" +echo "═══════════════════════════════════════════════════════════════" diff --git a/.claude/skills/ck-build-analysis.md b/.claude/skills/ck-build-analysis.md new file mode 100644 index 00000000000..131fbda999f --- /dev/null +++ b/.claude/skills/ck-build-analysis.md @@ -0,0 +1,112 @@ +# ck-build-analysis + +Analyze Composable Kernel build times using Clang's -ftime-trace profiler. + +## Terminal Usage + +Direct command-line usage: + +```bash +# From composable_kernel directory +.claude/skills/ck-build-analysis example_convnd_fwd_xdl_fp8 +.claude/skills/ck-build-analysis example_convnd_fwd_xdl_fp8 --granularity=1 +.claude/skills/ck-build-analysis example_convnd_fwd_xdl_fp8 --granularity=1 --output=my_report.md + +# Or add to PATH +export PATH="$PATH:$PWD/.claude/skills" +ck-build-analysis example_convnd_fwd_xdl_fp8 +``` + +## Ask Claude + +Just ask in natural language: +- "Analyze build time for example_convnd_fwd_xdl_fp8" +- "Profile the compilation of test_amdgcn_mma with 1µs granularity" +- "Generate a build time report for example_gemm_xdl" + +## Commands + +``` +ck-build-analysis [options] + +Options: + --granularity=N Time trace granularity in microseconds (default: 500) + --output=FILE Output report filename (default: build_time_analysis_report.md) + --name=NAME Docker container name (default: from CK_CONTAINER_NAME or auto-generated) + --no-reconfigure Skip CMake reconfiguration if build exists + --help Show this help message +``` + +## What It Does + +1. **Configures CMake** with `-ftime-trace` and custom granularity +2. **Builds the target** using Ninja in Docker +3. **Analyzes the trace** JSON file for template instantiation patterns +4. **Generates a report** with: + - Compilation phase breakdown + - Top expensive individual instantiations + - Template families ranked by total time and count + - Key insights and optimization recommendations + - Complete statistics + +## Configuration + +- **Container**: Uses ck-docker container (auto-starts if needed) +- **Granularity**: Default 500µs (use 1µs for high-resolution, 100µs for medium) +- **Output**: Markdown report in project root + +## Environment + +```bash +export CK_CONTAINER_NAME=my_build # Override container name +export CK_BUILD_ANALYSIS_GRANULARITY=1 # Default granularity in µs +``` + +## Examples + +```bash +# Basic analysis with default granularity (500µs) +ck-build-analysis example_convnd_fwd_xdl_fp8 + +# High-resolution analysis (1µs granularity, 22x larger trace) +ck-build-analysis example_convnd_fwd_xdl_fp8 --granularity=1 + +# Medium-resolution analysis (100µs granularity, good balance) +ck-build-analysis example_convnd_fwd_xdl_fp8 --granularity=100 + +# Custom output filename +ck-build-analysis example_convnd_fwd_xdl_fp8 --output=fp8_conv_analysis.md + +# Analyze test target +ck-build-analysis test_amdgcn_mma --granularity=1 + +# Use existing build (skip reconfigure) +ck-build-analysis example_convnd_fwd_xdl_fp8 --no-reconfigure +``` + +## Output + +The report includes: +- **Executive Summary**: Total time, events, instantiations, unique templates +- **Compilation Phases**: InstantiateFunction, Frontend, Backend, Optimizer, etc. +- **Top 30 Individual Instantiations**: Most expensive single templates +- **Template Families**: Grouped by total time and instantiation count +- **Key Insights**: What's slow and why +- **Optimization Recommendations**: Short, medium, and long-term strategies +- **Detailed Statistics**: Averages, medians, distributions + +## Granularity Trade-offs + +| Granularity | Events | Trace Size | Use Case | +|-------------|--------|------------|----------| +| 500µs (default) | ~50k | 3-5 MB | Quick overview, major bottlenecks | +| 100µs | ~150k | 15-20 MB | Balanced detail and performance | +| 50µs | ~200k | 30-40 MB | Detailed analysis | +| 1µs (high-res) | ~300k | 80-100 MB | Complete picture, all instantiations | + +## Notes + +- Lower granularity = more events = larger files = longer analysis +- Default 500µs captures major bottlenecks (filters out 86% of instantiations) +- 1µs granularity reveals all 36,000+ instantiations but takes longer to analyze +- 100µs is a good middle ground for most use cases From fc53e81355da9bd142af42d44e768b1da88d39ae Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 13 Jan 2026 22:41:51 -0600 Subject: [PATCH 05/17] Refactor report generation to use Jinja2 templates - Add Jinja2 template for report generation (.claude/skills/templates/build_analysis_report.md.jinja) - Refactor analysis script to use template rendering instead of string concatenation - Add custom Jinja2 filters for formatting (format_number, truncate, pad) - Separate presentation from logic for better maintainability - Template makes report format easier to modify and extend Requirements: - python3-jinja2 must be installed in Docker container (apt-get install python3-jinja2) Benefits: - Cleaner code with separation of concerns - Easier to customize report format - Better readability and maintainability Co-Authored-By: Claude --- .claude/skills/ck-build-analysis | 193 ++++++++---------- .../templates/build_analysis_report.md.jinja | 95 +++++++++ 2 files changed, 180 insertions(+), 108 deletions(-) create mode 100644 .claude/skills/templates/build_analysis_report.md.jinja diff --git a/.claude/skills/ck-build-analysis b/.claude/skills/ck-build-analysis index 1543705a517..7113001772c 100755 --- a/.claude/skills/ck-build-analysis +++ b/.claude/skills/ck-build-analysis @@ -183,9 +183,10 @@ import re import sys from collections import defaultdict from datetime import datetime +from jinja2 import Environment, FileSystemLoader if len(sys.argv) < 4: - print("Usage: analyze.py ") + print("Usage: analyze.py ") sys.exit(1) trace_file = sys.argv[1] @@ -193,6 +194,7 @@ output_file = sys.argv[2] target = sys.argv[3] granularity = sys.argv[4] build_time = sys.argv[5] +template_dir = sys.argv[6] print(f'Loading trace file: {trace_file}') with open(trace_file, 'r') as f: @@ -223,8 +225,7 @@ for event in data.get('traceEvents', []): template_stats[template_name]['count'] += 1 template_stats[template_name]['total_dur'] += dur -print('Sorting and generating report...') -sorted_templates = sorted(template_stats.items(), key=lambda x: x[1]['total_dur'], reverse=True) +print('Sorting data...') sorted_phases = sorted(phase_stats.items(), key=lambda x: x[1], reverse=True) top_individual.sort(key=lambda x: x['dur'], reverse=True) @@ -233,126 +234,101 @@ total_trace_time = sum(phase_stats.values()) total_events = len(data.get('traceEvents', [])) total_inst = sum(s['count'] for s in template_stats.values()) -report = [] -report.append('# Composable Kernel Build Time Analysis Report') -report.append('') -report.append(f'**Generated:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') -report.append(f'**Target:** {target}') -report.append(f'**Granularity:** {granularity}µs') -report.append('') -report.append('## Executive Summary') -report.append('') -report.append(f'- **Wall Clock Time:** {build_time} seconds') -report.append(f'- **Trace Time:** {total_trace_time/1000:.1f} seconds') -report.append(f'- **Template Instantiation Time:** {total_template_time/1000:.1f} seconds ({100*total_template_time/total_trace_time:.1f}% of trace)') -report.append(f'- **Total Events Captured:** {total_events:,}') -report.append(f'- **Total Template Instantiations:** {total_inst:,}') -report.append(f'- **Unique Template Families:** {len(sorted_templates)}') -report.append('') -report.append('## Compilation Phase Breakdown') -report.append('') -report.append('| Phase | Time (ms) | Time (s) | % of Total |') -report.append('|-------|-----------|----------|------------|') -for phase, dur in sorted_phases[:20]: - pct = 100 * dur / total_trace_time - report.append(f'| {phase:<40} | {dur:>9.2f} | {dur/1000:>8.2f} | {pct:>9.1f}% |') -report.append('') -report.append('## Top 30 Most Expensive Individual Instantiations') -report.append('') -report.append('| Rank | Template | Type | Time (ms) |') -report.append('|------|----------|------|-----------|') -for i, inst in enumerate(top_individual[:30], 1): - detail = inst['detail'][:70] + '...' if len(inst['detail']) > 70 else inst['detail'] - inst_type = 'Func' if inst['type'] == 'InstantiateFunction' else 'Class' - report.append(f'| {i:>4} | {detail:<70} | {inst_type:<5} | {inst["dur"]:>9.2f} |') -report.append('') -report.append('## Template Families by Total Time (Top 50)') -report.append('') -report.append('| Rank | Template Family | Count | Total (ms) | Avg (ms) | % of Total |') -report.append('|------|-----------------|-------|------------|----------|------------|') -for i, (name, stats) in enumerate(sorted_templates[:50], 1): - count = stats['count'] - total = stats['total_dur'] - avg = total / count if count > 0 else 0 - pct = 100 * total / total_template_time if total_template_time > 0 else 0 - display_name = name[:40] + '...' if len(name) > 40 else name - report.append(f'| {i:>4} | {display_name:<43} | {count:>5} | {total:>10.2f} | {avg:>8.2f} | {pct:>9.1f}% |') -report.append('') -report.append('## Template Families by Instantiation Count (Top 50)') -report.append('') -sorted_by_count = sorted(template_stats.items(), key=lambda x: x[1]['count'], reverse=True) -report.append('| Rank | Template Family | Count | Total (ms) | Avg (ms) |') -report.append('|------|-----------------|-------|------------|----------|') -for i, (name, stats) in enumerate(sorted_by_count[:50], 1): - count = stats['count'] - total = stats['total_dur'] - avg = total / count if count > 0 else 0 - display_name = name[:40] + '...' if len(name) > 40 else name - report.append(f'| {i:>4} | {display_name:<43} | {count:>5} | {total:>10.2f} | {avg:>8.2f} |') -report.append('') -report.append('## Key Insights') -report.append('') -report.append('### 1. Template Instantiation Impact') -report.append(f'- Template instantiation accounts for {100*total_template_time/total_trace_time:.1f}% of total trace time') -if len(sorted_templates) >= 10: - top10_pct = 100*sum(s[1]["total_dur"] for s in sorted_templates[:10])/total_template_time - report.append(f'- Top 10 template families account for {top10_pct:.1f}% of instantiation time') -report.append('') -report.append('### 2. Most Expensive Templates') -if len(sorted_templates) > 0: - report.append(f'- **{sorted_templates[0][0]}**: {sorted_templates[0][1]["count"]:,} instantiations, {sorted_templates[0][1]["total_dur"]/1000:.2f}s total') -if len(sorted_templates) > 1: - avg = sorted_templates[1][1]["total_dur"] / sorted_templates[1][1]["count"] - report.append(f'- **{sorted_templates[1][0]}**: {sorted_templates[1][1]["count"]:,} instantiations, {avg:.2f}ms average') -report.append('') -report.append('## Optimization Recommendations') -report.append('') -report.append('### Short Term') -report.append('1. **Focus on High-Impact Templates**: Address top 10 families first') -report.append('2. **Explicit Template Instantiation**: Pre-instantiate common configurations') -report.append('3. **Extern Templates**: Mark frequently-used templates as extern in headers') -report.append('') -report.append('### Medium Term') -report.append('1. **Precompiled Headers**: Include heavy templates in PCH') -report.append('2. **Template Specialization**: Replace general templates with specialized versions') -report.append('3. **Template Depth Reduction**: Simplify template hierarchies') -report.append('') -report.append('### Long Term') -report.append('1. **Architectural Review**: Evaluate necessity of deep template metaprogramming') -report.append('2. **C++20 Concepts**: Earlier constraint checking, fewer instantiations') -report.append('3. **Build Caching**: Distributed build cache for template instantiations') -report.append('') -report.append('## Detailed Statistics') -report.append('') -report.append(f'- **Total Unique Templates:** {len(sorted_templates)}') -report.append(f'- **Total Instantiations:** {total_inst:,}') -if total_inst > 0: - report.append(f'- **Average Instantiation Time:** {total_template_time/total_inst:.3f}ms') +# Prepare templates by time with calculated fields +templates_by_time = [] +for name, stats in sorted(template_stats.items(), key=lambda x: x[1]['total_dur'], reverse=True): + templates_by_time.append((name, { + 'count': stats['count'], + 'total_dur': stats['total_dur'], + 'avg': stats['total_dur'] / stats['count'] if stats['count'] > 0 else 0, + 'pct': 100 * stats['total_dur'] / total_template_time if total_template_time > 0 else 0 + })) + +# Prepare templates by count +templates_by_count = [] +for name, stats in sorted(template_stats.items(), key=lambda x: x[1]['count'], reverse=True): + templates_by_count.append((name, { + 'count': stats['count'], + 'total_dur': stats['total_dur'], + 'avg': stats['total_dur'] / stats['count'] if stats['count'] > 0 else 0 + })) + +# Prepare top individual instantiations with friendly type names +for inst in top_individual: + inst['inst_type'] = 'Func' if inst['type'] == 'InstantiateFunction' else 'Class' + +# Calculate additional metrics +median_count = 0 if len(template_stats) > 0: median_count = sorted([s["count"] for s in template_stats.values()])[len(template_stats)//2] - report.append(f'- **Median Template Family Count:** {median_count}') -report.append('') -report.append('---') -report.append('') -report.append(f'*Report generated using Clang -ftime-trace with {granularity}µs granularity*') -report.append(f'*Analysis tool: ck-build-analysis*') + +top10_pct = 0 +if len(templates_by_time) >= 10: + top10_pct = 100 * sum(s[1]["total_dur"] for s in templates_by_time[:10]) / total_template_time + +print('Rendering report with Jinja2...') +# Set up Jinja2 environment with custom filters +env = Environment(loader=FileSystemLoader(template_dir)) + +def format_number(value): + """Format number with thousand separators""" + return f'{value:,}' + +def truncate(value, length): + """Truncate string to length with ellipsis""" + if len(value) > length: + return value[:length-3] + '...' + return value + +def pad(value, length): + """Pad string to specified length""" + return f'{value:<{length}}' + +env.filters['format_number'] = format_number +env.filters['truncate'] = truncate +env.filters['pad'] = pad + +# Load and render template +template = env.get_template('build_analysis_report.md.jinja') +report_content = template.render( + timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + target=target, + granularity=granularity, + build_time=build_time, + trace_time_sec=f'{total_trace_time/1000:.1f}', + template_time_sec=f'{total_template_time/1000:.1f}', + template_pct=f'{100*total_template_time/total_trace_time:.1f}', + total_events=total_events, + total_instantiations=total_inst, + unique_families=len(template_stats), + total_trace_time=total_trace_time, + total_template_time=total_template_time, + phases=sorted_phases, + top_individual=top_individual, + templates_by_time=templates_by_time, + templates_by_count=templates_by_count, + median_count=median_count, + top10_pct=f'{top10_pct:.1f}' +) with open(output_file, 'w') as f: - f.write('\n'.join(report)) + f.write(report_content) print(f'Report generated: {output_file}') -print(f'Total lines: {len(report)}') +print(f'Report size: {len(report_content)} bytes') PYSCRIPT -# Copy analysis script to container and run it +# Copy analysis script and templates to container docker cp "${ANALYSIS_SCRIPT}" "${CONTAINER_NAME}:/tmp/analyze.py" +docker cp "${SCRIPT_DIR}/templates" "${CONTAINER_NAME}:/tmp/ck_build_analysis_templates" docker exec "${CONTAINER_NAME}" python3 /tmp/analyze.py \ "${TRACE_FILE}" \ "/workspace/${OUTPUT_FILE}" \ "${TARGET}" \ "${GRANULARITY}" \ - "${BUILD_TIME}" + "${BUILD_TIME}" \ + "/tmp/ck_build_analysis_templates" # Copy report back to host docker cp "${CONTAINER_NAME}:/workspace/${OUTPUT_FILE}" "${PROJECT_ROOT}/${OUTPUT_FILE}" @@ -360,6 +336,7 @@ docker cp "${CONTAINER_NAME}:/workspace/${OUTPUT_FILE}" "${PROJECT_ROOT}/${OUTPU # Cleanup rm -f "${ANALYSIS_SCRIPT}" docker exec "${CONTAINER_NAME}" rm -f /tmp/analyze.py +docker exec "${CONTAINER_NAME}" rm -rf /tmp/ck_build_analysis_templates echo "" echo "═══════════════════════════════════════════════════════════════" diff --git a/.claude/skills/templates/build_analysis_report.md.jinja b/.claude/skills/templates/build_analysis_report.md.jinja new file mode 100644 index 00000000000..b6c4b2bbf5b --- /dev/null +++ b/.claude/skills/templates/build_analysis_report.md.jinja @@ -0,0 +1,95 @@ +# Composable Kernel Build Time Analysis Report + +**Generated:** {{ timestamp }} +**Target:** {{ target }} +**Granularity:** {{ granularity }}µs + +## Executive Summary + +- **Wall Clock Time:** {{ build_time }} seconds +- **Trace Time:** {{ trace_time_sec }} seconds +- **Template Instantiation Time:** {{ template_time_sec }} seconds ({{ template_pct }}% of trace) +- **Total Events Captured:** {{ total_events|format_number }} +- **Total Template Instantiations:** {{ total_instantiations|format_number }} +- **Unique Template Families:** {{ unique_families }} + +## Compilation Phase Breakdown + +| Phase | Time (ms) | Time (s) | % of Total | +|-------|-----------|----------|------------| +{% for phase, dur in phases[:20] -%} +| {{ phase|pad(40) }} | {{ "%9.2f"|format(dur) }} | {{ "%8.2f"|format(dur/1000) }} | {{ "%9.1f"|format(100 * dur / total_trace_time) }}% | +{% endfor %} + +## Top 30 Most Expensive Individual Instantiations + +| Rank | Template | Type | Time (ms) | +|------|----------|------|-----------| +{% for inst in top_individual[:30] -%} +| {{ "%4d"|format(loop.index) }} | {{ inst.detail|truncate(70) }} | {{ inst.inst_type|pad(5) }} | {{ "%9.2f"|format(inst.dur) }} | +{% endfor %} + +## Template Families by Total Time (Top 50) + +| Rank | Template Family | Count | Total (ms) | Avg (ms) | % of Total | +|------|-----------------|-------|------------|----------|------------| +{% for name, stats in templates_by_time[:50] -%} +| {{ "%4d"|format(loop.index) }} | {{ name|truncate(43)|pad(43) }} | {{ "%5d"|format(stats.count) }} | {{ "%10.2f"|format(stats.total_dur) }} | {{ "%8.2f"|format(stats.avg) }} | {{ "%9.1f"|format(stats.pct) }}% | +{% endfor %} + +## Template Families by Instantiation Count (Top 50) + +| Rank | Template Family | Count | Total (ms) | Avg (ms) | +|------|-----------------|-------|------------|----------| +{% for name, stats in templates_by_count[:50] -%} +| {{ "%4d"|format(loop.index) }} | {{ name|truncate(43)|pad(43) }} | {{ "%5d"|format(stats.count) }} | {{ "%10.2f"|format(stats.total_dur) }} | {{ "%8.2f"|format(stats.avg) }} | +{% endfor %} + +## Key Insights + +### 1. Template Instantiation Impact +- Template instantiation accounts for {{ template_pct }}% of total trace time +{% if unique_families >= 10 -%} +- Top 10 template families account for {{ top10_pct }}% of instantiation time +{% endif %} + +### 2. Most Expensive Templates +{% if templates_by_time|length > 0 -%} +- **{{ templates_by_time[0][0] }}**: {{ templates_by_time[0][1].count|format_number }} instantiations, {{ "%.2f"|format(templates_by_time[0][1].total_dur/1000) }}s total +{% endif -%} +{% if templates_by_time|length > 1 -%} +- **{{ templates_by_time[1][0] }}**: {{ templates_by_time[1][1].count|format_number }} instantiations, {{ "%.2f"|format(templates_by_time[1][1].avg) }}ms average +{% endif %} + +## Optimization Recommendations + +### Short Term +1. **Focus on High-Impact Templates**: Address top 10 families first +2. **Explicit Template Instantiation**: Pre-instantiate common configurations +3. **Extern Templates**: Mark frequently-used templates as extern in headers + +### Medium Term +1. **Precompiled Headers**: Include heavy templates in PCH +2. **Template Specialization**: Replace general templates with specialized versions +3. **Template Depth Reduction**: Simplify template hierarchies + +### Long Term +1. **Architectural Review**: Evaluate necessity of deep template metaprogramming +2. **C++20 Concepts**: Earlier constraint checking, fewer instantiations +3. **Build Caching**: Distributed build cache for template instantiations + +## Detailed Statistics + +- **Total Unique Templates:** {{ unique_families }} +- **Total Instantiations:** {{ total_instantiations|format_number }} +{% if total_instantiations > 0 -%} +- **Average Instantiation Time:** {{ "%.3f"|format(total_template_time/total_instantiations) }}ms +{% endif -%} +{% if unique_families > 0 -%} +- **Median Template Family Count:** {{ median_count }} +{% endif %} + +--- + +*Report generated using Clang -ftime-trace with {{ granularity }}µs granularity* +*Analysis tool: ck-build-analysis* From 7e091c06c5fce0e63c7bfe1fd779daedd7f903d7 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 13 Jan 2026 22:50:17 -0600 Subject: [PATCH 06/17] Extract Python script and make PEP 723 compliant - Extract analysis script from bash heredoc into standalone Python file - Add PEP 723 inline script metadata for dependency management - Make script compatible with pipx and uv for automatic dependency installation - Improve code organization with proper functions and docstrings - Update documentation with PEP 723 usage examples Changes: - New file: analyze_build_trace.py (PEP 723 compliant) - Modified: ck-build-analysis (now uses external Python script) - Modified: ck-build-analysis.md (added implementation details section) Benefits: - Script can be run standalone with pipx/uv - Better code organization and maintainability - Clear dependency declaration - Easier to test and develop independently Example standalone usage: pipx run .claude/skills/analyze_build_trace.py trace.json report.md target 100 22 templates/ Co-Authored-By: Claude --- .claude/skills/analyze_build_trace.py | 234 ++++++++++++++++++++++++++ .claude/skills/ck-build-analysis | 153 +---------------- .claude/skills/ck-build-analysis.md | 39 +++++ 3 files changed, 278 insertions(+), 148 deletions(-) create mode 100755 .claude/skills/analyze_build_trace.py diff --git a/.claude/skills/analyze_build_trace.py b/.claude/skills/analyze_build_trace.py new file mode 100755 index 00000000000..f0f7d7fad3c --- /dev/null +++ b/.claude/skills/analyze_build_trace.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.8" +# dependencies = [ +# "jinja2>=3.0.0", +# ] +# /// +""" +Build Time Analysis Tool for Composable Kernel + +Analyzes Clang -ftime-trace output to identify template instantiation +bottlenecks and generate comprehensive build time reports. +""" + +import json +import re +import sys +from collections import defaultdict +from datetime import datetime +from pathlib import Path + +try: + from jinja2 import Environment, FileSystemLoader +except ImportError: + print("Error: jinja2 is required but not installed.", file=sys.stderr) + print("Install with: apt-get install python3-jinja2", file=sys.stderr) + print("Or with pip: pip install jinja2", file=sys.stderr) + sys.exit(1) + + +def parse_arguments(): + """Parse command-line arguments.""" + if len(sys.argv) < 7: + print("Usage: analyze_build_trace.py ") + sys.exit(1) + + return { + 'trace_file': sys.argv[1], + 'output_file': sys.argv[2], + 'target': sys.argv[3], + 'granularity': sys.argv[4], + 'build_time': sys.argv[5], + 'template_dir': sys.argv[6], + } + + +def load_trace_data(trace_file): + """Load and parse the trace JSON file.""" + print(f'Loading trace file: {trace_file}') + with open(trace_file, 'r') as f: + return json.load(f) + + +def process_events(data): + """Process trace events and extract template instantiation statistics.""" + print('Processing events...') + + template_stats = defaultdict(lambda: {'count': 0, 'total_dur': 0.0}) + phase_stats = defaultdict(float) + top_individual = [] + + for event in data.get('traceEvents', []): + name = event.get('name', '') + dur = event.get('dur', 0) / 1000.0 # Convert to milliseconds + + if name and dur > 0: + phase_stats[name] += dur + + if name in ['InstantiateFunction', 'InstantiateClass']: + detail = event.get('args', {}).get('detail', '') + top_individual.append({ + 'detail': detail, + 'dur': dur, + 'type': name + }) + + # Extract template name (everything before '<' or '(') + match = re.match(r'^([^<(]+)', detail) + if match: + template_name = match.group(1).strip() + # Normalize template names + template_name = re.sub(r'^ck::', '', template_name) + template_name = re.sub(r'^std::', 'std::', template_name) + + template_stats[template_name]['count'] += 1 + template_stats[template_name]['total_dur'] += dur + + return template_stats, phase_stats, top_individual + + +def prepare_template_data(template_stats, phase_stats, top_individual): + """Prepare and calculate derived statistics for template rendering.""" + print('Sorting data...') + + # Sort data + sorted_phases = sorted(phase_stats.items(), key=lambda x: x[1], reverse=True) + top_individual.sort(key=lambda x: x['dur'], reverse=True) + + # Calculate totals + total_template_time = sum(s['total_dur'] for s in template_stats.values()) + total_trace_time = sum(phase_stats.values()) + total_inst = sum(s['count'] for s in template_stats.values()) + + # Prepare templates by time with calculated fields + templates_by_time = [] + for name, stats in sorted(template_stats.items(), key=lambda x: x[1]['total_dur'], reverse=True): + templates_by_time.append((name, { + 'count': stats['count'], + 'total_dur': stats['total_dur'], + 'avg': stats['total_dur'] / stats['count'] if stats['count'] > 0 else 0, + 'pct': 100 * stats['total_dur'] / total_template_time if total_template_time > 0 else 0 + })) + + # Prepare templates by count + templates_by_count = [] + for name, stats in sorted(template_stats.items(), key=lambda x: x[1]['count'], reverse=True): + templates_by_count.append((name, { + 'count': stats['count'], + 'total_dur': stats['total_dur'], + 'avg': stats['total_dur'] / stats['count'] if stats['count'] > 0 else 0 + })) + + # Add friendly type names to individual instantiations + for inst in top_individual: + inst['inst_type'] = 'Func' if inst['type'] == 'InstantiateFunction' else 'Class' + + # Calculate additional metrics + median_count = 0 + if len(template_stats) > 0: + median_count = sorted([s["count"] for s in template_stats.values()])[len(template_stats) // 2] + + top10_pct = 0 + if len(templates_by_time) >= 10: + top10_pct = 100 * sum(s[1]["total_dur"] for s in templates_by_time[:10]) / total_template_time + + return { + 'sorted_phases': sorted_phases, + 'top_individual': top_individual, + 'templates_by_time': templates_by_time, + 'templates_by_count': templates_by_count, + 'total_template_time': total_template_time, + 'total_trace_time': total_trace_time, + 'total_inst': total_inst, + 'median_count': median_count, + 'top10_pct': top10_pct, + 'unique_families': len(template_stats), + } + + +def setup_jinja_environment(template_dir): + """Set up Jinja2 environment with custom filters.""" + env = Environment(loader=FileSystemLoader(template_dir)) + + def format_number(value): + """Format number with thousand separators.""" + return f'{value:,}' + + def truncate(value, length): + """Truncate string to length with ellipsis.""" + if len(value) > length: + return value[:length - 3] + '...' + return value + + def pad(value, length): + """Pad string to specified length.""" + return f'{value:<{length}}' + + env.filters['format_number'] = format_number + env.filters['truncate'] = truncate + env.filters['pad'] = pad + + return env + + +def generate_report(env, data, args, total_events): + """Generate the final report using Jinja2 template.""" + print('Rendering report with Jinja2...') + + template = env.get_template('build_analysis_report.md.jinja') + + report_content = template.render( + timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + target=args['target'], + granularity=args['granularity'], + build_time=args['build_time'], + trace_time_sec=f'{data["total_trace_time"] / 1000:.1f}', + template_time_sec=f'{data["total_template_time"] / 1000:.1f}', + template_pct=f'{100 * data["total_template_time"] / data["total_trace_time"]:.1f}', + total_events=total_events, + total_instantiations=data['total_inst'], + unique_families=data['unique_families'], + total_trace_time=data['total_trace_time'], + total_template_time=data['total_template_time'], + phases=data['sorted_phases'], + top_individual=data['top_individual'], + templates_by_time=data['templates_by_time'], + templates_by_count=data['templates_by_count'], + median_count=data['median_count'], + top10_pct=f'{data["top10_pct"]:.1f}' + ) + + return report_content + + +def main(): + """Main entry point for the analysis tool.""" + args = parse_arguments() + + # Load trace data + trace_data = load_trace_data(args['trace_file']) + total_events = len(trace_data.get('traceEvents', [])) + + # Process events + template_stats, phase_stats, top_individual = process_events(trace_data) + + # Prepare template data + data = prepare_template_data(template_stats, phase_stats, top_individual) + + # Setup Jinja2 environment + env = setup_jinja_environment(args['template_dir']) + + # Generate report + report_content = generate_report(env, data, args, total_events) + + # Write output + with open(args['output_file'], 'w') as f: + f.write(report_content) + + print(f'Report generated: {args["output_file"]}') + print(f'Report size: {len(report_content)} bytes') + + +if __name__ == '__main__': + main() diff --git a/.claude/skills/ck-build-analysis b/.claude/skills/ck-build-analysis index 7113001772c..db06cd4fc97 100755 --- a/.claude/skills/ck-build-analysis +++ b/.claude/skills/ck-build-analysis @@ -171,158 +171,16 @@ fi TRACE_SIZE=$(docker exec "${CONTAINER_NAME}" bash -c "ls -lh ${TRACE_FILE} | awk '{print \$5}'") echo "Found trace file: ${TRACE_FILE} (${TRACE_SIZE})" -# Generate analysis script +# Generate analysis report echo "" echo "Generating analysis report..." -ANALYSIS_SCRIPT="/tmp/analyze_${TARGET}_$$.py" -cat > "${ANALYSIS_SCRIPT}" << 'PYSCRIPT' -#!/usr/bin/env python3 -import json -import re -import sys -from collections import defaultdict -from datetime import datetime -from jinja2 import Environment, FileSystemLoader - -if len(sys.argv) < 4: - print("Usage: analyze.py ") - sys.exit(1) - -trace_file = sys.argv[1] -output_file = sys.argv[2] -target = sys.argv[3] -granularity = sys.argv[4] -build_time = sys.argv[5] -template_dir = sys.argv[6] - -print(f'Loading trace file: {trace_file}') -with open(trace_file, 'r') as f: - data = json.load(f) - -print('Processing events...') -template_stats = defaultdict(lambda: {'count': 0, 'total_dur': 0.0}) -phase_stats = defaultdict(float) -top_individual = [] - -for event in data.get('traceEvents', []): - name = event.get('name', '') - dur = event.get('dur', 0) / 1000.0 - - if name and dur > 0: - phase_stats[name] += dur - - if name in ['InstantiateFunction', 'InstantiateClass']: - detail = event.get('args', {}).get('detail', '') - top_individual.append({'detail': detail, 'dur': dur, 'type': name}) - - match = re.match(r'^([^<(]+)', detail) - if match: - template_name = match.group(1).strip() - template_name = re.sub(r'^ck::', '', template_name) - template_name = re.sub(r'^std::', 'std::', template_name) - - template_stats[template_name]['count'] += 1 - template_stats[template_name]['total_dur'] += dur - -print('Sorting data...') -sorted_phases = sorted(phase_stats.items(), key=lambda x: x[1], reverse=True) -top_individual.sort(key=lambda x: x['dur'], reverse=True) - -total_template_time = sum(s['total_dur'] for s in template_stats.values()) -total_trace_time = sum(phase_stats.values()) -total_events = len(data.get('traceEvents', [])) -total_inst = sum(s['count'] for s in template_stats.values()) - -# Prepare templates by time with calculated fields -templates_by_time = [] -for name, stats in sorted(template_stats.items(), key=lambda x: x[1]['total_dur'], reverse=True): - templates_by_time.append((name, { - 'count': stats['count'], - 'total_dur': stats['total_dur'], - 'avg': stats['total_dur'] / stats['count'] if stats['count'] > 0 else 0, - 'pct': 100 * stats['total_dur'] / total_template_time if total_template_time > 0 else 0 - })) - -# Prepare templates by count -templates_by_count = [] -for name, stats in sorted(template_stats.items(), key=lambda x: x[1]['count'], reverse=True): - templates_by_count.append((name, { - 'count': stats['count'], - 'total_dur': stats['total_dur'], - 'avg': stats['total_dur'] / stats['count'] if stats['count'] > 0 else 0 - })) - -# Prepare top individual instantiations with friendly type names -for inst in top_individual: - inst['inst_type'] = 'Func' if inst['type'] == 'InstantiateFunction' else 'Class' - -# Calculate additional metrics -median_count = 0 -if len(template_stats) > 0: - median_count = sorted([s["count"] for s in template_stats.values()])[len(template_stats)//2] - -top10_pct = 0 -if len(templates_by_time) >= 10: - top10_pct = 100 * sum(s[1]["total_dur"] for s in templates_by_time[:10]) / total_template_time - -print('Rendering report with Jinja2...') -# Set up Jinja2 environment with custom filters -env = Environment(loader=FileSystemLoader(template_dir)) - -def format_number(value): - """Format number with thousand separators""" - return f'{value:,}' - -def truncate(value, length): - """Truncate string to length with ellipsis""" - if len(value) > length: - return value[:length-3] + '...' - return value - -def pad(value, length): - """Pad string to specified length""" - return f'{value:<{length}}' - -env.filters['format_number'] = format_number -env.filters['truncate'] = truncate -env.filters['pad'] = pad - -# Load and render template -template = env.get_template('build_analysis_report.md.jinja') -report_content = template.render( - timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - target=target, - granularity=granularity, - build_time=build_time, - trace_time_sec=f'{total_trace_time/1000:.1f}', - template_time_sec=f'{total_template_time/1000:.1f}', - template_pct=f'{100*total_template_time/total_trace_time:.1f}', - total_events=total_events, - total_instantiations=total_inst, - unique_families=len(template_stats), - total_trace_time=total_trace_time, - total_template_time=total_template_time, - phases=sorted_phases, - top_individual=top_individual, - templates_by_time=templates_by_time, - templates_by_count=templates_by_count, - median_count=median_count, - top10_pct=f'{top10_pct:.1f}' -) - -with open(output_file, 'w') as f: - f.write(report_content) - -print(f'Report generated: {output_file}') -print(f'Report size: {len(report_content)} bytes') -PYSCRIPT - # Copy analysis script and templates to container -docker cp "${ANALYSIS_SCRIPT}" "${CONTAINER_NAME}:/tmp/analyze.py" +docker cp "${SCRIPT_DIR}/analyze_build_trace.py" "${CONTAINER_NAME}:/tmp/analyze_build_trace.py" docker cp "${SCRIPT_DIR}/templates" "${CONTAINER_NAME}:/tmp/ck_build_analysis_templates" -docker exec "${CONTAINER_NAME}" python3 /tmp/analyze.py \ +# Run analysis +docker exec "${CONTAINER_NAME}" python3 /tmp/analyze_build_trace.py \ "${TRACE_FILE}" \ "/workspace/${OUTPUT_FILE}" \ "${TARGET}" \ @@ -334,8 +192,7 @@ docker exec "${CONTAINER_NAME}" python3 /tmp/analyze.py \ docker cp "${CONTAINER_NAME}:/workspace/${OUTPUT_FILE}" "${PROJECT_ROOT}/${OUTPUT_FILE}" # Cleanup -rm -f "${ANALYSIS_SCRIPT}" -docker exec "${CONTAINER_NAME}" rm -f /tmp/analyze.py +docker exec "${CONTAINER_NAME}" rm -f /tmp/analyze_build_trace.py docker exec "${CONTAINER_NAME}" rm -rf /tmp/ck_build_analysis_templates echo "" diff --git a/.claude/skills/ck-build-analysis.md b/.claude/skills/ck-build-analysis.md index 131fbda999f..b6375a6ba55 100644 --- a/.claude/skills/ck-build-analysis.md +++ b/.claude/skills/ck-build-analysis.md @@ -110,3 +110,42 @@ The report includes: - Default 500µs captures major bottlenecks (filters out 86% of instantiations) - 1µs granularity reveals all 36,000+ instantiations but takes longer to analyze - 100µs is a good middle ground for most use cases + +## Implementation Details + +### PEP 723 Compliance + +The analysis script (`analyze_build_trace.py`) is PEP 723 compliant with inline dependency metadata: + +```python +# /// script +# requires-python = ">=3.8" +# dependencies = [ +# "jinja2>=3.0.0", +# ] +# /// +``` + +This allows tools like `pipx run` or `uv run` to automatically manage dependencies: + +```bash +# Run standalone with pipx (auto-installs dependencies) +pipx run .claude/skills/analyze_build_trace.py trace.json report.md target 100 22 templates/ + +# Or with uv +uv run .claude/skills/analyze_build_trace.py trace.json report.md target 100 22 templates/ +``` + +### Components + +- **ck-build-analysis** - Main bash script that orchestrates Docker, CMake, and analysis +- **analyze_build_trace.py** - PEP 723 compliant Python script for trace analysis +- **templates/build_analysis_report.md.jinja** - Jinja2 template for report generation + +### Requirements + +In Docker container: +- `python3-jinja2` (installed via `apt-get install python3-jinja2`) + +For standalone use: +- Python 3.8+ with `jinja2>=3.0.0` (auto-managed if using `pipx` or `uv`) From caf3f74e1250cd037cecda42cff9619a73c3c570 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 13 Jan 2026 22:56:29 -0600 Subject: [PATCH 07/17] Use uv run as default execution path for automatic dependency management - Automatically detect and use uv if available in container - Fall back to python3 if uv not found (backward compatible) - Leverage PEP 723 metadata for zero-config dependency installation - Update documentation with uv installation instructions Benefits: - Zero manual dependency installation with uv - Isolated dependency environment (no system pollution) - Fast dependency caching for subsequent runs - Automatic dependency resolution from PEP 723 metadata Tested with: - uv 0.9.25: Auto-installs jinja2 from PEP 723 metadata - python3: Falls back when uv unavailable (requires python3-jinja2) Installation: docker exec bash -c "curl -LsSf https://astral.sh/uv/install.sh | sh" Co-Authored-By: Claude --- .claude/skills/ck-build-analysis | 30 ++++++++++++++------ .claude/skills/ck-build-analysis.md | 43 +++++++++++++++++++++-------- 2 files changed, 54 insertions(+), 19 deletions(-) diff --git a/.claude/skills/ck-build-analysis b/.claude/skills/ck-build-analysis index db06cd4fc97..ef3df6f53b8 100755 --- a/.claude/skills/ck-build-analysis +++ b/.claude/skills/ck-build-analysis @@ -179,14 +179,28 @@ echo "Generating analysis report..." docker cp "${SCRIPT_DIR}/analyze_build_trace.py" "${CONTAINER_NAME}:/tmp/analyze_build_trace.py" docker cp "${SCRIPT_DIR}/templates" "${CONTAINER_NAME}:/tmp/ck_build_analysis_templates" -# Run analysis -docker exec "${CONTAINER_NAME}" python3 /tmp/analyze_build_trace.py \ - "${TRACE_FILE}" \ - "/workspace/${OUTPUT_FILE}" \ - "${TARGET}" \ - "${GRANULARITY}" \ - "${BUILD_TIME}" \ - "/tmp/ck_build_analysis_templates" +# Check if uv is available and use it for PEP 723 dependency management +# Check both PATH and common install locations +if docker exec "${CONTAINER_NAME}" bash -c "command -v uv >/dev/null 2>&1 || test -x \$HOME/.local/bin/uv"; then + echo "Using uv run for automatic dependency management..." + # Ensure uv is in PATH (handles ~/.local/bin installation) + docker exec "${CONTAINER_NAME}" bash -c "export PATH=\"\$HOME/.local/bin:\$PATH\" && uv run --no-project /tmp/analyze_build_trace.py \ + ${TRACE_FILE} \ + /workspace/${OUTPUT_FILE} \ + ${TARGET} \ + ${GRANULARITY} \ + ${BUILD_TIME} \ + /tmp/ck_build_analysis_templates" +else + echo "uv not found, using python3 (requires python3-jinja2 pre-installed)..." + docker exec "${CONTAINER_NAME}" python3 /tmp/analyze_build_trace.py \ + "${TRACE_FILE}" \ + "/workspace/${OUTPUT_FILE}" \ + "${TARGET}" \ + "${GRANULARITY}" \ + "${BUILD_TIME}" \ + "/tmp/ck_build_analysis_templates" +fi # Copy report back to host docker cp "${CONTAINER_NAME}:/workspace/${OUTPUT_FILE}" "${PROJECT_ROOT}/${OUTPUT_FILE}" diff --git a/.claude/skills/ck-build-analysis.md b/.claude/skills/ck-build-analysis.md index b6375a6ba55..2d80146998d 100644 --- a/.claude/skills/ck-build-analysis.md +++ b/.claude/skills/ck-build-analysis.md @@ -113,7 +113,7 @@ The report includes: ## Implementation Details -### PEP 723 Compliance +### PEP 723 Compliance with Automatic Dependency Management The analysis script (`analyze_build_trace.py`) is PEP 723 compliant with inline dependency metadata: @@ -126,26 +126,47 @@ The analysis script (`analyze_build_trace.py`) is PEP 723 compliant with inline # /// ``` -This allows tools like `pipx run` or `uv run` to automatically manage dependencies: +**The skill automatically uses `uv run` if available**, which provides: +- ✅ Zero-configuration dependency management +- ✅ Automatic installation of jinja2 from PEP 723 metadata +- ✅ Isolated dependency environment (no system pollution) +- ✅ Fast caching for subsequent runs +### Installation Options + +**Option 1: Install uv (Recommended)** ```bash -# Run standalone with pipx (auto-installs dependencies) -pipx run .claude/skills/analyze_build_trace.py trace.json report.md target 100 22 templates/ +# Install uv in the Docker container (one-time setup) +docker exec ck_ bash -c "curl -LsSf https://astral.sh/uv/install.sh | sh" +``` -# Or with uv -uv run .claude/skills/analyze_build_trace.py trace.json report.md target 100 22 templates/ +After installing `uv`, the skill will automatically use it for dependency management. + +**Option 2: Use system python3 + jinja2** +```bash +# If uv is not available, install jinja2 manually +docker exec ck_ apt-get install -y python3-jinja2 ``` +The skill automatically detects which method is available and uses the appropriate one. + ### Components - **ck-build-analysis** - Main bash script that orchestrates Docker, CMake, and analysis - **analyze_build_trace.py** - PEP 723 compliant Python script for trace analysis - **templates/build_analysis_report.md.jinja** - Jinja2 template for report generation -### Requirements +### Standalone Usage -In Docker container: -- `python3-jinja2` (installed via `apt-get install python3-jinja2`) +The Python script can also be run independently: -For standalone use: -- Python 3.8+ with `jinja2>=3.0.0` (auto-managed if using `pipx` or `uv`) +```bash +# With uv (recommended - auto-installs dependencies) +uv run .claude/skills/analyze_build_trace.py trace.json report.md target 100 22 templates/ + +# With pipx (alternative - auto-installs dependencies) +pipx run .claude/skills/analyze_build_trace.py trace.json report.md target 100 22 templates/ + +# With python3 (requires jinja2 pre-installed) +python3 .claude/skills/analyze_build_trace.py trace.json report.md target 100 22 templates/ +``` From 13655f2757d83b2097fef34ed042c4b727f34034 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 13 Jan 2026 23:16:42 -0600 Subject: [PATCH 08/17] Extract common utilities and improve default granularity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract shared configuration logic to .claude/skills/common.sh - Container naming and detection functions - Git branch sanitization - Docker image configuration - GPU target detection - Reduces ~50 lines of duplicate code between skills - Refactor ck-docker to use common.sh utilities - Replace manual docker ps checks with helper functions - Use shared container_exists() and container_is_running() - Use shared detect_gpu_target() and get_docker_image() - Refactor ck-build-analysis to use common.sh utilities - Use shared get_project_root() and get_container_name() - Use shared ensure_container_running() - Use shared detect_gpu_target() - Change default granularity from 500µs to 100µs - Provides better balance between detail and performance - Captures ~15k instantiations vs ~5k at 500µs - Still manageable 15-20 MB trace files - Update all documentation and help text Co-Authored-By: Claude --- .claude/skills/ck-build-analysis | 35 ++++------- .claude/skills/ck-build-analysis.md | 26 ++++---- .claude/skills/ck-docker | 70 +++++++-------------- .claude/skills/common.sh | 94 +++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+), 85 deletions(-) create mode 100644 .claude/skills/common.sh diff --git a/.claude/skills/ck-build-analysis b/.claude/skills/ck-build-analysis index ef3df6f53b8..82c4f40f12c 100755 --- a/.claude/skills/ck-build-analysis +++ b/.claude/skills/ck-build-analysis @@ -4,26 +4,16 @@ set -e set -o pipefail -# Find project root +# Find script directory and load common utilities SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +source "${SCRIPT_DIR}/common.sh" -# Detect git branch and sanitize for docker naming -GIT_BRANCH=$(cd "${PROJECT_ROOT}" && git rev-parse --abbrev-ref HEAD 2>/dev/null | tr '/' '_' | tr -cd 'a-zA-Z0-9_-' || echo "") -GIT_BRANCH=${GIT_BRANCH:-unknown} -if [ "${GIT_BRANCH}" = "HEAD" ]; then - GIT_BRANCH="detached" -fi - -# Ensure USER is set -USER_NAME=${USER:-$(whoami 2>/dev/null || echo "user")} - -# Default container name -DEFAULT_NAME="ck_${USER_NAME}_${GIT_BRANCH}" -CONTAINER_NAME="${CK_CONTAINER_NAME:-${DEFAULT_NAME}}" +# Initialize configuration +PROJECT_ROOT=$(get_project_root "${SCRIPT_DIR}") +CONTAINER_NAME=$(get_container_name "${PROJECT_ROOT}") # Default settings -GRANULARITY="${CK_BUILD_ANALYSIS_GRANULARITY:-500}" +GRANULARITY="${CK_BUILD_ANALYSIS_GRANULARITY:-100}" OUTPUT_FILE="build_time_analysis_report.md" RECONFIGURE=true @@ -38,7 +28,7 @@ Arguments: target Build target to analyze (e.g., example_convnd_fwd_xdl_fp8) Options: - --granularity=N Time trace granularity in microseconds (default: 500) + --granularity=N Time trace granularity in microseconds (default: 100) --output=FILE Output report filename (default: build_time_analysis_report.md) --name=NAME Docker container name (default: ${CONTAINER_NAME}) --no-reconfigure Skip CMake reconfiguration if build exists @@ -50,8 +40,8 @@ Examples: ck-build-analysis test_amdgcn_mma --granularity=100 --output=mma_test_analysis.md Granularity Guide: - 500 (default) - Quick overview, filters 86% of events (~5k instantiations, 3-5 MB) - 100 - Balanced detail (~15k instantiations, 15-20 MB) + 100 (default) - Balanced detail (~15k instantiations, 15-20 MB) + 500 - Quick overview, filters 86% of events (~5k instantiations, 3-5 MB) 1 - Complete analysis (~36k instantiations, 80-100 MB) EOF } @@ -116,17 +106,14 @@ echo "════════════════════════ echo "" # Ensure container is running -if ! docker ps --filter "name=^${CONTAINER_NAME}$" --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then - echo "Container not running. Starting with ck-docker..." - "${SCRIPT_DIR}/ck-docker" start "${CONTAINER_NAME}" -fi +ensure_container_running "${CONTAINER_NAME}" "${SCRIPT_DIR}" # Configure CMake with -ftime-trace if needed if [ "$RECONFIGURE" = true ] || ! docker exec "${CONTAINER_NAME}" test -f /workspace/build/build.ninja 2>/dev/null; then echo "" echo "Configuring CMake with -ftime-trace (granularity=${GRANULARITY}µs)..." - GPU_TARGET=$(docker exec "${CONTAINER_NAME}" bash -c "rocminfo 2>/dev/null | grep -oP 'gfx[0-9a-z]+' | head -1 || echo 'gfx950'" | tr -d '\r\n') + GPU_TARGET=$(detect_gpu_target "${CONTAINER_NAME}") docker exec "${CONTAINER_NAME}" bash -c " cd /workspace || exit 1 diff --git a/.claude/skills/ck-build-analysis.md b/.claude/skills/ck-build-analysis.md index 2d80146998d..792c90a01a5 100644 --- a/.claude/skills/ck-build-analysis.md +++ b/.claude/skills/ck-build-analysis.md @@ -30,7 +30,7 @@ Just ask in natural language: ck-build-analysis [options] Options: - --granularity=N Time trace granularity in microseconds (default: 500) + --granularity=N Time trace granularity in microseconds (default: 100) --output=FILE Output report filename (default: build_time_analysis_report.md) --name=NAME Docker container name (default: from CK_CONTAINER_NAME or auto-generated) --no-reconfigure Skip CMake reconfiguration if build exists @@ -58,27 +58,27 @@ Options: ## Environment ```bash -export CK_CONTAINER_NAME=my_build # Override container name -export CK_BUILD_ANALYSIS_GRANULARITY=1 # Default granularity in µs +export CK_CONTAINER_NAME=my_build # Override container name +export CK_BUILD_ANALYSIS_GRANULARITY=100 # Default granularity in µs ``` ## Examples ```bash -# Basic analysis with default granularity (500µs) +# Basic analysis with default granularity (100µs) ck-build-analysis example_convnd_fwd_xdl_fp8 -# High-resolution analysis (1µs granularity, 22x larger trace) -ck-build-analysis example_convnd_fwd_xdl_fp8 --granularity=1 +# Quick overview (500µs granularity, filters minor events) +ck-build-analysis example_convnd_fwd_xdl_fp8 --granularity=500 -# Medium-resolution analysis (100µs granularity, good balance) -ck-build-analysis example_convnd_fwd_xdl_fp8 --granularity=100 +# High-resolution analysis (1µs granularity, complete picture) +ck-build-analysis example_convnd_fwd_xdl_fp8 --granularity=1 # Custom output filename ck-build-analysis example_convnd_fwd_xdl_fp8 --output=fp8_conv_analysis.md # Analyze test target -ck-build-analysis test_amdgcn_mma --granularity=1 +ck-build-analysis test_amdgcn_mma # Use existing build (skip reconfigure) ck-build-analysis example_convnd_fwd_xdl_fp8 --no-reconfigure @@ -99,17 +99,17 @@ The report includes: | Granularity | Events | Trace Size | Use Case | |-------------|--------|------------|----------| -| 500µs (default) | ~50k | 3-5 MB | Quick overview, major bottlenecks | -| 100µs | ~150k | 15-20 MB | Balanced detail and performance | +| 500µs | ~50k | 3-5 MB | Quick overview, major bottlenecks only | +| 100µs (default) | ~150k | 15-20 MB | Balanced detail and performance | | 50µs | ~200k | 30-40 MB | Detailed analysis | | 1µs (high-res) | ~300k | 80-100 MB | Complete picture, all instantiations | ## Notes - Lower granularity = more events = larger files = longer analysis -- Default 500µs captures major bottlenecks (filters out 86% of instantiations) +- Default 100µs provides balanced detail for most use cases +- 500µs captures only major bottlenecks (filters out 86% of instantiations) - 1µs granularity reveals all 36,000+ instantiations but takes longer to analyze -- 100µs is a good middle ground for most use cases ## Implementation Details diff --git a/.claude/skills/ck-docker b/.claude/skills/ck-docker index 1217f6ae1bb..b7bafd96c2b 100755 --- a/.claude/skills/ck-docker +++ b/.claude/skills/ck-docker @@ -4,25 +4,13 @@ set -e set -o pipefail -# Find project root (where .git directory is) +# Find script directory and load common utilities SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +source "${SCRIPT_DIR}/common.sh" -# Detect git branch and sanitize for docker naming (replace / and special chars with _) -GIT_BRANCH=$(cd "${PROJECT_ROOT}" && git rev-parse --abbrev-ref HEAD 2>/dev/null | tr '/' '_' | tr -cd 'a-zA-Z0-9_-' || echo "") -# Handle edge cases: detached HEAD, empty branch name -GIT_BRANCH=${GIT_BRANCH:-unknown} -# If branch is just "HEAD" (detached state), make it more descriptive -if [ "${GIT_BRANCH}" = "HEAD" ]; then - GIT_BRANCH="detached" -fi - -# Ensure USER is set -USER_NAME=${USER:-$(whoami 2>/dev/null || echo "user")} - -# Default container name: ck__ -DEFAULT_NAME="ck_${USER_NAME}_${GIT_BRANCH}" -CONTAINER_NAME="${CK_CONTAINER_NAME:-${DEFAULT_NAME}}" +# Initialize configuration +PROJECT_ROOT=$(get_project_root "${SCRIPT_DIR}") +CONTAINER_NAME=$(get_container_name "${PROJECT_ROOT}") # Help message show_help() { @@ -53,28 +41,14 @@ Environment: EOF } -# Detect GPU target -detect_gpu() { - local container=$1 - # Allow override via GPU_TARGET environment variable - if [ -n "${GPU_TARGET:-}" ]; then - echo "${GPU_TARGET}" - return 0 - fi - docker exec "${container}" bash -c " - rocminfo 2>/dev/null | grep -oP 'gfx[0-9a-z]+' | head -1 || echo 'gfx950' - " | tr -d '\r\n' -} - # Start container cmd_start() { local name="${1:-${CONTAINER_NAME}}" - local docker_image="${CK_DOCKER_IMAGE:-rocm/composable_kernel:ck_ub24.04_rocm7.0.1}" + local docker_image=$(get_docker_image) - # Check if container exists (exact match to avoid substring collisions) - if docker ps -a --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then - # Check if container is running - if docker ps --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then + # Check if container exists and is running + if container_exists "${name}"; then + if container_is_running "${name}"; then echo "Container '${name}' is already running" return 0 else @@ -123,8 +97,8 @@ cmd_build() { esac done - # Check if container is running (exact match) - if ! docker ps --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then + # Check if container is running + if ! container_is_running "${name}"; then echo "Container '${name}' not running. Starting..." cmd_start "${name}" fi @@ -132,7 +106,7 @@ cmd_build() { # Reconfigure CMake if requested or if build.ninja doesn't exist if [ "$reconfigure" = true ] || ! docker exec "${name}" test -f /workspace/build/build.ninja 2>/dev/null; then echo "Detecting GPU target..." - local gpu_target=$(detect_gpu "${name}") + local gpu_target=$(detect_gpu_target "${name}") if [ "$reconfigure" = true ]; then echo "Reconfiguring CMake from scratch for GPU target: ${gpu_target}" @@ -200,8 +174,8 @@ cmd_test() { return 1 fi - # Check if container is running (exact match) - if ! docker ps --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then + # Check if container is running + if ! container_is_running "${name}"; then echo "Error: Container '${name}' not running" echo "Start it with: ck-docker start --name ${name}" return 1 @@ -226,8 +200,8 @@ cmd_test() { cmd_shell() { local name="${1:-${CONTAINER_NAME}}" - # Check if container is running (exact match) - if ! docker ps --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then + # Check if container is running + if ! container_is_running "${name}"; then echo "Container '${name}' not running. Starting..." cmd_start "${name}" fi @@ -239,7 +213,7 @@ cmd_shell() { # Status cmd_status() { local name="${1:-}" - local docker_image="${CK_DOCKER_IMAGE:-rocm/composable_kernel:ck_ub24.04_rocm7.0.1}" + local docker_image=$(get_docker_image) if [ -z "$name" ]; then echo "Composable Kernel Docker Containers:" @@ -247,14 +221,14 @@ cmd_status() { docker ps -a --filter "ancestor=${docker_image}" \ --format "table {{.Names}}\t{{.Status}}\t{{.CreatedAt}}" || echo "No containers found" else - # Check if container is running (exact match) - if docker ps --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then + # Check container status + if container_is_running "${name}"; then echo "Container '${name}' is RUNNING" docker ps --filter "name=^${name}$" --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" echo "" echo "GPU Information:" docker exec "${name}" bash -c "rocm-smi --showproductname 2>/dev/null | head -10 || echo 'No GPU detected'" - elif docker ps -a --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then + elif container_exists "${name}"; then echo "Container '${name}' exists but is STOPPED" echo "Start with: ck-docker start ${name}" else @@ -268,8 +242,8 @@ cmd_status() { cmd_stop() { local name="${1:-${CONTAINER_NAME}}" - # Check if container exists (exact match) - if docker ps -a --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$"; then + # Check if container exists + if container_exists "${name}"; then echo "Stopping and removing container '${name}'..." docker stop "${name}" 2>/dev/null || true docker rm "${name}" 2>/dev/null || true diff --git a/.claude/skills/common.sh b/.claude/skills/common.sh new file mode 100644 index 00000000000..1da7675705e --- /dev/null +++ b/.claude/skills/common.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# Common utilities for CK Docker skills +# Shared configuration and helper functions + +# Find project root (where .git directory is) +get_project_root() { + local script_dir="$1" + cd "${script_dir}/../.." && pwd +} + +# Detect git branch and sanitize for Docker naming +get_sanitized_branch() { + local project_root="$1" + local branch + + branch=$(cd "${project_root}" && git rev-parse --abbrev-ref HEAD 2>/dev/null | tr '/' '_' | tr -cd 'a-zA-Z0-9_-' || echo "") + branch=${branch:-unknown} + + # Handle detached HEAD state + if [ "${branch}" = "HEAD" ]; then + branch="detached" + fi + + echo "${branch}" +} + +# Get username with fallback +get_username() { + echo "${USER:-$(whoami 2>/dev/null || echo "user")}" +} + +# Generate default container name: ck__ +get_default_container_name() { + local project_root="$1" + local user_name + local git_branch + + user_name=$(get_username) + git_branch=$(get_sanitized_branch "${project_root}") + + echo "ck_${user_name}_${git_branch}" +} + +# Get container name (respects CK_CONTAINER_NAME env var) +get_container_name() { + local project_root="$1" + local default_name + + default_name=$(get_default_container_name "${project_root}") + echo "${CK_CONTAINER_NAME:-${default_name}}" +} + +# Get Docker image (respects CK_DOCKER_IMAGE env var) +get_docker_image() { + echo "${CK_DOCKER_IMAGE:-rocm/composable_kernel:ck_ub24.04_rocm7.0.1}" +} + +# Check if container exists (exact match) +container_exists() { + local name="$1" + docker ps -a --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$" +} + +# Check if container is running (exact match) +container_is_running() { + local name="$1" + docker ps --filter "name=^${name}$" --format '{{.Names}}' | grep -q "^${name}$" +} + +# Detect GPU target in container +detect_gpu_target() { + local container="$1" + + # Allow override via GPU_TARGET environment variable + if [ -n "${GPU_TARGET:-}" ]; then + echo "${GPU_TARGET}" + return 0 + fi + + docker exec "${container}" bash -c " + rocminfo 2>/dev/null | grep -oP 'gfx[0-9a-z]+' | head -1 || echo 'gfx950' + " | tr -d '\r\n' +} + +# Ensure container is running, start if needed +ensure_container_running() { + local container="$1" + local script_dir="$2" + + if ! container_is_running "${container}"; then + echo "Container '${container}' not running. Starting with ck-docker..." + "${script_dir}/ck-docker" start "${container}" + fi +} From 52037f96f1b7f6cc4eb90bf011f0babdec800081 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 13 Jan 2026 23:19:24 -0600 Subject: [PATCH 09/17] Auto-install uv for zero-configuration dependency management - Automatically install uv if not found in container - Eliminates manual dependency setup - No fallback to python3 + manual jinja2 installation needed - First run installs uv (~5 seconds), subsequent runs use cached version - Update documentation to reflect automatic installation Co-Authored-By: Claude --- .claude/skills/ck-build-analysis | 36 ++++++++++++----------------- .claude/skills/ck-build-analysis.md | 30 +++++++----------------- 2 files changed, 23 insertions(+), 43 deletions(-) diff --git a/.claude/skills/ck-build-analysis b/.claude/skills/ck-build-analysis index 82c4f40f12c..a52dd1e3bda 100755 --- a/.claude/skills/ck-build-analysis +++ b/.claude/skills/ck-build-analysis @@ -166,29 +166,23 @@ echo "Generating analysis report..." docker cp "${SCRIPT_DIR}/analyze_build_trace.py" "${CONTAINER_NAME}:/tmp/analyze_build_trace.py" docker cp "${SCRIPT_DIR}/templates" "${CONTAINER_NAME}:/tmp/ck_build_analysis_templates" -# Check if uv is available and use it for PEP 723 dependency management -# Check both PATH and common install locations -if docker exec "${CONTAINER_NAME}" bash -c "command -v uv >/dev/null 2>&1 || test -x \$HOME/.local/bin/uv"; then - echo "Using uv run for automatic dependency management..." - # Ensure uv is in PATH (handles ~/.local/bin installation) - docker exec "${CONTAINER_NAME}" bash -c "export PATH=\"\$HOME/.local/bin:\$PATH\" && uv run --no-project /tmp/analyze_build_trace.py \ - ${TRACE_FILE} \ - /workspace/${OUTPUT_FILE} \ - ${TARGET} \ - ${GRANULARITY} \ - ${BUILD_TIME} \ - /tmp/ck_build_analysis_templates" -else - echo "uv not found, using python3 (requires python3-jinja2 pre-installed)..." - docker exec "${CONTAINER_NAME}" python3 /tmp/analyze_build_trace.py \ - "${TRACE_FILE}" \ - "/workspace/${OUTPUT_FILE}" \ - "${TARGET}" \ - "${GRANULARITY}" \ - "${BUILD_TIME}" \ - "/tmp/ck_build_analysis_templates" +# Check if uv is available, install if needed, and use for PEP 723 dependency management +if ! docker exec "${CONTAINER_NAME}" bash -c "command -v uv >/dev/null 2>&1 || test -x \$HOME/.local/bin/uv"; then + echo "uv not found, installing..." + docker exec "${CONTAINER_NAME}" bash -c "curl -LsSf https://astral.sh/uv/install.sh | sh" >/dev/null 2>&1 + echo "uv installed successfully" fi +echo "Using uv run for automatic dependency management..." +# Ensure uv is in PATH (handles ~/.local/bin installation) +docker exec "${CONTAINER_NAME}" bash -c "export PATH=\"\$HOME/.local/bin:\$PATH\" && uv run --no-project /tmp/analyze_build_trace.py \ + ${TRACE_FILE} \ + /workspace/${OUTPUT_FILE} \ + ${TARGET} \ + ${GRANULARITY} \ + ${BUILD_TIME} \ + /tmp/ck_build_analysis_templates" + # Copy report back to host docker cp "${CONTAINER_NAME}:/workspace/${OUTPUT_FILE}" "${PROJECT_ROOT}/${OUTPUT_FILE}" diff --git a/.claude/skills/ck-build-analysis.md b/.claude/skills/ck-build-analysis.md index 792c90a01a5..83ff89144d3 100644 --- a/.claude/skills/ck-build-analysis.md +++ b/.claude/skills/ck-build-analysis.md @@ -126,29 +126,18 @@ The analysis script (`analyze_build_trace.py`) is PEP 723 compliant with inline # /// ``` -**The skill automatically uses `uv run` if available**, which provides: +**The skill automatically installs and uses `uv`**, which provides: - ✅ Zero-configuration dependency management - ✅ Automatic installation of jinja2 from PEP 723 metadata - ✅ Isolated dependency environment (no system pollution) - ✅ Fast caching for subsequent runs -### Installation Options +**No manual setup required!** The first time you run the skill, it will: +1. Detect if `uv` is installed in the container +2. If not, automatically install it (takes ~5 seconds) +3. Use `uv run` to execute the analysis with auto-managed dependencies -**Option 1: Install uv (Recommended)** -```bash -# Install uv in the Docker container (one-time setup) -docker exec ck_ bash -c "curl -LsSf https://astral.sh/uv/install.sh | sh" -``` - -After installing `uv`, the skill will automatically use it for dependency management. - -**Option 2: Use system python3 + jinja2** -```bash -# If uv is not available, install jinja2 manually -docker exec ck_ apt-get install -y python3-jinja2 -``` - -The skill automatically detects which method is available and uses the appropriate one. +On subsequent runs, `uv` will already be available and dependencies will be cached. ### Components @@ -161,12 +150,9 @@ The skill automatically detects which method is available and uses the appropria The Python script can also be run independently: ```bash -# With uv (recommended - auto-installs dependencies) +# With uv (recommended - auto-installs dependencies from PEP 723 metadata) uv run .claude/skills/analyze_build_trace.py trace.json report.md target 100 22 templates/ -# With pipx (alternative - auto-installs dependencies) +# With pipx (alternative - also auto-installs dependencies) pipx run .claude/skills/analyze_build_trace.py trace.json report.md target 100 22 templates/ - -# With python3 (requires jinja2 pre-installed) -python3 .claude/skills/analyze_build_trace.py trace.json report.md target 100 22 templates/ ``` From 28489b05ca646fed722b6de0bba012df48af6cba Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 13 Jan 2026 23:23:10 -0600 Subject: [PATCH 10/17] Use pipx to install uv instead of piping curl to bash - Install uv via Ubuntu package manager (pipx) for security - Avoids piping curl to bash which is a security concern - More reliable and verifiable installation method - Auto-installs pipx via apt if not already present - Update documentation to reflect package-based installation Co-Authored-By: Claude --- .claude/skills/ck-build-analysis | 11 +++++++++-- .claude/skills/ck-build-analysis.md | 4 +++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/.claude/skills/ck-build-analysis b/.claude/skills/ck-build-analysis index a52dd1e3bda..9460f3efac9 100755 --- a/.claude/skills/ck-build-analysis +++ b/.claude/skills/ck-build-analysis @@ -168,8 +168,15 @@ docker cp "${SCRIPT_DIR}/templates" "${CONTAINER_NAME}:/tmp/ck_build_analysis_te # Check if uv is available, install if needed, and use for PEP 723 dependency management if ! docker exec "${CONTAINER_NAME}" bash -c "command -v uv >/dev/null 2>&1 || test -x \$HOME/.local/bin/uv"; then - echo "uv not found, installing..." - docker exec "${CONTAINER_NAME}" bash -c "curl -LsSf https://astral.sh/uv/install.sh | sh" >/dev/null 2>&1 + echo "uv not found, installing via pipx..." + docker exec "${CONTAINER_NAME}" bash -c " + # Install pipx if not available + if ! command -v pipx >/dev/null 2>&1; then + apt-get update -qq && apt-get install -y -qq pipx >/dev/null 2>&1 + fi + # Install uv via pipx + pipx install uv >/dev/null 2>&1 + " echo "uv installed successfully" fi diff --git a/.claude/skills/ck-build-analysis.md b/.claude/skills/ck-build-analysis.md index 83ff89144d3..15744c9fe8d 100644 --- a/.claude/skills/ck-build-analysis.md +++ b/.claude/skills/ck-build-analysis.md @@ -134,11 +134,13 @@ The analysis script (`analyze_build_trace.py`) is PEP 723 compliant with inline **No manual setup required!** The first time you run the skill, it will: 1. Detect if `uv` is installed in the container -2. If not, automatically install it (takes ~5 seconds) +2. If not, automatically install it via Ubuntu packages (pipx install uv) 3. Use `uv run` to execute the analysis with auto-managed dependencies On subsequent runs, `uv` will already be available and dependencies will be cached. +Installation is done through Ubuntu's package manager for security and reliability. + ### Components - **ck-build-analysis** - Main bash script that orchestrates Docker, CMake, and analysis From cef3e869b0f859b1d572785e0114aec91ee1dc74 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Tue, 13 Jan 2026 23:56:29 -0600 Subject: [PATCH 11/17] Fix command injection and path traversal vulnerabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Security fixes: 1. Command Injection Prevention - Use docker exec -e flag to pass variables as environment variables - Change bash -c to use single quotes to prevent shell expansion - Properly quote all variables within the single-quoted commands - Affects: CMAKE configuration, ninja build, trace file search, Python analysis 2. Path Traversal Protection for OUTPUT_FILE - Validate OUTPUT_FILE contains no path separators (/) - Validate OUTPUT_FILE contains no parent directory references (..) - Allows file extensions (.md) but blocks directory traversal - Prevents writing files outside project directory Tested: - ✅ Path traversal blocked: --output="../../../tmp/evil.md" - ✅ Double-dot blocked: --output="..evil.md" - ✅ Normal operation: --output="security_test.md" - ✅ Build process works with quoted variables Co-Authored-By: Claude --- .claude/skills/ck-build-analysis | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/.claude/skills/ck-build-analysis b/.claude/skills/ck-build-analysis index 9460f3efac9..ad1acf730c5 100755 --- a/.claude/skills/ck-build-analysis +++ b/.claude/skills/ck-build-analysis @@ -95,6 +95,13 @@ if [ -z "$TARGET" ]; then exit 1 fi +# Validate OUTPUT_FILE to prevent path traversal +if [[ "$OUTPUT_FILE" =~ / ]] || [[ "$OUTPUT_FILE" =~ \.\. ]]; then + echo "Error: OUTPUT_FILE must be a simple filename (no path separators or .. allowed)" + echo "Invalid: $OUTPUT_FILE" + exit 1 +fi + echo "═══════════════════════════════════════════════════════════════" echo " CK Build Time Analysis" echo "═══════════════════════════════════════════════════════════════" @@ -115,19 +122,19 @@ if [ "$RECONFIGURE" = true ] || ! docker exec "${CONTAINER_NAME}" test -f /works GPU_TARGET=$(detect_gpu_target "${CONTAINER_NAME}") - docker exec "${CONTAINER_NAME}" bash -c " + docker exec -e GPU_TARGET="${GPU_TARGET}" -e GRANULARITY="${GRANULARITY}" "${CONTAINER_NAME}" bash -c ' cd /workspace || exit 1 rm -rf /workspace/build mkdir /workspace/build cd /workspace/build || exit 1 cmake .. -GNinja \ - -DGPU_TARGETS=${GPU_TARGET} \ + -DGPU_TARGETS="${GPU_TARGET}" \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ - -DCMAKE_CXX_FLAGS='-ftime-trace -ftime-trace-granularity=${GRANULARITY}' \ - -DCMAKE_HIP_FLAGS='-ftime-trace -ftime-trace-granularity=${GRANULARITY}' \ + -DCMAKE_CXX_FLAGS="-ftime-trace -ftime-trace-granularity=${GRANULARITY}" \ + -DCMAKE_HIP_FLAGS="-ftime-trace -ftime-trace-granularity=${GRANULARITY}" \ -DBUILD_TESTING=ON 2>&1 | tail -20 - " + ' echo "CMake configuration complete" fi @@ -137,7 +144,7 @@ echo "Building target: $TARGET" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" BUILD_START=$(date +%s) -docker exec "${CONTAINER_NAME}" bash -c "cd /workspace/build && time ninja ${TARGET} 2>&1" +docker exec -e TARGET="${TARGET}" "${CONTAINER_NAME}" bash -c 'cd /workspace/build && time ninja "${TARGET}" 2>&1' BUILD_END=$(date +%s) BUILD_TIME=$((BUILD_END - BUILD_START)) @@ -147,7 +154,7 @@ echo "Build completed in ${BUILD_TIME} seconds" # Find the trace JSON file echo "" echo "Locating trace file..." -TRACE_FILE=$(docker exec "${CONTAINER_NAME}" bash -c "find /workspace/build -name '*.cpp.json' -o -name '*.hip.json' 2>/dev/null | grep -i '${TARGET}' | head -1") +TRACE_FILE=$(docker exec -e TARGET="${TARGET}" "${CONTAINER_NAME}" bash -c 'find /workspace/build -name "*.cpp.json" -o -name "*.hip.json" 2>/dev/null | grep -iF "${TARGET}" | head -1') if [ -z "$TRACE_FILE" ]; then echo "Error: Could not find trace file for target ${TARGET}" @@ -182,13 +189,7 @@ fi echo "Using uv run for automatic dependency management..." # Ensure uv is in PATH (handles ~/.local/bin installation) -docker exec "${CONTAINER_NAME}" bash -c "export PATH=\"\$HOME/.local/bin:\$PATH\" && uv run --no-project /tmp/analyze_build_trace.py \ - ${TRACE_FILE} \ - /workspace/${OUTPUT_FILE} \ - ${TARGET} \ - ${GRANULARITY} \ - ${BUILD_TIME} \ - /tmp/ck_build_analysis_templates" +docker exec -e TRACE_FILE="${TRACE_FILE}" -e OUTPUT_FILE="${OUTPUT_FILE}" -e TARGET="${TARGET}" -e GRANULARITY="${GRANULARITY}" -e BUILD_TIME="${BUILD_TIME}" "${CONTAINER_NAME}" bash -c 'export PATH="$HOME/.local/bin:$PATH" && uv run --no-project /tmp/analyze_build_trace.py "${TRACE_FILE}" "/workspace/${OUTPUT_FILE}" "${TARGET}" "${GRANULARITY}" "${BUILD_TIME}" /tmp/ck_build_analysis_templates' # Copy report back to host docker cp "${CONTAINER_NAME}:/workspace/${OUTPUT_FILE}" "${PROJECT_ROOT}/${OUTPUT_FILE}" From 4b8471b68156d5d70d75424bbc0771d0af5df6af Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Wed, 14 Jan 2026 00:04:08 -0600 Subject: [PATCH 12/17] Use integer microseconds instead of float milliseconds Performance and precision improvements: - Parse durations as integers (microseconds) instead of floats (milliseconds) - Accumulate all durations in microseconds for better precision - Use integer division for average calculations - Avoid floating point arithmetic throughout data processing Template updates: - Add us_to_ms and us_to_s Jinja2 filters for display formatting - Convert microseconds to milliseconds/seconds only for display - Update all template fields to use conversion filters - Maintain precision in calculations, format only for output Benefits: - Better precision (no floating point rounding errors) - Faster processing (integer arithmetic) - Matches native trace file format (microseconds) - Cleaner separation of storage vs display formatting Co-Authored-By: Claude --- .claude/skills/analyze_build_trace.py | 25 ++++++++++++------- .../templates/build_analysis_report.md.jinja | 22 ++++++++-------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/.claude/skills/analyze_build_trace.py b/.claude/skills/analyze_build_trace.py index f0f7d7fad3c..208e038f228 100755 --- a/.claude/skills/analyze_build_trace.py +++ b/.claude/skills/analyze_build_trace.py @@ -55,13 +55,13 @@ def process_events(data): """Process trace events and extract template instantiation statistics.""" print('Processing events...') - template_stats = defaultdict(lambda: {'count': 0, 'total_dur': 0.0}) - phase_stats = defaultdict(float) + template_stats = defaultdict(lambda: {'count': 0, 'total_dur': 0}) + phase_stats = defaultdict(int) top_individual = [] for event in data.get('traceEvents', []): name = event.get('name', '') - dur = event.get('dur', 0) / 1000.0 # Convert to milliseconds + dur = int(event.get('dur', 0)) # Keep as integer microseconds if name and dur > 0: phase_stats[name] += dur @@ -107,7 +107,7 @@ def prepare_template_data(template_stats, phase_stats, top_individual): templates_by_time.append((name, { 'count': stats['count'], 'total_dur': stats['total_dur'], - 'avg': stats['total_dur'] / stats['count'] if stats['count'] > 0 else 0, + 'avg': stats['total_dur'] // stats['count'] if stats['count'] > 0 else 0, 'pct': 100 * stats['total_dur'] / total_template_time if total_template_time > 0 else 0 })) @@ -117,7 +117,7 @@ def prepare_template_data(template_stats, phase_stats, top_individual): templates_by_count.append((name, { 'count': stats['count'], 'total_dur': stats['total_dur'], - 'avg': stats['total_dur'] / stats['count'] if stats['count'] > 0 else 0 + 'avg': stats['total_dur'] // stats['count'] if stats['count'] > 0 else 0 })) # Add friendly type names to individual instantiations @@ -165,9 +165,19 @@ def pad(value, length): """Pad string to specified length.""" return f'{value:<{length}}' + def us_to_ms(value): + """Convert microseconds to milliseconds.""" + return value / 1000.0 + + def us_to_s(value): + """Convert microseconds to seconds.""" + return value / 1000000.0 + env.filters['format_number'] = format_number env.filters['truncate'] = truncate env.filters['pad'] = pad + env.filters['us_to_ms'] = us_to_ms + env.filters['us_to_s'] = us_to_s return env @@ -183,9 +193,6 @@ def generate_report(env, data, args, total_events): target=args['target'], granularity=args['granularity'], build_time=args['build_time'], - trace_time_sec=f'{data["total_trace_time"] / 1000:.1f}', - template_time_sec=f'{data["total_template_time"] / 1000:.1f}', - template_pct=f'{100 * data["total_template_time"] / data["total_trace_time"]:.1f}', total_events=total_events, total_instantiations=data['total_inst'], unique_families=data['unique_families'], @@ -196,7 +203,7 @@ def generate_report(env, data, args, total_events): templates_by_time=data['templates_by_time'], templates_by_count=data['templates_by_count'], median_count=data['median_count'], - top10_pct=f'{data["top10_pct"]:.1f}' + top10_pct=data['top10_pct'] ) return report_content diff --git a/.claude/skills/templates/build_analysis_report.md.jinja b/.claude/skills/templates/build_analysis_report.md.jinja index b6c4b2bbf5b..37933fe6074 100644 --- a/.claude/skills/templates/build_analysis_report.md.jinja +++ b/.claude/skills/templates/build_analysis_report.md.jinja @@ -7,8 +7,8 @@ ## Executive Summary - **Wall Clock Time:** {{ build_time }} seconds -- **Trace Time:** {{ trace_time_sec }} seconds -- **Template Instantiation Time:** {{ template_time_sec }} seconds ({{ template_pct }}% of trace) +- **Trace Time:** {{ total_trace_time|us_to_s|round(1) }} seconds +- **Template Instantiation Time:** {{ total_template_time|us_to_s|round(1) }} seconds ({{ (100 * total_template_time / total_trace_time)|round(1) }}% of trace) - **Total Events Captured:** {{ total_events|format_number }} - **Total Template Instantiations:** {{ total_instantiations|format_number }} - **Unique Template Families:** {{ unique_families }} @@ -18,7 +18,7 @@ | Phase | Time (ms) | Time (s) | % of Total | |-------|-----------|----------|------------| {% for phase, dur in phases[:20] -%} -| {{ phase|pad(40) }} | {{ "%9.2f"|format(dur) }} | {{ "%8.2f"|format(dur/1000) }} | {{ "%9.1f"|format(100 * dur / total_trace_time) }}% | +| {{ phase|pad(40) }} | {{ "%9.2f"|format(dur|us_to_ms) }} | {{ "%8.2f"|format(dur|us_to_s) }} | {{ "%9.1f"|format(100 * dur / total_trace_time) }}% | {% endfor %} ## Top 30 Most Expensive Individual Instantiations @@ -26,7 +26,7 @@ | Rank | Template | Type | Time (ms) | |------|----------|------|-----------| {% for inst in top_individual[:30] -%} -| {{ "%4d"|format(loop.index) }} | {{ inst.detail|truncate(70) }} | {{ inst.inst_type|pad(5) }} | {{ "%9.2f"|format(inst.dur) }} | +| {{ "%4d"|format(loop.index) }} | {{ inst.detail|truncate(70) }} | {{ inst.inst_type|pad(5) }} | {{ "%9.2f"|format(inst.dur|us_to_ms) }} | {% endfor %} ## Template Families by Total Time (Top 50) @@ -34,7 +34,7 @@ | Rank | Template Family | Count | Total (ms) | Avg (ms) | % of Total | |------|-----------------|-------|------------|----------|------------| {% for name, stats in templates_by_time[:50] -%} -| {{ "%4d"|format(loop.index) }} | {{ name|truncate(43)|pad(43) }} | {{ "%5d"|format(stats.count) }} | {{ "%10.2f"|format(stats.total_dur) }} | {{ "%8.2f"|format(stats.avg) }} | {{ "%9.1f"|format(stats.pct) }}% | +| {{ "%4d"|format(loop.index) }} | {{ name|truncate(43)|pad(43) }} | {{ "%5d"|format(stats.count) }} | {{ "%10.2f"|format(stats.total_dur|us_to_ms) }} | {{ "%8.2f"|format(stats.avg|us_to_ms) }} | {{ "%9.1f"|format(stats.pct) }}% | {% endfor %} ## Template Families by Instantiation Count (Top 50) @@ -42,23 +42,23 @@ | Rank | Template Family | Count | Total (ms) | Avg (ms) | |------|-----------------|-------|------------|----------| {% for name, stats in templates_by_count[:50] -%} -| {{ "%4d"|format(loop.index) }} | {{ name|truncate(43)|pad(43) }} | {{ "%5d"|format(stats.count) }} | {{ "%10.2f"|format(stats.total_dur) }} | {{ "%8.2f"|format(stats.avg) }} | +| {{ "%4d"|format(loop.index) }} | {{ name|truncate(43)|pad(43) }} | {{ "%5d"|format(stats.count) }} | {{ "%10.2f"|format(stats.total_dur|us_to_ms) }} | {{ "%8.2f"|format(stats.avg|us_to_ms) }} | {% endfor %} ## Key Insights ### 1. Template Instantiation Impact -- Template instantiation accounts for {{ template_pct }}% of total trace time +- Template instantiation accounts for {{ (100 * total_template_time / total_trace_time)|round(1) }}% of total trace time {% if unique_families >= 10 -%} -- Top 10 template families account for {{ top10_pct }}% of instantiation time +- Top 10 template families account for {{ top10_pct|round(1) }}% of instantiation time {% endif %} ### 2. Most Expensive Templates {% if templates_by_time|length > 0 -%} -- **{{ templates_by_time[0][0] }}**: {{ templates_by_time[0][1].count|format_number }} instantiations, {{ "%.2f"|format(templates_by_time[0][1].total_dur/1000) }}s total +- **{{ templates_by_time[0][0] }}**: {{ templates_by_time[0][1].count|format_number }} instantiations, {{ (templates_by_time[0][1].total_dur|us_to_s)|round(2) }}s total {% endif -%} {% if templates_by_time|length > 1 -%} -- **{{ templates_by_time[1][0] }}**: {{ templates_by_time[1][1].count|format_number }} instantiations, {{ "%.2f"|format(templates_by_time[1][1].avg) }}ms average +- **{{ templates_by_time[1][0] }}**: {{ templates_by_time[1][1].count|format_number }} instantiations, {{ (templates_by_time[1][1].avg|us_to_ms)|round(2) }}ms average {% endif %} ## Optimization Recommendations @@ -83,7 +83,7 @@ - **Total Unique Templates:** {{ unique_families }} - **Total Instantiations:** {{ total_instantiations|format_number }} {% if total_instantiations > 0 -%} -- **Average Instantiation Time:** {{ "%.3f"|format(total_template_time/total_instantiations) }}ms +- **Average Instantiation Time:** {{ ((total_template_time // total_instantiations)|us_to_ms)|round(3) }}ms {% endif -%} {% if unique_families > 0 -%} - **Median Template Family Count:** {{ median_count }} From 8fcf1595a9cba1b236314ebb925ea5a63f796d44 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Wed, 14 Jan 2026 00:18:14 -0600 Subject: [PATCH 13/17] Replace hardcoded recommendations with data-driven insights MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of generic boilerplate advice, generate specific actionable recommendations based on the actual analysis data: High-Impact Targets (by total time): - Show top 5 templates with actual times and percentages - Recommend strategy based on patterns: - High count (>100) → Extern templates - High individual cost (>50ms) → Template specialization - Otherwise → Explicit instantiation Frequently Instantiated (>100 times): - Identify templates compiled repeatedly - Recommend PCH or extern templates Most Expensive Individual Instantiations: - Show top 3 specific instantiations to profile - Point to exact templates consuming most time Example before (useless): "Focus on High-Impact Templates: Address top 10 families first" Example after (actionable): "TensorDescriptor - 4.2s total (18.1%) - 2,546 instantiations, 1.65ms average - Strategy: Extern templates - High instantiation count" Co-Authored-By: Claude --- .../templates/build_analysis_report.md.jinja | 39 ++++++++++++------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/.claude/skills/templates/build_analysis_report.md.jinja b/.claude/skills/templates/build_analysis_report.md.jinja index 37933fe6074..3fce59da108 100644 --- a/.claude/skills/templates/build_analysis_report.md.jinja +++ b/.claude/skills/templates/build_analysis_report.md.jinja @@ -63,20 +63,31 @@ ## Optimization Recommendations -### Short Term -1. **Focus on High-Impact Templates**: Address top 10 families first -2. **Explicit Template Instantiation**: Pre-instantiate common configurations -3. **Extern Templates**: Mark frequently-used templates as extern in headers - -### Medium Term -1. **Precompiled Headers**: Include heavy templates in PCH -2. **Template Specialization**: Replace general templates with specialized versions -3. **Template Depth Reduction**: Simplify template hierarchies - -### Long Term -1. **Architectural Review**: Evaluate necessity of deep template metaprogramming -2. **C++20 Concepts**: Earlier constraint checking, fewer instantiations -3. **Build Caching**: Distributed build cache for template instantiations +### High-Impact Targets (by total time) +{% for name, stats in templates_by_time[:5] -%} +**{{ loop.index }}. {{ name }}** - {{ (stats.total_dur|us_to_s)|round(1) }}s total ({{ stats.pct|round(1) }}%) + - {{ stats.count|format_number }} instantiations, {{ (stats.avg|us_to_ms)|round(2) }}ms average + {% if stats.count > 100 -%} + - Strategy: Extern templates - High instantiation count suggests repeated compilation + {% elif stats.avg|us_to_ms > 50 -%} + - Strategy: Template specialization - High individual cost suggests complexity + {% else -%} + - Strategy: Explicit instantiation - Pre-instantiate common configurations + {% endif %} + +{% endfor %} +### Frequently Instantiated (optimization candidates) +{% for name, stats in templates_by_count[:5] if stats.count > 100 -%} +**{{ name }}** - {{ stats.count|format_number }} times ({{ (stats.total_dur|us_to_s)|round(2) }}s total) + - Consider: Precompiled headers or extern templates to avoid recompilation + +{% endfor %} +### Most Expensive Individual Instantiations +{% for inst in top_individual[:3] -%} +**{{ loop.index }}. {{ inst.detail|truncate(60) }}** - {{ (inst.dur|us_to_ms)|round(1) }}ms + - Strategy: Profile and simplify this specific instantiation + +{% endfor %} ## Detailed Statistics From 6c187f54f2fe87838b5211ef4086bdc92879f433 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Wed, 14 Jan 2026 00:27:42 -0600 Subject: [PATCH 14/17] Add copyright header and format with ruff - Add AMD copyright header and MIT license identifier - Format code with ruff for consistent style - Remove unused pathlib.Path import - Convert single quotes to double quotes - Fix line wrapping and indentation per ruff style All ruff checks now pass without errors. Co-Authored-By: Claude --- .claude/skills/analyze_build_trace.py | 200 +++++++++++++++----------- 1 file changed, 113 insertions(+), 87 deletions(-) diff --git a/.claude/skills/analyze_build_trace.py b/.claude/skills/analyze_build_trace.py index 208e038f228..abe8c23fa0c 100755 --- a/.claude/skills/analyze_build_trace.py +++ b/.claude/skills/analyze_build_trace.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + # /// script # requires-python = ">=3.8" # dependencies = [ @@ -17,7 +20,6 @@ import sys from collections import defaultdict from datetime import datetime -from pathlib import Path try: from jinja2 import Environment, FileSystemLoader @@ -31,119 +33,143 @@ def parse_arguments(): """Parse command-line arguments.""" if len(sys.argv) < 7: - print("Usage: analyze_build_trace.py ") + print( + "Usage: analyze_build_trace.py " + ) sys.exit(1) return { - 'trace_file': sys.argv[1], - 'output_file': sys.argv[2], - 'target': sys.argv[3], - 'granularity': sys.argv[4], - 'build_time': sys.argv[5], - 'template_dir': sys.argv[6], + "trace_file": sys.argv[1], + "output_file": sys.argv[2], + "target": sys.argv[3], + "granularity": sys.argv[4], + "build_time": sys.argv[5], + "template_dir": sys.argv[6], } def load_trace_data(trace_file): """Load and parse the trace JSON file.""" - print(f'Loading trace file: {trace_file}') - with open(trace_file, 'r') as f: + print(f"Loading trace file: {trace_file}") + with open(trace_file, "r") as f: return json.load(f) def process_events(data): """Process trace events and extract template instantiation statistics.""" - print('Processing events...') + print("Processing events...") - template_stats = defaultdict(lambda: {'count': 0, 'total_dur': 0}) + template_stats = defaultdict(lambda: {"count": 0, "total_dur": 0}) phase_stats = defaultdict(int) top_individual = [] - for event in data.get('traceEvents', []): - name = event.get('name', '') - dur = int(event.get('dur', 0)) # Keep as integer microseconds + for event in data.get("traceEvents", []): + name = event.get("name", "") + dur = int(event.get("dur", 0)) # Keep as integer microseconds if name and dur > 0: phase_stats[name] += dur - if name in ['InstantiateFunction', 'InstantiateClass']: - detail = event.get('args', {}).get('detail', '') - top_individual.append({ - 'detail': detail, - 'dur': dur, - 'type': name - }) + if name in ["InstantiateFunction", "InstantiateClass"]: + detail = event.get("args", {}).get("detail", "") + top_individual.append({"detail": detail, "dur": dur, "type": name}) # Extract template name (everything before '<' or '(') - match = re.match(r'^([^<(]+)', detail) + match = re.match(r"^([^<(]+)", detail) if match: template_name = match.group(1).strip() # Normalize template names - template_name = re.sub(r'^ck::', '', template_name) - template_name = re.sub(r'^std::', 'std::', template_name) + template_name = re.sub(r"^ck::", "", template_name) + template_name = re.sub(r"^std::", "std::", template_name) - template_stats[template_name]['count'] += 1 - template_stats[template_name]['total_dur'] += dur + template_stats[template_name]["count"] += 1 + template_stats[template_name]["total_dur"] += dur return template_stats, phase_stats, top_individual def prepare_template_data(template_stats, phase_stats, top_individual): """Prepare and calculate derived statistics for template rendering.""" - print('Sorting data...') + print("Sorting data...") # Sort data sorted_phases = sorted(phase_stats.items(), key=lambda x: x[1], reverse=True) - top_individual.sort(key=lambda x: x['dur'], reverse=True) + top_individual.sort(key=lambda x: x["dur"], reverse=True) # Calculate totals - total_template_time = sum(s['total_dur'] for s in template_stats.values()) + total_template_time = sum(s["total_dur"] for s in template_stats.values()) total_trace_time = sum(phase_stats.values()) - total_inst = sum(s['count'] for s in template_stats.values()) + total_inst = sum(s["count"] for s in template_stats.values()) # Prepare templates by time with calculated fields templates_by_time = [] - for name, stats in sorted(template_stats.items(), key=lambda x: x[1]['total_dur'], reverse=True): - templates_by_time.append((name, { - 'count': stats['count'], - 'total_dur': stats['total_dur'], - 'avg': stats['total_dur'] // stats['count'] if stats['count'] > 0 else 0, - 'pct': 100 * stats['total_dur'] / total_template_time if total_template_time > 0 else 0 - })) + for name, stats in sorted( + template_stats.items(), key=lambda x: x[1]["total_dur"], reverse=True + ): + templates_by_time.append( + ( + name, + { + "count": stats["count"], + "total_dur": stats["total_dur"], + "avg": stats["total_dur"] // stats["count"] + if stats["count"] > 0 + else 0, + "pct": 100 * stats["total_dur"] / total_template_time + if total_template_time > 0 + else 0, + }, + ) + ) # Prepare templates by count templates_by_count = [] - for name, stats in sorted(template_stats.items(), key=lambda x: x[1]['count'], reverse=True): - templates_by_count.append((name, { - 'count': stats['count'], - 'total_dur': stats['total_dur'], - 'avg': stats['total_dur'] // stats['count'] if stats['count'] > 0 else 0 - })) + for name, stats in sorted( + template_stats.items(), key=lambda x: x[1]["count"], reverse=True + ): + templates_by_count.append( + ( + name, + { + "count": stats["count"], + "total_dur": stats["total_dur"], + "avg": stats["total_dur"] // stats["count"] + if stats["count"] > 0 + else 0, + }, + ) + ) # Add friendly type names to individual instantiations for inst in top_individual: - inst['inst_type'] = 'Func' if inst['type'] == 'InstantiateFunction' else 'Class' + inst["inst_type"] = "Func" if inst["type"] == "InstantiateFunction" else "Class" # Calculate additional metrics median_count = 0 if len(template_stats) > 0: - median_count = sorted([s["count"] for s in template_stats.values()])[len(template_stats) // 2] + median_count = sorted([s["count"] for s in template_stats.values()])[ + len(template_stats) // 2 + ] top10_pct = 0 if len(templates_by_time) >= 10: - top10_pct = 100 * sum(s[1]["total_dur"] for s in templates_by_time[:10]) / total_template_time + top10_pct = ( + 100 + * sum(s[1]["total_dur"] for s in templates_by_time[:10]) + / total_template_time + ) return { - 'sorted_phases': sorted_phases, - 'top_individual': top_individual, - 'templates_by_time': templates_by_time, - 'templates_by_count': templates_by_count, - 'total_template_time': total_template_time, - 'total_trace_time': total_trace_time, - 'total_inst': total_inst, - 'median_count': median_count, - 'top10_pct': top10_pct, - 'unique_families': len(template_stats), + "sorted_phases": sorted_phases, + "top_individual": top_individual, + "templates_by_time": templates_by_time, + "templates_by_count": templates_by_count, + "total_template_time": total_template_time, + "total_trace_time": total_trace_time, + "total_inst": total_inst, + "median_count": median_count, + "top10_pct": top10_pct, + "unique_families": len(template_stats), } @@ -153,17 +179,17 @@ def setup_jinja_environment(template_dir): def format_number(value): """Format number with thousand separators.""" - return f'{value:,}' + return f"{value:,}" def truncate(value, length): """Truncate string to length with ellipsis.""" if len(value) > length: - return value[:length - 3] + '...' + return value[: length - 3] + "..." return value def pad(value, length): """Pad string to specified length.""" - return f'{value:<{length}}' + return f"{value:<{length}}" def us_to_ms(value): """Convert microseconds to milliseconds.""" @@ -173,37 +199,37 @@ def us_to_s(value): """Convert microseconds to seconds.""" return value / 1000000.0 - env.filters['format_number'] = format_number - env.filters['truncate'] = truncate - env.filters['pad'] = pad - env.filters['us_to_ms'] = us_to_ms - env.filters['us_to_s'] = us_to_s + env.filters["format_number"] = format_number + env.filters["truncate"] = truncate + env.filters["pad"] = pad + env.filters["us_to_ms"] = us_to_ms + env.filters["us_to_s"] = us_to_s return env def generate_report(env, data, args, total_events): """Generate the final report using Jinja2 template.""" - print('Rendering report with Jinja2...') + print("Rendering report with Jinja2...") - template = env.get_template('build_analysis_report.md.jinja') + template = env.get_template("build_analysis_report.md.jinja") report_content = template.render( timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - target=args['target'], - granularity=args['granularity'], - build_time=args['build_time'], + target=args["target"], + granularity=args["granularity"], + build_time=args["build_time"], total_events=total_events, - total_instantiations=data['total_inst'], - unique_families=data['unique_families'], - total_trace_time=data['total_trace_time'], - total_template_time=data['total_template_time'], - phases=data['sorted_phases'], - top_individual=data['top_individual'], - templates_by_time=data['templates_by_time'], - templates_by_count=data['templates_by_count'], - median_count=data['median_count'], - top10_pct=data['top10_pct'] + total_instantiations=data["total_inst"], + unique_families=data["unique_families"], + total_trace_time=data["total_trace_time"], + total_template_time=data["total_template_time"], + phases=data["sorted_phases"], + top_individual=data["top_individual"], + templates_by_time=data["templates_by_time"], + templates_by_count=data["templates_by_count"], + median_count=data["median_count"], + top10_pct=data["top10_pct"], ) return report_content @@ -214,8 +240,8 @@ def main(): args = parse_arguments() # Load trace data - trace_data = load_trace_data(args['trace_file']) - total_events = len(trace_data.get('traceEvents', [])) + trace_data = load_trace_data(args["trace_file"]) + total_events = len(trace_data.get("traceEvents", [])) # Process events template_stats, phase_stats, top_individual = process_events(trace_data) @@ -224,18 +250,18 @@ def main(): data = prepare_template_data(template_stats, phase_stats, top_individual) # Setup Jinja2 environment - env = setup_jinja_environment(args['template_dir']) + env = setup_jinja_environment(args["template_dir"]) # Generate report report_content = generate_report(env, data, args, total_events) # Write output - with open(args['output_file'], 'w') as f: + with open(args["output_file"], "w") as f: f.write(report_content) - print(f'Report generated: {args["output_file"]}') - print(f'Report size: {len(report_content)} bytes') + print(f"Report generated: {args['output_file']}") + print(f"Report size: {len(report_content)} bytes") -if __name__ == '__main__': +if __name__ == "__main__": main() From 23ea6ed4c69661317850de3b50d9c7f517c013da Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Wed, 14 Jan 2026 00:59:57 -0600 Subject: [PATCH 15/17] Add copyright headers to all shell scripts Add AMD copyright and MIT license identifier to: - common.sh - ck-build-analysis - ck-docker Matches the copyright header format used throughout the codebase. Co-Authored-By: Claude --- .claude/skills/ck-build-analysis | 3 +++ .claude/skills/ck-docker | 3 +++ .claude/skills/common.sh | 3 +++ 3 files changed, 9 insertions(+) diff --git a/.claude/skills/ck-build-analysis b/.claude/skills/ck-build-analysis index ad1acf730c5..5d012bfd12a 100755 --- a/.claude/skills/ck-build-analysis +++ b/.claude/skills/ck-build-analysis @@ -1,4 +1,7 @@ #!/bin/bash +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + # CK Build Analysis Skill - Analyze build times using -ftime-trace set -e diff --git a/.claude/skills/ck-docker b/.claude/skills/ck-docker index b7bafd96c2b..f6115343dc7 100755 --- a/.claude/skills/ck-docker +++ b/.claude/skills/ck-docker @@ -1,4 +1,7 @@ #!/bin/bash +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + # CK Docker Skill - Build and test composable_kernel in Docker with ROCm support set -e diff --git a/.claude/skills/common.sh b/.claude/skills/common.sh index 1da7675705e..cdb20ac8c39 100644 --- a/.claude/skills/common.sh +++ b/.claude/skills/common.sh @@ -1,4 +1,7 @@ #!/bin/bash +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + # Common utilities for CK Docker skills # Shared configuration and helper functions From 9f4f9ce6a51030e954731bd6cf8c670709d12643 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Wed, 14 Jan 2026 11:49:03 -0600 Subject: [PATCH 16/17] Add multi-file support to build analysis tool Enhanced analyze_build_trace.py to aggregate statistics across multiple trace files instead of analyzing only one. The tool now accepts a build directory, comma-separated file list, or single file. Key improvements: - Process all .cpp.json and .hip.json files in build directory - Track per-file statistics and template time contributions - Show which file each expensive instantiation originated from - Maintain backward compatibility with single-file analysis Updated ck-build-analysis skill to pass build directory instead of searching for a single trace file. Validated against ClangBuildAnalyzer with <0.31% variance on individual instantiation times across 392,168 aggregated events. Co-Authored-By: Claude --- .claude/skills/analyze_build_trace.py | 156 +++++++++++++----- .claude/skills/ck-build-analysis | 26 ++- .../templates/build_analysis_report.md.jinja | 23 ++- 3 files changed, 156 insertions(+), 49 deletions(-) diff --git a/.claude/skills/analyze_build_trace.py b/.claude/skills/analyze_build_trace.py index abe8c23fa0c..c921cbe5d21 100755 --- a/.claude/skills/analyze_build_trace.py +++ b/.claude/skills/analyze_build_trace.py @@ -16,10 +16,12 @@ """ import json +import os import re import sys from collections import defaultdict from datetime import datetime +from pathlib import Path try: from jinja2 import Environment, FileSystemLoader @@ -34,12 +36,13 @@ def parse_arguments(): """Parse command-line arguments.""" if len(sys.argv) < 7: print( - "Usage: analyze_build_trace.py " + "Usage: analyze_build_trace.py " ) + print(" trace_files_or_dir: Comma-separated list of trace files OR directory containing .json files") sys.exit(1) return { - "trace_file": sys.argv[1], + "trace_input": sys.argv[1], "output_file": sys.argv[2], "target": sys.argv[3], "granularity": sys.argv[4], @@ -48,53 +51,126 @@ def parse_arguments(): } -def load_trace_data(trace_file): - """Load and parse the trace JSON file.""" - print(f"Loading trace file: {trace_file}") - with open(trace_file, "r") as f: - return json.load(f) +def find_trace_files(trace_input): + """Find all trace files from input (file list, single file, or directory).""" + trace_files = [] + + # Check if it's a directory + if os.path.isdir(trace_input): + print(f"Scanning directory: {trace_input}") + for root, dirs, files in os.walk(trace_input): + for file in files: + # Include .cpp.json and .hip.json, exclude compile_commands.json and CMake files + if file.endswith(('.cpp.json', '.hip.json')) and 'CMakeFiles' in root: + trace_files.append(os.path.join(root, file)) + trace_files.sort() + # Check if it's a comma-separated list + elif ',' in trace_input: + trace_files = [f.strip() for f in trace_input.split(',')] + # Single file + else: + trace_files = [trace_input] + + # Filter out non-existent files + valid_files = [f for f in trace_files if os.path.isfile(f)] + + if not valid_files: + print(f"Error: No valid trace files found in: {trace_input}", file=sys.stderr) + sys.exit(1) + + print(f"Found {len(valid_files)} trace file(s)") + return valid_files + + +def load_trace_data(trace_files): + """Load and parse multiple trace JSON files.""" + all_data = [] + + for trace_file in trace_files: + print(f" Loading: {trace_file}") + try: + with open(trace_file, "r") as f: + data = json.load(f) + # Get file basename for tracking + file_name = os.path.basename(trace_file) + all_data.append({ + 'file': file_name, + 'path': trace_file, + 'data': data + }) + except Exception as e: + print(f" Warning: Failed to load {trace_file}: {e}", file=sys.stderr) + + return all_data -def process_events(data): - """Process trace events and extract template instantiation statistics.""" - print("Processing events...") +def process_events(all_trace_data): + """Process trace events from multiple files and extract statistics.""" + print("Processing events from all files...") template_stats = defaultdict(lambda: {"count": 0, "total_dur": 0}) phase_stats = defaultdict(int) top_individual = [] + file_stats = [] + total_events = 0 + + for trace_info in all_trace_data: + file_name = trace_info['file'] + data = trace_info['data'] + events = data.get("traceEvents", []) + + file_template_time = 0 + file_event_count = len(events) + total_events += file_event_count + + print(f" Processing {file_name}: {file_event_count:,} events") + + for event in events: + name = event.get("name", "") + dur = int(event.get("dur", 0)) # Keep as integer microseconds + + if name and dur > 0: + phase_stats[name] += dur - for event in data.get("traceEvents", []): - name = event.get("name", "") - dur = int(event.get("dur", 0)) # Keep as integer microseconds + if name in ["InstantiateFunction", "InstantiateClass"]: + detail = event.get("args", {}).get("detail", "") + top_individual.append({ + "detail": detail, + "dur": dur, + "type": name, + "file": file_name + }) - if name and dur > 0: - phase_stats[name] += dur + file_template_time += dur - if name in ["InstantiateFunction", "InstantiateClass"]: - detail = event.get("args", {}).get("detail", "") - top_individual.append({"detail": detail, "dur": dur, "type": name}) + # Extract template name (everything before '<' or '(') + match = re.match(r"^([^<(]+)", detail) + if match: + template_name = match.group(1).strip() + # Normalize template names + template_name = re.sub(r"^ck::", "", template_name) + template_name = re.sub(r"^std::", "std::", template_name) - # Extract template name (everything before '<' or '(') - match = re.match(r"^([^<(]+)", detail) - if match: - template_name = match.group(1).strip() - # Normalize template names - template_name = re.sub(r"^ck::", "", template_name) - template_name = re.sub(r"^std::", "std::", template_name) + template_stats[template_name]["count"] += 1 + template_stats[template_name]["total_dur"] += dur - template_stats[template_name]["count"] += 1 - template_stats[template_name]["total_dur"] += dur + file_stats.append({ + 'name': file_name, + 'events': file_event_count, + 'template_time': file_template_time + }) - return template_stats, phase_stats, top_individual + return template_stats, phase_stats, top_individual, file_stats, total_events -def prepare_template_data(template_stats, phase_stats, top_individual): +def prepare_template_data(template_stats, phase_stats, top_individual, file_stats): """Prepare and calculate derived statistics for template rendering.""" print("Sorting data...") # Sort data sorted_phases = sorted(phase_stats.items(), key=lambda x: x[1], reverse=True) top_individual.sort(key=lambda x: x["dur"], reverse=True) + file_stats.sort(key=lambda x: x["template_time"], reverse=True) # Calculate totals total_template_time = sum(s["total_dur"] for s in template_stats.values()) @@ -170,6 +246,7 @@ def prepare_template_data(template_stats, phase_stats, top_individual): "median_count": median_count, "top10_pct": top10_pct, "unique_families": len(template_stats), + "file_stats": file_stats, } @@ -208,7 +285,7 @@ def us_to_s(value): return env -def generate_report(env, data, args, total_events): +def generate_report(env, data, args, total_events, num_files): """Generate the final report using Jinja2 template.""" print("Rendering report with Jinja2...") @@ -220,6 +297,7 @@ def generate_report(env, data, args, total_events): granularity=args["granularity"], build_time=args["build_time"], total_events=total_events, + num_files=num_files, total_instantiations=data["total_inst"], unique_families=data["unique_families"], total_trace_time=data["total_trace_time"], @@ -230,6 +308,7 @@ def generate_report(env, data, args, total_events): templates_by_count=data["templates_by_count"], median_count=data["median_count"], top10_pct=data["top10_pct"], + file_stats=data["file_stats"], ) return report_content @@ -239,28 +318,29 @@ def main(): """Main entry point for the analysis tool.""" args = parse_arguments() - # Load trace data - trace_data = load_trace_data(args["trace_file"]) - total_events = len(trace_data.get("traceEvents", [])) + # Find and load trace files + trace_files = find_trace_files(args["trace_input"]) + all_trace_data = load_trace_data(trace_files) - # Process events - template_stats, phase_stats, top_individual = process_events(trace_data) + # Process events from all files + template_stats, phase_stats, top_individual, file_stats, total_events = process_events(all_trace_data) # Prepare template data - data = prepare_template_data(template_stats, phase_stats, top_individual) + data = prepare_template_data(template_stats, phase_stats, top_individual, file_stats) # Setup Jinja2 environment env = setup_jinja_environment(args["template_dir"]) # Generate report - report_content = generate_report(env, data, args, total_events) + report_content = generate_report(env, data, args, total_events, len(all_trace_data)) # Write output with open(args["output_file"], "w") as f: f.write(report_content) print(f"Report generated: {args['output_file']}") - print(f"Report size: {len(report_content)} bytes") + print(f"Report size: {len(report_content):,} bytes") + print(f"Analyzed {len(all_trace_data)} file(s) with {total_events:,} total events") if __name__ == "__main__": diff --git a/.claude/skills/ck-build-analysis b/.claude/skills/ck-build-analysis index 5d012bfd12a..8131f6d6a6a 100755 --- a/.claude/skills/ck-build-analysis +++ b/.claude/skills/ck-build-analysis @@ -154,19 +154,26 @@ BUILD_TIME=$((BUILD_END - BUILD_START)) echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Build completed in ${BUILD_TIME} seconds" -# Find the trace JSON file +# Find all trace JSON files for the target echo "" -echo "Locating trace file..." -TRACE_FILE=$(docker exec -e TARGET="${TARGET}" "${CONTAINER_NAME}" bash -c 'find /workspace/build -name "*.cpp.json" -o -name "*.hip.json" 2>/dev/null | grep -iF "${TARGET}" | head -1') +echo "Locating trace files..." -if [ -z "$TRACE_FILE" ]; then - echo "Error: Could not find trace file for target ${TARGET}" - echo "Expected pattern: build/**/${TARGET}*.json" +# Count trace files +TRACE_COUNT=$(docker exec -e TARGET="${TARGET}" "${CONTAINER_NAME}" bash -c ' + find /workspace/build -type f \( -name "*.cpp.json" -o -name "*.hip.json" \) 2>/dev/null | \ + grep -vF "compile_commands.json" | wc -l +') + +if [ "$TRACE_COUNT" -eq 0 ]; then + echo "Error: Could not find any trace files in /workspace/build" + echo "Expected .cpp.json or .hip.json files from -ftime-trace compilation" exit 1 fi -TRACE_SIZE=$(docker exec "${CONTAINER_NAME}" bash -c "ls -lh ${TRACE_FILE} | awk '{print \$5}'") -echo "Found trace file: ${TRACE_FILE} (${TRACE_SIZE})" +echo "Found ${TRACE_COUNT} trace file(s) in build directory" + +# We'll pass the build directory to the Python script +BUILD_DIR="/workspace/build" # Generate analysis report echo "" @@ -192,7 +199,8 @@ fi echo "Using uv run for automatic dependency management..." # Ensure uv is in PATH (handles ~/.local/bin installation) -docker exec -e TRACE_FILE="${TRACE_FILE}" -e OUTPUT_FILE="${OUTPUT_FILE}" -e TARGET="${TARGET}" -e GRANULARITY="${GRANULARITY}" -e BUILD_TIME="${BUILD_TIME}" "${CONTAINER_NAME}" bash -c 'export PATH="$HOME/.local/bin:$PATH" && uv run --no-project /tmp/analyze_build_trace.py "${TRACE_FILE}" "/workspace/${OUTPUT_FILE}" "${TARGET}" "${GRANULARITY}" "${BUILD_TIME}" /tmp/ck_build_analysis_templates' +# Pass build directory instead of single file +docker exec -e BUILD_DIR="${BUILD_DIR}" -e OUTPUT_FILE="${OUTPUT_FILE}" -e TARGET="${TARGET}" -e GRANULARITY="${GRANULARITY}" -e BUILD_TIME="${BUILD_TIME}" "${CONTAINER_NAME}" bash -c 'export PATH="$HOME/.local/bin:$PATH" && uv run --no-project /tmp/analyze_build_trace.py "${BUILD_DIR}" "/workspace/${OUTPUT_FILE}" "${TARGET}" "${GRANULARITY}" "${BUILD_TIME}" /tmp/ck_build_analysis_templates' # Copy report back to host docker cp "${CONTAINER_NAME}:/workspace/${OUTPUT_FILE}" "${PROJECT_ROOT}/${OUTPUT_FILE}" diff --git a/.claude/skills/templates/build_analysis_report.md.jinja b/.claude/skills/templates/build_analysis_report.md.jinja index 3fce59da108..f91dce14a93 100644 --- a/.claude/skills/templates/build_analysis_report.md.jinja +++ b/.claude/skills/templates/build_analysis_report.md.jinja @@ -3,16 +3,27 @@ **Generated:** {{ timestamp }} **Target:** {{ target }} **Granularity:** {{ granularity }}µs +**Files Analyzed:** {{ num_files }} ## Executive Summary - **Wall Clock Time:** {{ build_time }} seconds - **Trace Time:** {{ total_trace_time|us_to_s|round(1) }} seconds - **Template Instantiation Time:** {{ total_template_time|us_to_s|round(1) }} seconds ({{ (100 * total_template_time / total_trace_time)|round(1) }}% of trace) -- **Total Events Captured:** {{ total_events|format_number }} +- **Total Events Captured:** {{ total_events|format_number }} (across {{ num_files }} file{{ 's' if num_files != 1 else '' }}) - **Total Template Instantiations:** {{ total_instantiations|format_number }} - **Unique Template Families:** {{ unique_families }} +{% if num_files > 1 -%} +## Per-File Analysis + +| File | Events | Template Time (ms) | % of Total | +|------|--------|-------------------|------------| +{% for file in file_stats[:20] -%} +| {{ file.name|truncate(50)|pad(50) }} | {{ "%7d"|format(file.events) }} | {{ "%17.2f"|format(file.template_time|us_to_ms) }} | {{ "%9.1f"|format(100 * file.template_time / total_template_time if total_template_time > 0 else 0) }}% | +{% endfor %} + +{% endif -%} ## Compilation Phase Breakdown | Phase | Time (ms) | Time (s) | % of Total | @@ -23,11 +34,19 @@ ## Top 30 Most Expensive Individual Instantiations +{% if num_files > 1 -%} +| Rank | Template | Type | Time (ms) | File | +|------|----------|------|-----------|------| +{% for inst in top_individual[:30] -%} +| {{ "%4d"|format(loop.index) }} | {{ inst.detail|truncate(50) }} | {{ inst.inst_type|pad(5) }} | {{ "%9.2f"|format(inst.dur|us_to_ms) }} | {{ inst.file|truncate(20) }} | +{% endfor -%} +{% else -%} | Rank | Template | Type | Time (ms) | |------|----------|------|-----------| {% for inst in top_individual[:30] -%} | {{ "%4d"|format(loop.index) }} | {{ inst.detail|truncate(70) }} | {{ inst.inst_type|pad(5) }} | {{ "%9.2f"|format(inst.dur|us_to_ms) }} | -{% endfor %} +{% endfor -%} +{% endif %} ## Template Families by Total Time (Top 50) From 911a6b72826a9aeaefd012a7e8077045e9bf8aed Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Wed, 14 Jan 2026 11:55:24 -0600 Subject: [PATCH 17/17] Format build analysis script with ruff - Remove unused pathlib import - Normalize to double quotes - Format long lines and data structures - Auto-fix linting issues Co-Authored-By: Claude --- .claude/skills/analyze_build_trace.py | 50 +++++++++++++-------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/.claude/skills/analyze_build_trace.py b/.claude/skills/analyze_build_trace.py index c921cbe5d21..3597132f323 100755 --- a/.claude/skills/analyze_build_trace.py +++ b/.claude/skills/analyze_build_trace.py @@ -21,7 +21,6 @@ import sys from collections import defaultdict from datetime import datetime -from pathlib import Path try: from jinja2 import Environment, FileSystemLoader @@ -38,7 +37,9 @@ def parse_arguments(): print( "Usage: analyze_build_trace.py " ) - print(" trace_files_or_dir: Comma-separated list of trace files OR directory containing .json files") + print( + " trace_files_or_dir: Comma-separated list of trace files OR directory containing .json files" + ) sys.exit(1) return { @@ -61,12 +62,12 @@ def find_trace_files(trace_input): for root, dirs, files in os.walk(trace_input): for file in files: # Include .cpp.json and .hip.json, exclude compile_commands.json and CMake files - if file.endswith(('.cpp.json', '.hip.json')) and 'CMakeFiles' in root: + if file.endswith((".cpp.json", ".hip.json")) and "CMakeFiles" in root: trace_files.append(os.path.join(root, file)) trace_files.sort() # Check if it's a comma-separated list - elif ',' in trace_input: - trace_files = [f.strip() for f in trace_input.split(',')] + elif "," in trace_input: + trace_files = [f.strip() for f in trace_input.split(",")] # Single file else: trace_files = [trace_input] @@ -93,11 +94,7 @@ def load_trace_data(trace_files): data = json.load(f) # Get file basename for tracking file_name = os.path.basename(trace_file) - all_data.append({ - 'file': file_name, - 'path': trace_file, - 'data': data - }) + all_data.append({"file": file_name, "path": trace_file, "data": data}) except Exception as e: print(f" Warning: Failed to load {trace_file}: {e}", file=sys.stderr) @@ -115,8 +112,8 @@ def process_events(all_trace_data): total_events = 0 for trace_info in all_trace_data: - file_name = trace_info['file'] - data = trace_info['data'] + file_name = trace_info["file"] + data = trace_info["data"] events = data.get("traceEvents", []) file_template_time = 0 @@ -134,12 +131,9 @@ def process_events(all_trace_data): if name in ["InstantiateFunction", "InstantiateClass"]: detail = event.get("args", {}).get("detail", "") - top_individual.append({ - "detail": detail, - "dur": dur, - "type": name, - "file": file_name - }) + top_individual.append( + {"detail": detail, "dur": dur, "type": name, "file": file_name} + ) file_template_time += dur @@ -154,11 +148,13 @@ def process_events(all_trace_data): template_stats[template_name]["count"] += 1 template_stats[template_name]["total_dur"] += dur - file_stats.append({ - 'name': file_name, - 'events': file_event_count, - 'template_time': file_template_time - }) + file_stats.append( + { + "name": file_name, + "events": file_event_count, + "template_time": file_template_time, + } + ) return template_stats, phase_stats, top_individual, file_stats, total_events @@ -323,10 +319,14 @@ def main(): all_trace_data = load_trace_data(trace_files) # Process events from all files - template_stats, phase_stats, top_individual, file_stats, total_events = process_events(all_trace_data) + template_stats, phase_stats, top_individual, file_stats, total_events = ( + process_events(all_trace_data) + ) # Prepare template data - data = prepare_template_data(template_stats, phase_stats, top_individual, file_stats) + data = prepare_template_data( + template_stats, phase_stats, top_individual, file_stats + ) # Setup Jinja2 environment env = setup_jinja_environment(args["template_dir"])