diff --git a/.gitignore b/.gitignore
index 00a5516d..28d7588b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,3 +52,18 @@ Mkfile.old
 dkms.conf
 __pycache__/
 *.pyc
+
+# Profiling artifacts
+*.pftrace
+rocprof_*/
+pytorch_profiles/
+
+# Downloaded datasets
+MLExamples/PyTorch_Profiling/data/
+
+# Generated profiling traces
+MLExamples/PyTorch_Profiling/rocprofv3/single_process/
+MLExamples/TinyTransformer/*/counters/
+MLExamples/TinyTransformer/*/traces/
+MLExamples/TinyTransformer/*/github_issue_test/
+MLExamples/inference_benchmark/profiling_results/
diff --git a/MLExamples/TinyTransformer/README.md b/MLExamples/TinyTransformer/README.md
index bb1bd657..431099ae 100644
--- a/MLExamples/TinyTransformer/README.md
+++ b/MLExamples/TinyTransformer/README.md
@@ -1,354 +1,63 @@
+# ML Example: TinyTransformer Profiling Progression
 
+This example keeps the same small decoder-only transformer and changes the implementation one step at a time. The point is not only to make the model faster. It is to see how the profiler output changes when the kernel mix, memory traffic, and framework path change.
 
-# AI Workshop: ROCm Tools for PyTorch AI Workload Profiling
+[`version1_pytorch_baseline`](version1_pytorch_baseline) is the main hands-on tutorial. Versions 2 through 4 are comparison points built on the same workload.
 
-README.md from `HPCTrainingExamples/MLExamples/TinyTransformer` in the Training Examples repository
+## Version map
 
-## Workshop Overview
+- [`version1_pytorch_baseline`](version1_pytorch_baseline): plain PyTorch reference implementation and the main tutorial entry point
+- [`version2_pytorch_fused`](version2_pytorch_fused): framework-level fusion path; useful for checking whether the stack actually enables the intended fused kernels
+- [`version3_triton`](version3_triton): custom Triton kernels for the main transformer building blocks
+- [`version4_pytorch_sdpa`](version4_pytorch_sdpa): SDPA-based attention path with the later fused structure kept in place
 
-This hands-on workshop provides a comprehensive guide to profiling AI workloads using AMD ROCm tools and PyTorch. Through progressive optimization of a Tiny LLaMA transformer implementation, participants will master the complete profiling ecosystem from framework-level tools to hardware-specific profilers.
+## Recommended order
 
-## Learning Objectives
+1. Start with [`version1_pytorch_baseline`](version1_pytorch_baseline) and record the baseline speed, batch time, memory use, hotspot list, and trace structure.
+2. Move to [`version2_pytorch_fused`](version2_pytorch_fused) and check whether framework-level fusion changes the kernel mix on your software stack.
+3. Use [`version3_triton`](version3_triton) to study the first large change in dispatch count and memory footprint.
+4. Use [`version4_pytorch_sdpa`](version4_pytorch_sdpa) to compare a framework attention path against the custom Triton path in version 3.
 
-By the end of this workshop, participants will be able to:
-- Configure deterministic execution environments for reproducible profiling
-- Use PyTorch native profiling tools for performance characterization
-- Integrate DeepSpeed FLOPS profiler for computational intensity analysis
-- Apply ROCm profiling tools (rocprofv3, rocprof-sys, rocprof-compute) for kernel-level optimization
-- Implement progressive optimization techniques from kernel fusion to custom GPU programming
-- Perform roofline analysis and bottleneck identification for production AI workloads
+## Example measurements
 
-## Workshop Structure
+The table below shows one validated set of runs collected in the ROCm 6.4 training container on March 22, 2026. Treat these as example measurements, not as target numbers for every system.
 
-This workshop follows a progressive optimization methodology with four implementation versions, each building upon the previous with enhanced profiling capabilities and performance improvements.
+| Version | Avg training speed | Avg batch time | Peak memory | Main observation |
+|---------|--------------------|----------------|-------------|------------------|
+| V1 baseline | 291.3 samples/sec | 27.5 ms | 434.3 MB | Reference PyTorch path |
+| V2 fused | 259.0 samples/sec | 30.9 ms | 434.3 MB | Fused features were not active on this stack |
+| V3 Triton | 829.9 samples/sec | 9.6 ms | 193.8 MB | Custom kernels changed both speed and memory use |
+| V4 SDPA | 830.7 samples/sec | 9.6 ms | 193.9 MB | SDPA path landed close to V3 on this workload |
 
-### Version Progression
+The stable point is the methodology: keep the model fixed, change one implementation layer at a time, and compare the traces, hotspot lists, and memory behavior.
 
-### Small Configuration (Quick Start)
-**Config:** Hidden=512, Layers=8, SeqLen=128, Batch=8
+The plot below was generated from the validated container runs with `generate_example_plots.py`.
 
-| Version | Speed (samples/sec) | Batch Time (ms) | Forward (ms) | Backward (ms) | Memory (MB) | Speedup |
-|---------|---------------------|-----------------|--------------|---------------|-------------|---------|
-| **V1 Baseline** | 372.9 | 21.7 | 10.8 | 9.2 | 522.3 | 1.0x |
-| **V3 Triton** | 2,065.0 | 3.9 | 3.2 | 0.3 | 281.8 | **5.5x** |
+![TinyTransformer example measurements from validated container runs](images/tinytransformer_baseline_comparison.png)
 
-### Medium Configuration (Production Scale)
-**Config:** Hidden=1024, Layers=12, SeqLen=512, Batch=16
+## Common profiling tools
 
-| Version | Throughput (tok/s) | Batch (ms) | Forward (ms) | Backward (ms) | Optimizer (ms) | Memory (MB) | Speedup |
-|---------|-------------------|------------|--------------|---------------|----------------|-------------|---------|
-| **V1 Baseline** | 50,017 | 163.8 | 50.3 | 107.4 | 6.1 | 2,358.7 | 1.0x |
-| **V2 Fused** | 60,192 | 136.1 | 44.8 | 85.6 | 5.8 | 2,358.9 | 1.20x |
-| **V3 Triton** | 156,652 | 52.3 | 51.3 | 0.6 | 0.4 | 916.2 | **3.13x** |
-| **V4 Ultra** | 157,169 | 52.1 | 51.1 | 0.6 | 0.4 | 916.5 | **3.14x** |
+All version directories provide the same ROCm profiling workflow:
 
-**See [PERFORMANCE_RESULTS.md](PERFORMANCE_RESULTS.md) for complete analysis**
+- `./get_hotspots.sh`: quick kernel ranking from `rocprofv3 --kernel-trace --stats`
+- `./get_trace.sh`: runtime trace and Perfetto output
+- `./get_counters.sh`: full kernel trace output
+- `./get_rocprof_compute.sh`: hardware metrics when `rocprof-compute` is supported on the current GPU
+- `./get_rocprof_sys.sh`: system trace; this script uses a smaller default step count to keep the run practical
 
-### Profiling Tools Progression
+The scripts also accept shared environment overrides through `profile_common.sh`. For example:
 
-Each version introduces additional profiling capabilities:
-
-1. **PyTorch Profiler**: Framework-level performance analysis
-2. **DeepSpeed FLOPS Profiler**: Computational efficiency metrics
-3. **rocprofv3**: GPU hotspots, device activity tracing and hardware counter collection
-4. **rocprof-sys**: System-level performance monitoring
-5. **rocprof-compute**: Advanced kernel-level analysis and optimization
-
-## Prerequisites
-
-### Hardware Requirements
-- AMD GPU with ROCm support (MI100, MI200, MI300 series, or RX 6000/7000 series)
-- Minimum 16GB system memory
-- ROCm 6.0+ installed and configured
-
-### Software Requirements
-- Python 3.10+
-- PyTorch with ROCm support
-- ROCm profiling tools suite
-- DeepSpeed (for FLOPS profiler)
-- Triton (for advanced versions)
-
-## Quick Start
-
-### 0. Set up environment
-On the training cluster's compute node, the required environment may be set up using the following
-commands:
-
-```bash
-module load rocm pytorch openmpi rocprofiler-compute rocprofiler-systems/develop
-```
-
-### 1. Verify Environment
-```bash
-# Check ROCm installation
-rocminfo
-
-# Verify GPU is detected
-rocm-smi
-
-# Check PyTorch + ROCm
-python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA Available: {torch.cuda.is_available()}'); print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"N/A\"}')"
-```
-
-### 2. Run Version 1 (Baseline) - 5 minutes
-```bash
-cd version1_pytorch_baseline/
-python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 20
-
-# Expected output:
-# Loss: ~7.0
-# Speed: ~373 samples/sec
-# Memory: ~522 MB
-```
-
-For a deeper analysis with the PyTorch profiler, and visualizing the output in TensorBoard,
-please follow the workshop exercises in
-[version1_pytorch_baseline/README.md](https://github.com/amd/HPCTrainingExamples/tree/main/MLExamples/TinyTransformer/version1_pytorch_baseline#workshop-exercises).
-
-### 3. Run Version 2 (Fused) - 5 minutes
-```bash
-cd version2_pytorch_fused
-python tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 30
-
-# Expected output:
-# Loss: 6.9310 
-# Speed: 187.6 samples/sec (2x faster)
-# Memory: 370.4 MB
-```
-
-To compare the baseline version1 to the fused version2 performance,
-follow instructions in [version2_pytorch_fused/README.md](https://github.com/amd/HPCTrainingExamples/tree/main/MLExamples/TinyTransformer/version2_pytorch_fused#step-1-baseline-comparison).
-
-Try profiling this workload with ROCm profilers using commands listed in
-[version2_pytorch_fused/README.md](https://github.com/amd/HPCTrainingExamples/tree/main/MLExamples/TinyTransformer/version2_pytorch_fused#exercise-3-rocm-tools-deep-dive).
-An example of using rocprofv3 on this example is provided below:
-
-```bash
-rocprofv3 --kernel-trace --stats --truncate-kernels -- python tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 30
-```
-The above command produces a hotspot list of GPU kernels. The `--truncate-kernels` option helps remove arguments
-from the kernel name for better readability.
-
-### 3. Run Version 3 (Optimized) - 5 minutes
-```bash
-cd version3_triton/
-python tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 20
-
-# Expected output:
-# Loss: ~7.0 (same correctness!)
-# Speed: ~2065 samples/sec (5.5x faster!)
-# Memory: ~282 MB (46% less!)
-```
-
-An exercise similar to the one you did for version2 is recommended for
-this version as well using ROCm profiling tools. As an example, you can
-collect a comprehensive timeline trace with host and device activity
-with `rocprof-sys` using the command below:
-
-```bash
-rocprof-sys-run --profile --trace -- python tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 30
-```
-View the trace at [https://ui.perfetto.dev](https://ui.perfetto.dev).
-
-### 4. Run Version 4 (Ultra optimized) - 5 minutes
-```bash
-cd version4_pytorch_sdpa/
-python3 tiny_llama_v4.py
-```
-
-<!-- 
-```
-### 4. Performance Debugging Exercise - 15 minutes
 ```bash
-cd version3_triton/exercises/performance_debugging/
-
-# Run all optimization stages with profiling
-./run_all_stages.sh
-
-# This demonstrates the complete debugging journey:
-# Stage 1: Broken (loss=942) → Fix weight init
-# Stage 2: Slow (15 samp/s) → Fix tensor layout
-# Stage 3: Fast (2065 samp/s) → Optimal!
+TINYTRANSFORMER_BATCH_SIZE=8 \
+TINYTRANSFORMER_SEQ_LEN=128 \
+TINYTRANSFORMER_NUM_STEPS=10 \
+./get_trace.sh
 ```
--->
-
-## Directory Structure
-
-```
-ai-workshop-training/
- README.md                              # This overview
- setup/                                 # Environment and prerequisites
-    environment_setup.md               # Detailed setup instructions
-    environment_setup.sh               # Automated setup script
-    requirements.txt                   # Python dependencies
-    validation_scripts/                # Environment validation
-        test_environment.py            # Comprehensive environment test
-        test_rocm_installation.py      # ROCm stack validation
-        test_profiling_tools.py        # Profiling tools validation
- version1_pytorch_baseline/             # Standard PyTorch implementation
-    README.md                          # Detailed guided instructions
-    tiny_llama_v1.py                   # Enhanced baseline implementation
-    run_pytorch_profiler.py            # PyTorch profiler integration
-    run_deepspeed_flops.py            # DeepSpeed FLOPS profiler
-    run_all_profilers.sh              # Orchestrated profiling script
-    exercises/                         # Hands-on exercises and analysis
-        exercise_1_baseline_analysis.md
-        exercise_2_memory_analysis.md
-        exercise_3_bottleneck_identification.md
- version2_pytorch_fused/                # Fused operations optimization
-    README.md                          # Fusion optimization guide
-    tiny_llama_v2.py                   # Fused implementation
-    run_pytorch_profiler.py            # Enhanced PyTorch profiling
-    run_deepspeed_flops.py            # FLOPS analysis
-    run_rocprofv3.sh                   # rocprofv3 integration
-    run_rocprof_sys.sh                # System profiling
-    run_rocprof_compute.sh             # Kernel-level profiling
-    run_all_profilers.sh              # Complete profiling suite
-    exercises/                         # Advanced profiling exercises
-        exercise_1_fusion_analysis.md
-        exercise_2_flash_attention.md
-        exercise_3_rocm_tools_intro.md
- version3_triton/                       # Triton kernel integration
-    README.md                          # Triton optimization guide
-    tiny_llama_v3.py                   # Triton-enhanced implementation
-    triton_kernels.py                  # Custom Triton kernels
-    run_pytorch_profiler.py            # Framework profiling
-    run_deepspeed_flops.py            # Computational analysis
-    run_rocprofv3.sh                   # Legacy profiling
-    run_rocprof_sys.sh                # System monitoring
-    run_rocprof_compute.sh             # Advanced kernel analysis
-    run_all_profilers.sh              # Complete profiling
-    exercises/                         # Triton development exercises
-        exercise_1_triton_basics.md
-        exercise_2_custom_kernels.md
-        exercise_3_performance_tuning.md
- version4_pytorch_sdpa/                 # Ultra-fused implementation
-    README.md                          # Ultra-optimization guide
-    tiny_llama_v4.py                   # Ultra-fused implementation
-    triton_ultra_kernels.py            # Ultra-fused kernels
-    [profiling scripts]                # Complete profiling suite
-    exercises/                         # Advanced optimization
-        exercise_1_ultra_fusion.md
-        exercise_2_register_optimization.md
-        exercise_3_production_deployment.md
- analysis_tools/                        # Performance analysis utilities
-    compare_versions.py                # Cross-version performance comparison
-    roofline_analysis.py               # Roofline model implementation
-    performance_dashboard.py           # Interactive performance dashboard
-    regression_tester.py               # Automated regression testing
-    report_generator.py                # Comprehensive report generation
- slides/                                # Presentation materials
-     luka_presentation_materials/        # AI workshop slides
-         workshop_overview.pptx
-         profiling_methodology.pptx
-         optimization_techniques.pptx
-         results_analysis.pptx
-```
-
-## Workshop Execution Timeline
-
-### Session 1: Foundation (45 minutes)
-- Environment setup and validation
-- Version 1 baseline profiling
-- PyTorch profiler introduction
-- Performance characterization methodology
-
-### Session 2: Optimization (60 minutes)
-- Version 2 kernel fusion techniques
-- ROCm tools introduction
-- Memory optimization analysis
-- Comparative performance analysis
-
-### Session 3: Advanced Techniques (60 minutes)
-- Version 3 Triton kernel development
-- Custom GPU programming
-- Advanced profiling techniques
-- Production optimization strategies
-
-### Session 4: Mastery (45 minutes)
-- Version 4 ultra-fusion implementation
-- Complete profiling suite utilization
-- Roofline analysis and bottleneck resolution
-- Workshop wrap-up and next steps
-
-## Key Performance Insights
-
-### Actual Performance Results (AMD MI325X, ROCm 6.4.4, PyTorch 2.7.1)
-
-**Test Configuration:** Batch=8, SeqLen=128, Hidden=512, Layers=8, Heads=8
-
-| Metric | V1 Baseline | V3 Optimized | Improvement |
-|--------|-------------|--------------|-------------|
-| **Training Speed** | 372.9 samples/sec | 2065.0 samples/sec | **5.5x faster** |
-| **Batch Time** | 21.7 ms | 3.9 ms | **5.6x faster** |
-| **Forward Pass** | 10.8 ms | 3.2 ms | **3.4x faster** |
-| **Memory Usage** | 522.3 MB | 281.8 MB | **46% reduction** |
-| **Throughput** | 47,735 tokens/sec | 264,320 tokens/sec | **5.5x faster** |
-
-### Key Optimization Techniques Applied
-
-1. **Flash Attention** (Memory-Efficient Attention)
-   - **V3**: Custom Triton Flash Attention kernel
-   - **V4**: PyTorch SDPA (hardware-accelerated)
-   - Both achieve ~3.1x speedup through memory-efficient attention
-   - Result: 46% memory reduction, 61% less memory bandwidth
-
-2. **Tensor Contiguity** (`.contiguous()` after GQA operations)
-   - Ensures optimal memory layout for Triton kernels
-   - Fixes stride-related performance issues
-   - Result: 20x speedup over non-contiguous version
-
-3. **Hybrid Kernel Strategy**
-   - Use Triton for: RMSNorm, Flash Attention (memory-bound ops)
-   - Use PyTorch/rocBLAS for: Matrix multiplies (compute-bound ops)
-   - Don't write custom Triton kernels for matmuls - rocBLAS is already optimal
-   - Result: 3.1x overall speedup
-
-4. **Proper Weight Initialization** (`std=0.02`)
-   - Critical for correct logits scale
-   - Prevents exploding/vanishing gradients
-   - Result: Loss goes from 942 → 7.0
-
-### V3 vs V4: Two Paths to the Same Performance
-
-- **V3 (Triton Custom Kernels)**: Custom Triton RMSNorm + Triton Flash Attention
-- **V4 (PyTorch Optimized)**: PyTorch ops + PyTorch SDPA
-- **Both achieve 3.1x speedup** - demonstrates that highly-optimized PyTorch operations can match custom kernels
-
-### Profiling Tool Capabilities
-
-- **PyTorch Profiler**: Framework overhead, operator timing, memory tracking
-- **rocprofv3**: Kernel execution stats, device activity and runtime API timeline tracing, hardware counter collection
-- **Manual Timing**: CUDA synchronization for accurate GPU timing
-
-## Contributing
-
-This workshop is designed for continuous improvement. Contributions are welcome:
-
-- Additional optimization techniques
-- Enhanced profiling methodologies
-- Extended GPU architecture support
-- Advanced analysis tools
-
-## Support and Resources
-
-- **Workshop Issues**: Submit GitHub issues for technical problems
-- **AMD ROCm Documentation**: [ROCm Developer Portal](https://rocm.docs.amd.com/)
-- **rocprofv3 tool usage**: [Using rocprofv3](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3.html#using-rocprofv3)
-- **rocprof-sys Guide**: [rocprof-sys documentation](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/index.html#rocm-systems-profiler-documentation)
-- **rocprof-compute Guide**: [rocprof-compute Documentation](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/#rocm-compute-profiler-documentation)
-- **PyTorch ROCm Support**: [PyTorch ROCm Installation](https://pytorch.org/get-started/locally/)
-
-## Authors and Acknowledgments
-
-Developed for the CASTIEL AI Workshop (October 16, 2024) by HPC/AI performance engineers with extensive experience optimizing production ML workloads on AMD GPU infrastructure.
-
-## License
-
-MIT License - See LICENSE file for details
-
----
-
-**Ready to start profiling? Begin with the [Environment Setup Guide](setup/environment_setup.md)**
 
+## Additional material
 
+- [`version1_pytorch_baseline/README.md`](version1_pytorch_baseline/README.md): primary tutorial for the progression
+- [`generate_example_plots.py`](generate_example_plots.py): regenerates the example plots from validation logs
+- [`VERSION_COMPARISON.md`](VERSION_COMPARISON.md): side-by-side comparison notes across versions
+- [`TINY_LLAMA_ARCHITECTURE.md`](TINY_LLAMA_ARCHITECTURE.md): model structure and implementation notes
+- [`TECHNICAL_APPENDICES.md`](TECHNICAL_APPENDICES.md): supplementary technical discussion
diff --git a/MLExamples/TinyTransformer/VERSION_COMPARISON.md b/MLExamples/TinyTransformer/VERSION_COMPARISON.md
new file mode 100644
index 00000000..bb6cfc4d
--- /dev/null
+++ b/MLExamples/TinyTransformer/VERSION_COMPARISON.md
@@ -0,0 +1,227 @@
+# Version 1 vs Version 2 vs Version 3 vs Version 4 Profiling Comparison
+
+## Executive Summary
+
+All four versions successfully profile with rocprofv3. The GitHub issue #1386 "no device activity" does not reproduce with ROCm 6.4.4 on RX 7900 XTX.
+
+**Key Finding**: Both version3 (Triton custom kernels) and version4 (PyTorch SDPA + Triton) achieve **4.4x speedup** over version1 baseline, with similar performance characteristics. Version2 (PyTorch fusion) provides minimal gains.
+
+## Test Configuration
+
+- **GPU**: AMD Radeon RX 7900 XTX (gfx1100)
+- **ROCm**: 6.4.4
+- **Profiler**: rocprofv3
+- **Test parameters**: batch-size 8, seq-len 128, num-steps 10
+
+## Profiling Results Comparison
+
+### Trace File Sizes (Runtime Trace)
+
+| Version | Trace Size | Result |
+|---------|-----------|---------|
+| Version 1 | 44 MB | Success - full device activity captured |
+| Version 2 | 41 MB | Success - full device activity captured |
+| Version 3 | Not tested | Kernel trace tested instead (3.0 MB) |
+| Version 4 | 9.7 MB | Success - full device activity captured |
+
+### Kernel Trace Analysis
+
+| Metric | Version 1 | Version 2 | Version 3 | Version 4 | V3/V4 vs V1 |
+|--------|-----------|-----------|-----------|-----------|-------------|
+| Total kernel dispatches | 22,284 | 22,479 | 4,727 | 5,493 | -76.3% to -78.8% |
+| Unique kernel types | 64 | 55 | 32 | 33 | -48.4% to -50.0% |
+| Total GPU time | 346.21 ms | 378.06 ms | 104.49 ms | 103.36 ms | -70.1% to -69.8% |
+
+### Top 3 Kernels by GPU Time
+
+#### Version 1 (PyTorch Baseline)
+
+1. **GEMM kernel** (Cijk_Alik_Bljk...): 30,658 us (127.74 us avg) - 240 calls
+2. **GEMM kernel** (Cijk_Ailk_Bljk...): 29,954 us (124.81 us avg) - 240 calls
+3. **GEMM kernel** (Cijk_Alik_Bljk...): 26,641 us (74.00 us avg) - 360 calls
+
+**Total top 3**: 87,253 us (25.2% of total GPU time)
+
+#### Version 2 (PyTorch Fused)
+
+1. **GEMM kernel** (Cijk_Ailk_Bljk...): 54,678 us (455.65 us avg) - 120 calls
+2. **GEMM kernel** (Cijk_Alik_Bljk...): 25,482 us (212.35 us avg) - 120 calls
+3. **bwd_kernel_fuse**: 24,814 us (206.78 us avg) - 120 calls
+
+**Total top 3**: 104,974 us (27.8% of total GPU time)
+
+#### Version 3 (Triton Custom Kernels)
+
+1. **GEMM kernel** (Cijk_Alik_Bljk...): 29,710 us (123.79 us avg) - 240 calls
+2. **GEMM kernel** (Cijk_Alik_Bljk...): 28,442 us (79.01 us avg) - 360 calls
+3. **flash_attention_kernel**: 15,557 us (129.64 us avg) - 120 calls
+
+**Total top 3**: 73,709 us (70.5% of total GPU time)
+
+**Note**: Version3's top 3 kernels account for 70.5% of GPU time vs 25-28% in V1/V2, showing much better kernel concentration.
+
+#### Version 4 (PyTorch SDPA + Triton)
+
+1. **GEMM kernel** (Cijk_Alik_Bljk...): 29,641 us (123.50 us avg) - 240 calls
+2. **GEMM kernel** (Cijk_Alik_Bljk...): 28,320 us (78.67 us avg) - 360 calls
+3. **attn_fwd** (PyTorch SDPA): 13,045 us (108.71 us avg) - 120 calls
+
+**Total top 3**: 71,006 us (68.7% of total GPU time)
+
+**Note**: Version4 uses PyTorch SDPA (`attn_fwd`) instead of custom flash attention, but achieves similar performance to version3.
+
+### Key Observations
+
+1. **Version3 and Version4 achieve similar performance through different approaches**:
+   - **Version3**: Custom Triton kernels (`flash_attention_kernel`, `rmsnorm_kernel`)
+   - **Version4**: PyTorch SDPA (`attn_fwd`) with Triton fallbacks
+   - Both: 78-76% fewer kernel dispatches than version1
+   - Both: ~50% fewer unique kernel types than version1
+   - V3 flash attention: 15,557 us (129.64 us avg)
+   - V4 SDPA attention: 13,045 us (108.71 us avg) - slightly faster!
+
+2. **Version2 fused kernels**:
+   - `bwd_kernel_fuse` (24,814 us total) - backward pass fusion
+   - `attn_fwd` (12,639 us total) - attention forward fusion
+   - These are custom fused operations not present in version1
+   - 14.1% fewer unique kernel types than version1
+   - Marginal performance impact (slightly slower)
+
+3. **Performance progression**:
+   - Version1: Many small kernels, high launch overhead
+   - Version2: Some fusion, but still many PyTorch framework kernels
+   - Version3: Aggressive fusion with custom Triton kernels
+     - 69.8% reduction in GPU time vs version1
+     - 72.4% reduction in GPU time vs version2
+     - 78.8% fewer kernel launches vs version1
+
+4. **Memory efficiency**:
+   - Version1: 434.3 MB peak memory
+   - Version2: 434.3 MB peak memory
+   - Version3: 193.8 MB peak memory (55.4% reduction)
+   - Triton kernels use significantly less memory
+
+5. **Profiler functionality**:
+   - rocprofv3 successfully captures all GPU activity on all three versions
+   - No "no device activity" issue observed
+   - GitHub issue #1386 likely fixed in ROCm 6.4.4
+
+## Performance Comparison
+
+### Throughput
+
+| Version | Samples/sec | Tokens/sec | Speedup vs V1 |
+|---------|-------------|------------|---------------|
+| Version 1 | 240.6 | 30,803 | 1.00x (baseline) |
+| Version 2 | 247.4 | 31,672 | 1.03x |
+| Version 3 | 1,054.8 | 135,014 | **4.38x** |
+| Version 4 | 1,054.5 | 134,972 | **4.38x** |
+
+Version3 and Version4 both achieve **4.38x speedup** over version1 and **4.26x speedup** over version2.
+
+### Batch Processing Time
+
+| Version | Average Batch Time | Speedup vs V1 |
+|---------|-------------------|---------------|
+| Version 1 | 33.3 ms | 1.00x (baseline) |
+| Version 2 | 32.3 ms | 1.03x |
+| Version 3 | 7.5 ms | **4.44x** |
+| Version 4 | 7.6 ms | **4.38x** |
+
+### Memory Usage
+
+| Version | Peak Memory | Reduction vs V1 |
+|---------|-------------|-----------------|
+| Version 1 | 434.3 MB | baseline |
+| Version 2 | 434.3 MB | 0% |
+| Version 3 | 193.8 MB | **55.4%** |
+| Version 4 | 193.9 MB | **55.3%** |
+
+Version3 and Version4 both use less than half the memory of version1/version2.
+
+## Fusion Impact Analysis
+
+### Version2 (PyTorch Fused)
+
+Version2 reports these fusion optimizations available:
+- QKV Fusion: Available but not active in this run
+- Flash Attention: Available but not active in this run
+- SwiGLU Fusion: Available but not active in this run
+- Torch Compile: Available but failed to activate
+
+The fused kernels observed (`bwd_kernel_fuse`, `attn_fwd`) suggest some fusion is occurring despite the "not active" status. This may be a reporting issue in the code.
+
+**Verdict**: Version2 fusion provides minimal benefit (3% speedup) and may have reporting issues.
+
+### Version3 (Triton Custom Kernels)
+
+Version3 reports active Triton optimizations:
+- RMSNorm Kernel: ACTIVE - Fused variance + normalization (1,167 us total, 4.58 us avg)
+- Flash Attention Kernel: ACTIVE - Memory-efficient attention (15,557 us total, 129.64 us avg)
+- SwiGLU Kernel: ACTIVE (not visible in top kernels, likely very fast)
+
+**Verdict**: Version3 Triton kernels deliver massive performance gains (4.38x speedup) with proper kernel fusion and optimization.
+
+### Version4 (PyTorch SDPA + Triton)
+
+Version4 uses PyTorch's Scaled Dot Product Attention (SDPA) with Triton fallbacks:
+- **attn_fwd** (PyTorch SDPA): 13,045 us total, 108.71 us avg
+  - Slightly faster than V3's custom flash attention (15,557 us)
+  - Leverages PyTorch's optimized SDPA implementation
+- Custom Triton kernels for other operations (RMSNorm, SwiGLU likely present but not in top kernels)
+- 16% more kernel dispatches than V3 (5,493 vs 4,727)
+- One additional unique kernel type (33 vs 32)
+
+**Verdict**: Version4 achieves identical performance to version3 (4.38x speedup) using PyTorch SDPA instead of custom flash attention. PyTorch SDPA is actually slightly more efficient for attention, but V4 has slightly more overhead elsewhere.
+
+## Conclusion
+
+1. **rocprofv3 works correctly** on all four versions with ROCm 6.4.4
+2. **No reproduction of GitHub issue #1386** - all versions show full device activity
+
+3. **Version3 and Version4 are equivalent winners**:
+   - Both: **4.38x faster** than version1 baseline
+   - Both: **4.26x faster** than version2
+   - Both: **~55% less memory** usage
+   - Both: **~77-79% fewer** kernel dispatches
+   - Both: **~70% reduction** in GPU time
+   - V3 uses custom flash attention, V4 uses PyTorch SDPA
+   - V4's SDPA is slightly faster (13.0 ms vs 15.6 ms) but has slightly more overhead elsewhere
+
+4. **Version2 provides minimal gains**:
+   - Only 3% faster than version1
+   - Same memory usage as version1
+   - Some fusion, but not well optimized
+   - May have reporting issues with fusion flags
+
+5. **Performance progression summary**:
+   - V1 baseline: 240.6 samples/sec, 346 ms GPU time, 434 MB memory
+   - V2 fused: 247.4 samples/sec, 378 ms GPU time, 434 MB memory (marginal improvement)
+   - V3 custom Triton: 1,054.8 samples/sec, 104 ms GPU time, 194 MB memory (massive improvement)
+   - V4 PyTorch SDPA: 1,054.5 samples/sec, 103 ms GPU time, 194 MB memory (equivalent to V3)
+
+6. **Key takeaways**:
+   - Custom Triton kernels (V3) deliver transformational performance that PyTorch-level fusion (V2) cannot match
+   - PyTorch SDPA (V4) provides a practical alternative to custom flash attention without sacrificing performance
+   - For production use, V4 may be preferable due to reliance on PyTorch's maintained SDPA implementation
+   - For maximum control and customization, V3's fully custom Triton approach is ideal
+
+## Files Generated
+
+### Version 1
+- Runtime trace: `version1_pytorch_baseline/traces/trace_*/`
+- Kernel trace: `version1_pytorch_baseline/counters/counter_20251028_164804/1f81e102abe6/9544_kernel_trace.csv` (11.6 MB)
+
+### Version 2
+- Runtime trace: `version2_pytorch_fused/traces/trace_20251028_170752/` (41 MB)
+- Runtime trace (50 steps): `version2_pytorch_fused/github_issue_test/test_20251028_172311/` (149 MB)
+- Kernel trace: `version2_pytorch_fused/counters/counter_20251028_172429/1f81e102abe6/17496_kernel_trace.csv` (10.8 MB)
+
+### Version 3
+- Kernel trace: `version3_triton/counters/counter_20251028_173451/1f81e102abe6/20129_kernel_trace.csv` (3.0 MB)
+- Much smaller trace file due to 78.8% fewer kernel dispatches
+
+### Version 4
+- Runtime trace: `version4_pytorch_sdpa/traces/trace_20251028_174853/` (9.7 MB)
+- Kernel trace: `version4_pytorch_sdpa/counters/counter_20251028_174948/1f81e102abe6/23175_kernel_trace.csv` (3.3 MB)
+- Similar trace sizes to version3
diff --git a/MLExamples/TinyTransformer/generate_example_plots.py b/MLExamples/TinyTransformer/generate_example_plots.py
new file mode 100644
index 00000000..0adb4f23
--- /dev/null
+++ b/MLExamples/TinyTransformer/generate_example_plots.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+"""Generate example tutorial plots from validated TinyTransformer runs."""
+
+from __future__ import annotations
+
+import argparse
+import os
+import re
+from pathlib import Path
+
+os.environ.setdefault("MPLCONFIGDIR", "/tmp/matplotlib")
+
+import matplotlib
+
+matplotlib.use("Agg")
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+
+VERSION_ORDER = [
+    ("version1_pytorch_baseline", "V1"),
+    ("version2_pytorch_fused", "V2"),
+    ("version3_triton", "V3"),
+    ("version4_pytorch_sdpa", "V4"),
+]
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Generate example plots from TinyTransformer validation logs."
+    )
+    parser.add_argument(
+        "--log-dir",
+        type=Path,
+        default=Path("/tmp/tinytransformer_validation_20260322"),
+        help="Directory containing validation logs",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("MLExamples/TinyTransformer/images"),
+        help="Directory where plot images will be written",
+    )
+    return parser.parse_args()
+
+
+def require_match(pattern: str, text: str, context: str) -> str:
+    match = re.search(pattern, text)
+    if not match:
+        raise ValueError(f"Could not find pattern {pattern!r} in {context}")
+    return match.group(1)
+
+
+def parse_baseline_metrics(log_dir: Path) -> pd.DataFrame:
+    rows = []
+    for version_dir, label in VERSION_ORDER:
+        log_path = log_dir / f"{version_dir}__baseline.log"
+        text = log_path.read_text()
+        rows.append(
+            {
+                "version_dir": version_dir,
+                "label": label,
+                "avg_training_speed": float(
+                    require_match(r"Average training speed:\s+([0-9.]+)", text, str(log_path))
+                ),
+                "avg_batch_time_ms": float(
+                    require_match(r"Average batch time:\s+([0-9.]+)\s+ms", text, str(log_path))
+                ),
+                "peak_memory_mb": float(
+                    require_match(r"Peak memory usage:\s+([0-9.]+)\s+MB", text, str(log_path))
+                ),
+            }
+        )
+    return pd.DataFrame(rows)
+
+
+def shorten_kernel_name(name: str) -> str:
+    if name.startswith("Cijk_"):
+        short = name.split("_SN_")[0]
+    elif name.startswith("void at::native::"):
+        short = "ATen kernel: " + name.split("(", 1)[0].replace("void ", "")
+    else:
+        short = name
+
+    if len(short) > 64:
+        short = short[:61] + "..."
+    return short
+
+
+def parse_hotspots(log_dir: Path, version_dir: str, top_n: int = 8) -> pd.DataFrame:
+    log_path = log_dir / f"{version_dir}__hotspots.log"
+    text = log_path.read_text()
+    csv_path = resolve_artifact_path(
+        require_match(r"Top rows from (.+_kernel_stats\.csv):", text, str(log_path))
+    )
+    df = pd.read_csv(csv_path)
+    top = df.sort_values("TotalDurationNs", ascending=False).head(top_n).copy()
+    top["TotalDurationMs"] = top["TotalDurationNs"] / 1e6
+    top["ShortName"] = top["Name"].map(shorten_kernel_name)
+    return top
+
+
+def resolve_artifact_path(path_text: str) -> Path:
+    path = Path(path_text)
+    if path.exists():
+        return path
+    if path_text.startswith("/workspace/"):
+        translated = REPO_ROOT / path.relative_to("/workspace")
+        if translated.exists():
+            return translated
+    raise FileNotFoundError(f"Could not resolve artifact path: {path_text}")
+
+
+def add_bar_labels(ax: plt.Axes, values: pd.Series, fmt: str) -> None:
+    for idx, value in enumerate(values):
+        ax.text(idx, value, fmt.format(value), ha="center", va="bottom", fontsize=9)
+
+
+def plot_comparison(df: pd.DataFrame, output_path: Path) -> None:
+    colors = ["#1f3c88", "#4f772d", "#c97b24", "#7a3e9d"]
+    fig, axes = plt.subplots(1, 3, figsize=(14, 4.8), constrained_layout=True)
+
+    metrics = [
+        ("avg_training_speed", "Average training speed", "samples/sec", "{:.1f}"),
+        ("avg_batch_time_ms", "Average batch time", "ms", "{:.1f}"),
+        ("peak_memory_mb", "Peak memory", "MB", "{:.1f}"),
+    ]
+
+    for ax, (column, title, ylabel, fmt) in zip(axes, metrics):
+        ax.bar(df["label"], df[column], color=colors)
+        ax.set_title(title)
+        ax.set_ylabel(ylabel)
+        ax.grid(axis="y", alpha=0.2)
+        add_bar_labels(ax, df[column], fmt)
+
+    fig.suptitle(
+        "TinyTransformer example measurements from validated container runs",
+        fontsize=14,
+        fontweight="bold",
+    )
+    fig.savefig(output_path, dpi=180, bbox_inches="tight")
+    plt.close(fig)
+
+
+def plot_hotspots(top: pd.DataFrame, title: str, output_path: Path, color: str) -> None:
+    plot_df = top.sort_values("TotalDurationMs", ascending=True)
+    fig, ax = plt.subplots(figsize=(10.5, 5.5), constrained_layout=True)
+    ax.barh(plot_df["ShortName"], plot_df["TotalDurationMs"], color=color)
+    ax.set_xlabel("Total duration (ms)")
+    ax.set_title(title)
+    ax.grid(axis="x", alpha=0.2)
+
+    for y, value in enumerate(plot_df["TotalDurationMs"]):
+        ax.text(value, y, f" {value:.2f}", va="center", ha="left", fontsize=9)
+
+    fig.savefig(output_path, dpi=180, bbox_inches="tight")
+    plt.close(fig)
+
+
+def main() -> None:
+    args = parse_args()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    baseline_df = parse_baseline_metrics(args.log_dir)
+    plot_comparison(
+        baseline_df,
+        args.output_dir / "tinytransformer_baseline_comparison.png",
+    )
+
+    v1_hotspots = parse_hotspots(args.log_dir, "version1_pytorch_baseline")
+    plot_hotspots(
+        v1_hotspots,
+        "TinyTransformer V1 hotspot summary from validated container run",
+        args.output_dir / "tinytransformer_version1_hotspots.png",
+        "#1f3c88",
+    )
+
+    v3_hotspots = parse_hotspots(args.log_dir, "version3_triton")
+    plot_hotspots(
+        v3_hotspots,
+        "TinyTransformer V3 hotspot summary from validated container run",
+        args.output_dir / "tinytransformer_version3_hotspots.png",
+        "#c97b24",
+    )
+
+    print(f"Wrote plots to {args.output_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLExamples/TinyTransformer/images/tinytransformer_baseline_comparison.png b/MLExamples/TinyTransformer/images/tinytransformer_baseline_comparison.png
new file mode 100644
index 00000000..3be25395
Binary files /dev/null and b/MLExamples/TinyTransformer/images/tinytransformer_baseline_comparison.png differ
diff --git a/MLExamples/TinyTransformer/images/tinytransformer_version1_hotspots.png b/MLExamples/TinyTransformer/images/tinytransformer_version1_hotspots.png
new file mode 100644
index 00000000..c379d421
Binary files /dev/null and b/MLExamples/TinyTransformer/images/tinytransformer_version1_hotspots.png differ
diff --git a/MLExamples/TinyTransformer/images/tinytransformer_version3_hotspots.png b/MLExamples/TinyTransformer/images/tinytransformer_version3_hotspots.png
new file mode 100644
index 00000000..3e3ae1d2
Binary files /dev/null and b/MLExamples/TinyTransformer/images/tinytransformer_version3_hotspots.png differ
diff --git a/MLExamples/TinyTransformer/profile_common.sh b/MLExamples/TinyTransformer/profile_common.sh
new file mode 100644
index 00000000..3490833e
--- /dev/null
+++ b/MLExamples/TinyTransformer/profile_common.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+# Shared helpers for the TinyTransformer profiling scripts.
+
+SCRIPT_DIR="${TINYTRANSFORMER_SCRIPT_DIR:-}"
+MODEL_SCRIPT_NAME="${TINYTRANSFORMER_MODEL_SCRIPT:-}"
+WORKLOAD_NAME="${TINYTRANSFORMER_WORKLOAD_NAME:-${MODEL_SCRIPT_NAME%.py}}"
+
+if [ -z "$SCRIPT_DIR" ] || [ -z "$MODEL_SCRIPT_NAME" ]; then
+    echo "Error: set TINYTRANSFORMER_SCRIPT_DIR and TINYTRANSFORMER_MODEL_SCRIPT before sourcing profile_common.sh." >&2
+    return 1 2>/dev/null || exit 1
+fi
+
+BENCHMARK_SCRIPT="$SCRIPT_DIR/$MODEL_SCRIPT_NAME"
+OUTPUT_ROOT="${TINYTRANSFORMER_OUTPUT_ROOT:-$SCRIPT_DIR/profiling_results}"
+DEFAULT_BATCH_SIZE="${TINYTRANSFORMER_DEFAULT_BATCH_SIZE:-8}"
+DEFAULT_SEQ_LEN="${TINYTRANSFORMER_DEFAULT_SEQ_LEN:-128}"
+DEFAULT_NUM_STEPS="${TINYTRANSFORMER_DEFAULT_NUM_STEPS:-10}"
+BATCH_SIZE="${TINYTRANSFORMER_BATCH_SIZE:-$DEFAULT_BATCH_SIZE}"
+SEQ_LEN="${TINYTRANSFORMER_SEQ_LEN:-$DEFAULT_SEQ_LEN}"
+NUM_STEPS="${TINYTRANSFORMER_NUM_STEPS:-$DEFAULT_NUM_STEPS}"
+EXTRA_BENCHMARK_ARGS_RAW="${TINYTRANSFORMER_EXTRA_ARGS:-}"
+EXTRA_BENCHMARK_ARGS=()
+
+if [ -n "$EXTRA_BENCHMARK_ARGS_RAW" ]; then
+    read -r -a EXTRA_BENCHMARK_ARGS <<< "$EXTRA_BENCHMARK_ARGS_RAW"
+fi
+
+if [ -n "${TINYTRANSFORMER_PYTHON:-}" ]; then
+    PYTHON_BIN="$TINYTRANSFORMER_PYTHON"
+elif command -v python >/dev/null 2>&1; then
+    PYTHON_BIN="python"
+else
+    PYTHON_BIN="python3"
+fi
+
+require_cmd() {
+    local cmd="$1"
+    if ! command -v "$cmd" >/dev/null 2>&1; then
+        echo "Error: required command '$cmd' was not found in PATH." >&2
+        exit 1
+    fi
+}
+
+ensure_benchmark_script() {
+    if [ ! -f "$BENCHMARK_SCRIPT" ]; then
+        echo "Error: benchmark script not found at '$BENCHMARK_SCRIPT'." >&2
+        exit 1
+    fi
+}
+
+detect_rocm_version() {
+    local version=""
+    local hip_version=""
+
+    if command -v rocminfo >/dev/null 2>&1; then
+        version=$(rocminfo 2>/dev/null | awk '/ROCm Version/ {print $3; exit}')
+    fi
+
+    if [ -z "$version" ] && [ -n "${ROCM_PATH:-}" ] && [ -f "$ROCM_PATH/.info/version" ]; then
+        version="$(cat "$ROCM_PATH/.info/version")"
+    fi
+
+    if [ -z "$version" ] && command -v hipcc >/dev/null 2>&1; then
+        hip_version=$(hipcc --version 2>/dev/null | awk '/HIP version/ {print $3; exit}')
+        if [ -n "$hip_version" ]; then
+            version="$hip_version"
+        fi
+    fi
+
+    printf '%s\n' "$version"
+}
+
+rocm_major_from_version() {
+    local version="$1"
+    if [ -n "$version" ]; then
+        printf '%s\n' "${version%%.*}"
+    else
+        printf '%s\n' ""
+    fi
+}
+
+detect_gpu_arch() {
+    if command -v rocminfo >/dev/null 2>&1; then
+        rocminfo 2>/dev/null | awk '/^[[:space:]]+Name:[[:space:]]+gfx/ {print $2; exit}'
+    fi
+}
+
+make_output_dir() {
+    local prefix="$1"
+    local timestamp
+    local output_dir
+    timestamp="$(date +%Y%m%d_%H%M%S)"
+    mkdir -p "$OUTPUT_ROOT"
+    output_dir="$OUTPUT_ROOT/${prefix}_${timestamp}"
+    mkdir -p "$output_dir"
+    printf '%s\n' "$output_dir"
+}
+
+build_benchmark_cmd() {
+    BENCHMARK_CMD=(
+        "$PYTHON_BIN"
+        "$BENCHMARK_SCRIPT"
+        --batch-size "$BATCH_SIZE"
+        --seq-len "$SEQ_LEN"
+        --num-steps "$NUM_STEPS"
+        "${EXTRA_BENCHMARK_ARGS[@]}"
+    )
+}
+
+print_workload_summary() {
+    echo "Workload:"
+    echo "  script: $MODEL_SCRIPT_NAME"
+    echo "  batch size: $BATCH_SIZE"
+    echo "  sequence length: $SEQ_LEN"
+    echo "  training steps: $NUM_STEPS"
+    echo "  python: $PYTHON_BIN"
+    if [ "${#EXTRA_BENCHMARK_ARGS[@]}" -gt 0 ]; then
+        echo "  extra args: ${EXTRA_BENCHMARK_ARGS[*]}"
+    fi
+}
+
+print_generated_files() {
+    local output_dir="$1"
+    local maxdepth="${2:-4}"
+
+    if ! find "$output_dir" -maxdepth "$maxdepth" -type f | grep -q .; then
+        echo "  No files found under $output_dir"
+        return
+    fi
+
+    while IFS= read -r file; do
+        ls -lh "$file"
+    done < <(find "$output_dir" -maxdepth "$maxdepth" -type f | sort)
+}
+
+select_largest_match() {
+    local search_dir="$1"
+    local pattern="$2"
+
+    find "$search_dir" -type f -name "$pattern" -printf '%s\t%p\n' 2>/dev/null \
+        | sort -nr \
+        | head -1 \
+        | cut -f2-
+}
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/IMPORTTIME_PROFILING.md b/MLExamples/TinyTransformer/version1_pytorch_baseline/IMPORTTIME_PROFILING.md
index 2b5cea5b..0eb11830 100644
--- a/MLExamples/TinyTransformer/version1_pytorch_baseline/IMPORTTIME_PROFILING.md
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/IMPORTTIME_PROFILING.md
@@ -1,25 +1,20 @@
+# Python Import Time Profiling
 
-## Python Import Time Profiling
+IMPORTTIME_PROFILING.md from `HPCTrainingExamples/MLExamples/TinyTransformer/version1_pytorch_baseline` in the Training Examples repository.
 
-`IMPORTTIME_PROFILING.md` from `HPCTrainingExamples/MLExamples/TinyTransformer/version1_pytorch_baseline` in the Training Examples repository
+## Overview
 
-### Overview
+The `python -X importtime` flag provides detailed timing information about module imports during Python script execution. This is useful for identifying slow imports that can impact startup time.
 
-The `python -X importtime` flag provides detailed timing information about module imports during Python script execution. This is useful for identifying slow imports that can impact startup time and overall application performance.
+## Basic Usage
 
-### Basic Usage
-
-```bash
+```
 python -X importtime script.py
 ```
 
-This outputs a hierarchical tree showing:
+This outputs a hierarchical tree showing import time for each module in microseconds.
 
-- Import time for each module
-- Cumulative time including sub-imports
-- Self time (time spent in the module itself)
-
-### Output Format
+## Output Format
 
 ```
 import time: self [us] | cumulative | imported package
@@ -34,232 +29,53 @@ import time:      1521 |       2865 | encodings
 - **cumulative**: Total time including all sub-imports (microseconds)
 - **imported package**: Module name with indentation showing import hierarchy
 
-### Example: Profiling TinyLlama V1
-
-#### Basic Import Analysis
-
-```bash
-python -X importtime tiny_llama_v1.py 2> import_times.txt
-```
-
-This redirects the import timing output (stderr) to a file for analysis.
+## Example: Profiling TinyLlama V1
 
-#### Analyzing PyTorch Import Time
+Redirect import timing output to a file for analysis:
 
-```bash
-python -X importtime -c "import torch" 2>&1 | grep -E "torch|time:"
-```
-
-Expected output shows PyTorch's heavy import cost:
-```
-import time:   1234567 |   1234567 | torch
 ```
-
-#### Analyzing DeepSpeed Import Time
-
-```bash
-python -X importtime -c "import deepspeed" 2>&1 | grep -E "deepspeed|time:"
-```
-
-### Common Import Time Bottlenecks in AI Workloads
-
-#### 1. PyTorch (torch)
-
-- Typical import time: 500ms - 2000ms
-- Loads CUDA/ROCm libraries
-- Initializes operator registry
-- Sets up autograd engine
-
-#### 2. Transformers Library
-
-- Typical import time: 300ms - 1000ms
-- Loads tokenizers
-- Registers model architectures
-- Initializes configuration classes
-
-#### 3. DeepSpeed
-
-- Typical import time: 200ms - 800ms
-- Loads distributed training components
-- Initializes optimization kernels
-- Sets up communication backends
-
-#### 4. NumPy/SciPy
-
-- Typical import time: 50ms - 200ms
-- Loads optimized BLAS/LAPACK libraries
-- Initializes array operations
-
-### Best Practices
-
-#### 1. Lazy Imports
-Move imports inside functions for code that's not always executed:
-
-```python
-def run_with_profiler():
-    # Only import when profiler is actually used
-    from torch.profiler import profile, ProfilerActivity
-    ...
+python -X importtime tiny_llama_v1.py 2> import_times.txt
 ```
 
-#### 2. Conditional Imports
-Import heavy dependencies only when needed:
+Analyze PyTorch import time:
 
-```python
-if args.enable_profiler:
-    import deepspeed.profiling.flops_profiler as fp
 ```
-
-#### 3. Import Grouping
-Organize imports by load time to understand startup cost:
-
-```python
-# Fast imports
-import os
-import sys
-import argparse
-
-# Medium imports
-import numpy as np
-import pandas as pd
-
-# Heavy imports (consider lazy loading)
-import torch
-import deepspeed
+python -X importtime -c "import torch" 2>&1 | grep -E "torch|time:"
 ```
 
-### Optimization Techniques
-
-#### 1. Module-Level Import Caching
-Python caches imports in `sys.modules`, so subsequent imports are fast:
+## Common Import Time Bottlenecks
 
-```python
-import torch  # Slow first time
-import torch  # Fast - already cached
-```
+| Package | Typical Import Time | Notes |
+|---------|-------------------|-------|
+| PyTorch (torch) | 500ms - 2000ms | Loads CUDA/ROCm libraries, operator registry |
+| Transformers | 300ms - 1000ms | Loads tokenizers, model architectures |
+| DeepSpeed | 200ms - 800ms | Distributed training components |
+| NumPy/SciPy | 50ms - 200ms | Optimized BLAS/LAPACK libraries |
 
-#### 2. Using `__import__()` for Dynamic Imports
-For plugins or optional features:
+## Generate Import Time Report
 
-```python
-def load_profiler(profiler_type):
-    if profiler_type == "pytorch":
-        torch_prof = __import__("torch.profiler", fromlist=["profile"])
-        return torch_prof
 ```
-
-#### 3. Parallel Import Loading
-Not natively supported, but can structure code to minimize import depth.
-
-### Analyzing Import Time Results
-
-#### Generate Report
-```bash
 python -X importtime tiny_llama_v1.py 2>&1 | \
     grep "import time:" | \
     sort -k3 -n -r | \
     head -20 > top_imports.txt
 ```
 
-#### Parse with Script
-```python
-import re
-import sys
-
-with open('import_times.txt', 'r') as f:
-    for line in f:
-        match = re.search(r'import time:\s+(\d+)\s+\|\s+(\d+)\s+\|\s+(.+)', line)
-        if match:
-            self_time = int(match.group(1))
-            cumulative = int(match.group(2))
-            module = match.group(3).strip()
-            if cumulative > 100000:  # > 100ms
-                print(f"{module}: {cumulative/1000:.2f}ms")
-```
-
-### ROCm/PyTorch Specific Considerations
-
-#### HIP Runtime Loading
-ROCm's HIP runtime can add significant import overhead:
-- libamdhip64.so loading
-- GPU device detection
-- Architecture-specific kernel initialization
-
-#### Environment Variables Impact
-These can affect import time:
-```bash
-# Reduce logging overhead during import
-AMD_LOG_LEVEL=0 MIOPEN_LOG_LEVEL=0 python -X importtime script.py
-
-# Skip GPU initialization during import analysis
-HIP_VISIBLE_DEVICES=-1 python -X importtime script.py
-```
-
-### Integration with Other Profiling Tools
+## ROCm/PyTorch Considerations
 
-#### Combine with cProfile
-```bash
-# First check import time
-python -X importtime script.py 2> imports.txt
+Reduce logging overhead during import analysis:
 
-# Then profile runtime
-python -m cProfile -o profile.stats script.py
 ```
-
-#### Combine with PyTorch Profiler
-```python
-# Fast startup with lazy imports
-def main():
-    import torch
-    from torch.profiler import profile
-
-    # Your training code here
-    ...
-
-if __name__ == "__main__":
-    main()
+AMD_LOG_LEVEL=0 MIOPEN_LOG_LEVEL=0 python -X importtime script.py
 ```
 
-### Example Analysis for Version 1
-
-#### Expected Import Hierarchy
+Skip GPU initialization during import analysis:
 
 ```
-import time: self [us] | cumulative | imported package
-import time:      2341 |       2341 |   _frozen_importlib_external
-import time:    850000 |     850000 | torch               # Dominant cost
-import time:    120000 |     120000 | torch.nn
-import time:     45000 |      45000 | torch.optim
-import time:     23000 |      23000 | apex.normalization.fused_layer_norm
-import time:     18000 |      18000 | apex.transformer.functional.fused_rope
-import time:      8000 |       8000 | argparse
-import time:      3500 |       3500 | json
+HIP_VISIBLE_DEVICES=-1 python -X importtime script.py
 ```
 
-#### Interpreting Results
-
-- **torch**: Largest import cost (850ms typical)
-- **torch.nn**: Additional overhead for neural network modules
-- **apex**: NVIDIA optimizations (ROCm compatible)
-- Standard library imports (argparse, json): Negligible cost
+## Additional Resources
 
-### When to Use Import Time Profiling
-
-1. **Debugging slow script startup**: Identify which imports are causing delays
-2. **Optimizing CLI tools**: Reduce time-to-first-output for user experience
-3. **Container startup optimization**: Minimize cold-start latency
-4. **CI/CD pipeline optimization**: Reduce test suite initialization time
-
-### Limitations
-
-- Does not profile runtime execution (use cProfile or PyTorch Profiler for that)
-- Import time varies based on system load and cold vs. warm cache
-- First import after system reboot will be slower due to OS page cache
-
-### References
-
-- [PEP 565 - Show DeprecationWarning in __main__](https://www.python.org/dev/peps/pep-0565/)
 - [Python -X Options Documentation](https://docs.python.org/3/using/cmdline.html#id5)
 - [PyTorch Performance Tuning Guide](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html)
-
-
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/PYTORCH_BASELINE_WORKSHOP_WALKTHROUGH.md b/MLExamples/TinyTransformer/version1_pytorch_baseline/PYTORCH_BASELINE_WORKSHOP_WALKTHROUGH.md
index 59d84818..f2fceb4a 100644
--- a/MLExamples/TinyTransformer/version1_pytorch_baseline/PYTORCH_BASELINE_WORKSHOP_WALKTHROUGH.md
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/PYTORCH_BASELINE_WORKSHOP_WALKTHROUGH.md
@@ -1,2367 +1,119 @@
-# Tiny LLaMA PyTorch Baseline - Profiling Workshop
-## Complete Hands-On Walkthrough Guide
+# TinyTransformer Baseline Workshop Guide
 
----
+The main reference for this directory is [`README.md`](README.md). This note keeps the same material in a shorter lab order.
 
-## Important Note
+## Preparation
 
-**The performance numbers and metrics shown throughout this workshop are representative examples and were collected on specific hardware configurations.** Your actual results will differ based on:
-
-- GPU model (e.g., MI250X, MI300X, MI325X)
-- ROCm version
-- PyTorch version
-- System configuration (CPU, memory, drivers)
-- Current GPU utilization and temperature
-
-**Focus on the relative improvements and optimization techniques** demonstrated in each exercise rather than matching the exact numbers shown. The methodologies and analysis approaches are applicable across different hardware platforms.
-
----
-
-## Notation and Variables
-
-Throughout this workshop, we use the following notation for tensor dimensions and model parameters:
-
-**Tensor Dimensions:**
-- **B** = Batch size (number of samples processed together)
-- **S** = Sequence length (number of tokens in each sequence)
-- **D** = Hidden dimension / Model dimension (size of hidden representations)
-- **H** = Number of attention heads
-- **head_dim** = Dimension per attention head (typically D / H)
-
-**Model Parameters:**
-- **D_ff** = Feed-forward network intermediate dimension
-- **V** = Vocabulary size (number of unique tokens)
-- **L** = Number of transformer layers
-
-**Performance Metrics:**
-- **FLOPS** = Floating Point Operations Per Second
-- **MFU** = Model FLOPS Utilization (% of theoretical peak achieved)
-- **TFLOPS** = Tera-FLOPS (10^12 floating point operations per second)
-- **GFLOPS** = Giga-FLOPS (10^9 floating point operations per second)
-
-**Complexity Notation:**
-- **O(S)** = Linear complexity with sequence length
-- **O(S^2)** = Quadratic complexity with sequence length
-- **O(B × S × D)** = Complexity grows with batch, sequence, and dimension
-
-**Example Tensor Shapes:**
-```
-Input tensor:           [B, S, D]      e.g., [8, 128, 256]
-Attention weights:      [B, H, S, S]   e.g., [8, 8, 128, 128]
-Query/Key/Value:        [B, H, S, head_dim] e.g., [8, 8, 128, 32]
-FFN intermediate:       [B, S, D_ff]   e.g., [8, 128, 512]
-```
-
----
-
-## Table of Contents
-
-1. [Introduction & Setup](#1-introduction--setup)
-2. [Understanding Tiny LLaMA Architecture](#2-understanding-tiny-llama-architecture)
-3. [Understanding the Baseline Implementation](#3-understanding-the-baseline-implementation)
-4. [Exercise 1: Baseline Performance Analysis](#4-exercise-1-baseline-performance-analysis)
-5. [Exercise 2: Memory Analysis & Optimization](#5-exercise-2-memory-analysis--optimization)
-6. [Exercise 3: Performance Study Across Problem Sizes](#6-exercise-3-performance-study-across-problem-sizes)
-
----
-
-## 1. Introduction & Setup
-
-### 1.1 What is LLM Training?
-
-**Large Language Model (LLM) Training** involves teaching neural networks to understand and generate human language through iterative optimization of model parameters.
-
-**Key Differences: Training vs Inference**
-
-| Aspect | Training | Inference |
-|--------|----------|-----------|
-| **Purpose** | Learn patterns from data | Make predictions |
-| **Direction** | Forward + Backward pass | Forward pass only |
-| **Gradients** | Required and computed | Not required |
-| **Batch Size** | Typically larger (8-64) | Often smaller (1-32) |
-| **Performance Goal** | Samples/sec + FLOPS efficiency | Latency + throughput |
-| **Memory Usage** | Very high (activations + gradients) | Lower (no gradient storage) |
-| **Optimization Focus** | Throughput, MFU, memory efficiency | Latency, batch throughput |
-
-**Why Profile LLM Training?**
-
-- Understand computational bottlenecks
-- Optimize hardware utilization (Model FLOPS Utilization - MFU)
-- Reduce training costs
-- Identify memory inefficiencies
-- Guide optimization decisions
-- Establish baseline for improvements
-
-### 1.2 Workshop Goals
-
-By the end of this workshop, you will be able to:
-
-- Configure and run deterministic PyTorch LLM training
-- Use PyTorch Profiler for detailed operator-level analysis
-- Integrate DeepSpeed FLOPS profiler for computational efficiency metrics
-- Interpret profiling results and identify performance bottlenecks
-- Understand memory usage patterns in transformer training
-- Analyze attention mechanisms and FFN performance
-- Calculate Model FLOPS Utilization (MFU)
-- Establish baseline performance metrics for optimization comparison
-
-### 1.3 Understanding Key Metrics
-
-Before diving into exercises, let's understand the metrics we'll be measuring:
-
-#### Training Speed (samples/sec)
-- **What:** Number of training samples processed per second
-- **Higher is better**
-- **Typical range:** 50-200 samples/sec for small models on single GPU
-- **Formula:** `(batch_size × num_steps) / total_time`
-
-#### FLOPS (Floating Point Operations Per Second)
-- **What:** Computational throughput
-- **Higher is better**
-- **Units:** TFLOPS (TeraFLOPS, 10^12 operations/second)
-- **Theoretical Peak:** Hardware maximum (e.g., MI250X: ~95 TFLOPS FP32, ~190 TFLOPS FP16)
-
-#### Model FLOPS Utilization (MFU)
-- **What:** Percentage of theoretical peak FLOPS achieved
-- **Formula:** `(Achieved FLOPS / Theoretical Peak FLOPS) × 100%`
-- **Typical ranges:**
-  - 20-30%: Baseline PyTorch (memory-bound)
-  - 40-50%: Well-optimized (compute-bound)
-  - 60%+: Highly optimized (kernel fusion, Flash Attention)
-
-#### Memory Usage (GB)
-- **What:** GPU memory consumed
-- **Components:** Model weights + optimizer states + activations + gradients
-- **Lower is better** (allows larger batches)
-
-#### GPU Utilization (%)
-- **What:** Percentage of GPU compute units in use
-- **Higher is better** (approaching 100%)
-- **Low utilization indicates:** Memory bottlenecks, CPU bottlenecks, or small workloads
-
-### 1.4 Environment Verification
-
-Let's verify your system is ready for the workshop.
-
-#### Step 1: Check ROCm Installation
-
-```bash
-# Check if ROCm is installed
-rocminfo | grep "Name:"
-```
-
-**Expected Output:**
-```
-  Name:                    gfx90a
-  Name:                    AMD Instinct MI250X
-```
-
-**If you see an error:**
-```bash
-# Check if ROCm is installed
-which rocminfo
-
-# If not found, ROCm is not installed
-# Contact your system administrator
-```
-
-#### Step 2: Check GPU Visibility
-
-```bash
-# Check GPU status
-rocm-smi
-```
-
-**Expected Output:**
-```
-GPU[0]    : GPU ID: 0
-GPU[0]    : GPU Name: AMD Instinct MI250X
-GPU[0]    : Temperature: 35.0°C
-GPU[0]    : GPU Memory Usage: 512 MB / 65536 MB
-GPU[0]    : GPU Utilization: 0%
-```
-
-**Common Issues:**
-
-**Error: "Unable to detect any GPUs"**
-```bash
-# Check permissions
-sudo usermod -aG video $USER
-sudo usermod -aG render $USER
-
-# Logout and login again
-# Then retry: rocm-smi
-```
-
-#### Step 3: Check PyTorch + ROCm
-
-```bash
-# Test PyTorch with ROCm
-python3 -c "
-import torch
-print(f'PyTorch Version: {torch.__version__}')
-print(f'CUDA Available: {torch.cuda.is_available()}')
-if torch.cuda.is_available():
-    print(f'GPU Name: {torch.cuda.get_device_name(0)}')
-    print(f'GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')
-else:
-    print('ERROR: No GPU detected!')
-"
-```
-
-**Expected Output:**
-```
-PyTorch Version: 2.7.1+rocm6.4.4
-CUDA Available: True
-GPU Name: AMD Instinct MI250X
-GPU Memory: 65.5 GB
-```
-
-**Common Issues:**
-
-**Error: "ModuleNotFoundError: No module named 'torch'"**
-```bash
-# Install PyTorch with ROCm support
-pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
-```
-
-**Error: "CUDA Available: False"**
-```bash
-# Check if ROCm-enabled PyTorch is installed
-python3 -c "import torch; print(torch.__version__)"
-
-# Should show something like: 2.7.1+rocm6.4.4
-# If it shows 2.7.1+cpu, you have CPU-only PyTorch
-
-# Reinstall with ROCm support
-pip uninstall torch torchvision torchaudio
-pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
-```
-
-#### Step 4: Check DeepSpeed (Optional but Recommended)
-
-```bash
-# Check if DeepSpeed is installed
-python3 -c "import deepspeed; print(f'DeepSpeed Version: {deepspeed.__version__}')"
-```
-
-**Expected Output:**
-```
-DeepSpeed Version: 0.12.6
-```
-
-**If not installed:**
-```bash
-# Install DeepSpeed
-pip install deepspeed
-```
-
-#### Step 5: Navigate to Workshop Directory
-
-```bash
-# Navigate to version1_pytorch_baseline directory
-cd ~/castille-ai-workshop-training/version1_pytorch_baseline/
-
-# List files
-ls -la
-```
-
-**Expected Output:**
-```
--rw-rw-r-- tiny_llama_v1.py
--rw-rw-r-- run_pytorch_profiler.py
--rw-rw-r-- run_deepspeed_flops.py
--rw-rw-r-- README.md
--rwxrwxr-x run_baseline.sh
--rwxrwxr-x run_pytorch_profiler.sh
--rwxrwxr-x run_deepspeed_flops.sh
-drwxrwxr-x exercises/
-```
-
-#### Step 6: Quick Test Run
-
-Let's verify everything works with a very small test:
-
-```bash
-# Run a tiny test (should complete in ~1-2 minutes)
-python3 tiny_llama_v1.py --batch-size 4 --seq-len 64 --num-steps 5
-```
-
-**Expected Output:**
-```
-==========================================
-Tiny LLaMA V1 - PyTorch Baseline
-==========================================
-Configuration:
-  Batch Size: 4
-  Sequence Length: 64
-  Number of Steps: 5
-  Hidden Dim: 256
-  Num Layers: 4
-  Num Heads: 8
-
-Initializing model...
-Model parameters: 2.3M
-
-Starting training...
-Step 1/5: Loss = 6.9088, Time = 0.235 seconds
-Step 2/5: Loss = 6.9076, Time = 0.045 seconds
-Step 3/5: Loss = 6.9065, Time = 0.044 seconds
-Step 4/5: Loss = 6.9054, Time = 0.043 seconds
-Step 5/5: Loss = 6.9042, Time = 0.043 seconds
-
-==========================================
-Performance Summary:
-==========================================
-Average time per step: 0.044 seconds
-Training speed: 90.9 samples/sec
-Peak memory usage: 1234 MB
-==========================================
-```
-
-**If you see this output, your environment is ready!**
-
-
----
-
-## 2. Understanding Tiny LLaMA Architecture
-
-### 2.1 Model Overview
-
-Tiny LLaMA is a scaled-down version of the LLaMA architecture, designed for educational purposes and profiling workshops. It uses the standard transformer decoder architecture with modern enhancements.
-
-**Model Configuration (Default):**
-
-```python
-vocab_size = 1000           # Small vocabulary for workshop
-hidden_dim = 256            # Model dimension (D)
-n_layers = 4                # Number of transformer layers
-n_heads = 8                 # Number of attention heads
-n_kv_heads = 4              # Number of key-value heads (GQA)
-intermediate_dim = 512      # FFN intermediate dimension
-max_seq_len = 128           # Maximum sequence length
-```
-
-**Model Size:**
-- Parameters: ~2.9 million
-- Memory footprint: ~11 MB (FP32)
-- Training memory (batch=8, seq=128): ~200-500 MB (includes activations, gradients, optimizer states)
-
-**Detailed Parameter Calculation:**
-
-Understanding how we arrive at ~2.9M parameters:
-
-1. **Token Embeddings**:
-   - Shape: [vocab_size, hidden_dim] = [1000, 256]
-   - Parameters: 1000 × 256 = 256,000
-
-2. **Per Transformer Layer** (4 layers total):
-
-   a. **RMSNorm (×2 per layer)**:
-      - Pre-attention norm: hidden_dim = 256 parameters
-      - Pre-FFN norm: hidden_dim = 256 parameters
-      - Total: 2 × 256 = 512 parameters per layer
-
-   b. **Multi-Head Attention with GQA** (Grouped Query Attention):
-      - **Q projection**: [hidden_dim, hidden_dim] = [256, 256] = 65,536 parameters
-      - **K projection** (GQA): [hidden_dim, head_dim × n_kv_heads] = [256, 32 × 4] = [256, 128] = 32,768 parameters
-        - Why smaller? GQA uses fewer key/value heads (4) than query heads (8)
-        - head_dim = hidden_dim / n_heads = 256 / 8 = 32
-      - **V projection** (GQA): [256, 128] = 32,768 parameters
-      - **O projection** (output): [256, 256] = 65,536 parameters
-      - **Total Attention**: 65,536 + 32,768 + 32,768 + 65,536 = 196,608 parameters per layer
-
-   c. **SwiGLU Feed-Forward Network**:
-      - **Gate projection**: [hidden_dim, intermediate_dim] = [256, 512] = 131,072 parameters
-      - **Up projection**: [256, 512] = 131,072 parameters
-      - **Down projection**: [intermediate_dim, hidden_dim] = [512, 256] = 131,072 parameters
-      - **Total FFN**: 131,072 + 131,072 + 131,072 = 393,216 parameters per layer
-
-   d. **Total per layer**: 512 + 196,608 + 393,216 = 590,336 parameters
-
-   e. **All 4 layers**: 4 × 590,336 = 2,361,344 parameters
-
-3. **Final Components**:
-   - **Final RMSNorm**: 256 parameters
-   - **Output projection** (LM head): [hidden_dim, vocab_size] = [256, 1000] = 256,000 parameters
-   - **Total**: 256 + 256,000 = 256,256 parameters
-
-4. **Grand Total**:
-   - Embeddings: 256,000
-   - All layers: 2,361,344
-   - Final components: 256,256
-   - **Total**: 256,000 + 2,361,344 + 256,256 = **2,873,600 parameters ≈ 2.9M**
-
-**Memory Footprint Calculation:**
-- FP32: 4 bytes per parameter
-- Total memory: 2,873,600 × 4 bytes = 11,494,400 bytes ≈ **11.0 MB**
-
-**Training Memory Breakdown** (batch_size=8, seq_len=128):
-
-Per-layer memory requirements:
-- **Input activations**: [B, S, D] = [8, 128, 256] = 262,144 elements → 1.05 MB
-- **Q, K, V tensors**: 3 × [8, 128, 256] → 3.15 MB
-- **Attention scores**: [B, H, S, S] = [8, 8, 128, 128] = 1,048,576 elements → 4.19 MB
-- **FFN intermediates**: 2 × [B, S, D_ff] = 2 × [8, 128, 512] → 4.19 MB
-- **Per-layer subtotal**: ~15.7 MB × 4 layers = **~63 MB**
-
-Training overhead:
-- **Gradients** (same size as activations): ~63 MB
-- **Parameter gradients**: 2.9M × 4 bytes = ~11 MB
-- **Optimizer states** (Adam: momentum + variance): 2.9M × 2 × 4 bytes = ~22 MB
-
-**Total training memory**: 63 + 63 + 11 + 22 = **~160 MB**
-
-Note: Actual PyTorch memory usage will be 200-500 MB due to:
-- Framework overhead
-- Memory fragmentation
-- Temporary buffers
-- CUDA kernels and workspace
-
-### 2.2 Transformer Layer Architecture
-
-Each transformer layer consists of:
-
-1. **RMSNorm** (Root Mean Square Normalization)
-2. **Multi-Head Attention** with RoPE
-3. **Residual Connection**
-4. **RMSNorm**
-5. **Feed-Forward Network** (SwiGLU)
-6. **Residual Connection**
-
-**Visual Structure:**
-
-```
-Input (B, S, D)
-    ↓
-┌───────────────────────────────────────┐
-│  RMSNorm                              │
-└───────────────────────────────────────┘
-    ↓
-┌───────────────────────────────────────┐
-│  Multi-Head Attention                 │
-│  ┌─────────────────────────────────┐ │
-│  │ Q, K, V Projections             │ │
-│  │ RoPE (Rotary Position Encoding) │ │
-│  │ Attention Computation           │ │
-│  │ Output Projection               │ │
-│  └─────────────────────────────────┘ │
-└───────────────────────────────────────┘
-    ↓
-  Residual Add
-    ↓
-┌───────────────────────────────────────┐
-│  RMSNorm                              │
-└───────────────────────────────────────┘
-    ↓
-┌───────────────────────────────────────┐
-│  Feed-Forward Network (SwiGLU)        │
-│  ┌─────────────────────────────────┐ │
-│  │ Gate Projection                 │ │
-│  │ Up Projection                   │ │
-│  │ SiLU Activation                 │ │
-│  │ Element-wise Multiply           │ │
-│  │ Down Projection                 │ │
-│  └─────────────────────────────────┘ │
-└───────────────────────────────────────┘
-    ↓
-  Residual Add
-    ↓
-Output (B, S, D)
-```
-
-### 2.3 Multi-Head Attention Implementation
-
-**Standard PyTorch Attention (Version 1 Baseline):**
-
-The baseline uses separate linear projections for Query, Key, and Value:
-
-```python
-def attention_forward(self, hidden_states, attention_mask=None):
-    batch_size, seq_len, _ = hidden_states.size()
-
-    # STEP 1: Separate linear projections (3 kernel launches)
-    query = self.q_proj(hidden_states)  # [B, S, D] -> [B, S, D]
-    key = self.k_proj(hidden_states)    # [B, S, D] -> [B, S, D]
-    value = self.v_proj(hidden_states)  # [B, S, D] -> [B, S, D]
-
-    # STEP 2: Reshape for multi-head attention
-    query = query.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
-    key = key.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
-    value = value.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
-    # Result: [B, H, S, head_dim]
-
-    # STEP 3: Apply rotary position embeddings
-    query, key = self.rotary_emb(query, key, seq_len)
-
-    # STEP 4: Compute attention scores
-    # attn_weights: [B, H, S, S]
-    attn_weights = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)
-
-    if attention_mask is not None:
-        attn_weights = attn_weights + attention_mask
-
-    # STEP 5: Softmax over last dimension
-    attn_weights = F.softmax(attn_weights, dim=-1)
-
-    # STEP 6: Apply attention to values
-    attn_output = torch.matmul(attn_weights, value)
-    # Result: [B, H, S, head_dim]
-
-    # STEP 7: Reshape and project output
-    attn_output = attn_output.transpose(1, 2).contiguous()
-    attn_output = attn_output.view(batch_size, seq_len, self.hidden_size)
-    attn_output = self.o_proj(attn_output)
-
-    return attn_output
-```
-
-**Performance Characteristics:**
-
-- **3 separate linear projections:** Creates kernel launch overhead
-- **Attention matrix materialization:** O(S^2) memory usage per head
-- **Memory-bound operations:** Multiple tensor reshapes
-- **Sequential execution:** Limited parallelization
-
-**FLOP Count (per layer):**
-
-Understanding FLOP calculations for attention operations with example configuration (B=8, S=128, D=256, H=8, head_dim=32):
-
-**Linear Projection FLOP Formula:**
-For a matrix multiplication: `output = input @ weight`
-- Input shape: [B, S, D_in]
-- Weight shape: [D_in, D_out]
-- FLOPs = 2 × B × S × D_in × D_out
-  - Factor of 2: Each multiply-accumulate (MAC) operation counts as 2 FLOPs (1 multiply + 1 add)
-  - We perform B × S output positions, each requiring D_in × D_out operations
-
-**Attention FLOP Calculations:**
-
-1. **Q, K, V Projections** (3 separate linear layers):
-   - Each projection: [B, S, D] → [B, S, D]
-   - FLOPs per projection: 2 × B × S × D × D
-   - Calculation: 2 × 8 × 128 × 256 × 256 = 134,217,728 ≈ 134.2M FLOPs
-   - Total for Q, K, V: 3 × 134.2M = 402.6M FLOPs
-
-2. **Attention Scores** (Q @ K^T):
-   - After reshaping: Q and K are [B, H, S, head_dim]
-   - For each head: [S, head_dim] @ [head_dim, S] → [S, S]
-   - FLOPs: 2 × B × H × S × S × head_dim
-   - Calculation: 2 × 8 × 8 × 128 × 128 × 32 = 67,108,864 ≈ 67.1M FLOPs
-   - Why: For each of B×H attention matrices, we compute S×S scores, each requiring head_dim multiply-accumulates
-
-3. **Attention Application** (Softmax @ V):
-   - Attention weights [B, H, S, S] @ Values [B, H, S, head_dim] → [B, H, S, head_dim]
-   - FLOPs: 2 × B × H × S × S × head_dim
-   - Calculation: 2 × 8 × 8 × 128 × 128 × 32 = 67.1M FLOPs
-   - Same as attention scores computation
-
-4. **Output Projection**:
-   - [B, S, D] → [B, S, D]
-   - FLOPs: 2 × B × S × D × D
-   - Calculation: 2 × 8 × 128 × 256 × 256 = 134.2M FLOPs
-
-**Summary:**
-```
-Q projection:           134.2M FLOPs
-K projection:           134.2M FLOPs
-V projection:           134.2M FLOPs
-Attention scores:        67.1M FLOPs
-Softmax:                 ~0.1M FLOPs (negligible, element-wise)
-Attention application:   67.1M FLOPs
-Output projection:      134.2M FLOPs
-─────────────────────────────────
-Total Attention:       ~671M FLOPs per layer
-```
-
-**Key Insights:**
-- Linear projections (Q, K, V, O) dominate: 536.8M FLOPs (80% of attention)
-- Attention computation (scores + application): 134.2M FLOPs (20% of attention)
-- Quadratic term (S × S) appears in attention scores but with small head_dim coefficient
-- For longer sequences, the S^2 term becomes more significant
-
-### 2.4 SwiGLU Feed-Forward Network
-
-**Implementation:**
-
-```python
-def swiglu_forward(self, hidden_states):
-    # STEP 1: Separate gate and up projections (2 kernel launches)
-    gate = self.gate_proj(hidden_states)  # [B, S, D] -> [B, S, D_ff]
-    up = self.up_proj(hidden_states)      # [B, S, D] -> [B, S, D_ff]
-
-    # STEP 2: SiLU activation (Swish)
-    gate_activated = F.silu(gate)         # Element-wise operation
-
-    # STEP 3: Element-wise multiplication
-    intermediate = gate_activated * up     # [B, S, D_ff]
-
-    # STEP 4: Down projection
-    output = self.down_proj(intermediate)  # [B, S, D_ff] -> [B, S, D]
-
-    return output
-```
-
-**Why SwiGLU?**
-- Better than standard ReLU activation
-- Gating mechanism improves model capacity
-- Used in modern LLMs (LLaMA, PaLM)
-
-**Performance Characteristics:**
-- **Separate gate/up projections:** Can be fused into single GEMM
-- **Intermediate tensor storage:** Memory overhead
-- **Sequential activation:** SiLU can be fused with multiplication
-
-**FLOP Count (per layer):**
-
-Understanding FLOP calculations for feed-forward network with example configuration (B=8, S=128, D=256, D_ff=512):
-
-**FFN FLOP Calculations:**
-
-1. **Gate Projection**:
-   - Transform: [B, S, D] → [B, S, D_ff]
-   - Weight matrix: [D, D_ff] = [256, 512]
-   - FLOPs: 2 × B × S × D × D_ff
-   - Calculation: 2 × 8 × 128 × 256 × 512 = 268,435,456 ≈ 268.4M FLOPs
-   - Explanation: For each of B×S positions, multiply a D-dimensional vector by a [D, D_ff] matrix
-
-2. **Up Projection**:
-   - Same dimensions as gate projection: [B, S, D] → [B, S, D_ff]
-   - FLOPs: 2 × B × S × D × D_ff = 268.4M FLOPs
-   - Calculation: 2 × 8 × 128 × 256 × 512 = 268.4M FLOPs
-
-3. **SiLU Activation**:
-   - Element-wise operation: silu(x) = x × sigmoid(x)
-   - Applied to gate tensor: [B, S, D_ff]
-   - FLOPs: ~3 × B × S × D_ff (sigmoid + multiply) ≈ 0.01M FLOPs
-   - Negligible compared to matrix multiplications
-
-4. **Element-wise Multiply**:
-   - gate_activated × up: [B, S, D_ff] element-wise
-   - FLOPs: B × S × D_ff = 8 × 128 × 512 ≈ 0.5M FLOPs
-   - Negligible compared to linear projections
-
-5. **Down Projection**:
-   - Transform: [B, S, D_ff] → [B, S, D]
-   - Weight matrix: [D_ff, D] = [512, 256]
-   - FLOPs: 2 × B × S × D_ff × D
-   - Calculation: 2 × 8 × 128 × 512 × 256 = 268,435,456 ≈ 268.4M FLOPs
-
-**Summary:**
-```
-Gate projection:        268.4M FLOPs
-Up projection:          268.4M FLOPs
-Down projection:        268.4M FLOPs
-SiLU activation:         ~0.01M FLOPs (negligible)
-Element-wise multiply:   ~0.5M FLOPs (negligible)
-─────────────────────────────────
-Total FFN:             ~805.3M FLOPs per layer
-```
-
-**Key Insights:**
-- Three linear projections dominate: 805.2M FLOPs (>99.9% of FFN)
-- Element-wise operations (SiLU, multiply) are negligible: <1M FLOPs combined
-- FFN is more compute-intensive than attention: 805M vs 671M FLOPs
-- Gate and up projections can be fused to reduce memory bandwidth
-- D_ff is typically 2-4× larger than D, making FFN compute-bound
-
-### 2.5 RMSNorm (Root Mean Square Normalization)
-
-**Implementation:**
-
-```python
-def rms_norm_forward(self, hidden_states):
-    input_dtype = hidden_states.dtype
-
-    # Compute RMS
-    variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-    hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-    # Apply learned scale
-    return (self.weight * hidden_states).to(input_dtype)
-```
-
-**Why RMSNorm instead of LayerNorm?**
-- Simpler: No mean subtraction
-- Faster: Fewer operations
-- Same effectiveness for LLMs
-- Less memory bandwidth
-
-**Performance Characteristics:**
-- Memory-bound operation
-- Reduction over hidden dimension
-- Opportunity for fusion with adjacent operations
-
-### 2.6 Complete Layer FLOP Breakdown
-
-For a single transformer layer with batch_size=8, seq_len=128:
-
-```
-Component               | FLOPs        | Percentage
-------------------------|--------------|------------
-Attention QKV Proj      | 402.6M       | 27.3%
-Attention Computation   | 134.2M       | 9.1%
-Attention Output Proj   | 134.2M       | 9.1%
-FFN Gate/Up Proj        | 536.8M       | 36.4%
-FFN Down Proj           | 268.4M       | 18.2%
-RMSNorm (x2)            | ~0.5M        | <0.1%
-------------------------|--------------|------------
-Total per Layer         | ~1,476M      | 100%
-Total Model (4 layers)  | ~5.91B       | -
-```
-
-**Corrected Calculations:**
-- Attention QKV: 3 × 134.2M = 402.6M FLOPs
-- Attention scores + application: 67.1M + 67.1M = 134.2M FLOPs
-- Attention output: 134.2M FLOPs
-- FFN gate + up: 2 × 268.4M = 536.8M FLOPs
-- FFN down: 268.4M FLOPs
-- Total per layer: 402.6 + 134.2 + 134.2 + 536.8 + 268.4 + 0.5 = 1,476.7M ≈ 1.48B FLOPs
-- Total model (4 layers): 4 × 1.48B = 5.92B FLOPs per forward pass
-
-**Key Observations:**
-- FFN dominates compute: ~54.6% of FLOPs (gate/up/down projections)
-- Attention: ~45.5% of FLOPs
-- RMSNorm negligible: <0.1% of FLOPs
-- Linear projections (GEMM operations) account for >99% of all FLOPs
-
-### 2.7 Memory Layout and Access Patterns
-
-**Memory Requirements (batch_size=8, seq_len=128):**
-
-```
-Component              | Memory (MB) | Notes
------------------------|-------------|---------------------------
-Model Parameters       | 9.2         | Weights only (FP32)
-Optimizer States       | 36.8        | Adam: 2× params (m, v)
-Input Activations      | 1.0         | Per layer
-Attention Activations  | 4.2         | Intermediate tensors
-FFN Activations        | 2.1         | Intermediate tensors
-Gradients              | 9.2         | Same as parameters
-Attention Matrix       | 1.0         | [B, H, S, S] per layer
------------------------|-------------|---------------------------
-Total (approximate)    | 63.5 MB     | Can vary with framework
-```
-
-**Memory Bandwidth Patterns:**
-
-- **Attention:** Memory-bound (many small operations, reshapes)
-- **FFN:** Compute-bound (large GEMMs with high arithmetic intensity)
-- **RMSNorm:** Memory-bound (reduction operations)
-
----
-
-## 3. Understanding the Baseline Implementation
-
-### 3.1 Code Structure Overview
-
-The `tiny_llama_v1.py` file is organized into several key components:
-
-```
-tiny_llama_v1.py
-├── Configuration Classes
-│   ├── TinyLlamaConfig (model configuration)
-│   └── ProfilerConfig (profiling options)
-├── Model Components
-│   ├── RMSNorm (normalization layer)
-│   ├── RotaryEmbedding (position encoding)
-│   ├── Attention (multi-head attention)
-│   ├── MLP (SwiGLU feed-forward)
-│   ├── TransformerBlock (complete layer)
-│   └── TinyLlamaModel (full model)
-├── Training Infrastructure
-│   ├── Optimizer setup
-│   ├── Loss computation
-│   └── Training loop
-└── Profiling Integration
-    ├── PyTorch Profiler setup
-    ├── DeepSpeed FLOPS profiler
-    └── Performance reporting
-```
-
-### 3.2 Command-Line Arguments
-
-Understanding the available options:
-
-**Basic Training Arguments:**
-
-```bash
---batch-size 8              # Number of samples per batch
---seq-len 128               # Sequence length
---num-steps 50              # Number of training steps
---learning-rate 1e-4        # Optimizer learning rate
---device cuda               # Device to use (cuda/cpu)
-```
-
-**Model Configuration:**
-
-```bash
---hidden-dim 256            # Model hidden dimension
---n-layers 4                # Number of transformer layers
---n-heads 8                 # Number of attention heads
---intermediate-dim 512      # FFN intermediate size
-```
-
-**Profiling Options:**
-
-```bash
---enable-pytorch-profiler   # Enable PyTorch profiler
---profile-dir ./profiles    # Directory for profile output
---profile-memory            # Include memory profiling
---profile-operators         # Detailed operator profiling
---profile-steps 5           # Number of steps to profile
-```
-
-**DeepSpeed FLOPS Profiling:**
-
-```bash
---enable-deepspeed-flops    # Enable FLOPS profiler
---flops-profile-step 10     # Which step to profile
-```
-
-**Other Options:**
-
-```bash
---seed 42                   # Random seed for reproducibility
---deterministic             # Enable deterministic operations
---output-dir ./output       # Directory for outputs
---log-interval 10           # Logging frequency
-```
-
-### 3.3 Profiling Integration Points
-
-The code includes several profiling integration points:
-
-**PyTorch Profiler Context:**
-
-```python
-# In training loop
-with torch.profiler.profile(
-    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-    record_shapes=True,
-    profile_memory=True,
-    with_stack=True,
-    with_flops=True
-) as prof:
-    # Training step
-    outputs = model(inputs)
-    loss = criterion(outputs)
-    loss.backward()
-    optimizer.step()
-
-# Export results
-prof.export_chrome_trace("trace.json")
-```
-
-**NVTX Annotations:**
-
-```python
-# Mark important regions
-with nvtx.range("attention_forward"):
-    attn_output = attention(hidden_states)
-
-with nvtx.range("ffn_forward"):
-    ffn_output = feed_forward(hidden_states)
-```
-
-**DeepSpeed FLOPS Profiler:**
-
-```python
-from deepspeed.profiling.flops_profiler import FlopsProfiler
-
-profiler = FlopsProfiler(model)
-profiler.start_profile()
-# Forward pass
-profiler.stop_profile()
-profiler.print_model_profile(profile_step=1)
-```
-
-### 3.4 Expected Kernel Launch Pattern
-
-For a single training step, the baseline implementation generates:
-
-```
-Per Transformer Layer (~17 kernel launches):
-├── RMSNorm (pre-attention)         : 1 kernel
-├── Q Projection                    : 1 kernel
-├── K Projection                    : 1 kernel
-├── V Projection                    : 1 kernel
-├── RoPE (query)                    : 1 kernel
-├── RoPE (key)                      : 1 kernel
-├── Attention scores (QK^T)         : 1 kernel
-├── Softmax                         : 1 kernel
-├── Attention application (softmax*V): 1 kernel
-├── Output Projection               : 1 kernel
-├── Residual Add                    : 1 kernel
-├── RMSNorm (pre-FFN)              : 1 kernel
-├── Gate Projection                 : 1 kernel
-├── Up Projection                   : 1 kernel
-├── SiLU Activation                 : 1 kernel
-├── Element-wise Multiply           : 1 kernel
-└── Down Projection                 : 1 kernel
-
-Total per step (4 layers): ~68 kernels (forward only)
-With backward pass: ~136 kernels per step
-```
-
-**Optimization Implications:**
-- High kernel launch overhead
-- Many small operations
-- Opportunities for fusion
-
-### 3.5 Running the Baseline
-
-**Quick Start:**
-
-```bash
-# Basic run without profiling
-./run_baseline.sh
-
-# Or manually
-python3 tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 50
-```
-
-**With PyTorch Profiler:**
-
-```bash
-# Using helper script
-./run_pytorch_profiler.sh
-
-# Or manually
-python3 tiny_llama_v1.py \
-    --batch-size 8 \
-    --seq-len 128 \
-    --num-steps 20 \
-    --enable-pytorch-profiler \
-    --profile-dir ./pytorch_profiles \
-    --profile-memory
-```
-
-**With DeepSpeed FLOPS Profiler:**
-
-```bash
-# Using helper script
-./run_deepspeed_flops.sh
-
-# Or manually
-python3 tiny_llama_v1.py \
-    --batch-size 8 \
-    --seq-len 128 \
-    --num-steps 20 \
-    --enable-deepspeed-flops \
-    --flops-profile-step 10
-```
-
----
-
-## 4. Exercise 1: Baseline Performance Analysis
-
-### 4.1 Objective
-
-Establish baseline performance metrics for Tiny LLaMA V1 and understand the profiling methodology that will be used throughout the workshop.
-
-**What you'll learn:**
-- How to run the baseline model
-- How to enable and use PyTorch Profiler
-- How to interpret basic profiling output
-- What "good" performance looks like for this model
-- How to identify top operations consuming time
-
-### 4.2 Step-by-Step Instructions
-
-#### Step 1: Run Baseline Training
-
-First, let's run the basic model without any profiling to establish a clean baseline:
-
-```bash
-# Navigate to version1_pytorch_baseline directory
-cd ~/castille-ai-workshop-training/version1_pytorch_baseline/
-
-# Run basic training
-python3 tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 20
-```
-
-**Expected Output:**
-
-```
-==========================================
-Tiny LLaMA V1 - PyTorch Baseline
-==========================================
-Configuration:
-  Batch Size: 8
-  Sequence Length: 128
-  Number of Steps: 20
-  Hidden Dim: 256
-  Num Layers: 4
-  Num Heads: 8
-  Intermediate Dim: 512
-
-Model Configuration:
-  Total Parameters: 2,345,984
-  Model Size: 9.2 MB (FP32)
-
-Initializing model and optimizer...
-Using device: cuda
-GPU: AMD Instinct MI250X
-
-Starting training...
-Step 1/20: Loss = 6.9088, Time = 0.234 seconds
-Step 2/20: Loss = 6.9076, Time = 0.046 seconds
-Step 3/20: Loss = 6.9065, Time = 0.045 seconds
-Step 4/20: Loss = 6.9054, Time = 0.044 seconds
-...
-Step 20/20: Loss = 6.8821, Time = 0.044 seconds
-
-==========================================
-Performance Summary:
-==========================================
-Average time per step: 0.045 seconds
-Training speed: 177.8 samples/sec
-Peak memory usage: 2847 MB
-Avg time per forward: 0.022 seconds
-Avg time per backward: 0.018 seconds
-Avg time per optimizer: 0.005 seconds
-==========================================
-```
-
-**Record the following baseline metrics:**
-- Training speed: _____ samples/sec
-- Peak memory usage: _____ MB
-- Avg time per step: _____ ms
-- GPU name and memory
-
-**Key Observations:**
-
-1. **First iteration is slower:** Step 1 takes ~234ms vs ~44ms for subsequent steps
-   - Reason: Kernel compilation, memory allocation, cache warming
-   - **Always exclude first iteration from measurements**
-
-2. **Consistent timing:** Steps 2-20 have similar timing
-   - Good sign: stable performance
-   - Small variance indicates consistent GPU utilization
-
-3. **Memory usage:** ~2.8 GB for this configuration
-   - Includes: Model weights (9 MB) + optimizer states (36 MB) + activations + gradients
-
-#### Step 2: Enable PyTorch Profiler
-
-Now let's add PyTorch profiler to understand what's happening under the hood:
-
-```bash
-# Run with PyTorch profiler enabled
-python3 tiny_llama_v1.py \
-    --batch-size 8 \
-    --seq-len 128 \
-    --num-steps 20 \
-    --enable-pytorch-profiler \
-    --profile-dir ./exercise1_profiles \
-    --profile-steps 5
-```
-
-**What this does:**
-- Profiles steps 5-10 (after warmup)
-- Records CPU and GPU operations
-- Tracks memory allocations
-- Generates TensorBoard-compatible traces
-
-**Expected Output:**
-
-```
-==========================================
-Tiny LLaMA V1 - PyTorch Baseline (Profiling Enabled)
-==========================================
-... (same as before) ...
-
-Profiling enabled: Steps 5-10
-Profile data will be saved to: ./exercise1_profiles/
-
-Step 1/20: Loss = 6.9088, Time = 0.245 seconds
-Step 2/20: Loss = 6.9076, Time = 0.048 seconds
-Step 3/20: Loss = 6.9065, Time = 0.047 seconds
-Step 4/20: Loss = 6.9054, Time = 0.046 seconds
-Step 5/20: Loss = 6.9043, Time = 0.052 seconds [PROFILING]
-Step 6/20: Loss = 6.9032, Time = 0.053 seconds [PROFILING]
-...
-Step 10/20: Loss = 6.8989, Time = 0.052 seconds [PROFILING]
-Step 11/20: Loss = 6.8978, Time = 0.046 seconds
-...
-
-Profiling complete!
-Profile files generated:
-  - ./exercise1_profiles/trace_step_5_10.json
-  - ./exercise1_profiles/events.out.tfevents.*
-  - ./exercise1_profiles/performance_summary.json
-
-Average time per step: 0.048 seconds (with profiling overhead)
-Training speed: 166.7 samples/sec
-Peak memory usage: 3124 MB
-```
-
-**Answer these questions in your results file:**
-
-1. How much overhead did profiling add to training time?
-   - Without profiling: ~0.045 seconds/step
-   - With profiling: ~0.048-0.052 seconds/step
-   - Overhead: ~6-15% (acceptable for profiling)
-
-2. What files were generated in the `exercise1_profiles/` directory?
-
-```bash
-ls -lh ./exercise1_profiles/
-```
-
-3. What's the difference in memory usage with profiling enabled?
-   - Extra memory needed for profiler data structures
-
-#### Step 3: Analyze Profiling Results with TensorBoard
-
-Launch TensorBoard to visualize the profiling results:
-
-```bash
-# Launch TensorBoard (run in background or separate terminal)
-tensorboard --logdir ./exercise1_profiles --port 6006
-
-# If TensorBoard is not available, examine JSON traces
-# We'll show alternative analysis methods below
-```
-
-**TensorBoard Analysis:**
-
-1. Open your browser to `http://localhost:6006` (or your server address)
-2. Navigate to the "PROFILE" tab
-3. Select the most recent run
-
-**Explore the following views:**
-
-**A. Overview Page:**
-
-- **Performance Summary:** Shows step time breakdown
-- **Run Environment:** GPU model, driver version, CUDA/ROCm version
-- **Recommendation:** TensorBoard may suggest optimizations
-
-**B. Trace Viewer:**
-
-- Timeline of CPU and GPU operations
-- Each row represents a thread or GPU stream
-- Zoom in to see individual kernel launches
-- Look for:
-  - GPU idle time (gaps in GPU timeline)
-  - CPU bottlenecks
-  - Memory transfer operations
-
-**C. Operator View:**
-
-Shows aggregated statistics for each operation type:
-
-```
-Top Operations by Total Time:
-Operation                          | Calls | GPU Time | CPU Time | Total Time
------------------------------------|-------|----------|----------|------------
-aten::mm (matrix multiply)         | 240   | 18.5 ms  | 0.2 ms   | 18.7 ms
-aten::addmm (matrix multiply+add)  | 480   | 15.3 ms  | 0.3 ms   | 15.6 ms
-aten::bmm (batch matrix multiply)  | 160   | 12.1 ms  | 0.1 ms   | 12.2 ms
-aten::softmax                      | 80    | 8.4 ms   | 0.1 ms   | 8.5 ms
-aten::mul (element-wise multiply)  | 320   | 3.2 ms   | 0.1 ms   | 3.3 ms
-aten::add_ (in-place add)          | 160   | 2.8 ms   | 0.1 ms   | 2.9 ms
-aten::silu (SiLU activation)       | 80    | 2.1 ms   | 0.1 ms   | 2.2 ms
-aten::rsqrt (RMSNorm)              | 160   | 1.5 ms   | 0.1 ms   | 1.6 ms
-```
-
-**Document in your results file:**
-
-**Top 3 longest-running operations:**
-1. _________________
-2. _________________
-3. _________________
-
-**D. Memory Timeline:**
-
-- Shows memory allocation over time
-- Peak memory during forward pass or backward pass?
-- Memory spikes indicate large tensor allocations
-
-**Document:**
-- Peak memory: _____ MB
-- When does peak occur: Forward / Backward / Optimizer
-- Are there memory spikes? Yes / No
-
-#### Step 4: Alternative Analysis (Without TensorBoard)
-
-If TensorBoard is not available, analyze the JSON trace directly:
-
-```bash
-# View performance summary
-cat ./exercise1_profiles/performance_summary.json | python3 -m json.tool
-```
-
-Use the Chrome trace viewer or analysis tools to identify the top operations by execution time. Look for patterns in:
-- Matrix multiplication operations (mm, addmm, bmm)
-- Attention-related kernels
-- FFN operations
-- Normalization operations
-
-#### Step 5: Identify Performance Patterns
-
-Based on your analysis, identify patterns in the baseline model:
-
-**Check these patterns in your results:**
-
-**Compute Patterns:**
-
-- [ ] Matrix multiplications (mm, addmm, bmm) dominate compute time
-- [ ] Attention operations consume ~35-45% of total time
-- [ ] FFN operations consume ~30-40% of total time
-- [ ] Many small operations with low individual utilization
-- [ ] Kernel launch overhead visible in timeline
-
-**Memory Patterns:**
-
-- [ ] Memory usage grows during forward pass
-- [ ] Peak memory during attention computation
-- [ ] Gradient tensors allocated during backward pass
-- [ ] Frequent small allocations for intermediate tensors
-
-**Optimization Opportunities:**
-
-Based on the profiling results, rank these optimizations by potential benefit:
-
-- [ ] **High:** Kernel fusion (reduce number of operations)
-- [ ] **High:** Fused QKV projection in attention
-- [ ] **High:** Flash Attention implementation (reduce memory)
-- [ ] **Medium:** Memory layout optimization
-- [ ] **Medium:** Mixed precision training (FP16)
-- [ ] **Low:** Batch size scaling (already reasonable)
-
-### 4.3 Expected Results
-
-After completing this exercise, you should have:
-
-#### Performance Baseline
-
-Representative ranges (actual results will vary by hardware):
-
-- **Training Speed:** 50-200 samples/sec
-- **GPU Utilization:** 60-75% (typical for baseline PyTorch)
-- **Memory Usage:** 2-4 GB (depends on batch size)
-- **Kernel Count:** 60-80 different kernel launches per step
-- **MFU (estimated):** 20-35% (memory-bound workload)
-
-#### Key Observations
-
-1. **Attention operations consume ~35-45% of total compute time**
-   - QKV projections: separate kernel launches
-   - Attention computation: O(S^2) memory complexity
-   - Softmax: memory-bound operation
-
-2. **FFN operations consume ~30-40% of total time**
-   - Gate/Up projections: separate operations
-   - SwiGLU: sequential activation and multiplication
-
-3. **Matrix multiplications (GEMM) are the dominant kernels**
-   - Linear layers in projections
-   - Attention score computation
-   - Good candidates for optimization
-
-4. **Multiple small operations create kernel launch overhead**
-   - Element-wise operations (add, multiply, activation)
-   - Normalization layers
-   - Residual connections
-
-5. **Memory allocation patterns show optimization opportunities**
-   - Intermediate tensors in attention
-   - Separate activations in FFN
-   - Gradient storage
-
-#### Profiling Data Generated
-
-```
-exercise1_profiles/
-├── trace_step_5_10.json           # Chrome trace format
-├── events.out.tfevents.*          # TensorBoard events
-├── performance_summary.json       # Aggregated metrics
-└── memory_timeline.json           # Memory usage over time
-```
-
-### 4.4 Troubleshooting
-
-#### Common Issues
-
-**1. CUDA/ROCm Memory Errors**
-
-```bash
-# Error: RuntimeError: CUDA out of memory
-# Solution: Reduce batch size or sequence length
-python3 tiny_llama_v1.py --batch-size 4 --seq-len 64 --num-steps 10
-```
-
-**2. Profiling Files Not Generated**
-
-```bash
-# Check permissions and disk space
-ls -la ./exercise1_profiles/
-df -h .
-
-# Create directory manually
-mkdir -p exercise1_profiles
-chmod 755 exercise1_profiles
-```
-
-**3. TensorBoard Not Loading**
-
-```bash
-# Try different port
-tensorboard --logdir ./exercise1_profiles --port 6007
-
-# Check if port is in use
-netstat -tuln | grep 6006
-
-# Or examine JSON files directly (see alternative analysis above)
-```
-
-**4. Low GPU Utilization**
-
-```bash
-# Check if GPU is being used
-rocm-smi
-
-# Monitor GPU during training (in separate terminal)
-watch -n 1 rocm-smi
-
-# Check for CPU bottlenecks
-htop
-```
-
-**5. Inconsistent Timing**
-
-```bash
-# Ensure no other processes are using GPU
-rocm-smi
-
-# Run with deterministic mode
-python3 tiny_llama_v1.py --deterministic --seed 42
-```
-
-### 4.5 Analysis Questions
-
-Answer these questions based on your results:
-
-**1. What is the primary bottleneck in the baseline model?**
-   - [ ] Memory bandwidth (many small operations)
-   - [ ] Compute utilization (GPU not fully utilized)
-   - [ ] Kernel launch overhead (too many launches)
-   - [ ] Data loading (CPU bottleneck)
-
-**Answer:** Likely a combination of memory bandwidth and kernel launch overhead. The baseline has many small operations that don't fully utilize the GPU.
-
-**2. Which operations would benefit most from fusion?**
-   - [ ] QKV projections in attention
-   - [ ] Gate/Up projections in SwiGLU
-   - [ ] Layer normalization operations
-   - [ ] All of the above
-
-**Answer:** All of the above. Version 2 will address these with kernel fusion.
-
-**3. What percentage of time is spent in attention vs FFN?**
-
-Based on profiling data:
-- Attention: ~_____%
-- FFN: ~_____%
-- Other (norms, residuals): ~_____%
-
-**4. Based on memory usage patterns, what optimization would help most?**
-   - [ ] Gradient checkpointing (reduce activation memory)
-   - [ ] Flash Attention (reduce attention memory from O(S^2) to O(S))
-   - [ ] Mixed precision (reduce memory footprint by 2x)
-   - [ ] Tensor fusion (reduce intermediate tensor allocations)
-
-**Answer:** Flash Attention for long sequences, tensor fusion for overall efficiency.
-
-### 4.6 Key Takeaways
-
-**What We Learned:**
-
-1. **Baseline performance characteristics:**
-   - Training speed: _____ samples/sec (record your value)
-   - GPU utilization: Moderate (60-75%)
-   - Memory usage: Reasonable for batch size
-
-2. **Primary bottlenecks identified:**
-   - Separate kernel launches for QKV, Gate/Up projections
-   - O(S^2) memory usage in attention
-   - Memory bandwidth limitations
-
-3. **Optimization targets for Version 2:**
-   - QKV fusion (combine 3 operations into 1)
-   - SwiGLU fusion (combine gate/up projections)
-   - Custom fused kernels for common patterns
-
-4. **Profiling methodology:**
-   - PyTorch Profiler provides detailed operator-level insights
-   - TensorBoard visualization helps identify patterns
-   - JSON traces enable programmatic analysis
-
-**Next Steps:**
-
-- Document your findings
-- Compare with expected results (are your metrics in the expected ranges?)
-- Identify top 3 optimization targets for Version 2
-- Save your profiling data for comparison with optimized versions
-
-**Exercise Complete When:**
-
-- [ ] Baseline training runs successfully
-- [ ] Profiling data generated and analyzed
-- [ ] Performance metrics documented
-- [ ] Top operations identified
-- [ ] Bottlenecks understood
-- [ ] Ready to proceed to memory analysis
-
----
-
-**Next Exercise:** [Exercise 2 - Memory Analysis & Optimization](#5-exercise-2-memory-analysis--optimization)
-
----
-
-## 5. Exercise 2: Memory Analysis & Optimization
-
-### 5.1 Objective
-
-Understand memory usage patterns, identify memory bottlenecks, and analyze memory bandwidth utilization in the baseline Tiny LLaMA model.
-
-**What you'll learn:**
-- How memory scales with batch size and sequence length
-- Where peak memory is consumed (forward, backward, optimizer)
-- Memory bandwidth utilization patterns
-- How to identify memory-bound vs compute-bound operations
-- Memory optimization opportunities
-
-### 5.2 Background: Why Memory Matters
-
-Memory optimization is crucial for transformer models because:
-
-**Memory Bandwidth:**
-- Often the limiting factor, especially for small models
-- Modern GPUs have very high compute (TFLOPS) but limited bandwidth (TB/s)
-- Memory-bound operations don't fully utilize GPU compute
-
-**Peak Memory:**
-- Determines maximum batch size and model size
-- Out-of-memory (OOM) errors are common
-- Larger batches → better GPU utilization
-
-**Memory Fragmentation:**
-- Multiple small allocations reduce effective memory
-- Garbage collection overhead
-- Can cause OOM even with available memory
-
-**Attention Memory:**
-- Quadratic scaling: O(S^2) with sequence length
-- Major bottleneck for long sequences
-- Target for Flash Attention optimization
-
-### 5.3 Step-by-Step Instructions
-
-#### Step 1: Memory-Focused Profiling
-
-Run profiling with enhanced memory analysis for different batch sizes:
-
-```bash
-# Batch size 4
-python3 tiny_llama_v1.py \
-    --batch-size 4 \
-    --seq-len 128 \
-    --num-steps 15 \
-    --enable-pytorch-profiler \
-    --profile-memory \
-    --profile-dir ./memory_analysis_bs4
-
-# Batch size 8
-python3 tiny_llama_v1.py \
-    --batch-size 8 \
-    --seq-len 128 \
-    --num-steps 15 \
-    --enable-pytorch-profiler \
-    --profile-memory \
-    --profile-dir ./memory_analysis_bs8
-
-# Batch size 16
-python3 tiny_llama_v1.py \
-    --batch-size 16 \
-    --seq-len 128 \
-    --num-steps 15 \
-    --enable-pytorch-profiler \
-    --profile-memory \
-    --profile-dir ./memory_analysis_bs16
-```
-
-**Expected Output for Each Run:**
-
-```
-==========================================
-Tiny LLaMA V1 - Memory Profiling
-==========================================
-Configuration:
-  Batch Size: 8
-  Sequence Length: 128
-  ...
-
-Memory Profiling Enabled
-
-Step 1/15: Loss = 6.9088, Time = 0.245 s, Memory = 2847 MB
-...
-Step 15/15: Loss = 6.8765, Time = 0.046 s, Memory = 2847 MB
-
-==========================================
-Memory Analysis Summary:
-==========================================
-Peak Memory Usage: 2847 MB
-Average Memory Usage: 2654 MB
-Memory at Forward Pass: 2123 MB
-Memory at Backward Pass: 2847 MB
-Memory at Optimizer Step: 2456 MB
-Number of Allocations: 1234
-Largest Tensor: 512 MB (attention_scores)
-==========================================
-```
-
-**Record memory usage for each batch size in your results file:**
-
-| Batch Size | Peak Memory (MB) | Avg Memory (MB) | Training Speed (samples/sec) |
-|------------|------------------|-----------------|------------------------------|
-| 4          | _______          | _______         | _______                      |
-| 8          | _______          | _______         | _______                      |
-| 16         | _______          | _______         | _______                      |
-
-**Questions to Answer:**
-
-1. **Memory Scaling:** Does memory double when batch size doubles?
-   - If yes → Linear scaling (good)
-   - If more than double → Superlinear scaling (fragmentation or inefficiency)
-
-2. **Throughput Scaling:** Does throughput double when batch size doubles?
-   - If yes → Perfect scaling
-   - If less → Diminishing returns (memory bandwidth limit)
-
-3. **Memory Efficiency:** What's the peak-to-average memory ratio?
-   - High ratio → Memory spikes, potential for optimization
-   - Low ratio → Consistent memory usage
-
-#### Step 2: Memory Timeline Analysis
-
-Analyze memory patterns using TensorBoard:
-
-```bash
-# Launch TensorBoard for memory analysis
-tensorboard --logdir ./memory_analysis_bs8 --port 6007
-```
-
-**In TensorBoard:**
-
-1. Go to the **PROFILE** tab
-2. Select **Memory Viewer** or **Memory Timeline** view
-3. Examine the memory usage pattern over time
-
-**What to Look For:**
-
-**A. Memory Allocation Pattern:**
-
-```
-Memory (MB)
-    |
-3000|                    ╱‾‾‾‾‾╲
-    |                   /       \
-2500|                  /         \___________
-    |                 /
-2000|        ╱‾‾‾‾‾‾╱
-    |       /
-1500|______/
-    |
-    +-----|-----|-----|-----|-----|------> Time
-         Fwd  Attn  FFN  Bwd  Opt  Done
-```
-
-- **Forward pass:** Memory increases as activations are computed
-- **Attention:** Often creates a spike (attention matrices)
-- **FFN:** Additional activation memory
-- **Backward pass:** Gradient tensors allocated
-- **Optimizer:** Parameter updates
-
-**B. Memory Peaks:**
-
-Document when peak memory occurs:
-- [ ] During forward pass (activations)
-- [ ] During attention computation (attention matrices)
-- [ ] During backward pass (gradients)
-- [ ] During optimizer step (momentum buffers)
-
-**C. Memory Deallocation:**
-
-- Are there clear drops in memory usage?
-- Does memory return to baseline after each step?
-- Are tensors being deallocated promptly?
-
-**Record in your results file:**
-
-**Memory Pattern Analysis:**
-- Peak memory occurs at: _______________________
-- Largest memory spike caused by: _______________________
-- Memory is deallocated: Promptly / Delayed / Not at all
-- Memory usage pattern: Steady / Fluctuating / Spiking
-
-#### Step 3: Sequence Length Scaling
-
-Test how memory scales with sequence length:
+Load the required modules:
 
 ```bash
-# Sequence length 64
-python3 tiny_llama_v1.py \
-    --batch-size 8 \
-    --seq-len 64 \
-    --num-steps 10 \
-    --profile-memory \
-    --profile-dir ./memory_seq64
-
-# Sequence length 128 (baseline)
-python3 tiny_llama_v1.py \
-    --batch-size 8 \
-    --seq-len 128 \
-    --num-steps 10 \
-    --profile-memory \
-    --profile-dir ./memory_seq128
-
-# Sequence length 256
-python3 tiny_llama_v1.py \
-    --batch-size 8 \
-    --seq-len 256 \
-    --num-steps 10 \
-    --profile-memory \
-    --profile-dir ./memory_seq256
-
-# Sequence length 512 (might OOM - use smaller batch if needed)
-python3 tiny_llama_v1.py \
-    --batch-size 4 \
-    --seq-len 512 \
-    --num-steps 5 \
-    --profile-memory \
-    --profile-dir ./memory_seq512
-```
-
-**Record sequence length scaling:**
-
-| Seq Length | Batch Size | Peak Memory (MB) | Memory Increase | Scaling Factor |
-|------------|------------|------------------|-----------------|----------------|
-| 64         | 8          | _______          | baseline        | 1.0x           |
-| 128        | 8          | _______          | _______         | _______        |
-| 256        | 8          | _______          | _______         | _______        |
-| 512        | 4          | _______          | _______         | _______        |
-
-**Memory Scaling Analysis:**
-
-Calculate the scaling factor:
+module load pytorch rocm
 ```
-Scaling Factor = Memory(S) / Memory(S_baseline)
-
-For attention memory (theoretical):
-- Linear components: O(S) → 2x when S doubles
-- Attention matrix: O(S^2) → 4x when S doubles
-
-Expected combined: ~3x when S doubles (for attention-heavy workloads)
-```
-
-**Answer these questions:**
 
-1. **What is the memory scaling pattern?**
-   - [ ] Linear (~2x when sequence doubles)
-   - [ ] Quadratic (~4x when sequence doubles)
-   - [ ] Between linear and quadratic (~3x)
-
-2. **Which component shows steepest memory scaling?**
-   - Run separate profiling focusing on attention vs FFN
-   - Check memory timeline for attention layers
-
-3. **At what sequence length do you hit memory limits?**
-   - Record the maximum sequence length before OOM
-   - Note the batch size at that limit
-
-#### Step 4: Identifying Memory Hotspots
-
-Use profiling to identify which operations consume most memory:
+Use the default case unless there is a reason to change it:
 
 ```bash
-# Run with detailed operator profiling
-python3 tiny_llama_v1.py \
-    --batch-size 8 \
-    --seq-len 128 \
-    --num-steps 10 \
-    --enable-pytorch-profiler \
-    --profile-memory \
-    --profile-operators \
-    --profile-dir ./memory_hotspots
-```
-
-**Analyze the operator memory usage:**
-
-Review the memory profiling output and trace files to identify operators with highest memory allocation. Use the PyTorch Profiler's memory view or trace analysis to examine memory allocation patterns.
-
-**Record top memory-consuming operations:**
-
-1. _________________: _______ MB
-2. _________________: _______ MB
-3. _________________: _______ MB
-4. _________________: _______ MB
-5. _________________: _______ MB
-
-**Common Memory Hotspots:**
-
-- **Attention scores:** `[B, H, S, S]` matrices (quadratic in S)
-- **Query/Key/Value states:** `[B, S, D]` tensors
-- **FFN intermediate:** `[B, S, D_ff]` tensors
-- **Gradients:** Same size as parameters + activations
-
-#### Step 5: Memory Bandwidth Analysis
-
-Analyze memory bandwidth utilization:
-
-**Calculate memory bandwidth manually:**
-
-For batch_size=8, seq_len=128, hidden_dim=256, n_layers=4:
-
-1. **Estimate memory traffic per step:**
-   - Forward pass: QKV weights + activations + FFN weights
-   - Backward pass: ~2× forward pass
-   - Total: Depends on model size and batch configuration
-
-2. **Calculate bandwidth utilization:**
-   - Memory bandwidth = Total memory traffic / Step time
-   - Compare with theoretical peak (e.g., MI250X: ~1.6 TB/s per GCD)
-   - Utilization % = (Actual bandwidth / Peak bandwidth) × 100
-
-3. **Calculate arithmetic intensity:**
-   - Arithmetic intensity = FLOPs / Memory traffic (bytes)
-   - < 10 FLOPS/byte: Memory-bound
-   - > 100 FLOPS/byte: Compute-bound
-   - 10-100 FLOPS/byte: Mixed workload
-
-Record your observations based on the profiling data collected.
-
-**Record in your results file:**
-
-**Bandwidth Analysis:**
-- Memory Traffic per Step: _______ GB
-- Memory Bandwidth Used: _______ GB/s
-- Theoretical Peak Bandwidth: _______ GB/s
-- Bandwidth Utilization: _______%
-- Arithmetic Intensity: _______ FLOPS/byte
-- Workload Classification: _______
-
-### 5.4 Analysis and Interpretation
-
-#### Memory Scaling Patterns
-
-**Batch Size Scaling:**
-
-Expected pattern:
-- Memory ≈ Base + (Batch_size × Per_sample_memory)
-- Should be approximately linear
-- If superlinear → fragmentation or inefficiency
-
-**Sequence Length Scaling:**
-
-Components:
-- Linear: Activations, most projections
-- Quadratic: Attention matrices `[B, H, S, S]`
-- Combined: Between linear and quadratic
-
-**Typical Results:**
-
-| Component      | S=64 | S=128 | S=256 | Scaling |
-|----------------|------|-------|-------|---------|
-| Parameters     | 9MB  | 9MB   | 9MB   | O(1)    |
-| Activations    | ~1GB | ~2GB  | ~4GB  | O(S)    |
-| Attention      | ~100MB | ~400MB | ~1.6GB | O(S^2) |
-| Total          | ~1.1GB | ~2.4GB | ~5.6GB | Mixed  |
-
-#### Memory Bottleneck Classification
-
-**Workload Type Determination:**
-
+python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 10
 ```
-Arithmetic Intensity (FLOPS/byte):
-- < 10: Memory-bound (bandwidth limited)
-- 10-100: Mixed workload
-- > 100: Compute-bound (ALU limited)
 
-Typical Transformer Training: 20-50 FLOPS/byte (mixed, leaning memory-bound)
-```
-
-**Signs of Memory-Bound Workload:**
-- Low GPU compute utilization (<70%)
-- High memory bandwidth utilization (>60%)
-- Many small operations
-- Frequent memory transfers
-
-**Signs of Compute-Bound Workload:**
-- High GPU compute utilization (>80%)
-- Low memory bandwidth utilization (<50%)
-- Large matrix multiplications dominate
-- Good arithmetic intensity
-
-### 5.5 Memory Optimization Opportunities
-
-Based on your analysis, rank these optimizations:
-
-**1. Flash Attention**
-- **Impact:** Reduces attention memory from O(S^2) to O(S)
-- **Benefit:** Enables much longer sequences
-- **When:** Always beneficial for S > 512
-- **Rank:** _____ (1-4)
-
-**2. Gradient Checkpointing**
-- **Impact:** Trades compute for memory (recompute activations)
-- **Benefit:** Reduces activation memory by ~2-4x
-- **When:** Memory-constrained, willing to sacrifice 20-30% speed
-- **Rank:** _____ (1-4)
-
-**3. Mixed Precision (FP16/BF16)**
-- **Impact:** Reduces memory per parameter by 2x
-- **Benefit:** Allows 2x larger batch or model
-- **When:** Always beneficial if hardware supports it
-- **Rank:** _____ (1-4)
+From one validated run, the baseline reference numbers were:
 
-**4. Kernel Fusion**
-- **Impact:** Reduces intermediate tensor allocations
-- **Benefit:** Lower memory footprint, less fragmentation
-- **When:** Many small operations (already the case)
-- **Rank:** _____ (1-4)
+- `291.3 samples/sec`
+- `27.5 ms` average batch time
+- `434.3 MB` peak memory
 
-### 5.6 Expected Results
+## Exercise 1: Establish the baseline
 
-After completing this exercise, you should have:
+Run the model once and record:
 
-**Memory Usage Baseline:**
-- Peak memory: 2-4 GB (batch_size=8, seq_len=128)
-- Memory scaling: ~Linear with batch size, ~Quadratic with sequence
-- Memory hotspots: Attention matrices, FFN intermediate tensors
-- Bandwidth utilization: 30-60% (memory-bound to mixed)
+- average training speed
+- average batch time
+- peak memory usage
 
-**Key Findings:**
+Those are the reference numbers for the later TinyTransformer versions.
 
-1. **Attention Memory Dominates for Long Sequences**
-   - At S=512, attention alone can consume GBs
-   - Quadratic scaling makes long sequences expensive
-   - Flash Attention is critical optimization target
+## Exercise 2: Collect a quick hotspot list
 
-2. **Memory Fragmentation Observable**
-   - Peak-to-average ratio often 1.2-1.5x
-   - Many small allocations create overhead
-   - Tensor fusion can reduce fragmentation
-
-3. **Bandwidth Utilization is Moderate**
-   - Typically 30-60% for baseline PyTorch
-   - Room for improvement through fusion
-   - Memory-bound operations limit performance
-
-4. **Linear Components Well-Behaved**
-   - FFN and most projections scale linearly
-   - Predictable memory requirements
-   - Batch size scaling is efficient
-
-### 5.7 Troubleshooting
-
-**Out of Memory Errors:**
+Run:
 
 ```bash
-# Error: RuntimeError: CUDA out of memory
-# Solution 1: Reduce batch size
-python3 tiny_llama_v1.py --batch-size 2 --seq-len 128
-
-# Solution 2: Reduce sequence length
-python3 tiny_llama_v1.py --batch-size 8 --seq-len 64
-
-# Solution 3: Enable gradient accumulation (if implemented)
-python3 tiny_llama_v1.py --batch-size 4 --gradient-accumulation-steps 2
+./get_hotspots.sh
 ```
 
-**Memory Profiling Overhead:**
+Record the top three kernels by total time. In the validated run, the top entries were GEMM-heavy kernels around `30.8 ms`, `30.1 ms`, and `26.6 ms` of total GPU time.
 
-```bash
-# If profiling causes OOM, reduce profiling frequency
-python3 tiny_llama_v1.py --profile-steps 2  # Profile fewer steps
-```
+## Exercise 3: Collect a runtime trace
 
-**Memory Fragmentation:**
+Run:
 
 ```bash
-# Set memory allocator configuration
-export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
-
-# Or use expandable segments (PyTorch 2.0+)
-export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+./get_trace.sh
 ```
 
-### 5.8 Analysis Questions
-
-Answer these questions based on your results:
-
-**1. What is the memory scaling behavior?**
-   - Batch size scaling: [ ] Linear [ ] Superlinear [ ] Sublinear
-   - Sequence length scaling: [ ] Linear [ ] Quadratic [ ] Cubic
-
-**2. Where is peak memory consumed?**
-   - [ ] Forward pass (activations)
-   - [ ] Backward pass (gradients)
-   - [ ] Optimizer step (parameter updates)
-   - [ ] Attention computation (attention matrices)
-
-**3. What is the primary memory optimization target?**
-   - [ ] Reduce attention memory (Flash Attention)
-   - [ ] Reduce activation memory (checkpointing)
-   - [ ] Reduce parameter memory (mixed precision)
-   - [ ] Reduce fragmentation (kernel fusion)
-
-**4. Is the workload memory-bound or compute-bound?**
-   - [ ] Memory-bound (low arithmetic intensity, <10 FLOPS/byte)
-   - [ ] Compute-bound (high arithmetic intensity, >100 FLOPS/byte)
-   - [ ] Mixed (moderate arithmetic intensity, 10-100 FLOPS/byte)
-
-**5. What memory optimization would provide the biggest benefit?**
-
-Rank by expected impact:
-1. _______________________________________
-2. _______________________________________
-3. _______________________________________
-4. _______________________________________
-
-### 5.9 Key Takeaways
-
-**What We Learned:**
-
-1. **Memory Scaling Patterns:**
-   - Batch size: Linear (good)
-   - Sequence length: Between linear and quadratic (attention dominates)
-   - Peak memory occurs during backward pass or attention computation
-
-2. **Memory Bottlenecks Identified:**
-   - Attention matrices: O(S^2) memory usage
-   - Intermediate tensors: FFN activations
-   - Memory fragmentation from many small allocations
-
-3. **Bandwidth Utilization:**
-   - Moderate utilization (30-60%) indicates mixed workload
-   - Room for optimization through kernel fusion
-   - Memory bandwidth limits throughput for small models
-
-4. **Optimization Priorities:**
-   - Flash Attention: Critical for long sequences (S > 512)
-   - Kernel fusion: Reduces fragmentation and bandwidth pressure
-   - Mixed precision: 2x memory reduction, always beneficial
-
-**Next Steps:**
-
-- Document memory analysis in results file
-- Compare memory patterns across configurations
-- Identify top 3 memory optimization targets
-- Understand memory-compute trade-offs
-- Proceed to Exercise 3 for bottleneck identification
+Open the resulting `.pftrace` file in Perfetto:
 
-**Exercise Complete When:**
-
-- [ ] Memory profiling completed for multiple batch sizes
-- [ ] Sequence length scaling analyzed
-- [ ] Memory hotspots identified
-- [ ] Bandwidth utilization calculated
-- [ ] Optimization priorities ranked
-- [ ] Ready to proceed to bottleneck identification
-
----
-
-## 6. Exercise 3: Performance Study Across Problem Sizes
-
-### 6.1 Objective
-
-Learn how model performance scales with different problem sizes by using the automated performance study launcher. This exercise demonstrates:
-
-- How performance varies across tiny to very large model configurations
-- Scaling characteristics of attention and FFN operations
-- Memory and compute requirements for different model sizes
-- How to establish performance baselines for optimization comparisons
-
-**Time Required:** 15-30 minutes (depending on problem sizes tested)
-
-### 6.2 Understanding the Performance Study Script
-
-The `launch_performance_study.sh` script provides pre-configured problem sizes:
-
-| Size | Hidden Dim | Layers | Seq Len | Batch | Params | Expected Time |
-|------|-----------|--------|---------|-------|--------|---------------|
-| **tiny** | 256 | 4 | 128 | 8 | ~2.9M | <5s/iter |
-| **small** | 512 | 8 | 256 | 8 | ~20.9M | 10-30s/iter |
-| **medium** | 1024 | 12 | 512 | 16 | ~167M | 30-60s/iter |
-| **large** | 2048 | 16 | 1024 | 8 | ~1.3B | 1-3min/iter |
-| **very_large** | 4096 | 24 | 2048 | 4 | ~10.7B | 5-10min/iter |
-
-**Script Features:**
-- Automatic configuration based on problem size
-- Output organization with timestamps
-- Configuration metadata in JSON format
-- Optional profiler integration
-- Performance metrics extraction
-- Next steps guidance
-
-### 6.3 Step-by-Step Instructions
-
-#### Step 1: Run Tiny Problem Size (Quick Validation)
-
-Start with the smallest size to verify everything works:
-
-```bash
-cd ~/castille-ai-workshop-training/version1_pytorch_baseline/
-
-# Run tiny problem size (fast validation)
-./launch_performance_study.sh tiny
-```
-
-**Expected Output:**
+```text
+https://ui.perfetto.dev/
 ```
-================================================================================
-CASTILLE AI WORKSHOP - VERSION 1 BASELINE PERFORMANCE STUDY
-================================================================================
-
-Problem Size: TINY
-Configuration:
-  Hidden Dimension:    256
-  Number of Layers:    4
-  Sequence Length:     128
-  Batch Size:          8
-  Training Steps:      50
-  Est. Parameters:     ~2.9M
-  Expected Time:       <5s/iter
-  Profilers Enabled:   false
 
-Output Directory: performance_results_tiny_20251014_123456
-================================================================================
+Identify:
 
-Starting V1 Baseline training...
-...
-================================================================================
-PERFORMANCE STUDY COMPLETE
-================================================================================
-Total Runtime: 42s
-Throughput: 95.2 samples/sec
-Peak Memory: 342 MB
-```
-
-**Observe:**
-- Quick completion time
-- Low memory usage
-- Baseline throughput metrics
+- host launches
+- forward-pass kernels
+- backward-pass kernels
+- visible synchronization points
 
-#### Step 2: Run Medium Problem Size (Workshop Standard)
+## Exercise 4: Collect the full kernel trace
 
-Test the standard workshop configuration:
+Run:
 
 ```bash
-# Run medium problem size with profiling enabled
-./launch_performance_study.sh medium --enable-profilers
+./get_counters.sh
 ```
 
-**Note:** This will take longer (5-10 minutes) due to profiling overhead.
-
-**Expected Characteristics:**
-- Longer runtime per iteration
-- Higher memory usage
-- More realistic model size for workshops
-- Profiling data generated for analysis
-
-#### Step 3: Compare Problem Sizes
+Record:
 
-Run multiple sizes to observe scaling:
+- total GPU time
+- dispatch count
+- top kernels by time
 
-```bash
-# Run small size
-./launch_performance_study.sh small
-
-# Run medium size (if not done in Step 2)
-./launch_performance_study.sh medium
-
-# Optional: Run large (if you have time and memory)
-# WARNING: This requires significant GPU memory (>16GB)
-# ./launch_performance_study.sh large
-```
-
-#### Step 4: Analyze Results
-
-Each run creates a timestamped output directory. Examine the results:
+If the result is a ROCm 7.x database, summarize it with:
 
 ```bash
-# List all performance study results
-ls -lt performance_results_*/
-
-# View latest tiny run configuration
-cat performance_results_tiny_*/config.json
-
-# View training output
-cat performance_results_tiny_*/training_output.log
-
-# Compare throughput across sizes
-echo "=== Throughput Comparison ==="
-for dir in performance_results_*/; do
-    size=$(basename "$dir" | cut -d'_' -f3)
-    throughput=$(grep "Throughput:" "$dir/training_output.log" | tail -1 | awk '{print $2, $3}')
-    echo "$size: $throughput"
-done
-
-# Compare memory usage
-echo ""
-echo "=== Memory Usage Comparison ==="
-for dir in performance_results_*/; do
-    size=$(basename "$dir" | cut -d'_' -f3)
-    memory=$(grep "Peak memory usage:" "$dir/training_output.log" | tail -1 | awk '{print $4, $5}')
-    echo "$size: $memory"
-done
+rocpd2csv -i <db_file> -o kernel_stats.csv
+rocpd summary -i <db_file> --region-categories KERNEL
 ```
 
-#### Step 5: Record Scaling Observations
-
-Create a comparison table from your results:
-
-**Performance Scaling:**
-
-| Problem Size | Parameters | Throughput (samples/s) | Memory (MB) | Time/Iter (s) |
-|--------------|-----------|------------------------|-------------|---------------|
-| tiny         | ~2.9M     | _________              | _________   | _________     |
-| small        | ~20.9M    | _________              | _________   | _________     |
-| medium       | ~167M     | _________              | _________   | _________     |
-
-**Scaling Analysis:**
-
-1. **Throughput Scaling:**
-   - Does throughput decrease linearly with model size?
-   - At what size does GPU become saturated?
-   - How does batch size affect throughput?
-
-2. **Memory Scaling:**
-   - Is memory scaling proportional to parameter count?
-   - Where does attention memory become significant?
-   - What's the memory overhead ratio?
-
-3. **Compute Characteristics:**
-   - Which size achieves best GPU utilization?
-   - How does arithmetic intensity change?
-   - Is the workload memory-bound or compute-bound?
-
-### 6.4 Understanding Scaling Patterns
-
-**Expected Scaling Behavior:**
-
-**1. Parameter Count Scaling:**
-- Linear layers: Scale with D² (hidden dimension squared)
-- Attention: Scales with D² for projections, S² for computation
-- FFN: Scales with D × D_ff (typically D × 4D)
-
-**2. Memory Scaling:**
-- Parameters: Linear with model size
-- Activations: Linear with batch size, quadratic with sequence length
-- Peak memory: Dominated by activations for large sequences
-
-**3. Compute Scaling:**
-- FLOPs: Proportional to parameters × sequence length × batch size
-- Time per iteration: Depends on GPU utilization
-- Throughput: Inversely related to FLOPs per sample
-
-**4. GPU Utilization:**
-- Small models: Memory-bound, low GPU utilization
-- Medium models: Mixed workload, moderate utilization
-- Large models: Compute-bound, high GPU utilization
-
-### 6.5 Expected Results
+## Exercise 5: Hardware metrics
 
-After completing this exercise, you should observe:
-
-**Tiny → Small Transition (2.9M → 20.9M):**
-- Parameter increase: ~7x
-- Memory increase: ~5-8x
-- Throughput decrease: ~3-5x
-- GPU utilization: Still relatively low
-
-**Small → Medium Transition (20.9M → 167M):**
-- Parameter increase: ~8x
-- Memory increase: ~6-10x (sequence length doubles!)
-- Throughput decrease: ~5-10x
-- GPU utilization: Significantly improved
-
-**Key Observations:**
-
-1. **Quadratic Attention Cost Visible:**
-   - Medium (seq_len=512) shows significant attention overhead vs small (seq_len=256)
-   - Memory increases faster than linear due to S² term
-   - This motivates Flash Attention optimization
-
-2. **Batch Size Impact:**
-   - Medium uses batch_size=16 vs 8 for small/large
-   - Better GPU utilization with larger batches
-   - Memory-throughput trade-off visible
-
-3. **Memory Becomes Limiting:**
-   - Large/very_large reduce batch size to fit in memory
-   - Attention matrices consume significant memory at long sequences
-   - Gradient checkpointing would be beneficial
-
-4. **Compute Patterns:**
-   - Larger models approach compute-bound regime
-   - Better GPU utilization percentage
-   - GEMM operations dominate more clearly
-
-### 6.6 Profiling Analysis (If Enabled)
-
-If you ran with `--enable-profilers`, analyze the generated profiles:
+Run:
 
 ```bash
-# Navigate to profiled run
-cd performance_results_medium_*/
-
-# View performance summary
-cat performance_summary.json | python3 -m json.tool
-
-# Check for profiler outputs
-ls -lh pytorch_profiles/
+./get_rocprof_compute.sh
 ```
 
-**Compare profiling results across sizes:**
-- How does kernel distribution change?
-- Which operations dominate in small vs large models?
-- How does memory bandwidth utilization scale?
-
-### 6.7 Troubleshooting
-
-**Out of Memory Error:**
-
-```bash
-# Error: RuntimeError: CUDA out of memory. Tried to allocate X.XX GiB
-
-# Solution 1: Try the next smaller size
-./launch_performance_study.sh small  # instead of medium
+On supported Instinct GPUs, use the printed `rocprof-compute analyze` sequence. On unsupported GPUs, the script exits cleanly and you can continue with the trace-based exercises.
 
-# Solution 2: Skip large/very_large on limited hardware
-# These sizes require >16GB GPU memory
-```
-
-**Slow Execution:**
+Questions to answer:
 
-```bash
-# If profiling is too slow, disable it
-./launch_performance_study.sh medium  # without --enable-profilers
+- does the dominant dispatch look memory bound or compute bound
+- is occupancy likely to matter
+- does the report agree with the hotspot list
 
-# Reduce number of steps for faster results (edit script or run directly)
-python tiny_llama_v1.py --hidden-dim 1024 --num-layers 12 --seq-len 512 \
-    --batch-size 16 --num-steps 20  # Reduced from 100
-```
+## Exercise 6: Optional system trace
 
-**Script Permission Denied:**
+Run:
 
 ```bash
-# Make script executable
-chmod +x launch_performance_study.sh
-
-# Then run
-./launch_performance_study.sh tiny
+./get_rocprof_sys.sh
 ```
 
-### 6.8 Analysis Questions
-
-Answer these based on your performance study results:
-
-**1. Scaling Characteristics:**
-
-Q: How does throughput scale with model size?
-A: _________________________________________________________________
-
-Q: At what model size does GPU utilization peak?
-A: _________________________________________________________________
-
-Q: Which component (attention vs FFN) dominates compute time?
-A: _________________________________________________________________
-
-**2. Memory Patterns:**
-
-Q: How does memory scale with sequence length? (linear, quadratic, other?)
-A: _________________________________________________________________
-
-Q: What is the memory overhead ratio (peak / parameters)?
-A: _________________________________________________________________
-
-Q: At what point does attention memory become significant?
-A: _________________________________________________________________
-
-**3. Performance Optimization:**
-
-Q: Which model size would benefit most from Flash Attention?
-A: _________________________________________________________________
-
-Q: Which size is most memory-bound vs compute-bound?
-A: _________________________________________________________________
-
-Q: What batch size would you recommend for medium model?
-A: _________________________________________________________________
-
-**4. Practical Insights:**
-
-Q: What's the largest model you can train on your GPU?
-A: _________________________________________________________________
-
-Q: How would you improve throughput for the medium model?
-A: _________________________________________________________________
-
-Q: What's the optimal problem size for this workshop?
-A: _________________________________________________________________
-
-### 6.9 Key Takeaways
-
-**1. Problem Size Dramatically Affects Performance:**
-- Small models: Memory-bound, low GPU utilization
-- Large models: Compute-bound, high GPU utilization
-- Medium models: Sweet spot for learning optimizations
-
-**2. Attention Memory Scales Quadratically:**
-- Visible impact when comparing seq_len=256 vs 512 vs 1024
-- Flash Attention is critical for long sequences
-- Memory becomes limiting factor before compute
-
-**3. Batch Size is a Key Tuning Parameter:**
-- Larger batches improve GPU utilization
-- Memory constraints force smaller batches for large models
-- Trade-off between throughput and memory usage
-
-**4. Automated Testing is Valuable:**
-- Pre-configured sizes reduce manual configuration errors
-- Consistent testing methodology across problem sizes
-- Easy to reproduce and compare results
-
-**5. Scaling Informs Optimization Strategy:**
-- Tiny models: Not worth optimizing (I/O bound)
-- Small-medium: Kernel fusion, mixed precision beneficial
-- Large: Flash Attention, gradient checkpointing critical
-
-**Next Steps:**
-
-- Review all performance study results
-- Document scaling patterns in your notes
-- Identify which optimizations would have most impact
-- Use baseline results to measure optimization improvements
-- Proceed to comparative analysis with optimized versions
-
-**Exercise Complete When:**
-
-- [ ] At least 2 problem sizes tested (tiny + one other)
-- [ ] Scaling patterns documented
-- [ ] Memory and throughput metrics recorded
-- [ ] Performance characteristics understood
-- [ ] Optimization priorities identified
-- [ ] Ready to compare with optimized versions
+This script uses a smaller default step count than the other profiling scripts. Open the generated `.proto` file in Perfetto and use it when the interaction between Python, libraries, and GPU execution matters more than kernel timing alone.
 
----
+## Exercise 7: Compare with the next version
 
-**Next Exercise:** Exercise 4 - Comparative Analysis with Optimized Versions
+Move to `../version2_pytorch_fused` and repeat the same sequence. The comparison is more useful than any single run in isolation.
 
----
+## Closing note
 
+If only a short session is available, Exercises 1 through 4 are enough. That gives a complete path from baseline run to hotspot list to runtime trace to full kernel trace.
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/README.md b/MLExamples/TinyTransformer/version1_pytorch_baseline/README.md
index 7c1f20d3..d9738c5d 100644
--- a/MLExamples/TinyTransformer/version1_pytorch_baseline/README.md
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/README.md
@@ -1,666 +1,187 @@
+# TinyTransformer Version 1: PyTorch Baseline
 
-# Version 1: PyTorch Baseline - Profiling Foundation
+This is the reference training path for the TinyTransformer progression. Start here, collect the baseline measurements, and then compare every later version against this directory.
 
-README.md from `HPCTrainingExamples/MLExamples/TinyTransformer/version1_pytorch_baseline` in the Training Examples repository
+## Environment
 
-## Overview
+Load the required modules:
 
-Version 1 establishes the profiling foundation for the workshop using a standard PyTorch implementation of Tiny LLaMA. This version focuses on comprehensive performance characterization using PyTorch native profiling and DeepSpeed FLOPS profiler, providing the baseline measurements for all subsequent optimizations.
-
-## Learning Objectives
-
-After completing this version, you will be able to:
-
-- Configure deterministic execution for reproducible profiling
-- Use PyTorch Profiler for detailed operator-level analysis
-- Integrate DeepSpeed FLOPS profiler for computational efficiency metrics
-- Interpret profiling results and identify performance bottlenecks
-- Establish baseline performance metrics for optimization comparison
-
-## Architecture Overview
-
-This implementation uses the standard transformer architecture with:
-
-- **Multi-Head Attention**: Standard scaled dot-product attention
-- **Feed-Forward Network**: SwiGLU activation with separate gate/up projections
-- **Layer Normalization**: RMSNorm for improved training stability
-- **Position Embeddings**: Rotary Position Embeddings (RoPE)
-
-### Model Configuration
-
-```python
-# Default Tiny LLaMA Configuration
-vocab_size = 1000           # Small vocabulary for workshop
-hidden_size = 256          # Model dimension
-num_layers = 4             # Transformer layers
-num_attention_heads = 8    # Attention heads
-intermediate_size = 512    # FFN dimension
-max_sequence_length = 128  # Context window
+```bash
+module load pytorch rocm
 ```
 
-## Implementation Details
-
-### Mathematical Implementation
-
-This section provides detailed implementation specifics for the baseline PyTorch model. For complete mathematical foundations, see [TINY_LLAMA_ARCHITECTURE.md](../TINY_LLAMA_ARCHITECTURE.md).
-
-#### Standard PyTorch Attention Implementation
-
-The baseline attention mechanism follows standard PyTorch patterns:
-
-```python
-def attention_forward(self, hidden_states, attention_mask=None):
-    batch_size, seq_len, _ = hidden_states.size()
-
-    # Linear projections (separate operations - optimization target!)
-    query = self.q_proj(hidden_states)  # [B, S, D] -> [B, S, D]
-    key = self.k_proj(hidden_states)    # [B, S, D] -> [B, S, D]
-    value = self.v_proj(hidden_states)  # [B, S, D] -> [B, S, D]
-
-    # Reshape for multi-head attention
-    query = query.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
-    key = key.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
-    value = value.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+The profiling scripts use the same default workload:
 
-    # Apply rotary position embeddings
-    query, key = self.rotary_emb(query, key, seq_len)
+- batch size `8`
+- sequence length `128`
+- training steps `10`
 
-    # Scaled dot-product attention - O(S^2) memory complexity
-    attn_weights = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)
+`get_rocprof_sys.sh` uses a smaller default step count so the system trace stays manageable. All scripts accept overrides through `TINYTRANSFORMER_BATCH_SIZE`, `TINYTRANSFORMER_SEQ_LEN`, `TINYTRANSFORMER_NUM_STEPS`, and `TINYTRANSFORMER_EXTRA_ARGS`.
 
-    if attention_mask is not None:
-        attn_weights = attn_weights + attention_mask
+## Baseline run
 
-    # Softmax over last dimension
-    attn_weights = F.softmax(attn_weights, dim=-1)
+Run the model once before profiling:
 
-    # Apply attention to values
-    attn_output = torch.matmul(attn_weights, value)
-
-    # Reshape and project output
-    attn_output = attn_output.transpose(1, 2).contiguous()
-    attn_output = attn_output.view(batch_size, seq_len, self.hidden_size)
-    attn_output = self.o_proj(attn_output)
-
-    return attn_output
+```bash
+python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 10
 ```
 
-**Performance Characteristics:**
-- **3 separate linear projections**: Creates kernel launch overhead
-- **Attention matrix materialization**: $S \times S \times H$ memory usage
-- **Multiple tensor reshapes**: Memory layout inefficiencies
-- **Sequential operations**: Limited parallelization opportunities
-
-#### SwiGLU Feed-Forward Implementation
-
-```python
-def swiglu_forward(self, hidden_states):
-    # Separate gate and up projections (optimization target!)
-    gate = self.gate_proj(hidden_states)  # [B, S, D] -> [B, S, D_ff]
-    up = self.up_proj(hidden_states)      # [B, S, D] -> [B, S, D_ff]
-
-    # SiLU activation (Swish)
-    gate_activated = F.silu(gate)         # Element-wise operation
-
-    # Element-wise multiplication
-    intermediate = gate_activated * up     # [B, S, D_ff]
+Example output from one validated run:
 
-    # Down projection
-    output = self.down_proj(intermediate)  # [B, S, D_ff] -> [B, S, D]
-
-    return output
+```text
+Performance Summary:
+   Average training speed: 291.3 samples/sec
+   Throughput: 37282 tokens/sec
+   Average batch time: 27.5 ms
+   Average forward time: 7.7 ms
+   Average backward time: 14.8 ms
+   Average optimizer time: 5.0 ms
+   Peak memory usage: 434.3 MB
 ```
 
-**Optimization Opportunities:**
-- **Separate gate/up projections**: Can be fused into single GEMM
-- **Intermediate tensor storage**: Memory overhead for gate_activated and up
-- **Sequential activation**: SiLU can be fused with multiplication
-
-#### RMSNorm Implementation
+These are the reference numbers to compare with versions 2 through 4.
 
-```python
-def rms_norm_forward(self, hidden_states):
-    input_dtype = hidden_states.dtype
-    variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-    hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-    return (self.weight * hidden_states).to(input_dtype)
-```
+## Quick hotspot summary
 
-**Implementation Details:**
-
-- **Variance computation**: Single reduction operation
-- **Epsilon for numerical stability**: Prevents division by zero
-- **Mixed precision handling**: Maintains numerical precision
-
-### Operator-Level Performance Analysis
-
-#### FLOP Breakdown by Operation Type
-
-```python
-# Per transformer layer FLOP count (batch_size=1, seq_len=128)
-FLOPS_BREAKDOWN = {
-    'q_proj': seq_len * hidden_dim * hidden_dim,           # 128 * 256 * 256 = 8.4M
-    'k_proj': seq_len * hidden_dim * hidden_dim,           # 128 * 256 * 256 = 8.4M
-    'v_proj': seq_len * hidden_dim * hidden_dim,           # 128 * 256 * 256 = 8.4M
-    'attn_scores': seq_len * seq_len * hidden_dim,         # 128 * 128 * 256 = 4.2M
-    'attn_output': seq_len * seq_len * hidden_dim,         # 128 * 128 * 256 = 4.2M
-    'o_proj': seq_len * hidden_dim * hidden_dim,           # 128 * 256 * 256 = 8.4M
-    'gate_proj': seq_len * hidden_dim * intermediate_dim,  # 128 * 256 * 512 = 16.8M
-    'up_proj': seq_len * hidden_dim * intermediate_dim,    # 128 * 256 * 512 = 16.8M
-    'down_proj': seq_len * intermediate_dim * hidden_dim,  # 128 * 512 * 256 = 16.8M
-    'rms_norm': 2 * seq_len * hidden_dim,                  # 2 * 128 * 256 = 65K
-}
-
-# Total per layer: ~92.1M FLOPs
-# Total model (4 layers): ~368M FLOPs per forward pass
-```
+Run:
 
-#### Memory Access Patterns
-
-```python
-# Memory bandwidth requirements per operation
-MEMORY_BREAKDOWN = {
-    'attention_qkv': {
-        'parameters': 3 * hidden_dim * hidden_dim * 4,     # 3 * 256^2 * 4B = 786KB
-        'activations': seq_len * hidden_dim * 4,           # 128 * 256 * 4B = 131KB
-        'attention_matrix': seq_len * seq_len * num_heads * 4,  # 128^2 * 8 * 4B = 524KB
-        'bandwidth_requirement': 'memory-bound'            # Limited by memory access
-    },
-    'feed_forward': {
-        'parameters': 3 * hidden_dim * intermediate_dim * 4,    # 3 * 256 * 512 * 4B = 1.57MB
-        'activations': seq_len * intermediate_dim * 4,          # 128 * 512 * 4B = 262KB
-        'bandwidth_requirement': 'compute-bound'               # Good arithmetic intensity
-    }
-}
-```
-
-#### Kernel Launch Analysis
-
-The baseline implementation generates numerous kernel launches per forward pass:
-
-```python
-# Typical kernel count per transformer layer
-KERNEL_LAUNCHES = {
-    'attention_block': {
-        'q_projection': 1,        # Linear layer
-        'k_projection': 1,        # Linear layer
-        'v_projection': 1,        # Linear layer
-        'rope_application': 2,    # For query and key
-        'attention_computation': 3,  # QK^T, softmax, attention*V
-        'output_projection': 1,   # Linear layer
-        'residual_add': 1,        # Element-wise addition
-        'subtotal': 10
-    },
-    'ffn_block': {
-        'rms_norm': 1,           # Normalization
-        'gate_projection': 1,     # Linear layer
-        'up_projection': 1,       # Linear layer
-        'silu_activation': 1,     # Element-wise SiLU
-        'element_multiply': 1,    # gate * up
-        'down_projection': 1,     # Linear layer
-        'residual_add': 1,        # Element-wise addition
-        'subtotal': 7
-    },
-    'layer_total': 17,           # Per transformer layer
-    'model_total': 68            # 4 layers * 17 kernels/layer
-}
-```
-
-**Optimization Implications:**
-
-- **High kernel launch overhead**: 68+ kernels create GPU scheduling overhead
-- **Memory bandwidth underutilization**: Many small operations
-- **Fusion opportunities**: Adjacent operations can be combined
-
-### Profiling Data Interpretation
-
-#### PyTorch Profiler Output Analysis
-
-When analyzing PyTorch profiler results, focus on these key metrics:
-
-```python
-# Key profiler metrics to examine
-PROFILER_METRICS = {
-    'operator_timing': {
-        'aten::linear': 'Matrix multiplication operations',
-        'aten::softmax': 'Attention softmax computation',
-        'aten::add_': 'Residual connections',
-        'aten::mul': 'Element-wise operations',
-        'aten::rsqrt': 'RMSNorm operations'
-    },
-    'memory_analysis': {
-        'peak_memory': 'Maximum GPU memory allocation',
-        'memory_timeline': 'Memory usage over time',
-        'fragmentation': 'Memory layout efficiency'
-    },
-    'gpu_utilization': {
-        'kernel_efficiency': 'Individual kernel performance',
-        'sm_efficiency': 'Streaming multiprocessor usage',
-        'memory_bandwidth': 'Memory subsystem utilization'
-    }
-}
-```
-
-#### Expected Bottleneck Patterns
-
-Based on the implementation analysis, expect these bottlenecks:
-
-```python
-EXPECTED_BOTTLENECKS = {
-    'attention_computation': {
-        'percentage_of_time': '35-45%',
-        'primary_issue': 'O(S^{2}) memory complexity',
-        'kernel_count': '10 per layer',
-        'optimization_target': 'Flash Attention + QKV fusion'
-    },
-    'feed_forward_network': {
-        'percentage_of_time': '30-40%',
-        'primary_issue': 'Separate gate/up projections',
-        'kernel_count': '7 per layer',
-        'optimization_target': 'SwiGLU fusion'
-    },
-    'layer_normalization': {
-        'percentage_of_time': '8-12%',
-        'primary_issue': 'Memory-bound operation',
-        'kernel_count': '2 per layer',
-        'optimization_target': 'Kernel fusion with adjacent ops'
-    },
-    'residual_connections': {
-        'percentage_of_time': '5-8%',
-        'primary_issue': 'Memory bandwidth limitation',
-        'kernel_count': '2 per layer',
-        'optimization_target': 'Fusion with preceding operations'
-    }
-}
+```bash
+./get_hotspots.sh
 ```
 
-### Code Walkthrough: Critical Performance Paths
-
-#### Attention Hot Path Analysis
-
-```python
-# Performance-critical code path in attention forward pass
-@profile_function("attention_forward")  # PyTorch profiler annotation
-def forward(self, hidden_states, attention_mask=None, position_ids=None):
-    bsz, q_len, _ = hidden_states.size()
-
-    # BOTTLENECK 1: Separate linear projections (3 kernel launches)
-    with nvtx.range("qkv_projections"):
-        query_states = self.q_proj(hidden_states)    # Kernel launch 1
-        key_states = self.k_proj(hidden_states)      # Kernel launch 2
-        value_states = self.v_proj(hidden_states)    # Kernel launch 3
-
-    # Reshape for attention heads
-    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-    key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-    value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+The script collects `rocprofv3 --kernel-trace --stats` and prints the top rows from the generated `*_kernel_stats.csv`. Example excerpt:
 
-    # BOTTLENECK 2: Attention computation (O(S^2) memory)
-    with nvtx.range("attention_computation"):
-        # Attention scores: [bsz, num_heads, q_len, kv_seq_len]
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attention_mask is not None:
-            attn_weights = attn_weights + attention_mask
-
-        # BOTTLENECK 3: Softmax (memory-bound)
-        attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-
-        # BOTTLENECK 4: Attention application
-        attn_output = torch.matmul(attn_weights, value_states)
-
-    # Reshape and output projection
-    attn_output = attn_output.transpose(1, 2).contiguous()
-    attn_output = attn_output.view(bsz, q_len, self.hidden_size)
-    attn_output = self.o_proj(attn_output)  # Kernel launch 4
-
-    return attn_output, attn_weights
+```text
+"Name","Calls","TotalDurationNs","AverageNs","Percentage"
+"Cijk_Alik_Bljk_SB_MT128x256x16_...",240,30763234,128180,8.79
+"Cijk_Ailk_Bljk_SB_MT128x64x8_...",240,30083168,125347,8.59
+"Cijk_Alik_Bljk_SB_MT128x128x16_...",360,26609605,73916,7.60
 ```
 
-**Profiling Annotations:**
-
-- `@profile_function`: Enables detailed timing analysis
-- `nvtx.range()`: Creates named regions in profiler traces
-- Performance counters will show exact kernel timing
-
-## Workshop Exercises
+For this baseline, the first pass is simple: identify the dominant GEMM and elementwise kernels, then compare that list with later versions.
 
-### Exercise 1: Baseline Performance Analysis
+The figure below comes from the validated container run used for this tutorial:
 
-**Objective**: Establish baseline performance metrics and identify computational bottlenecks.
+![TinyTransformer V1 hotspot summary from validated container run](../images/tinytransformer_version1_hotspots.png)
 
-#### Step 1: Run Basic Training
-```bash
-# Basic training without profiling
-python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 10
+## Runtime trace
 
-# Expected output: Training loss progression and timing info
-```
+Run:
 
-#### Step 2: Enable PyTorch Profiler
 ```bash
-# Make directory for the profiles
-mkdir pytorch_profiles
-# Run with PyTorch profiler enabled
-python tiny_llama_v1.py \
-    --batch-size 8 \
-    --seq-len 128 \
-    --num-steps 10 \
-    --enable-pytorch-profiler \
-    --profile-dir ./pytorch_profiles
-
-# This generates detailed profiling traces in pytorch_profiles/
+./get_trace.sh
 ```
 
-#### Step 3: Analyze Profiling Results
-```bash
-# Launch TensorBoard to visualize profiles
-tensorboard --logdir pytorch_profiles --port 6006
+Example success output:
 
-# Or generate text report
-python run_pytorch_profiler.py --analyze-existing pytorch_profiles/profile_*.json
+```text
+Profiling complete! Results saved to: profiling_results/trace_<timestamp>
+Perfetto trace file: .../28830_results.pftrace
+Open it in Perfetto UI: https://ui.perfetto.dev/
 ```
 
-**Expected Analysis Results:**
+The script now reports the largest generated Perfetto trace in the output tree, which avoids the small side traces that can also appear in the same run directory.
 
-- Attention operations consuming ~40% of compute time
-- Matrix multiplications (GEMM) as primary compute kernels
-- Memory transfer overhead between operations
-- GPU utilization patterns
+If your ROCm stack produces a database instead of a `.pftrace`, convert it with:
 
-#### Step 4: DeepSpeed FLOPS Analysis
 ```bash
-# Run with DeepSpeed FLOPS profiler
-python run_deepspeed_flops.py \
-    --batch-size 8 \
-    --seq-len 128 \
-    --num-steps 10
-
-# Analyze computational intensity
-python run_deepspeed_flops.py --analyze-results flops_profile.json
+rocpd2pftrace -i <db_file> -o trace.pftrace
 ```
 
-**Expected FLOPS Analysis:**
-
-- Total FLOPS per forward/backward pass
-- FLOPS breakdown by operation type
-- Model FLOPS Utilization (MFU) calculation
-- Memory bandwidth requirements
-
-### Exercise 2: Memory Analysis and Optimization
-
-**Objective**: Understand memory usage patterns and bandwidth requirements.
+## Full kernel trace
 
-#### Step 1: Memory Profiling
-```bash
-# Run with memory profiling enabled
-python tiny_llama_v1.py \
-    --batch-size 8 \
-    --seq-len 128 \
-    --enable-pytorch-profiler \
-    --profile-memory \
-    --profile-dir ./memory_analysis
-
-# Generate memory timeline visualization
-python -c "
-import torch
-from torch.profiler import profile, record_function, ProfilerActivity
-# Memory analysis code will be embedded in tiny_llama_v1.py
-"
-```
+Run:
 
-#### Step 2: Batch Size Scaling
 ```bash
-# Test different batch sizes
-for bs in 4 8 16 32; do
-    echo \"Testing batch size: \$bs\"
-    python tiny_llama_v1.py \
-        --batch-size \$bs \
-        --seq-len 128 \
-        --num-steps 5 \
-        --enable-pytorch-profiler \
-        --profile-dir ./scaling_bs\$bs
-done
-
-# Analyze scaling behavior
-python analyze_batch_scaling.py --profile-dirs scaling_bs*
+./get_counters.sh
 ```
 
-**Expected Memory Analysis:**
-
-- Memory usage scaling with batch size
-- Peak memory allocation points
-- Memory fragmentation patterns
-- Opportunities for memory optimization
-
-### Exercise 3: Bottleneck Identification
+Example success output:
 
-**Objective**: Identify computational and memory bottlenecks for optimization targets.
-
-#### Step 1: Operator-Level Analysis
-```bash
-# Detailed operator timing
-python tiny_llama_v1.py \
-    --batch-size 8 \
-    --seq-len 128 \
-    --enable-pytorch-profiler \
-    --profile-operators \
-    --sort-by cuda_time_total
-
-# Generate bottleneck report
-python analyze_bottlenecks.py \
-    --profile-data pytorch_profiles/ \
-    --output-report bottlenecks_v1.md
+```text
+Kernel trace CSV: .../29490_kernel_trace.csv
+Agent info CSV: .../29490_agent_info.csv
 ```
 
-#### Step 2: Attention Pattern Analysis
-```bash
-# Focus on attention computation
-python tiny_llama_v1.py \
-    --batch-size 8 \
-    --seq-len 128 \
-    --profile-attention-only \
-    --enable-pytorch-profiler
-```
+On ROCm 7.x, the main output may be a database. Useful follow-up commands are:
 
-#### Step 3: Matrix Multiplication Analysis
 ```bash
-# GEMM operation profiling
-python analyze_gemm_operations.py \
-    --model-config tiny_llama_v1_config.yaml \
-    --batch-sizes \"4,8,16,32\" \
-    --sequence-lengths \"64,128,256\"
+rocpd2csv -i <db_file> -o kernel_stats.csv
+rocpd summary -i <db_file> --region-categories KERNEL
 ```
 
-**Expected Bottleneck Analysis:**
+The first quantities to record are total GPU time, dispatch count, unique kernel count, and the top kernels by total duration.
 
-- Attention QKV projection overhead
-- Softmax computation inefficiency
-- Multiple small GEMM operations
-- Memory-bound operations identification
+## Hardware metrics
 
-## Profiling Tools Integration
+Run:
 
-### PyTorch Profiler Configuration
-
-The implementation includes comprehensive PyTorch profiler integration:
-
-```python
-# In tiny_llama_v1.py
-from torch.profiler import profile, record_function, ProfilerActivity
-
-# Profiler configuration
-profiler_config = {
-    'activities': [ProfilerActivity.CPU, ProfilerActivity.CUDA],
-    'record_shapes': True,
-    'profile_memory': True,
-    'with_stack': True,
-    'with_flops': True,
-    'experimental_config': torch._C._profiler._ExperimentalConfig(verbose=True)
-}
+```bash
+./get_rocprof_compute.sh
 ```
 
-### DeepSpeed FLOPS Profiler Integration
+On supported Instinct GPUs, the script collects `rocprof-compute` data and prints the follow-up analysis flow:
 
-```python
-# FLOPS profiler setup
-from deepspeed.profiling.flops_profiler import FlopsProfiler
-
-profiler = FlopsProfiler(model)
-profiler.start_profile()
-# Training step
-profiler.stop_profile()
-profiler.print_model_profile(profile_step=1)
+```bash
+rocprof-compute analyze -p <raw_data_dir> --list-stats
+rocprof-compute analyze -p <raw_data_dir> --dispatch <N>
+rocprof-compute analyze -p <raw_data_dir> --dispatch <N> --block 2.1.15 6.2.7
+rocprof-compute analyze -p <raw_data_dir> --dispatch <N> --block 16.1 17.1
 ```
 
-## Key Performance Metrics
-
-### Baseline Performance Expectations
-
-On a typical AMD MI200 series GPU:
-
-| Metric | Expected Range | Notes |
-|--------|----------------|-------|
-| **Training Speed** | 50-100 samples/sec | Batch size dependent |
-| **GPU Utilization** | 60-75% | Standard PyTorch efficiency |
-| **Memory Usage** | 2-4 GB | Model + batch data |
-| **FLOPS Utilization** | 30-45% | Baseline MFU |
-| **Memory Bandwidth** | 40-60% | Memory-bound operations |
-
-### Profiling Output Files
-
-After running exercises, expect these output files:
+On unsupported GPUs, the script exits cleanly. Example output:
 
+```text
+Skipping rocprof-compute profiling for TinyTransformer V1...
+Detected GPU architecture: gfx1100
+rocprof-compute hardware-counter collection currently requires a supported Instinct GPU
+Use get_trace.sh, get_hotspots.sh, or get_counters.sh on this system instead.
 ```
-version1_pytorch_baseline/
-├── pytorch_profiles/
-│   ├── profile_*.json          # PyTorch profiler traces
-│   ├── trace_*.json            # Chrome trace format
-│   └── memory_timeline.html    # Memory usage visualization
-├── flops_analysis/
-│   ├── flops_profile.json      # FLOPS breakdown
-│   ├── model_profile.txt       # Detailed model analysis
-│   └── mfu_analysis.csv        # Model FLOPS Utilization
-└── bottleneck_analysis/
-    ├── bottlenecks_v1.md       # Comprehensive bottleneck report
-    ├── operator_timing.csv     # Per-operator performance
-    └── optimization_targets.json # Prioritized optimization opportunities
-```
-
-## Expected Analysis Results
-
-### Performance Characteristics
-
-1. **Compute Distribution**:
-   - Attention operations: ~40% of total time
-   - Feed-forward network: ~35% of total time
-   - Layer normalization: ~10% of total time
-   - Other operations: ~15% of total time
-
-2. **Memory Patterns**:
-   - Peak memory usage during attention computation
-   - Multiple intermediate tensor allocations
-   - Memory fragmentation from varying tensor sizes
-
-3. **Optimization Opportunities**:
-   - Kernel fusion potential in attention
-   - Memory layout optimization
-   - Reduced intermediate tensor creation
-
-### Bottleneck Identification
-
-Primary bottlenecks to address in subsequent versions:
 
-1. **Separate QKV projections** → Fusion opportunity
-2. **Standard attention computation** → Flash Attention
-3. **Individual FFN gates** → SwiGLU fusion
-4. **Multiple kernel launches** → Custom kernels
+## System trace
 
-## Troubleshooting
+Run:
 
-### Common Issues
-
-#### CUDA/ROCm Memory Errors
 ```bash
-# Reduce batch size if memory errors occur
-python tiny_llama_v1.py --batch-size 4 --seq-len 64
+./get_rocprof_sys.sh
 ```
 
-#### Profiler Permission Issues
-```bash
-# Ensure proper permissions for profiling
-export ROCPROF_COMPUTE_DISABLE_AQL_DEBUG=1
-```
+This script defaults to `2` training steps so the trace remains practical. Example success output:
 
-#### Missing Profiling Output
-```bash
-# Check profiling directory permissions
-mkdir -p pytorch_profiles
-chmod 755 pytorch_profiles
+```text
+Profiling complete! Results saved to: profiling_results/rocprof_sys_<timestamp>
+Perfetto trace file: .../perfetto-trace-31804.proto
+Open it in Perfetto UI: https://ui.perfetto.dev/
 ```
 
-### Performance Validation
+On the validated ROCm 6.4 container, `rocprof-sys` also emitted `perf_event_paranoid` warnings and an `RSMI_STATUS_UNEXPECTED_DATA` backtrace before completing. Those messages were noisy, but the script still produced a usable Perfetto trace.
 
-To validate your setup is working correctly:
+## Optional framework-level profiling
+
+The Python driver also exposes framework-level instrumentation:
 
 ```bash
-# Quick validation run
 python tiny_llama_v1.py \
-    --batch-size 4 \
-    --seq-len 64 \
-    --num-steps 3 \
+    --batch-size 8 \
+    --seq-len 128 \
+    --num-steps 20 \
     --enable-pytorch-profiler \
-    --validate-setup
-
-# Expected: Successful completion with profiling files generated
+    --profile-dir ./pytorch_profiles \
+    --profile-steps 5
 ```
 
-## Next Steps
-
-After completing all exercises in Version 1:
-
-1. **Review baseline metrics** - Understand current performance characteristics
-2. **Identify optimization targets** - Use bottleneck analysis to prioritize improvements
-3. **Prepare for Version 2** - Kernel fusion will address primary bottlenecks
-4. **Document findings** - Record baseline measurements for comparison
-
-**Ready for optimization? Proceed to [Version 2: PyTorch Fused](../version2_pytorch_fused/README.md)**
-
----
-
-## Performance Summary Template
-
-Use this template to document your Version 1 results:
+Open the result with TensorBoard:
 
+```bash
+tensorboard --logdir ./pytorch_profiles --port 6006
 ```
-# Version 1 Baseline Results
 
-## Configuration
+## Workshop sequence
 
-- Batch Size: ___
-- Sequence Length: ___
-- GPU: ___
-- ROCm Version: ___
-
-## Performance Metrics
-
-- Training Speed: ___ samples/sec
-- GPU Utilization: ___%
-- Memory Usage: ___ GB
-- FLOPS Utilization: ___%
-
-## Top Bottlenecks
-
-1. _________________ (__% of time)
-2. _________________ (__% of time)
-3. _________________ (__% of time)
-
-## Optimization Targets for Version 2
-
-- [ ] QKV fusion
-- [ ] Flash Attention
-- [ ] SwiGLU fusion
-- [ ] Other: ___________
-```
+Use [`PYTORCH_BASELINE_WORKSHOP_WALKTHROUGH.md`](PYTORCH_BASELINE_WORKSHOP_WALKTHROUGH.md) for a shorter lab sequence built on the same commands.
 
+## References
 
+- rocprofv3: https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/develop/how-to/using-rocprofv3.html
+- rocpd tools: https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/develop/how-to/using-rocpd-output-format.html
+- Perfetto UI: https://ui.perfetto.dev/
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/ROCPROFV3_VERSION1_RESULTS.md b/MLExamples/TinyTransformer/version1_pytorch_baseline/ROCPROFV3_VERSION1_RESULTS.md
new file mode 100644
index 00000000..9736c662
--- /dev/null
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/ROCPROFV3_VERSION1_RESULTS.md
@@ -0,0 +1,66 @@
+# rocprofv3 Test Results - Version 1 Baseline
+
+ROCPROFV3_VERSION1_RESULTS.md from `HPCTrainingExamples/MLExamples/TinyTransformer/version1_pytorch_baseline` in the Training Examples repository.
+
+## Summary
+
+rocprofv3 successfully captures profiling data from version1 baseline. This document shows example results from runtime trace collection.
+
+## Test Configuration
+
+**Command:**
+
+```
+rocprofv3 --runtime-trace --output-format pftrace -- python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 10
+```
+
+**Environment:**
+- ROCm Version: 6.4.x
+- PyTorch: ROCm-enabled build
+- GPU: AMD Instinct or Radeon with gfx support
+
+## Example Output
+
+```
+Training completed: 10 steps, batch size 8
+Performance: 262.3 samples/sec, 33,571 tokens/sec
+Memory usage: 434.3 MB peak
+rocprofv3 exit code: 0 (success)
+```
+
+## Generated Files
+
+Output directory contains Perfetto trace files:
+
+| File | Size | Description |
+|------|------|-------------|
+| `<pid>_results.pftrace` | ~40-50 MB | Main trace with full profiling data |
+| Additional `.pftrace` files | ~600 bytes | Minimal traces from subprocesses |
+
+The main trace file (largest) contains the full profiling data for timeline analysis.
+
+## Viewing the Trace
+
+1. Visit https://ui.perfetto.dev/
+2. Click "Open trace file"
+3. Select the main `.pftrace` file
+4. Examine:
+   - GPU kernel timeline
+   - Memory transfer operations
+   - HIP API calls
+   - Kernel duration and overlap
+
+## Warnings
+
+The following warning may appear and can be ignored:
+
+```
+rocprofiler_iterate_agent_supported_counters returned ROCPROFILER_STATUS_ERROR_AGENT_ARCH_NOT_SUPPORTED for agent X (gfxXXXX)
+```
+
+This typically relates to integrated GPUs or unsupported architectures and does not affect profiling of the target GPU.
+
+## Additional Resources
+
+- rocprofv3 documentation: https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/develop/how-to/using-rocprofv3.html
+- Perfetto UI: https://ui.perfetto.dev/
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_1_baseline_analysis.md b/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_1_baseline_analysis.md
index 1cb9b199..b30e4884 100644
--- a/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_1_baseline_analysis.md
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_1_baseline_analysis.md
@@ -1,53 +1,26 @@
+# Exercise 1: Baseline Performance Analysis
 
-## Exercise 1: Baseline Performance Analysis
+exercise_1_baseline_analysis.md from `HPCTrainingExamples/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises` in the Training Examples repository.
 
-`exercise1_baseline_analysis.md` from `HPCTrainingExamples/MLExamples/TinyTransformer/version1_pytorch_baseline` in the Training Examples repository
+## Objective
 
-### Objective
-Establish baseline performance metrics for Tiny LLaMA V1 and understand the profiling methodology that will be used throughout the workshop.
+Establish baseline performance metrics for Tiny LLaMA V1 and understand profiling methodology.
 
-### Prerequisites
+## Step 1: Run Baseline Training
 
-- Completed environment setup from `../setup/`
-- Verified environment with validation scripts
-
-### Duration
-**Estimated Time:** 20-30 minutes
-
-### Instructions
-
-#### Step 1: Run Baseline Training (5 minutes)
-
-First, let's run the basic model without any profiling to establish a clean baseline:
-
-```bash
-## Navigate to version1_pytorch_baseline directory
+```
 cd version1_pytorch_baseline
-
-## Run basic training
 python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 20
 ```
 
-**Expected Output:**
-
+Expected output:
 - Model configuration summary
 - Training progress with loss values
 - Performance metrics (samples/sec, memory usage)
-- Final performance summary
 
-**📝 Record the following baseline metrics:**
+## Step 2: Enable PyTorch Profiler
 
-- Training speed: _____ samples/sec
-- Peak memory usage: _____ MB
-- Final loss: _____
-- Average batch time: _____ ms
-
-#### Step 2: Enable Basic Profiling (10 minutes)
-
-Now let's add PyTorch profiler to understand what's happening under the hood:
-
-```bash
-# Run with PyTorch profiler enabled
+```
 mkdir exercise1_profiles
 python tiny_llama_v1.py \
     --batch-size 8 \
@@ -57,199 +30,48 @@ python tiny_llama_v1.py \
     --profile-dir ./exercise1_profiles
 ```
 
-**Expected Output:**
+Profile files will be generated in `./exercise1_profiles/`.
 
-- Same training output as before
-- Additional profiling information
-- Profile files generated in `./exercise1_profiles/`
+## Step 3: Analyze Results
 
-**📝 Answer these questions:**
+Launch TensorBoard to visualize profiling results:
 
-1. How much overhead did profiling add to training time?
-2. What files were generated in the `exercise1_profiles/` directory?
-3. What's the difference in memory usage with profiling enabled?
-
-#### Step 3: Analyze Profiling Results (10 minutes)
-
-Launch TensorBoard to visualize the profiling results:
+```
+tensorboard --logdir ./exercise1_profiles --port 6006
+```
 
-```bash
-## Launch TensorBoard (run in background)
-tensorboard --logdir ./exercise1_profiles --port 6006 &
+Alternatively, examine JSON traces directly:
 
-## If TensorBoard is not available, examine the JSON traces
+```
 ls -la ./exercise1_profiles/
 ```
 
-**TensorBoard Analysis:**
-
-1. Open your browser to `http://localhost:6006`
-2. Navigate to the "PROFILE" tab
-3. Select the most recent run
-
-**📝 Explore and document:**
-
-**Trace Timeline:**
-
-- What are the top 3 longest-running operations?
-  1. _________________
-  2. _________________
-  3. _________________
-
-**Operator View:**
-
-- Which operation consumes the most GPU time?
-- What percentage of time is spent in attention operations?
-- How many different kernel types are launched?
-
-**Memory Timeline:**
-
-- What is the peak memory usage?
-- When does peak memory occur (forward/backward pass)?
-- Are there any memory spikes or unusual patterns?
-
-#### Step 4: Identify Performance Patterns (5 minutes)
-
-Based on your analysis, identify patterns in the baseline model:
-
-**📝 Pattern Analysis:**
-
-**Compute Patterns:**
-
-- [ ] Attention operations dominate compute time
-- [ ] Matrix multiplications are the primary kernels
-- [ ] Many small operations with low utilization
-- [ ] Memory transfers visible between operations
-
-**Memory Patterns:**
+## Key Observations
 
-- [ ] Memory usage grows during forward pass
-- [ ] Peak memory during attention computation
-- [ ] Frequent small allocations
-- [ ] Memory fragmentation visible
+Typical baseline performance characteristics:
+- Training speed: 50-100 samples/sec (varies by hardware)
+- GPU utilization: 60-75%
+- Memory usage: 2-4 GB depending on batch size
+- Kernel count: 40-50 different kernel launches per step
 
-**Optimization Opportunities:**
-
-Based on the profiling results, which of these optimizations would likely provide the biggest benefit:
-
-- [ ] Kernel fusion (reduce number of operations)
-- [ ] Memory layout optimization
-- [ ] Flash Attention implementation
-- [ ] Mixed precision training
-- [ ] Batch size scaling
-
-### Expected Results
-
-After completing this exercise, you should have:
-
-#### Performance Baseline
-
-- **Training Speed**: 50-100 samples/sec (varies by hardware)
-- **GPU Utilization**: 60-75% (typical for baseline PyTorch)
-- **Memory Usage**: 2-4 GB depending on batch size
-- **Kernel Count**: 40-50 different kernel launches per step
-
-#### Key Observations
+## Optimization Opportunities
 
+Based on profiling analysis:
 - Attention operations consume ~40% of total compute time
 - Matrix multiplications (GEMM) are the dominant kernels
 - Multiple small operations create kernel launch overhead
 - Memory allocation patterns show optimization opportunities
 
-#### Profiling Data Generated
-```
-exercise1_profiles/
-├── events.out.tfevents.*           # TensorBoard events
-├── trace_step_*.json               # Chrome trace files
-├── performance_summary.json        # Performance metrics
-└── [additional profile files]
-```
-
-### Troubleshooting
+## Troubleshooting
 
-#### Common Issues
+CUDA/ROCm memory errors:
 
-**1. CUDA/ROCm Memory Errors**
-```bash
-## Reduce batch size if you get OOM errors
+```
 python tiny_llama_v1.py --batch-size 4 --seq-len 64 --num-steps 10
 ```
 
-**2. Profiling Files Not Generated**
-```bash
-## Check permissions and disk space
-ls -la ./exercise1_profiles/
-df -h .
-```
+Check GPU utilization:
 
-**3. TensorBoard Not Loading**
-```bash
-## Try different port or check firewall
-tensorboard --logdir ./exercise1_profiles --port 6007
-## Or examine JSON files directly
-python -c "import json; print(json.load(open('./exercise1_profiles/performance_summary.json')))"
 ```
-
-**4. Low GPU Utilization**
-```bash
-## Check if GPU is being used
-nvidia-smi  # for NVIDIA
-## or
-rocm-smi   # for AMD
+rocm-smi
 ```
-
-### Analysis Questions
-
-**📝 Answer these questions based on your results:**
-
-1. **What is the primary bottleneck in the baseline model?**
-   - [ ] Memory bandwidth
-   - [ ] Compute utilization
-   - [ ] Kernel launch overhead
-   - [ ] Data loading
-
-2. **Which operations would benefit most from fusion?**
-   - [ ] QKV projections in attention
-   - [ ] Gate/Up projections in SwiGLU
-   - [ ] Layer normalization operations
-   - [ ] All of the above
-
-3. **What is the Model FLOPS Utilization (rough estimate)?**
-   - [ ] < 20% (memory bound)
-   - [ ] 20-40% (mixed workload)
-   - [ ] 40-60% (compute bound)
-   - [ ] > 60% (highly optimized)
-
-4. **Based on memory usage patterns, what optimization would help most?**
-   - [ ] Gradient checkpointing
-   - [ ] Flash Attention
-   - [ ] Mixed precision
-   - [ ] Tensor fusion
-
-### Next Steps
-
-After completing this exercise:
-
-1. **Document your findings** using the performance template in the main README
-2. **Compare with expected results** - are your metrics in the expected ranges?
-3. **Identify top 3 optimization targets** for Version 2
-4. **Proceed to Exercise 2** for memory analysis
-5. **Save your profiling data** - you'll compare against Version 2 later
-
-### Success Criteria
-
-**Exercise Complete When:**
-
-- [ ] Baseline training runs successfully
-- [ ] Profiling data generated and analyzed
-- [ ] Performance metrics documented
-- [ ] Bottlenecks identified
-- [ ] Ready to proceed to memory analysis
-
----
-
-**Key Takeaway**: The baseline model provides a solid foundation for optimization. The profiling data clearly shows opportunities for kernel fusion, memory optimization, and attention improvements that will be addressed in subsequent versions.
-
-**Next Exercise**: [Exercise 2 - Memory Analysis](exercise_2_memory_analysis.md)
-
-
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_2_memory_analysis.md b/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_2_memory_analysis.md
index e35626b7..89a2bc9d 100644
--- a/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_2_memory_analysis.md
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_2_memory_analysis.md
@@ -1,42 +1,19 @@
+# Exercise 2: Memory Analysis and Optimization
 
-## Exercise 2: Memory Analysis and Optimization
+exercise_2_memory_analysis.md from `HPCTrainingExamples/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises` in the Training Examples repository.
 
-`exercise2_memory_analysis.md` from `HPCTrainingExamples/MLExamples/TinyTransformer/version1_pytorch_baseline` in the Training Examples repository
+## Objective
 
-### Objective
-Understand memory usage patterns, identify memory bottlenecks, and analyze memory bandwidth utilization in the baseline Tiny LLaMA model.
+Understand memory usage patterns, identify memory bottlenecks, and analyze memory bandwidth utilization.
 
-### Prerequisites
+## Step 1: Memory Profiling with Different Batch Sizes
 
-- Completed Exercise 1
-- Basic understanding of GPU memory hierarchy
-
-### Duration
-**Estimated Time:** 25-30 minutes
-
-### Background
-
-Memory optimization is crucial for transformer models because:
-
-- **Memory Bandwidth**: Often the limiting factor for inference
-- **Peak Memory**: Determines maximum batch size and model size
-- **Memory Fragmentation**: Can reduce effective memory utilization
-- **Attention Memory**: Quadratic scaling with sequence length
-
-### Instructions
-
-#### Step 1: Memory-Focused Profiling (10 minutes)
-
-Run profiling with enhanced memory analysis:
-
-```bash
-# Memory profiling with different batch sizes
+```
 python tiny_llama_v1.py \
     --batch-size 4 \
     --seq-len 128 \
     --num-steps 15 \
     --enable-pytorch-profiler \
-    --enable-memory-profiling \
     --profile-dir ./memory_analysis_bs4
 
 python tiny_llama_v1.py \
@@ -44,7 +21,6 @@ python tiny_llama_v1.py \
     --seq-len 128 \
     --num-steps 15 \
     --enable-pytorch-profiler \
-    --enable-memory-profiling \
     --profile-dir ./memory_analysis_bs8
 
 python tiny_llama_v1.py \
@@ -52,279 +28,63 @@ python tiny_llama_v1.py \
     --seq-len 128 \
     --num-steps 15 \
     --enable-pytorch-profiler \
-    --enable-memory-profiling \
     --profile-dir ./memory_analysis_bs16
 ```
 
-**📝 Record memory usage for each batch size:**
-
-| Batch Size | Peak Memory (MB) | Avg Memory (MB) | Training Speed (samples/sec) |
-|------------|------------------|-----------------|------------------------------|
-| 4          |                  |                 |                              |
-| 8          |                  |                 |                              |
-| 16         |                  |                 |                              |
-
-#### Step 2: Memory Timeline Analysis (10 minutes)
+## Step 2: Memory Timeline Analysis
 
-Analyze memory patterns using TensorBoard:
+Launch TensorBoard for memory analysis:
 
-```bash
-# Launch TensorBoard for memory analysis
-tensorboard --logdir ./memory_analysis_bs8 --port 6007
 ```
-
-In TensorBoard:
-
-1. Go to the **PROFILE** tab
-2. Select **Memory Timeline** view
-3. Examine the memory usage pattern
-
-**📝 Memory Pattern Analysis:**
-
-**Memory Allocation Timeline:**
-
-- At what point does memory usage peak? ________________
-- What operations cause the largest memory spikes? ________________
-- Are there memory deallocations visible? ________________
-
-**Memory Efficiency:**
-
-- Is memory usage steady or fluctuating? ________________
-- Are there unnecessary memory allocations? ________________
-- What's the memory utilization pattern during attention? ________________
-
-#### Step 3: Sequence Length Scaling (8 minutes)
-
-Test how memory scales with sequence length:
-
-```bash
-# Test different sequence lengths
-python tiny_llama_v1.py \
-    --batch-size 8 \
-    --seq-len 64 \
-    --num-steps 10 \
-    --enable-memory-profiling \
-    --profile-dir ./memory_seq64
-
-python tiny_llama_v1.py \
-    --batch-size 8 \
-    --seq-len 256 \
-    --num-steps 10 \
-    --enable-memory-profiling \
-    --profile-dir ./memory_seq256
-
-# Note: seq-len 512 might OOM - try with smaller batch size if needed
-python tiny_llama_v1.py \
-    --batch-size 4 \
-    --seq-len 512 \
-    --num-steps 5 \
-    --enable-memory-profiling \
-    --profile-dir ./memory_seq512
+tensorboard --logdir ./memory_analysis_bs8 --port 6007
 ```
 
-**📝 Sequence Length Scaling Analysis:**
-
-| Seq Length | Batch Size | Peak Memory (MB) | Memory per Token | Scaling Pattern |
-|------------|------------|------------------|------------------|-----------------|
-| 64         | 8          |                  |                  |                 |
-| 128        | 8          |                  |                  |                 |
-| 256        | 8          |                  |                  |                 |
-| 512        | 4          |                  |                  |                 |
+In TensorBoard, navigate to the PROFILE tab and select Memory Timeline view.
 
-**Memory Scaling Questions:**
+## Step 3: Sequence Length Scaling
 
-1. Is memory scaling linear, quadratic, or something else with sequence length?
-2. Which component shows the steepest memory scaling?
-3. At what sequence length do you hit memory limits?
+Test memory scaling with sequence length:
 
-#### Step 4: Memory Bandwidth Analysis (7 minutes)
-
-Use the memory profiling results to analyze bandwidth utilization:
-
-```bash
-# Run bandwidth-focused analysis
-python run_deepspeed_flops.py \
-    --batch-size 8 \
-    --seq-len 128 \
-    --num-steps 15 \
-    --computational-intensity \
-    --output-dir ./bandwidth_analysis
 ```
-
-**📝 Bandwidth Analysis Results:**
-
-Check the `bandwidth_analysis/computational_intensity.json` file:
-
-```bash
-# View bandwidth metrics
-python -c "
-import json
-data = json.load(open('./bandwidth_analysis/computational_intensity.json'))
-print('Arithmetic Intensity:', data['arithmetic_intensity_flops_per_byte'])
-print('Memory Bandwidth Used:', data['memory_bandwidth_used_gb_per_sec'], 'GB/s')
-print('Bandwidth Utilization:', data['memory_bandwidth_utilization_percent'], '%')
-print('Workload Type:', data['memory_bound_vs_compute_bound'])
-"
+python tiny_llama_v1.py --batch-size 8 --seq-len 64 --num-steps 10
+python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 10
+python tiny_llama_v1.py --batch-size 8 --seq-len 256 --num-steps 10
+python tiny_llama_v1.py --batch-size 4 --seq-len 512 --num-steps 5
 ```
 
-**Key Metrics:**
-
-- Arithmetic Intensity: _______ FLOPS/byte
-- Memory Bandwidth Used: _______ GB/s
-- Bandwidth Utilization: _______ %
-- Workload Classification: _______
-
-### Analysis and Interpretation
+## Expected Observations
 
-#### Step 5: Memory Optimization Opportunities (10 minutes)
+**Memory Scaling:**
+- Memory scales approximately linearly with batch size
+- Memory scales quadratically with sequence length (due to attention matrices)
 
-Based on your analysis, identify optimization opportunities:
+**Memory Hotspots:**
+- Attention QKV matrices
+- Attention score computation `[B, H, S, S]`
+- FFN intermediate tensors
 
-**📝 Memory Optimization Assessment:**
+**Bandwidth Classification:**
+- Arithmetic Intensity < 10 FLOPS/byte: Memory-bound
+- Arithmetic Intensity 10-100 FLOPS/byte: Mixed workload
+- Arithmetic Intensity > 100 FLOPS/byte: Compute-bound
 
-**1. Memory Scaling Efficiency**
+## Optimization Targets
 
-- [ ] Linear scaling with batch size (good)
-- [ ] Quadratic scaling with sequence length (attention bottleneck)
-- [ ] Peak memory much higher than average (fragmentation)
-- [ ] Memory plateaus (good memory reuse)
+1. **Flash Attention**: Reduce attention memory from O(S^2) to O(S)
+2. **Gradient Checkpointing**: Trade compute for memory
+3. **Mixed Precision (FP16/BF16)**: 2x memory reduction
+4. **Kernel Fusion**: Reduce intermediate tensor allocations
 
-**2. Bandwidth Utilization**
+## Troubleshooting
 
-- [ ] High bandwidth utilization (>70%) - compute bound
-- [ ] Medium bandwidth utilization (30-70%) - mixed workload
-- [ ] Low bandwidth utilization (<30%) - memory bound
+Out of memory errors:
 
-**3. Memory Hotspots** (check profiling results)
-
-- [ ] Attention QKV matrices
-- [ ] Attention score computation
-- [ ] Feed-forward intermediate tensors
-- [ ] Gradient accumulation
-
-**4. Optimization Targets**
-
-Rank these optimizations by memory impact (1=highest, 4=lowest):
-- [ ] Flash Attention (reduce attention memory) - Rank: ___
-- [ ] Gradient checkpointing (trade compute for memory) - Rank: ___
-- [ ] Mixed precision (reduce memory per parameter) - Rank: ___
-- [ ] Tensor fusion (reduce intermediate allocations) - Rank: ___
-
-#### Step 6: Memory Bottleneck Identification (5 minutes)
-
-Determine if your workload is memory-bound or compute-bound:
-
-**📝 Bottleneck Classification:**
-
-Based on your bandwidth analysis:
-
-- **Arithmetic Intensity < 10 FLOPS/byte** → Memory-bound workload
-- **Arithmetic Intensity 10-100 FLOPS/byte** → Mixed workload
-- **Arithmetic Intensity > 100 FLOPS/byte** → Compute-bound workload
-
-**Your Classification:** _______________________
-
-**Evidence:**
-
-- Arithmetic intensity: _______ FLOPS/byte
-- Memory bandwidth utilization: _______ %
-- GPU compute utilization: _______ % (from Exercise 1)
-
-**Primary Bottleneck:**
-
-- [ ] Memory bandwidth (low compute util, high memory util)
-- [ ] Compute throughput (high compute util, low memory util)
-- [ ] Mixed (balanced utilization)
-- [ ] Kernel overhead (low both)
-
-### Expected Results
-
-#### Memory Usage Patterns
-
-- **Peak Memory Growth**: Approximately linear with batch size
-- **Sequence Scaling**: Quadratic scaling due to attention matrices
-- **Memory Hotspots**: Attention computation and intermediate tensors
-- **Bandwidth Utilization**: 30-60% on most modern GPUs
-
-#### Key Findings
-
-1. **Attention Memory**: Consumes significant memory, scales quadratically
-2. **Memory Fragmentation**: Multiple small allocations create overhead
-3. **Peak vs Average**: Large difference indicates optimization opportunity
-4. **Bandwidth Bound**: Likely memory-bound for typical configurations
-
-### Troubleshooting
-
-**Out of Memory Errors:**
-```bash
-# Reduce batch size and/or sequence length
+```
 python tiny_llama_v1.py --batch-size 2 --seq-len 64
 ```
 
-**Memory Profiling Failed:**
-```bash
-# Check CUDA memory debugging
-export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
-```
+Memory fragmentation:
 
-**Bandwidth Analysis Error:**
-```bash
-# Check DeepSpeed installation
-pip install deepspeed
 ```
-
-### Analysis Questions
-
-**📝 Critical Analysis Questions:**
-
-1. **What is the memory scaling behavior?**
-   - Batch size scaling: [ ] Linear [ ] Quadratic [ ] Exponential
-   - Sequence length scaling: [ ] Linear [ ] Quadratic [ ] Exponential
-
-2. **Where is peak memory consumed?**
-   - [ ] During forward pass (activations)
-   - [ ] During backward pass (gradients)
-   - [ ] During optimizer step (parameters)
-
-3. **What is the primary memory optimization target?**
-   - [ ] Reduce attention memory (Flash Attention)
-   - [ ] Reduce activation memory (checkpointing)
-   - [ ] Reduce parameter memory (mixed precision)
-   - [ ] Reduce fragmentation (tensor fusion)
-
-4. **Is the workload memory-bound or compute-bound?**
-   - [ ] Memory-bound (low arithmetic intensity)
-   - [ ] Compute-bound (high arithmetic intensity)
-   - [ ] Mixed workload (balanced)
-
-5. **What memory optimization would provide the biggest benefit?**
-   - [ ] Flash Attention (quadratic → linear attention memory)
-   - [ ] Gradient checkpointing (trade compute for memory)
-   - [ ] Mixed precision FP16/BF16 (2x memory reduction)
-   - [ ] Tensor fusion (reduce intermediate allocations)
-
-### Next Steps
-
-1. **Document your memory analysis** results
-2. **Compare memory patterns** across different configurations
-3. **Identify top memory optimization targets** for Version 2
-4. **Understand the memory vs compute trade-offs**
-5. **Proceed to Exercise 3** for bottleneck identification
-
-### Success Criteria
-
-**Exercise Complete When:**
-
-- [ ] Memory profiling completed for multiple configurations
-- [ ] Memory scaling patterns understood
-- [ ] Bandwidth utilization analyzed
-- [ ] Memory bottlenecks identified
-- [ ] Optimization priorities ranked
-
----
-
-**Key Takeaway**: Memory analysis reveals that the baseline model has significant memory optimization opportunities, particularly in attention computation which scales quadratically with sequence length. Flash Attention and kernel fusion will be primary targets for Version 2.
-
-**Next Exercise**: [Exercise 3 - Bottleneck Identification](exercise_3_bottleneck_identification.md)
-
-
+export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
+```
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_3_bottleneck_identification.md b/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_3_bottleneck_identification.md
index 16ee8fe9..8af87e44 100644
--- a/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_3_bottleneck_identification.md
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_3_bottleneck_identification.md
@@ -1,254 +1,48 @@
+# Exercise 3: Bottleneck Identification and Optimization Planning
 
-## Exercise 3: Bottleneck Identification and Optimization Planning
+exercise_3_bottleneck_identification.md from `HPCTrainingExamples/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises` in the Training Examples repository.
 
-`exercise3_bottleneck_identification.md` from `HPCTrainingExamples/MLExamples/TinyTransformer/version1_pytorch_baseline` in the Training Examples repository
+## Objective
 
-### Objective
-Systematically identify performance bottlenecks in the baseline model and create an optimization roadmap for Version 2 and beyond.
+Systematically identify performance bottlenecks in the baseline model and create an optimization roadmap.
 
-### Prerequisites
-- Completed Exercises 1 and 2
-- Understanding of profiling results analysis
+## Step 1: Comprehensive Profiling
 
-### Duration
-**Estimated Time:** 30-35 minutes
+Run the complete profiling suite:
 
-### Background
-
-Bottleneck identification is critical for effective optimization:
-- **Amdahl's Law**: Overall speedup is limited by the slowest component
-- **Optimization ROI**: Focus effort where it provides maximum benefit
-- **Systematic Approach**: Use data-driven decisions rather than intuition
-- **Baseline Establishment**: Create benchmarks for measuring improvement
-
-### Instructions
-
-#### Step 1: Comprehensive Profiling Run (10 minutes)
-
-Run the complete profiling suite to gather all necessary data:
-
-```bash
-## Run comprehensive profiling analysis
-bash run_all_profilers.sh \
+```
+python tiny_llama_v1.py \
     --batch-size 8 \
     --seq-len 128 \
     --num-steps 30 \
+    --enable-pytorch-profiler \
     --profile-dir ./bottleneck_analysis
 ```
 
-This will generate:
-- Baseline training metrics
-- PyTorch profiler results
-- FLOPS analysis data
-- Memory usage patterns
-- Comprehensive reports
-
-**📝 Wait for completion and record:**
-- Overall runtime: _______ seconds
-- Profile data location: _______
-- Any errors or warnings: _______
-
-#### Step 2: Operator-Level Bottleneck Analysis (10 minutes)
-
-Analyze the detailed profiling results to identify computational bottlenecks:
-
-```bash
-## View the comprehensive profiling report
-cat ./bottleneck_analysis/performance_summary_report.md
-
-## Examine PyTorch profiler operator breakdown
-python run_pytorch_profiler.py \
-    --analyze-existing ./bottleneck_analysis/pytorch_profiling \
-    --generate-report \
-    --output-dir ./detailed_analysis
-```
-
-**📝 Top Time-Consuming Operations:**
-
-From the PyTorch profiler results, identify the top 10 operations by GPU time:
-
-| Rank | Operation Name | GPU Time (%) | CPU Time (%) | Count | Optimization Target |
-|------|----------------|-------------|-------------|-------|-------------------|
-| 1    |                |             |             |       |                   |
-| 2    |                |             |             |       |                   |
-| 3    |                |             |             |       |                   |
-| 4    |                |             |             |       |                   |
-| 5    |                |             |             |       |                   |
-| 6    |                |             |             |       |                   |
-| 7    |                |             |             |       |                   |
-| 8    |                |             |             |       |                   |
-| 9    |                |             |             |       |                   |
-| 10   |                |             |             |       |                   |
-
-**Pattern Analysis:**
-- What percentage of time is spent in matrix multiplications? _______%
-- How many separate linear projection operations are there? _______
-- What's the overhead from kernel launches vs. computation? _______%
-
-#### Step 3: FLOPS Efficiency Analysis (8 minutes)
-
-Examine computational efficiency using the FLOPS analysis:
-
-```bash
-## View FLOPS analysis results
-python -c "
-import json
-with open('./bottleneck_analysis/flops_analysis/flops_profile.json', 'r') as f:
-    data = json.load(f)
-
-print('=== FLOPS EFFICIENCY ANALYSIS ===')
-print(f'Model FLOPS Utilization: {data[\"efficiency_metrics\"][\"mfu_percent\"]:.1f}%')
-print(f'Achieved FLOPS/sec: {data[\"performance_metrics\"][\"flops_per_sec\"]:.2e}')
-print(f'Peak Device FLOPS: {data[\"efficiency_metrics\"][\"device_peak_flops\"]:.2e}')
-print(f'FLOPS per Parameter: {data[\"flops_analysis\"][\"flops_per_parameter\"]:.2f}')
-print(f'Throughput: {data[\"performance_metrics\"][\"throughput_samples_per_sec\"]:.1f} samples/sec')
-"
-```
-
-**📝 Efficiency Metrics:**
-- Model FLOPS Utilization (MFU): _______%
-- Achieved FLOPS per second: _______
-- FLOPS per parameter: _______
-- Overall throughput: _______ samples/sec
-
-**Efficiency Classification:**
-- [ ] < 20% MFU: Severely underutilized (kernel overhead dominant)
-- [ ] 20-40% MFU: Memory-bound workload
-- [ ] 40-60% MFU: Mixed workload with optimization opportunities
-- [ ] > 60% MFU: Well-optimized compute-bound workload
-
-#### Step 4: Memory Bottleneck Assessment (7 minutes)
-
-Analyze memory-related bottlenecks:
-
-```bash
-## Check computational intensity analysis
-python -c "
-import json
-import os
-
-intensity_file = './bottleneck_analysis/flops_analysis/computational_intensity.json'
-if os.path.exists(intensity_file):
-    with open(intensity_file, 'r') as f:
-        data = json.load(f)
-
-    print('=== MEMORY BOTTLENECK ANALYSIS ===')
-    print(f'Arithmetic Intensity: {data[\"arithmetic_intensity_flops_per_byte\"]:.2f} FLOPS/byte')
-    print(f'Memory Bandwidth Used: {data[\"memory_bandwidth_used_gb_per_sec\"]:.1f} GB/s')
-    print(f'Bandwidth Utilization: {data[\"memory_bandwidth_utilization_percent\"]:.1f}%')
-    print(f'Workload Type: {data[\"memory_bound_vs_compute_bound\"]}')
-else:
-    print('Computational intensity analysis not available')
-"
-```
-
-**📝 Memory Analysis:**
-- Arithmetic Intensity: _______ FLOPS/byte
-- Memory Bandwidth Utilization: _______%
-- Primary Bottleneck: [ ] Memory-bound [ ] Compute-bound [ ] Mixed
-- Peak Memory Usage: _______ MB
-
-**Roofline Model Position:**
-- [ ] Below roofline - memory bound (optimize data movement)
-- [ ] On roofline - balanced (optimize both)
-- [ ] Below compute ceiling - compute bound (optimize kernels)
-
-#### Step 5: Systematic Bottleneck Ranking (10 minutes)
-
-Create a systematic ranking of bottlenecks based on impact and effort:
-
-**📝 Bottleneck Impact Assessment:**
-
-For each major bottleneck, assess:
-
-| Bottleneck Category | % of Total Time | Optimization Difficulty | Expected Speedup | Priority Rank |
-|--------------------|-----------------|------------------------|------------------|---------------|
-| QKV Projections    |                 | Low-Medium             | 1.2-1.5x         |               |
-| Attention Computation |             | Medium                 | 1.3-2.0x         |               |
-| SwiGLU Gate/Up     |                 | Low                    | 1.1-1.3x         |               |
-| Kernel Launch Overhead |            | Medium-High            | 1.5-3.0x         |               |
-| Memory Fragmentation |              | Medium                 | 1.1-1.4x         |               |
-| Softmax Operations |                 | Medium-High            | 1.2-1.8x         |               |
+## Step 2: Operator-Level Analysis
 
-**Impact vs Effort Matrix:**
+Examine the profiling results to identify computational bottlenecks. Look for the top time-consuming operations in the profiler output.
 
-High Impact, Low Effort (Priority 1):
-- _______________________________
-- _______________________________
+Expected top operations by GPU time:
+- Matrix multiplications (aten::mm, aten::addmm, aten::bmm)
+- Softmax operations
+- Element-wise operations
 
-High Impact, High Effort (Priority 2):
-- _______________________________
-- _______________________________
+## Step 3: Efficiency Analysis
 
-Low Impact, Low Effort (Priority 3):
-- _______________________________
-- _______________________________
+Key efficiency metrics to examine:
+- Model FLOPS Utilization (MFU)
+- Memory bandwidth utilization
+- Kernel launch overhead
 
-Low Impact, High Effort (Priority 4 - Skip):
-- _______________________________
-- _______________________________
+Typical baseline efficiency:
+- MFU: 20-35% (memory-bound workload)
+- Bandwidth utilization: 30-60%
 
-### Analysis and Optimization Roadmap
+## Typical Bottleneck Hierarchy
 
-#### Step 6: Create Version 2 Optimization Plan (10 minutes)
-
-Based on your analysis, create a detailed optimization plan for Version 2:
-
-**📝 Version 2 Optimization Roadmap:**
-
-**Phase 1: Kernel Fusion (Expected: 1.4-1.8x speedup)**
-- [ ] **QKV Fusion**: Combine Q, K, V linear projections
-  - Impact: Reduce 3 kernel launches to 1
-  - Memory: Reduce intermediate tensor allocations
-  - Implementation: Fused linear layer
-
-- [ ] **SwiGLU Fusion**: Combine gate and up projections
-  - Impact: Reduce 2 kernel launches to 1
-  - Memory: Eliminate intermediate activations
-  - Implementation: Custom fused activation
-
-**Phase 2: Attention Optimization (Expected: 1.3-2.0x speedup)**
-- [ ] **Flash Attention**: Memory-efficient attention computation
-  - Impact: Reduce attention memory from O(n^2) to O(n)
-  - Memory: Enable longer sequences and larger batches
-  - Implementation: torch.nn.functional.scaled_dot_product_attention
-
-**Phase 3: Additional Optimizations (Expected: 1.1-1.3x speedup)**
-- [ ] **Torch Compile**: Automatic kernel fusion
-- [ ] **Memory Layout**: Optimize tensor layouts
-- [ ] **Mixed Precision**: FP16/BF16 where appropriate
-
-**Expected Overall Speedup for Version 2:** _______x
-
-#### Step 7: Validation Metrics Definition (5 minutes)
-
-Define metrics to validate Version 2 improvements:
-
-**📝 Success Metrics for Version 2:**
-
-**Performance Targets:**
-- Training throughput: _______ samples/sec → _______ samples/sec
-- Model FLOPS Utilization: _______ % → _______ %
-- Peak memory usage: _______ MB → _______ MB
-- Kernel count per step: _______ → _______
-
-**Validation Tests:**
-- [ ] Batch size 8, sequence length 128 (baseline comparison)
-- [ ] Batch size 16, sequence length 256 (scaling test)
-- [ ] Memory scaling with sequence length
-- [ ] Numerical accuracy validation (loss convergence)
-
-**Quality Gates:**
-- [ ] No degradation in model accuracy
-- [ ] Deterministic execution maintained
-- [ ] Memory usage reduced or stable
-- [ ] Throughput improved by >30%
-
-### Expected Results
-
-#### Typical Bottleneck Hierarchy
 1. **Attention Operations (35-45% of time)**
-   - Multiple QKV projections
+   - QKV projections (3 separate kernel launches)
    - Attention score computation
    - Softmax operations
 
@@ -261,98 +55,30 @@ Define metrics to validate Version 2 improvements:
    - Multiple small operations
    - Memory transfers between kernels
 
-4. **Memory Operations (5-15% of time)**
-   - Tensor allocations/deallocations
-   - Memory fragmentation
-
-#### Optimization Priority Order
-1. **QKV Fusion** (Low effort, medium impact)
-2. **Flash Attention** (Medium effort, high impact)
-3. **SwiGLU Fusion** (Low effort, low-medium impact)
-4. **Torch Compile** (Very low effort, variable impact)
-
-### Troubleshooting
-
-**Missing Analysis Files:**
-```bash
-## Re-run comprehensive profiling if files are missing
-bash run_all_profilers.sh --batch-size 8 --profile-dir ./bottleneck_retry
-```
-
-**Profiling Data Errors:**
-```bash
-## Check for GPU memory issues
-nvidia-smi  # or rocm-smi
-## Reduce batch size if necessary
-```
-
-### Analysis Questions
-
-**📝 Critical Analysis Questions:**
-
-1. **What is the single largest performance bottleneck?**
-   - [ ] QKV projection operations
-   - [ ] Attention score computation
-   - [ ] Feed-forward network
-   - [ ] Kernel launch overhead
-   - [ ] Memory bandwidth
-
-2. **What type of optimization would provide the biggest benefit?**
-   - [ ] Kernel fusion (reduce launches)
-   - [ ] Memory optimization (bandwidth)
-   - [ ] Algorithmic optimization (attention)
-   - [ ] Precision optimization (mixed precision)
-
-3. **Is the workload primarily:**
-   - [ ] Memory-bound (optimize data movement)
-   - [ ] Compute-bound (optimize kernels)
-   - [ ] Overhead-bound (optimize launches)
-   - [ ] Mixed workload (balanced optimization)
-
-4. **What should be the first optimization implemented?**
-   - [ ] QKV fusion (immediate benefit)
-   - [ ] Flash Attention (biggest impact)
-   - [ ] SwiGLU fusion (easy implementation)
-   - [ ] Torch compile (automatic optimization)
+## Optimization Roadmap
 
-5. **What is the realistic speedup target for Version 2?**
-   - [ ] 1.2-1.4x (conservative)
-   - [ ] 1.5-2.0x (achievable)
-   - [ ] 2.0-3.0x (optimistic)
-   - [ ] >3.0x (unlikely without major changes)
+**Priority 1: Kernel Fusion (Expected 1.4-1.8x speedup)**
+- QKV Fusion: Combine Q, K, V projections into single GEMM
+- SwiGLU Fusion: Combine gate and up projections
 
-### Deliverables
+**Priority 2: Attention Optimization (Expected 1.3-2.0x speedup)**
+- Flash Attention: Memory-efficient attention computation
+- Reduces memory from O(S^2) to O(S)
 
-At the end of this exercise, you should have:
+**Priority 3: Additional Optimizations (Expected 1.1-1.3x speedup)**
+- torch.compile for automatic kernel fusion
+- Mixed precision (FP16/BF16)
 
-1. **Bottleneck Analysis Report** with quantified performance issues
-2. **Optimization Roadmap** with prioritized improvements
-3. **Version 2 Implementation Plan** with expected benefits
-4. **Success Metrics** for validating improvements
-5. **Baseline Measurements** for comparison
+## Troubleshooting
 
-### Next Steps
+Missing analysis files:
 
-1. **Document all findings** in the performance summary template
-2. **Review optimization priorities** with team/instructor
-3. **Validate technical feasibility** of planned optimizations
-4. **Proceed to Version 2** implementation with clear targets
-5. **Set up regression testing** framework for validation
-
-### Success Criteria
-
-**Exercise Complete When:**
-- [ ] Comprehensive bottleneck analysis completed
-- [ ] Performance bottlenecks quantified and ranked
-- [ ] Optimization roadmap created with priorities
-- [ ] Success metrics defined for Version 2
-- [ ] Implementation plan validated
-- [ ] Ready to begin Version 2 optimizations
-
----
-
-**Key Takeaway**: Systematic bottleneck identification reveals that the baseline model has clear optimization opportunities in kernel fusion, attention computation, and memory usage. The data-driven approach provides a roadmap for achieving 1.5-2.0x speedup in Version 2.
-
-**Next Phase**: [Version 2 - PyTorch Fused](../version2_pytorch_fused/README.md)
+```
+python tiny_llama_v1.py --batch-size 8 --profile-dir ./bottleneck_retry
+```
 
+Check GPU status:
 
+```
+rocm-smi
+```
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/get_counters.sh b/MLExamples/TinyTransformer/version1_pytorch_baseline/get_counters.sh
new file mode 100755
index 00000000..2d7bf5b4
--- /dev/null
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/get_counters.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# Collect kernel trace data for TinyTransformer V1 with rocprofv3.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v1.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v1"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprofv3
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+ROCM_VERSION="$(detect_rocm_version)"
+OUTPUT_DIR="$(make_output_dir counters)"
+
+echo "Starting rocprofv3 kernel trace for TinyTransformer V1..."
+if [ -n "$ROCM_VERSION" ]; then
+    echo "Detected ROCm version: $ROCM_VERSION"
+fi
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+rocprofv3 \
+    --kernel-trace \
+    --output-directory "$OUTPUT_DIR" \
+    -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 3
+echo ""
+echo "To analyze results:"
+
+DB_FILE="$(select_largest_match "$OUTPUT_DIR" "*.db")"
+CSV_FILE="$(select_largest_match "$OUTPUT_DIR" "*_kernel_trace.csv")"
+AGENT_INFO_FILE=""
+
+if [ -n "$CSV_FILE" ]; then
+    CSV_PREFIX="${CSV_FILE%_kernel_trace.csv}"
+    MATCHING_AGENT_INFO="${CSV_PREFIX}_agent_info.csv"
+    if [ -f "$MATCHING_AGENT_INFO" ]; then
+        AGENT_INFO_FILE="$MATCHING_AGENT_INFO"
+    fi
+fi
+
+if [ -z "$AGENT_INFO_FILE" ]; then
+    AGENT_INFO_FILE="$(select_largest_match "$OUTPUT_DIR" "*_agent_info.csv")"
+fi
+
+if [ -n "$CSV_FILE" ]; then
+    echo "  Kernel trace CSV: $CSV_FILE"
+fi
+if [ -n "$AGENT_INFO_FILE" ]; then
+    echo "  Agent info CSV: $AGENT_INFO_FILE"
+fi
+if [ -n "$DB_FILE" ]; then
+    echo "  SQLite database: $DB_FILE"
+    echo ""
+    echo "  Export to CSV:"
+    echo "    rocpd2csv -i \"$DB_FILE\" -o kernel_stats.csv"
+    echo ""
+    echo "  Get kernel summary:"
+    echo "    rocpd summary -i \"$DB_FILE\" --region-categories KERNEL"
+fi
+if [ -z "$CSV_FILE" ] && [ -z "$DB_FILE" ]; then
+    echo "  WARNING: No ROCm profiler output file was detected under $OUTPUT_DIR"
+fi
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/get_hotspots.sh b/MLExamples/TinyTransformer/version1_pytorch_baseline/get_hotspots.sh
new file mode 100755
index 00000000..9529a70b
--- /dev/null
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/get_hotspots.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Collect a quick hotspot summary for TinyTransformer V1 with rocprofv3 --stats.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v1.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v1"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprofv3
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+OUTPUT_DIR="$(make_output_dir hotspots)"
+
+echo "Starting rocprofv3 hotspot summary for TinyTransformer V1..."
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+rocprofv3 \
+    --kernel-trace \
+    --stats \
+    --output-directory "$OUTPUT_DIR" \
+    -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 3
+echo ""
+
+CSV_FILE="$(select_largest_match "$OUTPUT_DIR" "*_kernel_stats.csv")"
+if [ -z "$CSV_FILE" ]; then
+    CSV_FILE="$(select_largest_match "$OUTPUT_DIR" "*_domain_stats.csv")"
+fi
+if [ -n "$CSV_FILE" ]; then
+    echo "Top rows from $CSV_FILE:"
+    head -11 "$CSV_FILE"
+else
+    echo "WARNING: No hotspot CSV file was detected under $OUTPUT_DIR"
+fi
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/get_rocprof_compute.sh b/MLExamples/TinyTransformer/version1_pytorch_baseline/get_rocprof_compute.sh
new file mode 100755
index 00000000..7128e2fb
--- /dev/null
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/get_rocprof_compute.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+# Collect hardware metrics for TinyTransformer V1 with rocprof-compute.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v1.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v1"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprof-compute
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+MODE="${1:-no-roof}"
+GPU_ARCH="$(detect_gpu_arch)"
+SUPPORTED_ARCH_REGEX='^(gfx908|gfx90a|gfx940|gfx941|gfx942)$'
+
+if [ -n "$GPU_ARCH" ] && ! echo "$GPU_ARCH" | grep -Eq "$SUPPORTED_ARCH_REGEX"; then
+    echo "Skipping rocprof-compute profiling for TinyTransformer V1..."
+    echo "Detected GPU architecture: $GPU_ARCH"
+    echo "rocprof-compute hardware-counter collection currently requires a supported Instinct GPU"
+    echo "(for example gfx908, gfx90a, gfx940, gfx941, or gfx942)."
+    echo "Use get_trace.sh, get_hotspots.sh, or get_counters.sh on this system instead."
+    exit 0
+fi
+
+OUTPUT_DIR="$(make_output_dir rocprof_compute)"
+PROFILE_ROOT="$OUTPUT_DIR/$WORKLOAD_NAME"
+
+case "$MODE" in
+    full)
+        PROFILE_ARGS=(--kernel-names)
+        MODE_DESCRIPTION="full profile (counters plus roofline stage)"
+        ;;
+    roof-only)
+        PROFILE_ARGS=(--roof-only --kernel-names)
+        MODE_DESCRIPTION="roofline-only profile"
+        ;;
+    no-roof)
+        PROFILE_ARGS=(--no-roof --kernel-names)
+        MODE_DESCRIPTION="counter-only profile without roofline collection"
+        ;;
+    *)
+        echo "Usage: $0 [no-roof|full|roof-only]" >&2
+        echo "  no-roof   collect counters only and skip the roofline stage" >&2
+        echo "  full      collect the default counter set and roofline data" >&2
+        echo "  roof-only collect roofline data only and label roofline kernels" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting rocprof-compute hardware metrics for TinyTransformer V1..."
+if [ -n "$GPU_ARCH" ]; then
+    echo "Detected GPU architecture: $GPU_ARCH"
+fi
+echo "Mode: $MODE_DESCRIPTION"
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+rocprof-compute profile \
+    --name "$WORKLOAD_NAME" \
+    --path "$PROFILE_ROOT" \
+    "${PROFILE_ARGS[@]}" \
+    -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 4
+echo ""
+echo "To analyze results:"
+
+ANALYZE_PATH=""
+for marker in pmc_perf.csv roofline.csv sysinfo.csv; do
+    MARKER_FILE="$(find "$PROFILE_ROOT" -name "$marker" 2>/dev/null | head -1)"
+    if [ -n "$MARKER_FILE" ]; then
+        ANALYZE_PATH="$(dirname "$MARKER_FILE")"
+        break
+    fi
+done
+
+if [ -n "$ANALYZE_PATH" ]; then
+    echo "  Raw data directory: $ANALYZE_PATH"
+    echo ""
+    echo "  1. List detected kernels and dispatches:"
+    echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --list-stats"
+    if [ "$MODE" != "roof-only" ]; then
+        echo ""
+        echo "  2. Inspect one dispatch in the default report:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N>"
+        echo ""
+        echo "  3. Check occupancy and LDS-related limits:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N> --block 2.1.15 6.2.7"
+        echo ""
+        echo "  4. Check L1/L2 memory speed-of-light metrics:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N> --block 16.1 17.1"
+    else
+        echo ""
+        echo "  Roofline-only mode does not collect the full counter set."
+        echo "  Re-run with '$0 full' or '$0 no-roof' for detailed block analysis."
+    fi
+else
+    echo "  WARNING: Could not detect the rocprof-compute raw data directory under $PROFILE_ROOT"
+    echo "  Inspect the generated workload tree and use that path with 'rocprof-compute analyze -p'."
+fi
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/get_rocprof_sys.sh b/MLExamples/TinyTransformer/version1_pytorch_baseline/get_rocprof_sys.sh
new file mode 100755
index 00000000..00cc6d65
--- /dev/null
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/get_rocprof_sys.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Collect a system trace for TinyTransformer V1 with rocprof-sys.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v1.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v1"
+TINYTRANSFORMER_DEFAULT_NUM_STEPS=2
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprof-sys-run
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+OUTPUT_DIR="$(make_output_dir rocprof_sys)"
+
+echo "Starting rocprof-sys trace for TinyTransformer V1..."
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+pushd "$OUTPUT_DIR" >/dev/null
+rocprof-sys-run \
+    --profile \
+    --trace \
+    -- "${BENCHMARK_CMD[@]}"
+popd >/dev/null
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 4
+echo ""
+echo "Open the trace in Perfetto:"
+PROTO_FILE="$(select_largest_match "$OUTPUT_DIR" "*.proto")"
+if [ -n "$PROTO_FILE" ]; then
+    echo "  Perfetto trace file: $PROTO_FILE"
+    echo "  Open it in Perfetto UI: https://ui.perfetto.dev/"
+else
+    echo "  WARNING: No .proto file was found under $OUTPUT_DIR"
+    echo "  Inspect the output tree and open the generated trace in Perfetto UI if present."
+fi
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/get_trace.sh b/MLExamples/TinyTransformer/version1_pytorch_baseline/get_trace.sh
new file mode 100755
index 00000000..83eb8521
--- /dev/null
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/get_trace.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Collect a runtime trace for TinyTransformer V1 with rocprofv3.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v1.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v1"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprofv3
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+ROCM_VERSION="$(detect_rocm_version)"
+ROCM_MAJOR="$(rocm_major_from_version "$ROCM_VERSION")"
+OUTPUT_DIR="$(make_output_dir trace)"
+
+echo "Starting rocprofv3 runtime trace for TinyTransformer V1..."
+if [ -n "$ROCM_VERSION" ]; then
+    echo "Detected ROCm version: $ROCM_VERSION"
+fi
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+
+TRACE_CMD=(rocprofv3 --runtime-trace --output-directory "$OUTPUT_DIR")
+if [ "$ROCM_MAJOR" = "6" ] || [ "$ROCM_MAJOR" = "7" ]; then
+    TRACE_CMD+=(--output-format pftrace)
+fi
+
+echo ""
+"${TRACE_CMD[@]}" -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 3
+echo ""
+
+PFTRACE_FILE="$(select_largest_match "$OUTPUT_DIR" "*.pftrace")"
+DB_FILE="$(select_largest_match "$OUTPUT_DIR" "*.db")"
+
+if [ -n "$PFTRACE_FILE" ]; then
+    echo "Perfetto trace file: $PFTRACE_FILE"
+    echo "Open it in Perfetto UI: https://ui.perfetto.dev/"
+elif [ -n "$DB_FILE" ]; then
+    echo "SQLite database found: $DB_FILE"
+    echo "Convert it to Perfetto format with:"
+    echo "  rocpd2pftrace -i \"$DB_FILE\" -o trace.pftrace"
+else
+    echo "WARNING: No .pftrace or .db file was found under $OUTPUT_DIR"
+fi
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/test_rocpd.sh b/MLExamples/TinyTransformer/version1_pytorch_baseline/test_rocpd.sh
new file mode 100755
index 00000000..128f3a53
--- /dev/null
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/test_rocpd.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Test rocpd (ROCm Profiling Daemon) for continuous profiling
+#
+
+set -e
+
+echo "=========================================="
+echo "rocpd Test - Version 1"
+echo "=========================================="
+echo ""
+
+# Check if rocpd is available
+if ! command -v rocpd &> /dev/null; then
+    echo "[ERROR] rocpd not found in PATH"
+    echo "rocpd may not be installed or available in this ROCm version"
+    exit 1
+fi
+
+echo "rocpd location: $(which rocpd)"
+echo ""
+
+OUTPUT_DIR="./rocpd/rocpd_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUTPUT_DIR"
+
+echo "Output directory: $OUTPUT_DIR"
+echo ""
+
+# Start rocpd in background
+echo "Starting rocpd daemon..."
+rocpd --output-dir "$OUTPUT_DIR" &
+ROCPD_PID=$!
+echo "rocpd running with PID: $ROCPD_PID"
+echo ""
+
+# Give rocpd time to initialize
+sleep 2
+
+# Run workload
+echo "Running workload: python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 10"
+python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 10
+WORKLOAD_EXIT=$?
+echo ""
+
+# Stop rocpd
+echo "Stopping rocpd daemon..."
+kill $ROCPD_PID 2>/dev/null || true
+wait $ROCPD_PID 2>/dev/null || true
+echo ""
+
+if [ $WORKLOAD_EXIT -eq 0 ]; then
+    echo "[SUCCESS] Workload completed"
+else
+    echo "[FAILED] Workload failed with exit code $WORKLOAD_EXIT"
+fi
+echo ""
+
+echo "Generated files in $OUTPUT_DIR:"
+ls -lh "$OUTPUT_DIR"
+echo ""
+
+echo "rocpd output is a SQLite3 database file"
+echo ""
+echo "To view the database:"
+echo "  - Use VS Code SQLite Viewer extension"
+echo "  - rocprof-compute and rocprof-systems can consume it directly"
+echo "  - No official CLI tool is provided for viewing"
+echo ""
+echo "rocpd provides continuous profiling with minimal overhead"
+echo ""
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/test_rocprofv3_version1.sh b/MLExamples/TinyTransformer/version1_pytorch_baseline/test_rocprofv3_version1.sh
new file mode 100755
index 00000000..a108fc73
--- /dev/null
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/test_rocprofv3_version1.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+#
+# rocprofv3 validation test for tiny_llama_v1.py
+# Tests profiler capture on baseline PyTorch implementation
+#
+
+set -e
+
+echo "=========================================="
+echo "rocprofv3 Test Suite - Version 1 Baseline"
+echo "=========================================="
+echo ""
+
+# Step 1: Environment Validation
+echo "[STEP 1] Environment Validation"
+echo "----------------------------------------"
+
+echo "ROCm Version:"
+rocm-smi --showproductname || echo "rocm-smi failed"
+echo ""
+
+echo "GPU Visibility:"
+echo "  HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES"
+echo "  ROCR_VISIBLE_DEVICES=$ROCR_VISIBLE_DEVICES"
+echo "  HSA_ENABLE_PROFILING=$HSA_ENABLE_PROFILING"
+echo ""
+
+echo "rocprofv3 location:"
+which rocprofv3
+echo ""
+
+echo "PyTorch + ROCm Check:"
+python3 -c "
+import torch
+print(f'PyTorch version: {torch.__version__}')
+print(f'CUDA available: {torch.cuda.is_available()}')
+if torch.cuda.is_available():
+    print(f'Device count: {torch.cuda.device_count()}')
+    print(f'Device name: {torch.cuda.get_device_name(0)}')
+    print(f'Device capability: {torch.cuda.get_device_capability(0)}')
+else:
+    print('WARNING: CUDA/ROCm not available!')
+"
+echo ""
+
+# Step 2: Baseline Test (No Profiler)
+echo "[STEP 2] Baseline Test - No Profiler"
+echo "----------------------------------------"
+echo "Running: python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 5 --validate-setup"
+echo ""
+
+python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 5 --validate-setup
+BASELINE_EXIT=$?
+
+if [ $BASELINE_EXIT -eq 0 ]; then
+    echo "[SUCCESS] Baseline test passed"
+else
+    echo "[FAILED] Baseline test failed with exit code $BASELINE_EXIT"
+    exit 1
+fi
+echo ""
+
+# Step 3: rocprofv3 with runtime-trace (GitHub issue command pattern)
+echo "[STEP 3] rocprofv3 Test - Runtime Trace + Perfetto"
+echo "----------------------------------------"
+echo "Running: rocprofv3 --runtime-trace --output-format pftrace -- python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 10 --enable-pytorch-profiler --profile-memory"
+echo ""
+
+OUTPUT_DIR="./rocprof_v1_test_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUTPUT_DIR"
+cd "$OUTPUT_DIR"
+
+rocprofv3 --runtime-trace --output-format pftrace -- python ../tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 10 --enable-pytorch-profiler --profile-memory
+ROCPROF_EXIT=$?
+
+echo ""
+if [ $ROCPROF_EXIT -eq 0 ]; then
+    echo "[SUCCESS] rocprofv3 completed"
+else
+    echo "[FAILED] rocprofv3 failed with exit code $ROCPROF_EXIT"
+fi
+echo ""
+
+# Check generated files
+echo "Generated files:"
+ls -lh
+echo ""
+
+# Check for profiling data
+if ls *.pftrace 1> /dev/null 2>&1; then
+    echo "Found perfetto trace files:"
+    ls -lh *.pftrace
+
+    echo ""
+    echo "Checking trace file size:"
+    for f in *.pftrace; do
+        size=$(stat -f%z "$f" 2>/dev/null || stat -c%s "$f")
+        if [ $size -gt 1000 ]; then
+            echo "  $f: $size bytes (likely has data)"
+        else
+            echo "  $f: $size bytes (suspiciously small)"
+        fi
+    done
+else
+    echo "No .pftrace files found in current directory"
+    echo "Checking subdirectories..."
+    find . -name "*.pftrace" -ls
+fi
+echo ""
+
+# Check for PyTorch profiler output
+if [ -d "pytorch_profiles" ]; then
+    echo ""
+    echo "PyTorch Profiler output:"
+    ls -lh pytorch_profiles/
+    echo ""
+    echo "TensorBoard traces available:"
+    echo "  Launch: tensorboard --logdir pytorch_profiles"
+else
+    echo ""
+    echo "Note: pytorch_profiles directory not found (script may need directory creation fix)"
+fi
+
+# Summary
+echo ""
+echo "=========================================="
+echo "Test Summary"
+echo "=========================================="
+echo "Results directory: $OUTPUT_DIR"
+echo ""
+echo "Generated profiling data:"
+echo "  1. rocprofv3 perfetto traces (.pftrace files)"
+echo "  2. PyTorch profiler traces (pytorch_profiles/ if present)"
+echo ""
+echo "Next steps:"
+echo "  1. Inspect generated files in $OUTPUT_DIR"
+echo "  2. Open .pftrace in perfetto.dev or chrome://tracing"
+echo "  3. View PyTorch traces with tensorboard --logdir pytorch_profiles"
+echo "  4. Check for GPU kernel activity in both profilers"
+echo "  5. Compare to GitHub issue #1386 output"
+echo ""
+echo "To view perfetto trace:"
+echo "  Visit: https://ui.perfetto.dev/"
+echo "  Click 'Open trace file' and select the .pftrace file"
+echo ""
diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/tiny_llama_v1.py b/MLExamples/TinyTransformer/version1_pytorch_baseline/tiny_llama_v1.py
index defb8dca..6590f8b7 100644
--- a/MLExamples/TinyTransformer/version1_pytorch_baseline/tiny_llama_v1.py
+++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/tiny_llama_v1.py
@@ -771,6 +771,7 @@ def train_tiny_llama(
         }
 
         profile_path = Path(profiler_config.profile_dir) / "performance_summary.json"
+        profile_path.parent.mkdir(parents=True, exist_ok=True)
         with open(profile_path, 'w') as f:
             json.dump(profile_data, f, indent=2)
 
diff --git a/MLExamples/TinyTransformer/version2_pytorch_fused/README.md b/MLExamples/TinyTransformer/version2_pytorch_fused/README.md
index 60e73ffe..f995b509 100644
--- a/MLExamples/TinyTransformer/version2_pytorch_fused/README.md
+++ b/MLExamples/TinyTransformer/version2_pytorch_fused/README.md
@@ -1,812 +1,68 @@
+# TinyTransformer Version 2: Framework-Level Fusion
 
-# Version 2: PyTorch Fused - Kernel Fusion and ROCm Tools Integration
+This version keeps the same workload as version 1 and asks a narrower question: what changes when the model is routed through framework-level fusion paths?
 
-README.md from `HPCTrainingExamples/MLExamples/TinyTransformer/version2_pytorch_fused` in the Training Examples repository
+## What changed
 
-## Overview
+The intended differences relative to version 1 are:
 
-Version 2 demonstrates the power of kernel fusion and introduces comprehensive ROCm profiling tools. Building on the baseline analysis from Version 1, this version implements targeted optimizations to achieve significant performance improvements through strategic kernel fusion, Flash Attention, and advanced ROCm profiling integration.
+- fused QKV projection path
+- fused or memory-efficient attention path
+- fused SwiGLU path
+- `torch.compile`-driven graph and kernel fusion when available
 
-## Learning Objectives
+Whether those paths are actually active depends on the software stack. That is part of the lesson for this version.
 
-After completing this version, you will be able to:
+## Baseline run
 
-- Implement QKV fusion to reduce kernel launch overhead
-- Integrate Flash Attention for memory-efficient attention computation
-- Apply SwiGLU fusion in feed-forward networks
-- Use ROCm profiling tools (rocprofv3, rocprof-sys, rocprof-compute) for hardware-level analysis
-- Analyze kernel fusion impact on performance and memory usage
-- Interpret ROCm profiling data for optimization insights
+Load the same environment as version 1:
 
-## Key Optimizations Implemented
-
-### 1. QKV Fusion
-
-- **Problem**: Separate Q, K, V linear projections create 3 kernel launches
-- **Solution**: Fused QKV projection with single kernel launch
-- **Expected Benefit**: 20-30% reduction in attention overhead
-
-### 2. Flash Attention Integration
-
-- **Problem**: Standard attention has O(n^2) memory complexity
-- **Solution**: PyTorch's scaled_dot_product_attention with Flash Attention
-- **Expected Benefit**: Significant memory reduction, enables larger sequences
-
-### 3. SwiGLU Fusion
-
-- **Problem**: Separate gate and up projections in feed-forward network
-- **Solution**: Combined gate/up computation with element-wise operations
-- **Expected Benefit**: 15-25% feed-forward network speedup
-
-### 4. Torch Compile Integration
-
-- **Problem**: Remaining kernel launch overhead
-- **Solution**: Automatic fusion through torch.compile()
-- **Expected Benefit**: Additional 10-20% speedup through automatic optimizations
-
-## Architecture Enhancements and Fusion Techniques
-
-### Mathematical Foundation of Kernel Fusion
-
-Kernel fusion combines multiple operations into a single GPU kernel to reduce memory bandwidth requirements and kernel launch overhead. For complete mathematical foundations, see [TINY_LLAMA_ARCHITECTURE.md](../TINY_LLAMA_ARCHITECTURE.md).
-
-#### Fusion Efficiency Analysis
-
-**Memory Bandwidth Reduction:**
-
-$$
-\text{Bandwidth Reduction} = 1 - \frac{\text{Fused Operations Memory}}{\text{Separate Operations Memory}}
-$$
-
-**For QKV Fusion:**
-
-$$
-\begin{aligned}
-\text{Separate}: & \quad 3 \times (\text{Input Read} + \text{Weight Read} + \text{Output Write}) \\
-& = 3 \times (B \times S \times D + D^2 + B \times S \times D) \\
-\text{Fused}: & \quad \text{Input Read} + 3 \times \text{Weight Read} + \text{Output Write} \\
-& = B \times S \times D + 3 \times D^2 + B \times S \times 3D \\
-\text{Reduction}: & \quad \frac{2 \times B \times S \times D}{\text{Total Separate Memory}} \approx 40\% \text{ for typical batch sizes}
-\end{aligned}
-$$
-
-### 1. QKV Fusion Implementation
-
-#### Detailed QKV Fusion Analysis
-
-**Before Fusion (Baseline):**
-```python
-# Three separate linear projections - 3 kernel launches
-q = self.q_proj(hidden_states)  # Kernel 1: GEMM [B,S,D] × [D,D] = [B,S,D]
-k = self.k_proj(hidden_states)  # Kernel 2: GEMM [B,S,D] × [D,D] = [B,S,D]
-v = self.v_proj(hidden_states)  # Kernel 3: GEMM [B,S,D] × [D,D] = [B,S,D]
-
-# Memory reads: 3x input tensor + 3x weight matrices
-# Memory writes: 3x output tensors
-# Total FLOPS: 3 × (2 × B × S × D^2)
-```
-
-**After Fusion (Optimized):**
-```python
-# Single fused projection - 1 kernel launch
-qkv = self.qkv_proj(hidden_states)  # Kernel 1: GEMM [B,S,D] × [D,3D] = [B,S,3D]
-q, k, v = qkv.chunk(3, dim=-1)       # Tensor view operation (no memory copy)
-
-# Memory reads: 1x input tensor + 1x weight matrix (3x size)
-# Memory writes: 1x output tensor (3x size)
-# Total FLOPS: 2 × B × S × D × 3D = 6 × B × S × D^2  (same compute)
-```
-
-**Performance Analysis:**
-```python
-# Kernel launch overhead reduction
-KERNEL_LAUNCH_OVERHEAD = {
-    'baseline_launches': 3,
-    'fused_launches': 1,
-    'reduction': '67% fewer kernel launches',
-    'overhead_per_launch': '5-50 μs depending on operation size',
-    'total_overhead_saved': '10-100 μs per attention layer'
-}
-
-# Memory bandwidth optimization
-MEMORY_BANDWIDTH = {
-    'baseline_reads': 'B×S×D (input) × 3 + D^2 × 3 (weights)',
-    'fused_reads': 'B×S×D (input) × 1 + D^2 × 3 (weights)',
-    'bandwidth_reduction': '~40% for typical batch sizes',
-    'cache_efficiency': 'Improved due to temporal locality'
-}
-```
-
-#### Fused QKV Implementation
-
-```python
-class FusedQKVAttention(nn.Module):
-    """QKV-fused attention with detailed performance optimizations."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.hidden_dim = config.hidden_dim
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_dim // self.num_heads
-
-        # Single fused QKV projection - critical optimization!
-        self.qkv_proj = nn.Linear(
-            config.hidden_dim,
-            3 * config.hidden_dim,
-            bias=False
-        )
-        self.o_proj = nn.Linear(config.hidden_dim, config.hidden_dim, bias=False)
-
-        # RoPE for position embeddings
-        self.rotary_emb = RotaryEmbedding(self.head_dim)
-
-    def forward(self, hidden_states, attention_mask=None):
-        batch_size, seq_len, _ = hidden_states.size()
-
-        # OPTIMIZATION 1: Fused QKV projection (3 ops → 1 op)
-        with nvtx.range("fused_qkv_projection"):
-            qkv = self.qkv_proj(hidden_states)  # [B, S, 3*D]
-
-        # OPTIMIZATION 2: Efficient tensor chunking (no memory copy)
-        q, k, v = qkv.chunk(3, dim=-1)  # Each: [B, S, D]
-
-        # Reshape for multi-head attention
-        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
-        k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
-        v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
-
-        # Apply RoPE (rotary position embeddings)
-        q, k = self.rotary_emb(q, k, seq_len)
-
-        # OPTIMIZATION 3: Flash Attention (covered in next section)
-        with nvtx.range("flash_attention"):
-            attn_output = F.scaled_dot_product_attention(
-                q, k, v,
-                attn_mask=attention_mask,
-                is_causal=True  # Enables causal masking optimization
-            )
-
-        # Reshape and project output
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(batch_size, seq_len, self.hidden_dim)
-
-        return self.o_proj(attn_output)
-```
-
-### 2. Flash Attention Deep Dive
-
-#### Memory Complexity Analysis
-
-**Standard Attention Memory:**
-
-$$
-\begin{aligned}
-\text{Attention Matrix} &: \mathcal{O}(B \times H \times S^{2}) \\
-\text{For } S=1024: &\quad 1024^2 = 1M \text{ elements per head} \\
-\text{Total Memory} &: B \times H \times S^{2} \times 4 \text{ bytes} \\
-\text{Example}: &\quad 8 \times 8 \times 1024^2 \times 4 = 268\text{MB}
-\end{aligned}
-$$
-
-**Flash Attention Memory:**
-
-$$
-\begin{aligned}
-\text{Block Size} &: B_r \times B_c \quad (\text{typically } 64 \times 64) \\
-\text{Memory Usage} &: \mathcal{O}(B \times H \times (B_r + B_c) \times \frac{S^{2}}{B_r \times B_c}) \\
-&= \mathcal{O}(B \times H \times S) \text{ (linear in sequence length!)} \\
-\text{Reduction} &: \frac{S^{2}}{S} = S \text{-fold memory reduction}
-\end{aligned}
-$$
-
-#### Flash Attention Implementation Details
-
-```python
-# Flash Attention Algorithm (PyTorch implementation)
-def flash_attention_forward(q, k, v, mask=None):
-    """Memory-efficient attention with O(N) memory complexity."""
-
-    # Use PyTorch's optimized implementation
-    return F.scaled_dot_product_attention(
-        q, k, v,
-        attn_mask=mask,
-        dropout_p=0.0,
-        is_causal=True,  # Enables causal mask optimization
-        scale=None       # Uses 1/sqrt(head_dim) automatically
-    )
-
-# The above function automatically:
-# 1. Tiles the computation into blocks
-# 2. Computes attention scores incrementally
-# 3. Maintains numerical stability with online softmax
-# 4. Minimizes memory transfers between HBM and SRAM
-```
-
-**Flash Attention Performance Characteristics:**
-```python
-FLASH_ATTENTION_BENEFITS = {
-    'memory_complexity': {
-        'standard': 'O(B × H × S^2)',
-        'flash': 'O(B × H × S)',
-        'reduction_factor': 'S (sequence length)'
-    },
-    'computation': {
-        'flops': 'Same as standard attention',
-        'io_complexity': 'O(S^2 / √M) vs O(S^2) where M is SRAM size',
-        'wall_clock': '2-4x faster for sequences > 512'
-    },
-    'numerical_stability': {
-        'method': 'Online softmax with running max',
-        'precision': 'Better numerical stability than standard attention',
-        'overflow_protection': 'Built-in overflow/underflow handling'
-    }
-}
-```
-
-### 3. SwiGLU Fusion Implementation
-
-#### SwiGLU Mathematical Analysis
-
-**Baseline SwiGLU (Separate Operations):**
-
-$$
-\begin{aligned}
-\text{gate} &= xW_{\text{gate}} + b_{\text{gate}} \quad \text{(Linear projection 1)} \\
-\text{up} &= xW_{\text{up}} + b_{\text{up}} \quad \text{(Linear projection 2)} \\
-\text{activated} &= \text{SiLU}(\text{gate}) \quad \text{(Activation function)} \\
-\text{intermediate} &= \text{activated} \odot \text{up} \quad \text{(Element-wise multiply)} \\
-\text{output} &= \text{intermediate} W_{\text{down}} + b_{\text{down}} \quad \text{(Linear projection 3)}
-\end{aligned}
-$$
-
-**Fused SwiGLU (Optimized):**
-
-$$
-\begin{aligned}
-\text{gate\_up} &= x[W_{\text{gate}} \parallel W_{\text{up}}] \quad \text{(Single GEMM)} \\
-\text{gate, up} &= \text{split}(\text{gate\_up}, \text{dim}=-1) \quad \text{(Tensor view)} \\
-\text{output} &= (\text{SiLU}(\text{gate}) \odot \text{up})W_{\text{down}} \quad \text{(Fused activation + projection)}
-\end{aligned}
-$$
-
-#### Performance Impact Analysis
-
-```python
-# FLOP count comparison
-SWIGLU_FLOPS = {
-    'gate_projection': 2 * batch_size * seq_len * hidden_dim * intermediate_dim,
-    'up_projection': 2 * batch_size * seq_len * hidden_dim * intermediate_dim,
-    'down_projection': 2 * batch_size * seq_len * intermediate_dim * hidden_dim,
-    'silu_activation': batch_size * seq_len * intermediate_dim,  # Element-wise
-    'elementwise_multiply': batch_size * seq_len * intermediate_dim,  # Element-wise
-}
-
-# Memory access pattern optimization
-MEMORY_ACCESS_OPTIMIZATION = {
-    'baseline_memory_ops': {
-        'gate_proj': 'Input read + Weight read + Output write',
-        'up_proj': 'Input read + Weight read + Output write',
-        'down_proj': 'Input read + Weight read + Output write',
-        'total_input_reads': 3,  # Major inefficiency!
-    },
-    'fused_memory_ops': {
-        'gate_up_proj': 'Input read + Weight read + Output write',
-        'down_proj': 'Input read + Weight read + Output write',
-        'total_input_reads': 2,  # 33% reduction in memory bandwidth
-    }
-}
-```
-
-#### Detailed SwiGLU Fusion Implementation
-
-```python
-class FusedSwiGLU(nn.Module):
-    """SwiGLU with gate/up projection fusion for optimal performance."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.hidden_dim = config.hidden_dim
-        self.intermediate_dim = config.intermediate_dim
-
-        # OPTIMIZATION: Fused gate and up projections
-        self.gate_up_proj = nn.Linear(
-            self.hidden_dim,
-            2 * self.intermediate_dim,  # Combined weight matrix
-            bias=False
-        )
-
-        self.down_proj = nn.Linear(
-            self.intermediate_dim,
-            self.hidden_dim,
-            bias=False
-        )
-
-    def forward(self, hidden_states):
-        batch_size, seq_len, hidden_dim = hidden_states.shape
-
-        # OPTIMIZATION 1: Single GEMM for gate and up projections
-        with nvtx.range("fused_gate_up_projection"):
-            gate_up = self.gate_up_proj(hidden_states)  # [B, S, 2*I]
-
-        # OPTIMIZATION 2: Efficient tensor splitting (no memory copy)
-        gate, up = gate_up.chunk(2, dim=-1)  # Each: [B, S, I]
-
-        # OPTIMIZATION 3: Fused SiLU activation with element-wise multiply
-        with nvtx.range("silu_and_multiply"):
-            # SiLU: x * sigmoid(x) = x / (1 + exp(-x))
-            intermediate = F.silu(gate) * up
-
-        # Final down projection
-        with nvtx.range("down_projection"):
-            output = self.down_proj(intermediate)
-
-        return output
-```
-
-**Advanced SwiGLU Optimizations:**
-```python
-# Custom SiLU implementation for maximum efficiency
-def fused_silu_multiply(gate, up):
-    """Fused SiLU activation with element-wise multiplication."""
-    # Can be further optimized with custom kernels in Version 3
-    return F.silu(gate) * up
-
-# Memory layout optimization
-def optimized_weight_layout(gate_weight, up_weight):
-    """Optimize weight matrix layout for fused GEMM."""
-    # Concatenate weights for optimal memory access
-    return torch.cat([gate_weight, up_weight], dim=0)
-```
-
-### 4. Torch Compile Integration
-
-#### Graph-Level Optimization
-
-```python
-# Automatic fusion through torch.compile
-@torch.compile(mode='max-autotune')
-class CompiledTinyLlama(nn.Module):
-    """Automatically optimized model with torch.compile."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.layers = nn.ModuleList([
-            FusedTransformerBlock(config) for _ in range(config.num_layers)
-        ])
-
-    def forward(self, input_ids, attention_mask=None):
-        # torch.compile will automatically:
-        # 1. Fuse adjacent operations
-        # 2. Optimize memory layouts
-        # 3. Generate specialized kernels
-        # 4. Eliminate redundant operations
-
-        hidden_states = self.embed_tokens(input_ids)
-
-        for layer in self.layers:
-            hidden_states = layer(hidden_states, attention_mask)
-
-        return self.norm(hidden_states)
-```
-
-**Torch Compile Optimization Benefits:**
-```python
-TORCH_COMPILE_OPTIMIZATIONS = {
-    'automatic_fusion': {
-        'elementwise_ops': 'Fuses adjacent elementwise operations',
-        'reduction_ops': 'Combines reductions where possible',
-        'memory_planning': 'Optimizes tensor allocation and deallocation'
-    },
-    'kernel_specialization': {
-        'shape_specialization': 'Generates optimized kernels for specific shapes',
-        'dtype_optimization': 'Optimizes for specific data types',
-        'device_targeting': 'AMD GPU-specific optimizations'
-    },
-    'graph_optimization': {
-        'dead_code_elimination': 'Removes unused operations',
-        'constant_folding': 'Precomputes constant expressions',
-        'common_subexpression': 'Eliminates redundant computations'
-    }
-}
-```
-
-### Fusion Performance Analysis Framework
-
-#### Kernel Launch Reduction Analysis
-
-```python
-# Theoretical kernel count analysis
-KERNEL_COUNT_ANALYSIS = {
-    'baseline_attention': {
-        'q_projection': 1,
-        'k_projection': 1,
-        'v_projection': 1,
-        'attention_computation': 3,  # QK^T, softmax, attention*V
-        'output_projection': 1,
-        'total': 7
-    },
-    'fused_attention': {
-        'qkv_projection': 1,  # Fused Q,K,V
-        'flash_attention': 1,  # Optimized attention
-        'output_projection': 1,
-        'total': 3
-    },
-    'reduction': '57% fewer kernels per attention layer'
-}
-
-# Memory bandwidth utilization
-MEMORY_BANDWIDTH_ANALYSIS = {
-    'baseline_efficiency': {
-        'multiple_small_ops': 'Poor memory bandwidth utilization',
-        'cache_misses': 'Frequent cache evictions between operations',
-        'bandwidth_usage': '40-60% of peak bandwidth'
-    },
-    'fused_efficiency': {
-        'larger_operations': 'Better memory bandwidth utilization',
-        'temporal_locality': 'Improved cache reuse',
-        'bandwidth_usage': '70-85% of peak bandwidth'
-    }
-}
-```
-
-#### Arithmetic Intensity Optimization
-
-```python
-# Roofline model analysis for fusion optimizations
-def calculate_arithmetic_intensity(operation_type, batch_size, seq_len, hidden_dim):
-    """Calculate arithmetic intensity for roofline analysis."""
-
-    intensity_metrics = {
-        'baseline_attention': {
-            'flops': 4 * batch_size * seq_len * hidden_dim ** 2,
-            'memory_bytes': 3 * (batch_size * seq_len * hidden_dim * 4),  # 3 separate reads
-            'arithmetic_intensity': 'flops / memory_bytes'
-        },
-        'fused_qkv_attention': {
-            'flops': 4 * batch_size * seq_len * hidden_dim ** 2,  # Same compute
-            'memory_bytes': 1 * (batch_size * seq_len * hidden_dim * 4),  # Single read
-            'arithmetic_intensity': '3x higher than baseline'
-        }
-    }
-
-    return intensity_metrics
-```
-
-## Workshop Exercises
-
-### Exercise 1: Kernel Fusion Analysis
-
-**Objective**: Compare baseline vs. fused implementations to quantify fusion benefits.
-
-#### Step 1: Baseline Comparison
-```bash
-# Run Version 1 baseline for comparison
-cd ../version1_pytorch_baseline
-python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 30 > ../version2_baseline_comparison.log
-
-# Run Version 2 fused implementation
-cd ../version2_pytorch_fused
-python tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 30 > fused_performance.log
-```
-
-#### Step 2: Kernel Count Analysis
 ```bash
-# PyTorch profiler comparison
-python run_pytorch_profiler.py --batch-size 8 --profile-dir ./fusion_analysis --generate-report
-
-# Compare kernel counts between versions
-python analyze_kernel_reduction.py --baseline ../version1_pytorch_baseline/pytorch_profiles --fused ./fusion_analysis
+module load pytorch rocm
 ```
 
-**Expected Results:**
-- 40-60% reduction in kernel launch count
-- 1.4-1.8x speedup in overall training
-- Improved GPU utilization metrics
-
-### Exercise 2: Flash Attention Memory Analysis
-
-**Objective**: Analyze memory efficiency improvements from Flash Attention.
-
-#### Step 1: Memory Scaling Test
-```bash
-# Test memory scaling with sequence length
-for seq_len in 128 256 512 1024; do
-    python tiny_llama_v2.py \
-        --seq-len $seq_len \
-        --batch-size 4 \
-        --enable-memory-profiling \
-        --profile-dir ./flash_attention_seq${seq_len}
-done
-```
+Run:
 
-#### Step 2: Memory Bandwidth Analysis
 ```bash
-# Analyze memory bandwidth utilization
-python run_deepspeed_flops.py \
-    --batch-size 8 \
-    --seq-len 256 \
-    --computational-intensity \
-    --generate-roofline
+python tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 10
 ```
 
-**Expected Results:**
-
-- Linear memory scaling vs. quadratic for baseline
-- 2-4x memory reduction for longer sequences
-- Improved arithmetic intensity metrics
-
-### Exercise 3: ROCm Tools Deep Dive
-
-**Objective**: Master ROCm profiling tools for hardware-level optimization.
-
-AMD offers three performance profiling tools for ROCm based applications:
-`rocprofv3`, `rocprof-sys`, and `rocprof-compute`. For more details about these tools, see 
-[Appendix C of the TECHNICAL_APPENDICES.md](https://github.com/amd/HPCTrainingExamples/blob/main/MLExamples/TinyTransformer/TECHNICAL_APPENDICES.md#appendix-c-rocm-profiling-tools-reference).
-about each tool. 
-
-#### Step 1: rocprofv3 Basic Profiling
+Example output from one validated run:
 
-Running rocprofv3 to collect GPU hotspots on this example would look like this:
+```text
+Performance Summary V2:
+   Average training speed: 259.0 samples/sec
+   Throughput: 33152 tokens/sec
+   Average batch time: 30.9 ms
+   Peak memory usage: 434.3 MB
 
-```bash
-rocprofv3 --kernel-trace --stats --truncate-kernels -- python tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 30
+Fusion Efficiency:
+   QKV Fusion Active: False
+   Flash Attention Active: False
+   SwiGLU Fusion Active: False
+   Kernel Reduction: 0.0%
 ```
 
-View the `<pid>_kernel_stats.csv` file to see the GPU kernel hotspots.
-
-#### Step 2: rocprof-sys System Analysis
+On this stack, the fused paths were not active. That is still useful training material because it shows that version 2 should be treated as a check, not as a guaranteed speedup.
 
-To collect a comprehensive timeline trace with host and device activity, run rocprof-sys as shown below:
+## Profiling workflow
 
-```bash
-rocprof-sys-run --profile --trace -- python tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 30
-```
+Use the same scripts as version 1:
 
-Copy the `.proto` file to your laptop to visualize with the Perfetto browser based tool at [https://ui.perfetto.dev](https://ui.perfetto.dev).
+- `./get_hotspots.sh`
+- `./get_trace.sh`
+- `./get_counters.sh`
+- `./get_rocprof_compute.sh`
+- `./get_rocprof_sys.sh`
 
-#### Step 3: rocprof-compute Advanced Analysis
-
-To collect roofline plots, run the following command:
-
-```bash
-rocprof-compute profile -n roof --kernel-names --roof-only --device 0 -- python tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 30
-```
-
-This generates three PDF files: two roofline plots and a legend.
-
-To collect a profile, then analyze a particular dispatch, run the following commands:
-
-```bash
-rocprof-compute profile -n ver2 --no-roof -- python3 tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 30
-rocprof-compute analyze -p workloads/ver2/MI300A_A1 --list-stats >& stats.txt
-rocprof-compute analyze -p workloads/ver2/MI300A_A1 --dispatch 1538 >& dispatch_1538.txt
-```
+The first question is whether the hotspot list and trace structure actually differ from version 1. If the fused paths are active, you should expect fewer short-lived kernels and a more concentrated dominant kernel set. If they are inactive, version 2 becomes a useful negative control.
 
-The `--list-stats` option provides a hotspot list of GPU kernels and a list of dispatches. Pick a dispatch of the
-kernel that you want to analyze further and use that in the subsequent analyze command. For example, we are
-analyzing dispatch 1538 here.
-
-<!--
-**Expected Results:**
-- Detailed kernel performance metrics
-- Memory hierarchy utilization analysis
-- Optimization recommendations for Version 3
--->
-
-## Key Performance Improvements
-
-### Expected Performance Gains
-
-| Optimization | Impact | Memory Reduction | Kernel Reduction | Implementation Effort |
-|-------------|--------|------------------|------------------|---------------------|
-| **QKV Fusion** | 1.2-1.4x | 15-25% | 33% (3→1 kernels) | Low |
-| **Flash Attention** | 1.3-2.0x | 50-80% | 20% fewer kernels | Medium |
-| **SwiGLU Fusion** | 1.1-1.3x | 10-20% | 50% (2→1 kernels) | Low |
-| **Torch Compile** | 1.1-1.2x | 5-10% | 10-30% | Very Low |
-| **Combined Effect** | **1.6-2.5x** | **60-90%** | **40-60%** | - |
-
-### Scaling Characteristics
-
-- **Batch Size Scaling**: Improved efficiency at larger batch sizes
-- **Sequence Length Scaling**: Near-linear memory scaling (vs. quadratic)
-- **Model Size Scaling**: Better utilization for larger hidden dimensions
-- **Multi-GPU Scaling**: Reduced communication overhead
-
-<!-- Commenting this until we can fix scripts
-
-## ROCm Profiling Workflow
-
-### Comprehensive Profiling Pipeline
-
-```bash
-# Complete ROCm profiling suite
-bash run_all_profilers.sh \
-    --batch-size 8 \
-    --seq-len 128 \
-    --enable-rocm-tools \
-    --profile-dir ./complete_rocm_analysis
-```
-
-This orchestrates:
-1. **PyTorch Profiler** - Framework-level analysis
-2. **DeepSpeed FLOPS** - Computational efficiency
-3. **rocprofv3** - Basic kernel profiling
-4. **rocprof-sys** - System monitoring
-5. **rocprof-compute** - Advanced analysis
-
-### Profiling Data Analysis
-
-```bash
-# Generate comprehensive comparison report
-python generate_fusion_report.py \
-    --baseline ../version1_pytorch_baseline/complete_analysis \
-    --fused ./complete_rocm_analysis \
-    --output ./fusion_comparison_report.md
-```
--->
-
-## Advanced Features
-
-### Configurable Fusion Levels
-
-```bash
-# Selective fusion testing
-python tiny_llama_v2.py \
-    --enable-qkv-fusion \
-    --enable-flash-attention \
-    --disable-swiglu-fusion \
-    --enable-torch-compile
-
-# A/B testing different fusion combinations
-python fusion_ablation_study.py --all-combinations
-```
-
-### Dynamic Batch Size Optimization
-
-```bash
-# Find optimal batch size for current hardware
-python optimize_batch_size.py \
-    --target-memory-usage 0.8 \
-    --seq-len 128 \
-    --optimization-target throughput
-```
-
-### Mixed Precision Integration
-
-```bash
-# Test mixed precision with fusion
-python tiny_llama_v2.py \
-    --use-amp \
-    --amp-dtype bfloat16 \
-    --enable-all-fusion
-```
-
-## Performance Validation
-
-### Regression Testing
-
-```bash
-# Numerical accuracy validation
-python validate_numerical_accuracy.py \
-    --baseline ../version1_pytorch_baseline/tiny_llama_v1.py \
-    --optimized ./tiny_llama_v2.py \
-    --tolerance 1e-4
-
-# Performance regression testing
-python performance_regression_test.py \
-    --baseline-results ../version1_baseline_metrics.json \
-    --current-results ./version2_metrics.json \
-    --min-speedup 1.3
-```
-
-### Benchmark Suite
-
-```bash
-# Comprehensive benchmarking
-python benchmark_suite.py \
-    --models v1,v2 \
-    --batch-sizes 4,8,16,32 \
-    --seq-lengths 128,256,512 \
-    --metrics throughput,memory,accuracy
-```
-
-## Troubleshooting
-
-### Common Issues
-
-#### Flash Attention Compatibility
-```bash
-# Check PyTorch version compatibility
-python -c "import torch; print(torch.__version__); print(hasattr(torch.nn.functional, 'scaled_dot_product_attention'))"
-
-# Fallback for older PyTorch versions
-export PYTORCH_FALLBACK_ATTENTION=1
-```
-
-#### ROCm Tools Permission Issues
-```bash
-# Ensure proper permissions for ROCm profiling
-sudo usermod -a -G render $USER
-export ROCPROF_COMPUTE_DISABLE_AQL_DEBUG=1
-```
-
-#### Memory Issues with Larger Sequences
-```bash
-# Enable memory optimization flags
-export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:256
-export HIP_LAUNCH_BLOCKING=1  # For debugging
-```
-
-<!-- 
-### Performance Debugging
-
-#### Kernel Launch Analysis
-```bash
-# Analyze kernel launch patterns
-rocprof --hip-trace --kernel-trace python tiny_llama_v2.py
-python analyze_kernel_launches.py --trace-file results.csv
-```
-
-#### Memory Bandwidth Utilization
-```bash
-# Detailed memory analysis
-python memory_bandwidth_analyzer.py \
-    --profile-data ./complete_rocm_analysis \
-    --generate-roofline-plot
-```
--->
-
-## Expected Learning Outcomes
-
-### Technical Skills Developed
-
-- **Kernel Fusion Techniques**: Practical implementation of operation fusion
-- **Memory Optimization**: Understanding memory-efficient algorithm design
-- **ROCm Profiling Mastery**: Comprehensive hardware profiling skills
-- **Performance Analysis**: Data-driven optimization decision making
-
-### Performance Engineering Insights
-
-- **Amdahl's Law in Practice**: Understanding optimization impact distribution
-- **Memory vs. Compute Trade-offs**: Balancing different optimization strategies
-- **Hardware Utilization**: Maximizing GPU resource utilization
-- **Scaling Characteristics**: How optimizations affect different workload sizes
-
-## Next Steps
-
-After mastering Version 2:
-
-1. **Analyze fusion impact** across different model and batch configurations
-2. **Identify remaining bottlenecks** using ROCm profiling data
-3. **Prepare optimization targets** for Version 3 (Triton kernels)
-4. **Document lessons learned** for production deployment
-5. **Establish performance baselines** for advanced optimizations
-
-**Ready for Custom Kernels? Proceed to [Version 3: Triton Integration](../version3_triton/README.md)**
-
-<!--
----
-
-## Quick Start Commands
-
-```bash
-# Complete Version 2 workflow
-cd version2_pytorch_fused
-
-# 1. Basic fused training
-python tiny_llama_v2.py --batch-size 8 --enable-all-fusion
-
-# 2. Comprehensive profiling
-bash run_all_profilers.sh --enable-rocm-tools
-
-# 3. Compare with Version 1
-python compare_versions.py --v1 ../version1_pytorch_baseline --v2 .
-
-# 4. Generate optimization report
-python generate_fusion_report.py --output-dir ./optimization_analysis
-```
--->
+## Comparison target
 
-**Expected Results**: 1.6-2.5x speedup, 60-90% memory reduction, comprehensive ROCm profiling mastery.
+Compare this version directly against [`../version1_pytorch_baseline`](../version1_pytorch_baseline). The comparison is more important than the absolute number from any single run.
 
+## References
 
+- comparison across versions: [`../VERSION_COMPARISON.md`](../VERSION_COMPARISON.md)
+- rocprofv3: https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/develop/how-to/using-rocprofv3.html
+- Perfetto UI: https://ui.perfetto.dev/
diff --git a/MLExamples/TinyTransformer/version2_pytorch_fused/get_counters.sh b/MLExamples/TinyTransformer/version2_pytorch_fused/get_counters.sh
new file mode 100755
index 00000000..76c10f0a
--- /dev/null
+++ b/MLExamples/TinyTransformer/version2_pytorch_fused/get_counters.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# Collect kernel trace data for TinyTransformer V2 with rocprofv3.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v2.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v2"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprofv3
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+ROCM_VERSION="$(detect_rocm_version)"
+OUTPUT_DIR="$(make_output_dir counters)"
+
+echo "Starting rocprofv3 kernel trace for TinyTransformer V2..."
+if [ -n "$ROCM_VERSION" ]; then
+    echo "Detected ROCm version: $ROCM_VERSION"
+fi
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+rocprofv3 \
+    --kernel-trace \
+    --output-directory "$OUTPUT_DIR" \
+    -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 3
+echo ""
+echo "To analyze results:"
+
+DB_FILE="$(select_largest_match "$OUTPUT_DIR" "*.db")"
+CSV_FILE="$(select_largest_match "$OUTPUT_DIR" "*_kernel_trace.csv")"
+AGENT_INFO_FILE=""
+
+if [ -n "$CSV_FILE" ]; then
+    CSV_PREFIX="${CSV_FILE%_kernel_trace.csv}"
+    MATCHING_AGENT_INFO="${CSV_PREFIX}_agent_info.csv"
+    if [ -f "$MATCHING_AGENT_INFO" ]; then
+        AGENT_INFO_FILE="$MATCHING_AGENT_INFO"
+    fi
+fi
+
+if [ -z "$AGENT_INFO_FILE" ]; then
+    AGENT_INFO_FILE="$(select_largest_match "$OUTPUT_DIR" "*_agent_info.csv")"
+fi
+
+if [ -n "$CSV_FILE" ]; then
+    echo "  Kernel trace CSV: $CSV_FILE"
+fi
+if [ -n "$AGENT_INFO_FILE" ]; then
+    echo "  Agent info CSV: $AGENT_INFO_FILE"
+fi
+if [ -n "$DB_FILE" ]; then
+    echo "  SQLite database: $DB_FILE"
+    echo ""
+    echo "  Export to CSV:"
+    echo "    rocpd2csv -i \"$DB_FILE\" -o kernel_stats.csv"
+    echo ""
+    echo "  Get kernel summary:"
+    echo "    rocpd summary -i \"$DB_FILE\" --region-categories KERNEL"
+fi
+if [ -z "$CSV_FILE" ] && [ -z "$DB_FILE" ]; then
+    echo "  WARNING: No ROCm profiler output file was detected under $OUTPUT_DIR"
+fi
diff --git a/MLExamples/TinyTransformer/version2_pytorch_fused/get_hotspots.sh b/MLExamples/TinyTransformer/version2_pytorch_fused/get_hotspots.sh
new file mode 100755
index 00000000..da7d6c0a
--- /dev/null
+++ b/MLExamples/TinyTransformer/version2_pytorch_fused/get_hotspots.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Collect a quick hotspot summary for TinyTransformer V2 with rocprofv3 --stats.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v2.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v2"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprofv3
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+OUTPUT_DIR="$(make_output_dir hotspots)"
+
+echo "Starting rocprofv3 hotspot summary for TinyTransformer V2..."
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+rocprofv3 \
+    --kernel-trace \
+    --stats \
+    --output-directory "$OUTPUT_DIR" \
+    -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 3
+echo ""
+
+CSV_FILE="$(select_largest_match "$OUTPUT_DIR" "*_kernel_stats.csv")"
+if [ -z "$CSV_FILE" ]; then
+    CSV_FILE="$(select_largest_match "$OUTPUT_DIR" "*_domain_stats.csv")"
+fi
+if [ -n "$CSV_FILE" ]; then
+    echo "Top rows from $CSV_FILE:"
+    head -11 "$CSV_FILE"
+else
+    echo "WARNING: No hotspot CSV file was detected under $OUTPUT_DIR"
+fi
diff --git a/MLExamples/TinyTransformer/version2_pytorch_fused/get_rocprof_compute.sh b/MLExamples/TinyTransformer/version2_pytorch_fused/get_rocprof_compute.sh
new file mode 100755
index 00000000..11e319fd
--- /dev/null
+++ b/MLExamples/TinyTransformer/version2_pytorch_fused/get_rocprof_compute.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+# Collect hardware metrics for TinyTransformer V2 with rocprof-compute.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v2.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v2"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprof-compute
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+MODE="${1:-no-roof}"
+GPU_ARCH="$(detect_gpu_arch)"
+SUPPORTED_ARCH_REGEX='^(gfx908|gfx90a|gfx940|gfx941|gfx942)$'
+
+if [ -n "$GPU_ARCH" ] && ! echo "$GPU_ARCH" | grep -Eq "$SUPPORTED_ARCH_REGEX"; then
+    echo "Skipping rocprof-compute profiling for TinyTransformer V2..."
+    echo "Detected GPU architecture: $GPU_ARCH"
+    echo "rocprof-compute hardware-counter collection currently requires a supported Instinct GPU"
+    echo "(for example gfx908, gfx90a, gfx940, gfx941, or gfx942)."
+    echo "Use get_trace.sh, get_hotspots.sh, or get_counters.sh on this system instead."
+    exit 0
+fi
+
+OUTPUT_DIR="$(make_output_dir rocprof_compute)"
+PROFILE_ROOT="$OUTPUT_DIR/$WORKLOAD_NAME"
+
+case "$MODE" in
+    full)
+        PROFILE_ARGS=(--kernel-names)
+        MODE_DESCRIPTION="full profile (counters plus roofline stage)"
+        ;;
+    roof-only)
+        PROFILE_ARGS=(--roof-only --kernel-names)
+        MODE_DESCRIPTION="roofline-only profile"
+        ;;
+    no-roof)
+        PROFILE_ARGS=(--no-roof --kernel-names)
+        MODE_DESCRIPTION="counter-only profile without roofline collection"
+        ;;
+    *)
+        echo "Usage: $0 [no-roof|full|roof-only]" >&2
+        echo "  no-roof   collect counters only and skip the roofline stage" >&2
+        echo "  full      collect the default counter set and roofline data" >&2
+        echo "  roof-only collect roofline data only and label roofline kernels" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting rocprof-compute hardware metrics for TinyTransformer V2..."
+if [ -n "$GPU_ARCH" ]; then
+    echo "Detected GPU architecture: $GPU_ARCH"
+fi
+echo "Mode: $MODE_DESCRIPTION"
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+rocprof-compute profile \
+    --name "$WORKLOAD_NAME" \
+    --path "$PROFILE_ROOT" \
+    "${PROFILE_ARGS[@]}" \
+    -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 4
+echo ""
+echo "To analyze results:"
+
+ANALYZE_PATH=""
+for marker in pmc_perf.csv roofline.csv sysinfo.csv; do
+    MARKER_FILE="$(find "$PROFILE_ROOT" -name "$marker" 2>/dev/null | head -1)"
+    if [ -n "$MARKER_FILE" ]; then
+        ANALYZE_PATH="$(dirname "$MARKER_FILE")"
+        break
+    fi
+done
+
+if [ -n "$ANALYZE_PATH" ]; then
+    echo "  Raw data directory: $ANALYZE_PATH"
+    echo ""
+    echo "  1. List detected kernels and dispatches:"
+    echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --list-stats"
+    if [ "$MODE" != "roof-only" ]; then
+        echo ""
+        echo "  2. Inspect one dispatch in the default report:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N>"
+        echo ""
+        echo "  3. Check occupancy and LDS-related limits:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N> --block 2.1.15 6.2.7"
+        echo ""
+        echo "  4. Check L1/L2 memory speed-of-light metrics:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N> --block 16.1 17.1"
+    else
+        echo ""
+        echo "  Roofline-only mode does not collect the full counter set."
+        echo "  Re-run with '$0 full' or '$0 no-roof' for detailed block analysis."
+    fi
+else
+    echo "  WARNING: Could not detect the rocprof-compute raw data directory under $PROFILE_ROOT"
+    echo "  Inspect the generated workload tree and use that path with 'rocprof-compute analyze -p'."
+fi
diff --git a/MLExamples/TinyTransformer/version2_pytorch_fused/get_rocprof_sys.sh b/MLExamples/TinyTransformer/version2_pytorch_fused/get_rocprof_sys.sh
new file mode 100755
index 00000000..756b4b99
--- /dev/null
+++ b/MLExamples/TinyTransformer/version2_pytorch_fused/get_rocprof_sys.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Collect a system trace for TinyTransformer V2 with rocprof-sys.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v2.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v2"
+TINYTRANSFORMER_DEFAULT_NUM_STEPS=2
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprof-sys-run
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+OUTPUT_DIR="$(make_output_dir rocprof_sys)"
+
+echo "Starting rocprof-sys trace for TinyTransformer V2..."
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+pushd "$OUTPUT_DIR" >/dev/null
+rocprof-sys-run \
+    --profile \
+    --trace \
+    -- "${BENCHMARK_CMD[@]}"
+popd >/dev/null
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 4
+echo ""
+echo "Open the trace in Perfetto:"
+PROTO_FILE="$(select_largest_match "$OUTPUT_DIR" "*.proto")"
+if [ -n "$PROTO_FILE" ]; then
+    echo "  Perfetto trace file: $PROTO_FILE"
+    echo "  Open it in Perfetto UI: https://ui.perfetto.dev/"
+else
+    echo "  WARNING: No .proto file was found under $OUTPUT_DIR"
+    echo "  Inspect the output tree and open the generated trace in Perfetto UI if present."
+fi
diff --git a/MLExamples/TinyTransformer/version2_pytorch_fused/get_trace.sh b/MLExamples/TinyTransformer/version2_pytorch_fused/get_trace.sh
new file mode 100755
index 00000000..c065397f
--- /dev/null
+++ b/MLExamples/TinyTransformer/version2_pytorch_fused/get_trace.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Collect a runtime trace for TinyTransformer V2 with rocprofv3.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v2.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v2"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprofv3
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+ROCM_VERSION="$(detect_rocm_version)"
+ROCM_MAJOR="$(rocm_major_from_version "$ROCM_VERSION")"
+OUTPUT_DIR="$(make_output_dir trace)"
+
+echo "Starting rocprofv3 runtime trace for TinyTransformer V2..."
+if [ -n "$ROCM_VERSION" ]; then
+    echo "Detected ROCm version: $ROCM_VERSION"
+fi
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+
+TRACE_CMD=(rocprofv3 --runtime-trace --output-directory "$OUTPUT_DIR")
+if [ "$ROCM_MAJOR" = "6" ] || [ "$ROCM_MAJOR" = "7" ]; then
+    TRACE_CMD+=(--output-format pftrace)
+fi
+
+echo ""
+"${TRACE_CMD[@]}" -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 3
+echo ""
+
+PFTRACE_FILE="$(select_largest_match "$OUTPUT_DIR" "*.pftrace")"
+DB_FILE="$(select_largest_match "$OUTPUT_DIR" "*.db")"
+
+if [ -n "$PFTRACE_FILE" ]; then
+    echo "Perfetto trace file: $PFTRACE_FILE"
+    echo "Open it in Perfetto UI: https://ui.perfetto.dev/"
+elif [ -n "$DB_FILE" ]; then
+    echo "SQLite database found: $DB_FILE"
+    echo "Convert it to Perfetto format with:"
+    echo "  rocpd2pftrace -i \"$DB_FILE\" -o trace.pftrace"
+else
+    echo "WARNING: No .pftrace or .db file was found under $OUTPUT_DIR"
+fi
diff --git a/MLExamples/TinyTransformer/version2_pytorch_fused/test_github_issue.sh b/MLExamples/TinyTransformer/version2_pytorch_fused/test_github_issue.sh
new file mode 100755
index 00000000..439cfa3f
--- /dev/null
+++ b/MLExamples/TinyTransformer/version2_pytorch_fused/test_github_issue.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+#
+# Test exact command from GitHub issue #1386
+# Issue: "No device activity" with rocprofv3 on version2
+#
+
+set -e
+
+echo "=========================================="
+echo "GitHub Issue #1386 Reproduction Test"
+echo "=========================================="
+echo ""
+
+OUTPUT_DIR="./github_issue_test/test_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUTPUT_DIR"
+
+echo "Output directory: $OUTPUT_DIR"
+echo ""
+echo "Reproducing exact command from GitHub issue #1386:"
+echo "rocprofv3 --runtime-trace --output-format pftrace -- python tiny_llama_v2.py --batch-size 8 --seq-len 128"
+echo ""
+echo "Note: GitHub issue did NOT specify --num-steps, so default value will be used"
+echo ""
+
+cd "$OUTPUT_DIR"
+rocprofv3 --runtime-trace --output-format pftrace -- python ../../tiny_llama_v2.py --batch-size 8 --seq-len 128
+ROCPROF_EXIT=$?
+
+echo ""
+if [ $ROCPROF_EXIT -eq 0 ]; then
+    echo "[SUCCESS] rocprofv3 profiling completed"
+else
+    echo "[FAILED] rocprofv3 profiling failed with exit code $ROCPROF_EXIT"
+    exit 1
+fi
+echo ""
+
+echo "Generated files:"
+find . -type f -ls
+echo ""
+
+echo "Checking trace file sizes:"
+if compgen -G "*/*.pftrace" > /dev/null; then
+    for f in */*.pftrace; do
+        SIZE=$(stat -c%s "$f" 2>/dev/null || stat -f%z "$f" 2>/dev/null || echo "unknown")
+        SIZE_MB=$(echo "scale=2; $SIZE / 1048576" | bc)
+        echo "  $f - ${SIZE_MB} MB"
+    done
+    echo ""
+    LARGEST=$(find . -name "*.pftrace" -exec ls -l {} \; | sort -k5 -n -r | head -1 | awk '{print $9, $5}')
+    LARGEST_FILE=$(echo $LARGEST | awk '{print $1}')
+    LARGEST_SIZE=$(echo $LARGEST | awk '{print $2}')
+    LARGEST_MB=$(echo "scale=2; $LARGEST_SIZE / 1048576" | bc)
+
+    echo "Largest trace: $LARGEST_FILE (${LARGEST_MB} MB)"
+    echo ""
+
+    if (( $(echo "$LARGEST_MB < 1" | bc -l) )); then
+        echo "[WARNING] Trace file is very small (< 1 MB)"
+        echo "This may indicate 'no device activity' issue from GitHub #1386"
+    else
+        echo "[OK] Trace file size looks normal"
+        echo "Version2 profiling appears to be working correctly"
+    fi
+else
+    echo "[ERROR] No .pftrace files found"
+fi
+echo ""
+
+echo "Comparison with version1 baseline:"
+echo "  Version1 trace size: ~44 MB"
+echo "  Version2 trace size: ${LARGEST_MB} MB"
+echo ""
diff --git a/MLExamples/TinyTransformer/version2_pytorch_fused/test_rocpd.sh b/MLExamples/TinyTransformer/version2_pytorch_fused/test_rocpd.sh
new file mode 100755
index 00000000..a40d273b
--- /dev/null
+++ b/MLExamples/TinyTransformer/version2_pytorch_fused/test_rocpd.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Test rocpd (ROCm Profiling Daemon) for continuous profiling
+#
+
+set -e
+
+echo "=========================================="
+echo "rocpd Test - Version 2"
+echo "=========================================="
+echo ""
+
+# Check if rocpd is available
+if ! command -v rocpd &> /dev/null; then
+    echo "[ERROR] rocpd not found in PATH"
+    echo "rocpd may not be installed or available in this ROCm version"
+    exit 1
+fi
+
+echo "rocpd location: $(which rocpd)"
+echo ""
+
+OUTPUT_DIR="./rocpd/rocpd_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUTPUT_DIR"
+
+echo "Output directory: $OUTPUT_DIR"
+echo ""
+
+# Start rocpd in background
+echo "Starting rocpd daemon..."
+rocpd --output-dir "$OUTPUT_DIR" &
+ROCPD_PID=$!
+echo "rocpd running with PID: $ROCPD_PID"
+echo ""
+
+# Give rocpd time to initialize
+sleep 2
+
+# Run workload
+echo "Running workload: python tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 10"
+python tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 10
+WORKLOAD_EXIT=$?
+echo ""
+
+# Stop rocpd
+echo "Stopping rocpd daemon..."
+kill $ROCPD_PID 2>/dev/null || true
+wait $ROCPD_PID 2>/dev/null || true
+echo ""
+
+if [ $WORKLOAD_EXIT -eq 0 ]; then
+    echo "[SUCCESS] Workload completed"
+else
+    echo "[FAILED] Workload failed with exit code $WORKLOAD_EXIT"
+fi
+echo ""
+
+echo "Generated files in $OUTPUT_DIR:"
+ls -lh "$OUTPUT_DIR"
+echo ""
+
+echo "rocpd output is a SQLite3 database file"
+echo ""
+echo "To view the database:"
+echo "  - Use VS Code SQLite Viewer extension"
+echo "  - rocprof-compute and rocprof-systems can consume it directly"
+echo "  - No official CLI tool is provided for viewing"
+echo ""
+echo "rocpd provides continuous profiling with minimal overhead"
+echo ""
diff --git a/MLExamples/TinyTransformer/version3_triton/README.md b/MLExamples/TinyTransformer/version3_triton/README.md
index 24d5e8b2..73432da2 100644
--- a/MLExamples/TinyTransformer/version3_triton/README.md
+++ b/MLExamples/TinyTransformer/version3_triton/README.md
@@ -1,785 +1,66 @@
+# TinyTransformer Version 3: Triton Kernels
 
-# Version 3: Triton Kernel Integration
+Version 3 is where the progression changes materially. The custom Triton kernels reduce memory use, change the dominant kernel set, and move the training loop into a different performance regime.
 
-README.md from `HPCTrainingExamples/MLExamples/TinyTransformer/version3_triton` in the Training Examples repository
+## What changed
 
-**Objective**: Implement custom GPU kernels using Triton for maximum performance optimization
+Relative to version 2, this version introduces:
 
-**Expected Performance**: 2.0-3.5x speedup over baseline, 70-95% memory reduction
+- Triton RMSNorm kernels
+- Triton attention kernels
+- a Triton-backed SwiGLU path
+- a smaller, more concentrated kernel mix
 
-**Learning Focus**: GPU kernel programming, memory access optimization, custom operator development
+## Baseline run
 
-## Overview
-
-Version 3 introduces custom Triton GPU kernels for the most performance-critical operations in the Tiny LLaMA model. Triton provides a Python-like syntax for writing GPU kernels while automatically handling low-level optimizations like memory coalescing and register allocation.
-
-### Key Optimizations
-
-1. **Custom RMSNorm Kernel**: Fused variance computation and normalization
-2. **SwiGLU Kernel**: Combined gate/up projections with SiLU activation
-3. **Flash Attention Kernel**: Memory-efficient attention with O(N) complexity
-4. **Automatic Optimization**: Triton compiler optimizations for target hardware
-
-### Architecture Changes
-
-```
-Previous: PyTorch Operations → Multiple Kernel Launches → Memory Transfers
-Current:  Custom Triton Kernels → Single Optimized Launch → Minimal Memory Traffic
-```
-
-## Files and Structure
-
-```
-version3_triton/
-├── README.md                           # This file
-├── tiny_llama_v3.py                   # Main model with Triton kernels
-├── run_triton_profiling.py            # Triton-specific profiling
-├── run_rocprof_triton.sh              # ROCProfiler for Triton kernels
-├── exercises/
-│   ├── exercise1_triton_basics.md     # Triton fundamentals
-│   ├── exercise2_swiglu_optimization.md # SwiGLU kernel deep dive
-│   └── exercise3_flash_attention.md   # Flash Attention implementation
-└── results/                           # Generated profiling results
-```
-
-## Key Components and Triton Kernel Implementation
-
-### Mathematical Foundation of Triton Kernels
-
-Triton kernels optimize GPU computation by exploiting the memory hierarchy and parallelism patterns. For complete mathematical foundations, see [TINY_LLAMA_ARCHITECTURE.md](../TINY_LLAMA_ARCHITECTURE.md).
-
-#### Memory Hierarchy Optimization
-
-**GPU Memory Hierarchy:**
-```
-Registers (fastest, ~40KB per SM)     → Data reuse within thread
-Shared Memory (~164KB per SM)         → Data sharing within thread block
-L1 Cache (~128KB per SM)              → Automatic caching
-L2 Cache (~8MB global)                → Cross-SM data sharing
-HBM (slowest, ~64GB)                  → Main memory
-```
-
-**Triton Optimization Strategy:**
-
-$$\text{Arithmetic Intensity} = \frac{\text{FLOPS}}{\text{Memory Bytes Accessed}}$$
-
-Triton maximizes this ratio by:
-
-1. **Tiling**: Processing data in blocks that fit in fast memory
-2. **Fusion**: Combining multiple operations to reuse data
-3. **Vectorization**: Using SIMD instructions efficiently
-
-### 1. Triton RMSNorm Implementation
-
-#### RMSNorm Mathematical Analysis
-
-**Standard Implementation (PyTorch):**
-```python
-# Multiple kernel launches and memory accesses
-variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True)  # Kernel 1: Power + Reduction
-rstd = torch.rsqrt(variance + eps)                            # Kernel 2: Reciprocal sqrt
-output = (x * rstd).to(input_dtype) * weight                  # Kernel 3: Multiply + Scale
-
-# Total: 3 kernel launches, 3x memory bandwidth usage
-```
-
-**Triton Fused Implementation:**
-```python
-@triton.jit
-def rmsnorm_kernel(
-    x_ptr, weight_ptr, output_ptr,
-    n_rows, n_cols, eps,
-    BLOCK_SIZE: tl.constexpr
-):
-    """
-    Fused RMSNorm kernel with optimal memory access patterns.
-
-    Mathematical Operation:
-    output = (x / sqrt(mean(x^2) + eps)) * weight
-
-    Memory Optimization:
-    - Single pass through input data
-    - Variance computation in registers
-    - Immediate normalization and scaling
-    """
-    # Program ID determines which row this thread block processes
-    row_idx = tl.program_id(0)
-
-    # Bounds checking
-    if row_idx >= n_rows:
-        return
-
-    # Compute memory offsets for this row
-    x_row_ptr = x_ptr + row_idx * n_cols
-    output_row_ptr = output_ptr + row_idx * n_cols
-
-    # Load weight vector (broadcast across all rows)
-    col_offsets = tl.arange(0, BLOCK_SIZE)
-    mask = col_offsets < n_cols
-    weight = tl.load(weight_ptr + col_offsets, mask=mask, other=0.0)
-
-    # OPTIMIZATION 1: Streaming variance computation
-    variance = 0.0
-    for block_start in range(0, n_cols, BLOCK_SIZE):
-        col_offsets = block_start + tl.arange(0, BLOCK_SIZE)
-        mask = col_offsets < n_cols
-
-        # Load input block
-        x_block = tl.load(x_row_ptr + col_offsets, mask=mask, other=0.0)
-
-        # Accumulate variance in registers (no memory writes!)
-        variance += tl.sum(x_block * x_block)
-
-    # Compute RMS normalization factor
-    variance = variance / n_cols
-    rstd = 1.0 / tl.sqrt(variance + eps)
-
-    # OPTIMIZATION 2: Fused normalization and scaling
-    for block_start in range(0, n_cols, BLOCK_SIZE):
-        col_offsets = block_start + tl.arange(0, BLOCK_SIZE)
-        mask = col_offsets < n_cols
-
-        # Load input block again (cached in L1/L2)
-        x_block = tl.load(x_row_ptr + col_offsets, mask=mask, other=0.0)
-        weight_block = tl.load(weight_ptr + col_offsets, mask=mask, other=0.0)
-
-        # Fused normalize + scale in single operation
-        output_block = x_block * rstd * weight_block
-
-        # Store result
-        tl.store(output_row_ptr + col_offsets, output_block, mask=mask)
-```
-
-**Performance Analysis:**
-```python
-RMSNORM_PERFORMANCE = {
-    'memory_access_pattern': {
-        'pytorch': 'Multiple passes through data',
-        'triton': 'Two passes (variance + normalize)',
-        'bandwidth_reduction': '~50% fewer memory accesses'
-    },
-    'kernel_launches': {
-        'pytorch': 3,  # pow, mean, multiply
-        'triton': 1,   # fused operation
-        'overhead_reduction': '67% fewer kernel launches'
-    },
-    'numerical_precision': {
-        'pytorch': 'Multiple intermediate tensors',
-        'triton': 'High-precision accumulation in registers',
-        'stability': 'Better numerical stability'
-    }
-}
-```
-
-### 2. Triton SwiGLU Implementation
-
-#### SwiGLU Fusion Analysis
-
-**Memory Access Pattern Optimization:**
-
-$$\begin{aligned}
-\text{Standard SwiGLU}: & \quad \text{4 separate operations} \\
-\text{gate} &= xW_{\text{gate}} \quad \text{(GEMM 1)} \\
-\text{up} &= xW_{\text{up}} \quad \text{(GEMM 2)} \\
-\text{activated} &= \text{SiLU}(\text{gate}) \quad \text{(Elementwise 1)} \\
-\text{output} &= \text{activated} \odot \text{up} \quad \text{(Elementwise 2)} \\
-\text{Memory Reads}: & \quad 4 \times \text{input tensor} + 2 \times \text{weight matrices}
-\end{aligned}$$
-
-**Triton Fused SwiGLU:**
-
-$$\begin{aligned}
-\text{Triton SwiGLU}: & \quad \text{Single fused operation} \\
-\text{output} &= \text{SiLU}(xW_{\text{gate}}) \odot (xW_{\text{up}}) \\
-\text{Memory Reads}: & \quad 1 \times \text{input tensor} + 2 \times \text{weight matrices}
-\end{aligned}$$
-
-#### Detailed Triton SwiGLU Kernel
-
-```python
-@triton.jit
-def swiglu_kernel(
-    x_ptr, gate_weight_ptr, up_weight_ptr, output_ptr,
-    batch_size, seq_len, hidden_dim, intermediate_dim,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr
-):
-    """
-    Fused SwiGLU kernel with optimal memory tiling.
-
-    Computes: output = SiLU(x @ gate_weight) * (x @ up_weight)
-
-    Tiling Strategy:
-    - M dimension: batch_size * seq_len
-    - K dimension: hidden_dim
-    - N dimension: intermediate_dim
-    """
-    # Thread block coordinates
-    pid_m = tl.program_id(0)
-    pid_n = tl.program_id(1)
-
-    # Compute tile offsets
-    m_offset = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    n_offset = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-
-    # Initialize accumulators for both gate and up projections
-    gate_acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    up_acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-
-    # OPTIMIZATION 1: Fused GEMM computation
-    for k in range(0, hidden_dim, BLOCK_SIZE_K):
-        k_offset = k + tl.arange(0, BLOCK_SIZE_K)
-
-        # Load input tile (shared between gate and up computations)
-        x_tile = tl.load(
-            x_ptr + m_offset[:, None] * hidden_dim + k_offset[None, :],
-            mask=(m_offset[:, None] < batch_size * seq_len) & (k_offset[None, :] < hidden_dim)
-        )
-
-        # Load weight tiles
-        gate_weight_tile = tl.load(
-            gate_weight_ptr + k_offset[:, None] * intermediate_dim + n_offset[None, :],
-            mask=(k_offset[:, None] < hidden_dim) & (n_offset[None, :] < intermediate_dim)
-        )
-        up_weight_tile = tl.load(
-            up_weight_ptr + k_offset[:, None] * intermediate_dim + n_offset[None, :],
-            mask=(k_offset[:, None] < hidden_dim) & (n_offset[None, :] < intermediate_dim)
-        )
-
-        # Fused matrix multiplication (data reuse in registers)
-        gate_acc += tl.dot(x_tile, gate_weight_tile)
-        up_acc += tl.dot(x_tile, up_weight_tile)
-
-    # OPTIMIZATION 2: Fused SiLU activation and element-wise multiply
-    # SiLU(x) = x * sigmoid(x) = x / (1 + exp(-x))
-    gate_activated = gate_acc / (1.0 + tl.exp(-gate_acc))
-    swiglu_output = gate_activated * up_acc
-
-    # Store final result
-    output_mask = (m_offset[:, None] < batch_size * seq_len) & (n_offset[None, :] < intermediate_dim)
-    tl.store(
-        output_ptr + m_offset[:, None] * intermediate_dim + n_offset[None, :],
-        swiglu_output,
-        mask=output_mask
-    )
-```
-
-**Triton SwiGLU Performance Characteristics:**
-```python
-SWIGLU_TRITON_BENEFITS = {
-    'memory_efficiency': {
-        'data_reuse': 'Input tensor loaded once, used for both gate and up',
-        'register_usage': 'Intermediate results kept in registers',
-        'bandwidth_reduction': '60-75% reduction in memory traffic'
-    },
-    'computational_efficiency': {
-        'operation_fusion': 'GEMM + SiLU + elementwise in single kernel',
-        'vectorization': 'Automatic SIMD instruction generation',
-        'occupancy': 'Optimized thread block configuration'
-    },
-    'numerical_stability': {
-        'precision': 'FP32 accumulation with FP16 storage',
-        'activation_stability': 'Numerically stable SiLU implementation',
-        'overflow_protection': 'Built-in overflow handling'
-    }
-}
-```
-
-### 3. Triton Flash Attention Implementation
-
-#### Flash Attention Tiling Strategy
-
-**Memory Complexity Analysis:**
-
-$$\begin{aligned}
-\text{Standard Attention Memory} &: O(B \times H \times S^{2}) \\
-\text{Flash Attention Memory} &: O(B \times H \times S) \\
-\text{SRAM Usage} &: O(B_r + B_c) \text{ where } B_r, B_c \text{ are tile sizes} \\
-\text{IO Complexity} &: O\left(\frac{S^{2}}{\sqrt{M}}\right) \text{ where } M \text{ is SRAM size}
-\end{aligned}$$
-
-#### Triton Flash Attention Kernel
-
-```python
-@triton.jit
-def flash_attention_kernel(
-    q_ptr, k_ptr, v_ptr, output_ptr,
-    batch_size, num_heads, seq_len, head_dim,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr
-):
-    """
-    Memory-efficient Flash Attention with tiled computation.
-
-    Algorithm:
-    1. Tile Q, K, V into blocks that fit in SRAM
-    2. Compute attention scores incrementally
-    3. Use online softmax for numerical stability
-    4. Accumulate attention output progressively
-    """
-    # Thread block IDs
-    batch_idx = tl.program_id(0)
-    head_idx = tl.program_id(1)
-    q_tile_idx = tl.program_id(2)
-
-    # Compute base pointers for this batch and head
-    q_base = q_ptr + batch_idx * num_heads * seq_len * head_dim + head_idx * seq_len * head_dim
-    k_base = k_ptr + batch_idx * num_heads * seq_len * head_dim + head_idx * seq_len * head_dim
-    v_base = v_ptr + batch_idx * num_heads * seq_len * head_dim + head_idx * seq_len * head_dim
-    output_base = output_ptr + batch_idx * num_heads * seq_len * head_dim + head_idx * seq_len * head_dim
-
-    # Load Q tile (stays in SRAM for entire computation)
-    q_offset_m = q_tile_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    q_mask_m = q_offset_m < seq_len
-
-    q_tile = tl.load(
-        q_base + q_offset_m[:, None] * head_dim + tl.arange(0, head_dim)[None, :],
-        mask=q_mask_m[:, None]
-    )
-
-    # Initialize output accumulator and normalization factors
-    output_acc = tl.zeros((BLOCK_SIZE_M, head_dim), dtype=tl.float32)
-    row_max = tl.full((BLOCK_SIZE_M,), float('-inf'), dtype=tl.float32)
-    row_sum = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-
-    # OPTIMIZATION 1: Tiled computation over K, V
-    for k_tile_idx in range(0, tl.cdiv(seq_len, BLOCK_SIZE_N)):
-        k_offset_n = k_tile_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-        k_mask_n = k_offset_n < seq_len
-
-        # Load K and V tiles
-        k_tile = tl.load(
-            k_base + k_offset_n[:, None] * head_dim + tl.arange(0, head_dim)[None, :],
-            mask=k_mask_n[:, None]
-        )
-        v_tile = tl.load(
-            v_base + k_offset_n[:, None] * head_dim + tl.arange(0, head_dim)[None, :],
-            mask=k_mask_n[:, None]
-        )
-
-        # OPTIMIZATION 2: Compute attention scores in tiles
-        scores = tl.dot(q_tile, k_tile.T) * (1.0 / tl.sqrt(head_dim.to(tl.float32)))
-
-        # Apply causal mask
-        causal_mask = q_offset_m[:, None] >= k_offset_n[None, :]
-        scores = tl.where(causal_mask, scores, float('-inf'))
-
-        # OPTIMIZATION 3: Online softmax (numerically stable)
-        tile_max = tl.max(scores, axis=1)
-        new_row_max = tl.maximum(row_max, tile_max)
-
-        # Rescale previous accumulated values
-        old_scale = tl.exp(row_max - new_row_max)
-        tile_scale = tl.exp(tile_max - new_row_max)
-
-        # Update output accumulator
-        output_acc = output_acc * old_scale[:, None]
-        scores_softmax = tl.exp(scores - new_row_max[:, None]) * tile_scale[:, None]
-        output_acc += tl.dot(scores_softmax, v_tile)
-
-        # Update normalization factors
-        row_sum = row_sum * old_scale + tl.sum(scores_softmax, axis=1)
-        row_max = new_row_max
-
-    # Final normalization
-    output_final = output_acc / row_sum[:, None]
-
-    # Store result
-    tl.store(
-        output_base + q_offset_m[:, None] * head_dim + tl.arange(0, head_dim)[None, :],
-        output_final,
-        mask=q_mask_m[:, None]
-    )
-```
-
-**Flash Attention Performance Benefits:**
-```python
-FLASH_ATTENTION_TRITON = {
-    'memory_efficiency': {
-        'complexity': 'O(N) vs O(N^2) for standard attention',
-        'sram_usage': 'Optimal SRAM utilization with tiling',
-        'hbm_access': 'Minimized high-bandwidth memory access'
-    },
-    'computational_efficiency': {
-        'online_softmax': 'Numerically stable incremental computation',
-        'tiled_gemm': 'Optimal matrix multiplication blocking',
-        'kernel_fusion': 'Single kernel for entire attention computation'
-    },
-    'scalability': {
-        'sequence_length': 'Linear scaling with sequence length',
-        'batch_processing': 'Efficient batched computation',
-        'multi_head': 'Parallelized across attention heads'
-    }
-}
-```
-
-### Advanced Triton Optimization Techniques
-
-#### Block Size Tuning
-
-```python
-def auto_tune_block_sizes(operation_type, input_shape, device_properties):
-    """
-    Automatically tune block sizes for optimal performance.
-    """
-    tuning_space = {
-        'rmsnorm': {
-            'block_sizes': [64, 128, 256, 512, 1024],
-            'criteria': 'Memory bandwidth utilization',
-            'constraints': 'Register usage < 64KB'
-        },
-        'swiglu': {
-            'block_sizes': [(32, 64, 32), (64, 64, 64), (128, 32, 64)],
-            'criteria': 'Arithmetic intensity maximization',
-            'constraints': 'Shared memory < 164KB'
-        },
-        'flash_attention': {
-            'block_sizes': [(64, 64), (128, 64), (64, 128)],
-            'criteria': 'SRAM utilization efficiency',
-            'constraints': 'Memory coalescing requirements'
-        }
-    }
-
-    return optimize_for_hardware(tuning_space[operation_type], device_properties)
-```
-
-#### Memory Coalescing Optimization
-
-```python
-# Optimal memory access patterns for AMD GPUs
-MEMORY_ACCESS_PATTERNS = {
-    'coalesced_access': {
-        'pattern': 'Consecutive threads access consecutive memory addresses',
-        'bandwidth': '100% of peak memory bandwidth',
-        'implementation': 'Proper stride patterns in Triton kernels'
-    },
-    'strided_access': {
-        'pattern': 'Regular stride pattern across memory',
-        'bandwidth': '50-80% of peak memory bandwidth',
-        'optimization': 'Adjust block sizes to match stride'
-    },
-    'random_access': {
-        'pattern': 'Irregular memory access pattern',
-        'bandwidth': '10-30% of peak memory bandwidth',
-        'mitigation': 'Data reordering and blocking strategies'
-    }
-}
-```
-
-## Quick Start
-
-### 1. Environment Setup
-
-Ensure Triton is installed in your environment:
+Load the required modules:
 
 ```bash
-# Should already be installed from setup/
-pip install triton
+module load pytorch rocm triton
 ```
 
-Verify Triton installation:
-
-```python
-import triton
-print(f"Triton version: {triton.__version__}")
-```
-
-### 2. Run the Model
-
-Execute the optimized model:
+Run:
 
 ```bash
-cd version3_triton/
-python3 tiny_llama_v3.py
-```
-
-**Expected Output:**
-```
-=== Triton Kernel Model Benchmark ===
-Model size: XXX.X M parameters
-Input shape: torch.Size([4, 512])
-Average forward pass time: XX.XX ms
-Throughput: XXXX tokens/second
-Memory allocated: X.XX GB
-Estimated FLOPS/second: XX.XX TFLOPS
+python tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 10
 ```
 
-### 3. Profile Performance
+Example output from one validated run:
 
-Run comprehensive profiling:
-
-```bash
-# Triton-specific profiling
-python3 run_triton_profiling.py
-```
-
-<!--
-```
-# ROCProfiler analysis
-chmod +x run_rocprof_triton.sh
-./run_rocprof_triton.sh
+```text
+Performance Summary V3:
+   Average training speed: 829.9 samples/sec
+   Throughput: 106221 tokens/sec
+   Average batch time: 9.6 ms
+   Peak memory usage: 193.8 MB
 ```
--->
-
-### 4. Analyze Results
-
-Check generated results:
-
-```bash
-ls profiling_results/
-cat profiling_results/triton_summary_report.md
-```
-
-<!--
-```
-ls rocprof_results/
-cat rocprof_results/triton_analysis_summary.md
-```
-
-## Performance Analysis
-
-### Expected Performance Gains
-
-| Component | Baseline Time | Version 2 Time | Version 3 Time | V3 Speedup | V3 vs V2 |
-|-----------|---------------|----------------|----------------|------------|----------|
-| RMSNorm | 100% | 60-70% | 35-45% | 2.2-2.9x | 1.4-1.9x |
-| SwiGLU | 100% | 40-60% | 25-35% | 2.9-4.0x | 1.4-2.0x |
-| Attention | 100% | 60-80% | 30-50% | 2.0-3.3x | 1.6-2.3x |
-| **Overall** | **100%** | **50-70%** | **30-45%** | **2.2-3.3x** | **1.4-2.0x** |
-
-### Memory Efficiency
-
-| Metric | Standard PyTorch | Version 2 Fused | Version 3 Triton | Improvement |
-|--------|------------------|-----------------|------------------|-------------|
-| Peak Memory | 100% | 40-60% | 20-35% | 65-80% reduction |
-| Memory Bandwidth | 100% | 70-85% | 45-65% | 35-55% reduction |
-| Kernel Launches | 100% | 30-50% | 15-25% | 75-85% reduction |
-
-### Triton Kernel Development Workflow
-
-#### Performance Profiling Integration
-
-```python
-# Triton kernel profiling with ROCm tools
-def profile_triton_kernel(kernel_func, inputs, kernel_name):
-    """
-    Comprehensive profiling of Triton kernels.
-    """
-    profiling_config = {
-        'kernel_timing': 'Per-kernel execution time',
-        'memory_bandwidth': 'Achieved vs theoretical bandwidth',
-        'occupancy': 'SM utilization percentage',
-        'register_usage': 'Register file utilization',
-        'shared_memory': 'Shared memory bank conflicts'
-    }
-
-    with torch.profiler.profile(
-        activities=[torch.profiler.ProfilerActivity.CUDA],
-        record_shapes=True,
-        with_stack=True
-    ) as prof:
-        # Run kernel multiple times for accurate timing
-        for _ in range(100):
-            output = kernel_func(*inputs)
-            torch.cuda.synchronize()
-
-    return analyze_kernel_performance(prof, kernel_name)
-```
-
-#### Kernel Optimization Checklist
-
-```python
-TRITON_OPTIMIZATION_CHECKLIST = {
-    'memory_optimization': {
-        'coalesced_access': 'Ensure consecutive memory access patterns',
-        'shared_memory_usage': 'Maximize shared memory utilization',
-        'register_spilling': 'Minimize register pressure',
-        'cache_efficiency': 'Optimize for L1/L2 cache behavior'
-    },
-    'compute_optimization': {
-        'arithmetic_intensity': 'Maximize FLOPS per byte ratio',
-        'vectorization': 'Use SIMD instructions effectively',
-        'occupancy': 'Balance thread blocks vs resources',
-        'load_balancing': 'Ensure even work distribution'
-    },
-    'compiler_optimization': {
-        'block_size_tuning': 'Auto-tune for target hardware',
-        'loop_unrolling': 'Optimize inner loops',
-        'constant_folding': 'Precompute constant expressions',
-        'dead_code_elimination': 'Remove unused computations'
-    }
-}
-```
-
-## Hands-on Exercises
-
-Work through the exercises in order to build understanding:
-
-### Exercise 1: Triton Basics (45 minutes)
-
-- Understand Triton kernel structure
-- Analyze memory access patterns
-- Optimize block sizes
-- Compare with PyTorch implementations
-
-### Exercise 2: SwiGLU Optimization (60 minutes)
-
-- Multi-dimensional kernel programming
-- Arithmetic intensity analysis
-- Memory layout optimization
-- Advanced kernel variants
-
-### Exercise 3: Flash Attention (75 minutes)
-
-- Memory-efficient attention algorithms
-- Tiling strategies and optimization
-- Numerical stability considerations
-- Large sequence handling
-
-## Advanced Topics
-
-### Kernel Optimization Strategies
-
-1. **Block Size Tuning**
-   - Match hardware characteristics
-   - Optimize for occupancy
-   - Consider memory coalescing
-
-2. **Memory Access Patterns**
-   - Minimize global memory access
-   - Maximize register usage
-   - Optimize cache utilization
-
-3. **Arithmetic Intensity**
-   - Balance compute vs memory operations
-   - Identify bottlenecks (compute vs memory bound)
-   - Apply roofline model analysis
-
-### Debugging Triton Kernels
-
-1. **Compilation Issues**
-   - Check tensor shapes and types
-   - Verify constexpr usage
-   - Review block size constraints
-
-2. **Performance Problems**
-   - Profile memory access patterns
-   - Check occupancy metrics
-   - Analyze kernel launch overhead
-
-3. **Numerical Issues**
-   - Monitor for overflow/underflow
-   - Check reduction accuracy
-   - Verify mask applications
-
-## Integration with ROCm Tools
-
-<!--
-### ROCProfiler Analysis
-
-The provided scripts integrate with ROCm profiling tools:
-
-```bash
-# Basic kernel profiling
-rocprof --stats --kernel-trace python3 tiny_llama_v3.py
-
-# Memory analysis
-rocprof --hip-trace --memory-trace python3 tiny_llama_v3.py
-
-# Detailed metrics
-rocprof --input kernel_metrics.txt python3 tiny_llama_v3.py
-```
--->
-
-### Key Metrics to Monitor
-
-1. **Kernel Performance**
-   - Execution time per kernel
-   - Launch overhead
-   - Occupancy rates
-
-2. **Memory Utilization**
-   - Bandwidth efficiency
-   - Cache hit rates
-   - Memory access patterns
-
-3. **Compute Efficiency**
-   - VALU utilization
-   - Arithmetic intensity
-   - Roofline performance
-
-## Troubleshooting
-
-### Common Issues
-
-1. **Triton Not Found**
-   ```bash
-   pip install triton
-   # Or check environment setup
-   ```
-
-2. **Kernel Compilation Errors**
-   - Verify GPU compatibility
-   - Check CUDA/ROCm installation
-   - Review tensor dimensions
-
-3. **Performance Regression**
-   - Ensure proper warmup
-   - Check block size settings
-   - Verify input data layout
-
-4. **Memory Errors**
-   - Reduce batch size or sequence length
-   - Check for memory leaks
-   - Monitor peak memory usage
-
-### Performance Debugging
 
-1. **Profile Each Kernel Individually**
-   ```python
-   # Isolate kernel performance
-   triton_rmsnorm = TritonRMSNorm(dim)
-   # Benchmark just this component
-   ```
+That is the first large jump in the progression. The step time falls sharply and the memory footprint drops by more than half relative to the baseline.
 
-2. **Compare Block Sizes**
-   ```python
-   # Test different configurations
-   for block_size in [64, 128, 256, 512]:
-       # Measure performance
-   ```
+## Profiling workflow
 
-3. **Memory Pattern Analysis**
-   ```python
-   # Check memory access efficiency
-   torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA])
-   ```
+Use the same scripts as the earlier versions:
 
-## Next Steps
+- `./get_hotspots.sh`
+- `./get_trace.sh`
+- `./get_counters.sh`
+- `./get_rocprof_compute.sh`
+- `./get_rocprof_sys.sh`
 
-After completing Version 3:
+Start with `./get_hotspots.sh`. The first thing to check is whether the dominant kernel set is now smaller and heavier than in version 1. Then use `./get_trace.sh` and `./get_counters.sh` to confirm that the trace is less fragmented and the dispatch count is lower.
 
-1. **Review Performance Gains**: Compare with previous versions
-2. **Understand Optimization Principles**: Kernel design patterns
-3. **Prepare for Version 4**: Ultra-fused implementations
+Example hotspot plot from the validated container run:
 
-Version 4 will combine all optimizations into ultra-fused kernels that process entire transformer blocks in minimal kernel launches.
+![TinyTransformer V3 hotspot summary from validated container run](../images/tinytransformer_version3_hotspots.png)
 
-## Resources
+If `rocprof-compute` is supported on the current GPU, version 3 is also a good point to inspect block-level metrics because the set of important kernels is smaller than in the baseline.
 
-### Documentation
-- [Triton Language Tutorial](https://triton-lang.org/main/getting-started/tutorials/index.html)
-- [GPU Architecture Guide](https://rocmdocs.amd.com/en/latest/Programming_Guides/Programming-Guides.html)
-- [ROCm Profiler Documentation](https://rocmdocs.amd.com/en/latest/ROCm_Tools/ROCm-Tools.html)
+## Workshop note
 
-### Papers and References
-- [Flash Attention Paper](https://arxiv.org/abs/2205.14135)
-- [Triton: A Language for AI Kernel Programming](https://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf)
-- [Roofline Model for GPU Performance](https://crd.lbl.gov/departments/computer-science/PAR/research/roofline/)
+Use [`README_WORKSHOP.md`](README_WORKSHOP.md) for the short lab sequence. The staged debugging exercise under [`exercises/performance_debugging`](exercises/performance_debugging) is useful when the goal is to understand how the final optimized path was reached.
 
-### AMD ROCm Resources
-- [ROCm Documentation](https://rocmdocs.amd.com/)
-- [HIP Programming Guide](https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP-GUIDE.html)
-- [Performance Optimization Tips](https://rocmdocs.amd.com/en/latest/Programming_Guides/Opencl-programming-guide.html)
+## References
 
+- comparison across versions: [`../VERSION_COMPARISON.md`](../VERSION_COMPARISON.md)
+- Triton tutorials: https://triton-lang.org/main/getting-started/tutorials/index.html
+- Perfetto UI: https://ui.perfetto.dev/
diff --git a/MLExamples/TinyTransformer/version3_triton/README_WORKSHOP.md b/MLExamples/TinyTransformer/version3_triton/README_WORKSHOP.md
index faea4dc0..fdda519c 100644
--- a/MLExamples/TinyTransformer/version3_triton/README_WORKSHOP.md
+++ b/MLExamples/TinyTransformer/version3_triton/README_WORKSHOP.md
@@ -1,394 +1,86 @@
+# TinyTransformer Triton Workshop Guide
 
-# Version 3: Triton Kernel Integration - Workshop Edition
+The main reference for this directory is [`README.md`](README.md). This note keeps the Triton version in a short lab order.
 
-`README_WORKSHOP.md` from `HPCTrainingExamples/MLExamples/TinyTransformer/version3_triton` in the Training Examples repository
+## Preparation
 
-**Objective**: Implement custom GPU kernels using Triton for maximum performance optimization
-
-**Actual Performance**: **5.5x speedup** over baseline, **46% memory reduction**
-
-**Learning Focus**: GPU kernel programming, performance debugging, hybrid optimization strategies
-
----
-
-##  Quick Start (5 minutes)
+Load the required modules:
 
 ```bash
-cd version3_triton/
-
-# Run the optimized version
-python tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 20
-
-# Expected output:
-# Loss: 7.0108  (correct!)
-# Speed: 2065.0 samples/sec  (5.5x faster than V1!)
-# Memory: 281.8 MB  (46% less than V1's 522 MB!)
-```
-
----
-
-##  Performance Results
-
-### Actual Measurements (AMD MI325X, ROCm 6.4.4)
-
-**Test Configuration:** Batch=8, SeqLen=128, Hidden=512, Layers=8, Heads=8
-
-| Metric | V1 Baseline | V3 Optimized | Improvement |
-|--------|-------------|--------------|-------------|
-| **Training Speed** | 372.9 samples/sec | **2065.0 samples/sec** | **5.5x faster** |
-| **Batch Time** | 21.7 ms | **3.9 ms** | **5.6x faster** |
-| **Forward Pass** | 10.8 ms | **3.2 ms** | **3.4x faster** |
-| **Backward Pass** | 9.2 ms | **0.3 ms** | **30x faster** |
-| **Memory Usage** | 522.3 MB | **281.8 MB** | **46% reduction** |
-| **Throughput** | 47,735 tokens/sec | **264,320 tokens/sec** | **5.5x faster** |
-
----
-
-##  Key Concepts
-
-### What is Triton?
-
-**Triton** is a Python-based GPU programming language that makes it easy to write high-performance GPU kernels without dealing with low-level CUDA/HIP complexity.
-
-**Why Use Triton?**
--  Python-like syntax (easier than CUDA/HIP)
--  Automatic memory coalescing and optimization
--  Works on both NVIDIA and AMD GPUs
--  Great for memory-bound operations and fusion
-
-**When NOT to Use Triton?**
--  Large matrix multiplications (use PyTorch/rocBLAS instead)
--  Operations already well-optimized in PyTorch
--  Compute-bound ops where BLAS libraries excel
-
----
-
-##  Optimizations Applied in V3
-
-### 1. Flash Attention (Triton Kernel)
-**What it does:** Memory-efficient attention using online softmax
-
-**PyTorch Standard Attention:**
-```python
-# Materializes full attention matrix: O(N²) memory
-scores = Q @ K.T  # [batch, heads, seq, seq] - HUGE!
-attn = softmax(scores)
-output = attn @ V
-```
-
-**Flash Attention:**
-```python
-# Online computation: O(N) memory
-# Processes attention in blocks, never materializes full matrix
-# Uses tiled computation with recomputation for backward pass
-```
-
-**Result:**
-
-- 46% memory reduction (282 MB vs 522 MB)
-- Enables longer sequences
-- Slightly faster forward pass
-
-### 2. RMSNorm (Triton Kernel)
-**What it does:** Fused variance computation + normalization
-
-**Before (PyTorch):** 3 separate kernels
-```python
-variance = x.pow(2).mean(-1, keepdim=True)  # Kernel 1
-rstd = torch.rsqrt(variance + eps)           # Kernel 2
-output = (x * rstd) * weight                 # Kernel 3
+module load pytorch rocm triton
 ```
 
-**After (Triton):** Single fused kernel
-```python
-# All operations in one kernel launch
-# Variance computed in registers
-# Immediate normalization and scaling
-```
-
-**Result:**
-
-- 3x fewer kernel launches
-- Better cache utilization
-- Reduced memory bandwidth
+Run:
 
-### 3. Hybrid SwiGLU Strategy
-**Critical Lesson:** Don't use custom kernels for everything!
-
-**Initial (Broken) Approach:**
-```python
-# Used Triton kernel for matrix multiply - BAD IDEA!
-# Launched 2,097,152 threads (batch × seq × d_ff)
-# Each thread did manual reduction - VERY SLOW!
-# Result: 25.5ms forward pass (2.4x SLOWER than V1!)
-```
-
-**Optimized (Hybrid) Approach:**
-```python
-# Use PyTorch for matrix multiplies (rocBLAS optimized)
-gate = self.gate_proj(x)  # rocBLAS
-up = self.up_proj(x)       # rocBLAS
-
-# Use PyTorch for activation (already fused)
-gate_activated = F.silu(gate) * up
-
-# Use PyTorch for final projection
-output = self.down_proj(intermediate)  # rocBLAS
-```
-
-**Result:**
-- 8x forward pass speedup (25.5ms → 3.2ms)
-- **Key insight:** Use the best tool for each operation
-
-### 4. Tensor Contiguity (Critical!)
-**The Bug:** Non-contiguous tensors after `repeat_interleave` for GQA
-
-**Before:**
-```python
-k = k.repeat_interleave(n_rep, dim=1)  # Creates non-contiguous tensor!
-v = v.repeat_interleave(n_rep, dim=1)  # Bad memory layout for Triton!
-```
-
-**After:**
-```python
-k = k.repeat_interleave(n_rep, dim=1).contiguous()  # Fix memory layout
-v = v.repeat_interleave(n_rep, dim=1).contiguous()  # Now Triton-friendly!
+```bash
+python tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 10
 ```
 
-**Result:**
+From one validated run, the reference numbers were:
 
-- 20x speedup! (15.2 → 310.8 samples/sec)
-- Triton kernels depend on contiguous memory for efficient access
-- Always check tensor contiguity before passing to custom kernels
+- `829.9 samples/sec`
+- `9.6 ms` average batch time
+- `193.8 MB` peak memory
 
-### 5. Proper Weight Initialization
-**The Bug:** Default `nn.Embedding` uses `Normal(0, 1)` - too large!
+## Exercise 1: Compare against the baseline
 
-**Before:**
-```python
-# No weight initialization
-# Embedding weight ~ Normal(0, 1)
-# With dim=1024, logits have std ≈ √1024 ≈ 32
-# Result: Logits explode to hundreds, loss = 942!
-```
-
-**After:**
-```python
-def _init_weights(self):
-    for module in self.modules():
-        if isinstance(module, nn.Embedding):
-            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-```
+Place the version 3 numbers next to the version 1 baseline before you start profiling. That comparison is the main point of the exercise.
 
-**Result:**
-- Loss: 942 → 7.0
-- Critical for tied weights (embedding + lm_head)
-- Small std prevents exploding gradients
+## Exercise 2: Hotspot list
 
----
-
-##  Performance Debugging Exercise
-
-Want to see the complete optimization journey? Try our hands-on debugging exercise:
+Run:
 
 ```bash
-cd exercises/performance_debugging/
-
-# Read the guide
-cat README.md
-
-# Run all 5 stages of optimization with profiling
-./run_all_stages.sh
-
-# This shows the complete journey:
-# Stage 1: Broken (loss=942) - missing weight init
-# Stage 2: Slow (15 samp/s) - non-contiguous tensors
-# Stage 3: Better (311 samp/s) - added .contiguous()
-# Stage 4: Same (306 samp/s) - accurate timing revealed issue
-# Stage 5: Optimal (2065 samp/s) - hybrid kernel strategy!
+./get_hotspots.sh
 ```
 
-**What you'll learn:**
-
-- How to diagnose incorrect model behavior (exploding loss)
-- How to identify performance bottlenecks with profiling
-- When to use custom kernels vs. optimized libraries
-- How memory layout affects GPU performance
-- Systematic debugging methodology
+Use this to check whether the dominant kernel set is smaller and more concentrated than in the baseline.
 
----
+## Exercise 3: Runtime trace
 
-<!--
-##  Profiling Commands
+Run:
 
-### Basic Profiling
 ```bash
-# Run with basic stats
-rocprof --stats python tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 10
-
-# Output: CSV file with kernel execution times
+./get_trace.sh
 ```
 
-### Detailed Kernel Trace
-```bash
-# Generate detailed timeline trace
-rocprofv2 --kernel-trace -o v3_trace.json python tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 5
+Open the `.pftrace` file in Perfetto:
 
-# View at https://ui.perfetto.dev
+```text
+https://ui.perfetto.dev/
 ```
 
-### What to Look For in Profiles
-
-1. **Kernel Execution Time:** Which kernels take the most time?
-2. **Memory Bandwidth:** Are you memory-bound or compute-bound?
-3. **Occupancy:** How many wavefronts/warps are active?
-4. **Launch Overhead:** Too many small kernel launches?
-
----
--->
-
-##  Key Learnings
-
-### 1. Correctness First, Performance Second
-
-- Stage 1 had broken loss (942 instead of 7)
-- No point optimizing a broken model!
-- Always validate correctness before optimizing
-
-### 2. Memory Layout Matters
-
-- Non-contiguous tensors killed performance (20x slower!)
-- Always `.contiguous()` before Triton kernels
-- Check with `tensor.is_contiguous()`
+Ask:
 
-### 3. Hybrid Optimization Wins
+- does the step consist of fewer, heavier kernels
+- is the attention region easier to recognize
+- are there fewer visible launch gaps
 
-- Don't write custom kernels for everything
-- Use Triton for: memory-bound ops, fusion opportunities
-- Use PyTorch/BLAS for: large matrix multiplies
-- Profile to decide!
+## Exercise 4: Full kernel trace
 
-### 4. Measure Accurately
+Run:
 
-- GPU operations are asynchronous
-- Always `torch.cuda.synchronize()` for accurate timing
-- Without sync, timings are meaningless
-
-### 5. Iterative Debugging
-
-- Fix one issue at a time
-- Re-measure after each fix
-- Profile to identify next bottleneck
-- Repeat until optimal
-
----
-
-##  Files Overview
-
-```
-version3_triton/
- README_WORKSHOP.md                      # This file
- tiny_llama_v3.py                        # Main optimized model
- exercises/
-    performance_debugging/              #  Hands-on debugging exercise
-       README.md                       # Complete optimization journey
-       run_all_stages.sh               # Run all 5 stages with profiling
-       WORKSHOP_GUIDE.md               # Quick reference guide
-    exercise1_triton_basics.md          # Triton fundamentals
-    exercise2_swiglu_optimization.md    # SwiGLU deep dive
-    exercise3_flash_attention.md        # Flash Attention implementation
- triton_profiles/                        # Generated profiling data
-```
-
----
-
-##  Next Steps
-
-### After Running V3
-
-1. **Compare with V1:**
 ```bash
-# Run V1 for comparison
-cd ../version1_pytorch_baseline/
-python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 20
-
-# Compare outputs
-# V1: 372.9 samp/s, 522.3 MB
-# V3: 2065.0 samp/s, 281.8 MB (5.5x faster, 46% less memory!)
+./get_counters.sh
 ```
 
-2. **Try V4 (Ultra-Fused):**
-```bash
-cd ../version4_pytorch_sdpa/
-python tiny_llama_v4.py --batch-size 8 --seq-len 128 --num-steps 20
-
-# Expected: ~8x faster than V1!
-```
-
-3. **Deep Dive into Profiling:**
-```bash
-cd exercises/performance_debugging/
-./run_all_stages.sh
+Record:
 
-# Analyze the profiling CSV files
-# Compare kernel execution times
-# Understand the optimization journey
-```
-
----
+- dispatch count
+- number of unique kernels
+- top kernels by total time
 
-##  Common Issues and Solutions
+## Exercise 5: Performance debugging path
 
-### Issue 1: ImportError: No module named 'triton'
-```bash
-pip install triton
-```
+If time permits, run the staged exercise:
 
-### Issue 2: RuntimeError: CUDA not available
 ```bash
-# Verify ROCm installation
-rocminfo
-
-# Check PyTorch sees GPU
-python -c "import torch; print(torch.cuda.is_available())"
+cd exercises/performance_debugging
+./run_all_stages.sh
 ```
 
-### Issue 3: Loss is not ~7.0
-
-- Check weight initialization is enabled
-- Verify model architecture matches V1
-- Check for tensor shape mismatches
-
-### Issue 4: Performance slower than expected
-
-- Ensure tensors are contiguous: `.contiguous()`
-- Check CUDA synchronization for accurate timing
-- Profile to identify bottleneck kernel
-- Verify using optimized SwiGLU (hybrid approach)
-
----
-
-##  Additional Resources
-
-- **Triton Documentation:** https://triton-lang.org/
-- **Flash Attention Paper:** https://arxiv.org/abs/2205.14135
-- **ROCm Profiling Guide:** https://rocm.docs.amd.com/projects/rocprofiler/
-- **Performance Debugging Guide:** exercises/performance_debugging/README.md
-
----
-
-##  Summary
-
-**V3 achieves 5.5x speedup through:**
-
-1.  Flash Attention (Triton) - 46% memory reduction
-2.  RMSNorm (Triton) - Fused kernel
-3.  Hybrid SwiGLU - Use rocBLAS for matmul
-4.  Tensor contiguity - Critical for Triton performance
-5.  Proper initialization - Correctness first!
-
-**Key insight:** Best performance comes from using the right tool for each operation - not from using custom kernels everywhere!
-
-**Ready to debug?** Start with `cd exercises/performance_debugging/`
+This is useful because it shows that the final performance comes from a sequence of implementation changes, not from a single switch.
 
+## Closing note
 
+Version 3 is usually the clearest point in the progression to discuss why kernel specialization changes both performance and profiler output. For a short lab, Exercises 1 through 4 are enough.
diff --git a/MLExamples/TinyTransformer/version3_triton/exercises/exercise1_triton_basics.md b/MLExamples/TinyTransformer/version3_triton/exercises/exercise1_triton_basics.md
index 9ceb42ab..9c23cc48 100644
--- a/MLExamples/TinyTransformer/version3_triton/exercises/exercise1_triton_basics.md
+++ b/MLExamples/TinyTransformer/version3_triton/exercises/exercise1_triton_basics.md
@@ -1,30 +1,12 @@
-
 ## Exercise 1: Understanding Triton Kernel Basics
 
-`exercise1_triton_basics.md from `HPCTrainingExamples/MLExamples/TinyTransformer/version3_triton/exercises` in the Training Examples repository
-
-**Objective**: Learn the fundamentals of Triton GPU programming and analyze basic kernel performance.
-
-**Time**: 45 minutes
-
-**Prerequisites**: Completed Version 1 and Version 2 exercises
-
-### Background
+**Objective**: Learn Triton GPU programming fundamentals and analyze basic kernel performance.
 
-Triton is a language and compiler for writing custom GPU kernels. It provides:
+**Time**: 45 minutes | **Prerequisites**: Completed Version 1 and Version 2 exercises
 
-- Python-like syntax for GPU programming
-- Automatic memory coalescing and optimization
-- Block-level programming model
-- Integration with PyTorch
+### Part A: Kernel Structure Analysis
 
-In this exercise, you'll analyze the basic structure of Triton kernels and understand their performance characteristics.
-
-### Part A: Kernel Structure Analysis (15 minutes)
-
-#### Step 1: Examine the RMSNorm Kernel
-
-Open `tiny_llama_v3.py` and locate the `rmsnorm_kernel` function:
+Examine the `rmsnorm_kernel` in `tiny_llama_v3.py`:
 
 ```python
 @triton.jit
@@ -36,212 +18,48 @@ def rmsnorm_kernel(
 ):
 ```
 
-**Questions to Answer:**
-
-1. **Pointer Management**: How does Triton handle memory pointers compared to CUDA?
-2. **Block Processing**: What is the role of `BLOCK_SIZE` in this kernel?
-3. **Constexpr Usage**: Why are `eps` and `BLOCK_SIZE` marked as `tl.constexpr`?
-4. **Memory Access Pattern**: How does the kernel ensure coalesced memory access?
-
-#### Step 2: Analyze Memory Access Patterns
-
-Look at the variance computation loop:
-
-```python
-for i in range(0, n_elements, BLOCK_SIZE):
-    offsets = i + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_elements
-    x_vals = tl.load(x_ptr + row_idx * n_elements + offsets, mask=mask, other=0.0)
-    variance += tl.sum(x_vals * x_vals, axis=0)
-```
-
-**Analysis Tasks:**
-
-1. **Memory Coalescing**: Explain how the `offsets` calculation ensures coalesced memory access
-2. **Boundary Handling**: What does the `mask` parameter accomplish?
-3. **Reduction Pattern**: How does this implement an efficient parallel reduction?
-
-#### Step 3: Compare with PyTorch Implementation
-
-Compare the Triton RMSNorm with the PyTorch version:
-
-```python
-def pytorch_rmsnorm(x):
-    variance = x.pow(2).mean(dim=-1, keepdim=True)
-    x = x * torch.rsqrt(variance + eps)
-    return x * weight
-```
-
-**Discussion Points:**
+**Questions:**
+1. How does Triton handle memory pointers compared to CUDA?
+2. What is the role of `BLOCK_SIZE`?
+3. Why are `eps` and `BLOCK_SIZE` marked as `tl.constexpr`?
 
-1. **Kernel Fusion**: How does Triton fuse operations that PyTorch keeps separate?
-2. **Memory Efficiency**: What memory advantages does the Triton version have?
-3. **Numerical Precision**: Are there any precision considerations?
+### Part B: Performance Profiling
 
-### Part B: Performance Profiling (20 minutes)
-
-#### Step 4: Run Basic Profiling
-
-Execute the Triton profiling script:
+Run the Triton profiling script:
 
 ```bash
 cd version3_triton/
 python3 run_triton_profiling.py
 ```
 
-**Expected Output Analysis:**
-
-```
-=== Triton Kernel Performance Analysis ===
-
-1. RMSNorm Kernel Profiling
-  Triton RMSNorm: X.XXX ms
-  PyTorch RMSNorm: Y.YYY ms
-  Speedup: Z.ZZx
-  Max error: E.EEe-XX
-```
-
-**Performance Questions:**
-
-1. **Speedup Analysis**: What speedup did you achieve? Is it consistent with expectations?
-2. **Accuracy Check**: What is the maximum error between implementations? Is this acceptable?
-3. **Memory Usage**: How does memory usage compare between the implementations?
-
-#### Step 5: Analyze ROCProfiler Results
-
-Run the ROCProfiler analysis:
+Run ROCProfiler analysis:
 
 ```bash
-chmod +x run_rocprof_triton.sh
 ./run_rocprof_triton.sh
-```
-
-Examine the generated results:
-
-```bash
-ls rocprof_results/
 cat rocprof_results/triton_analysis_summary.md
 ```
 
-**Profiling Analysis:**
-
-1. **Kernel Launch Overhead**: What is the launch overhead for Triton kernels?
-2. **Memory Bandwidth**: What memory bandwidth utilization are you achieving?
-3. **GPU Utilization**: How well are you utilizing the available compute units?
-
-### Part C: Block Size Optimization (10 minutes)
-
-#### Step 6: Experiment with Block Sizes
-
-Modify the `rmsnorm_kernel` call in `TritonRMSNorm.forward()`:
-
-```python
-# Try different block sizes
-for block_size in [64, 128, 256, 512, 1024]:
-    rmsnorm_kernel[grid](
-        x_reshaped, self.weight, output,
-        dim, self.eps, BLOCK_SIZE=block_size
-    )
-```
-
-**Optimization Tasks:**
-
-1. **Performance Testing**: Measure execution time for each block size
-2. **Memory Analysis**: How does block size affect memory access patterns?
-3. **Occupancy Impact**: What's the relationship between block size and GPU occupancy?
-
-#### Step 7: Memory Access Analysis
-
-Create a simple memory access pattern analyzer:
-
-```python
-def analyze_memory_pattern():
-    # Simulate memory access pattern
-    dim = 2048
-    block_sizes = [64, 128, 256, 512]
-
-    for block_size in block_sizes:
-        total_blocks = (dim + block_size - 1) // block_size
-        print(f"Block size {block_size}: {total_blocks} blocks")
-
-        # Analyze memory transactions
-        elements_per_transaction = min(block_size, 32)  # Typical coalescing width
-        transactions = (block_size + elements_per_transaction - 1) // elements_per_transaction
-        print(f"  Memory transactions per block: {transactions}")
-        print(f"  Total transactions: {total_blocks * transactions}")
-```
-
-**Memory Analysis Questions:**
+### Part C: Block Size Optimization
 
-1. **Coalescing Efficiency**: Which block size provides the best memory coalescing?
-2. **Transaction Overhead**: How does the number of memory transactions scale?
-3. **Cache Utilization**: What's the impact on L1/L2 cache utilization?
+Experiment with block sizes (64, 128, 256, 512, 1024) and measure:
+- Execution time
+- Memory transactions
+- GPU occupancy
 
-### Exercise Results
-
-Document your findings:
-
-#### Performance Results Table
+### Results Template
 
 | Metric | Triton RMSNorm | PyTorch RMSNorm | Speedup |
 |--------|----------------|------------------|---------|
 | Execution Time (ms) | | | |
 | Memory Usage (MB) | | | |
-| Bandwidth (GB/s) | | | |
-
-#### Block Size Analysis
-
-| Block Size | Execution Time (ms) | Memory Transactions | GPU Occupancy |
-|------------|-------------------|-------------------|---------------|
-| 64 | | | |
-| 128 | | | |
-| 256 | | | |
-| 512 | | | |
-| 1024 | | | |
-
-#### Key Insights
-
-1. **Best Block Size**: _____
-2. **Primary Performance Bottleneck**: _____
-3. **Memory Efficiency**: _____
-4. **Optimization Opportunities**: _____
-
-### Discussion Questions
-
-1. **Triton vs CUDA**: How does Triton kernel development compare to writing CUDA kernels?
 
-2. **Automatic Optimizations**: What optimizations does Triton perform automatically?
+### Common Issues
 
-3. **Performance Portability**: How portable are Triton kernels across different GPU architectures?
+- **Compilation Errors**: Check tensor shapes and constexpr values
+- **Performance Regression**: Verify block size tuning and proper warmup
+- **Numerical Differences**: Small FP precision differences are normal
 
-4. **Integration Complexity**: What are the challenges of integrating Triton kernels into PyTorch models?
-
-### Next Steps
-
-In Exercise 2, you'll dive deeper into the SwiGLU kernel implementation and learn about:
-- Multi-dimensional memory access patterns
-- Kernel fusion strategies
-- Advanced optimization techniques
-- Debugging Triton kernels
-
-### Common Issues and Solutions
-
-#### Issue 1: Compilation Errors
-**Problem**: Triton kernel fails to compile
-**Solution**: Check that all tensor shapes are compatible and constexpr values are properly defined
-
-#### Issue 2: Performance Regression
-**Problem**: Triton kernel is slower than PyTorch
-**Solution**: Verify block size tuning and memory access patterns; ensure proper warmup
-
-#### Issue 3: Numerical Differences
-**Problem**: Results don't match PyTorch exactly
-**Solution**: Check floating-point precision and reduction order; small differences are normal
-
-### Additional Resources
+### Resources
 
 - [Triton Documentation](https://triton-lang.org/main/index.html)
 - [Triton Tutorials](https://triton-lang.org/main/getting-started/tutorials/index.html)
-- [GPU Memory Coalescing Guide](https://developer.nvidia.com/blog/how-access-global-memory-efficiently-cuda-c-kernels/)
-- [ROCm Performance Guidelines](https://rocmdocs.amd.com/)
-
diff --git a/MLExamples/TinyTransformer/version3_triton/exercises/exercise2_swiglu_optimization.md b/MLExamples/TinyTransformer/version3_triton/exercises/exercise2_swiglu_optimization.md
index 0607ab3e..7eca4afc 100644
--- a/MLExamples/TinyTransformer/version3_triton/exercises/exercise2_swiglu_optimization.md
+++ b/MLExamples/TinyTransformer/version3_triton/exercises/exercise2_swiglu_optimization.md
@@ -1,30 +1,16 @@
-
 ## Exercise 2: SwiGLU Kernel Optimization
 
-`exercise2_swiglu_optimization.md` from `HPCTrainingExamples/MLExamples/TinyTransformer/version3_triton` in the Training Examples repository
-
-**Objective**: Master advanced Triton kernel development through SwiGLU optimization and learn multi-dimensional memory access patterns.
-
-**Time**: 60 minutes
+**Objective**: Master advanced Triton kernel development through SwiGLU optimization.
 
-**Prerequisites**: Completed Exercise 1
+**Time**: 60 minutes | **Prerequisites**: Completed Exercise 1
 
 ### Background
 
-The SwiGLU (Swish-Gated Linear Unit) is a key component in modern transformer architectures. It combines:
-
-- Gate projection with SiLU activation
-- Up projection
-- Element-wise multiplication
-- Down projection
-
-Traditional implementations require multiple kernel launches and intermediate storage. Our Triton kernel fuses the gate and up projections with activation, reducing memory traffic and improving performance.
+SwiGLU (Swish-Gated Linear Unit) combines gate projection with SiLU activation, up projection, element-wise multiplication, and down projection. Our Triton kernel fuses the gate and up projections with activation.
 
-### Part A: SwiGLU Kernel Deep Dive (20 minutes)
+### Part A: SwiGLU Kernel Analysis
 
-#### Step 1: Analyze the Kernel Structure
-
-Examine the `swiglu_kernel` in `tiny_llama_v3.py`:
+Examine `swiglu_kernel` in `tiny_llama_v3.py`:
 
 ```python
 @triton.jit
@@ -37,425 +23,40 @@ def swiglu_kernel(
 ):
 ```
 
-**Analysis Questions:**
-
-1. **Multi-dimensional Blocking**: Why does this kernel use three different block sizes?
-2. **Memory Layout**: How are the tensors laid out in memory (batch, sequence, feature dimensions)?
-3. **Compute Intensity**: What is the arithmetic intensity of this kernel?
-
-#### Step 2: Understand the Computation Flow
-
-Follow the kernel execution:
-
-```python
-# Load input
-input_offset = batch_idx * seq_len * d_model + seq_idx * d_model
-x_block = tl.load(x_ptr + input_offset + tl.arange(0, d_model))
-
-# Compute projections
-for i in range(0, d_model, BLOCK_SIZE_D):
-    x_vals = tl.load(x_ptr + input_offset + i + tl.arange(0, BLOCK_SIZE_D))
-    gate_weights = tl.load(gate_weight_ptr + d_idx * d_model + i + tl.arange(0, BLOCK_SIZE_D))
-    up_weights = tl.load(up_weight_ptr + d_idx * d_model + i + tl.arange(0, BLOCK_SIZE_D))
-
-    gate_sum += tl.sum(x_vals * gate_weights)
-    up_sum += tl.sum(x_vals * up_weights)
-
-# Apply activation
-gate_activated = gate_sum / (1.0 + tl.exp(-gate_sum))
-result = gate_activated * up_sum
-```
-
-**Computation Analysis:**
-
-1. **Memory Reuse**: How does the kernel maximize input data reuse?
-2. **Reduction Pattern**: Explain the dot product computation strategy
-3. **Activation Fusion**: How is the SiLU activation integrated efficiently?
-
-#### Step 3: Memory Access Pattern Visualization
-
-Create a visualization tool for memory access patterns:
-
-```python
-def visualize_swiglu_access_pattern():
-    """Visualize memory access patterns for SwiGLU kernel."""
-
-    # Example dimensions
-    batch_size, seq_len, d_model, d_ff = 2, 4, 8, 12
-
-    print("SwiGLU Memory Access Pattern Analysis")
-    print("=" * 50)
-
-    print(f"Tensor shapes:")
-    print(f"  Input (x): [{batch_size}, {seq_len}, {d_model}]")
-    print(f"  Gate weights: [{d_ff}, {d_model}]")
-    print(f"  Up weights: [{d_ff}, {d_model}]")
-    print(f"  Output: [{batch_size}, {seq_len}, {d_ff}]")
-
-    print(f"\nTotal elements:")
-    print(f"  Input: {batch_size * seq_len * d_model}")
-    print(f"  Weights: {2 * d_ff * d_model}")
-    print(f"  Output: {batch_size * seq_len * d_ff}")
-
-    # Analyze memory traffic
-    input_reads = batch_size * seq_len * d_model * d_ff  # Each input element read d_ff times
-    weight_reads = 2 * d_ff * d_model * batch_size * seq_len  # Weight reuse across batch/seq
-    output_writes = batch_size * seq_len * d_ff
-
-    total_bytes = (input_reads + weight_reads + output_writes) * 4  # float32
-
-    print(f"\nMemory traffic analysis:")
-    print(f"  Input reads: {input_reads}")
-    print(f"  Weight reads: {weight_reads}")
-    print(f"  Output writes: {output_writes}")
-    print(f"  Total memory traffic: {total_bytes / 1e6:.2f} MB")
-
-    # Compute to memory ratio
-    flops = 2 * batch_size * seq_len * d_model * d_ff * 2  # 2 projections, 2 ops per MAC
-    arithmetic_intensity = flops / total_bytes * 4  # ops per byte
-
-    print(f"  FLOPs: {flops}")
-    print(f"  Arithmetic intensity: {arithmetic_intensity:.2f} ops/byte")
-
-# Run the analysis
-visualize_swiglu_access_pattern()
-```
-
-### Part B: Performance Optimization (25 minutes)
-
-#### Step 4: Block Size Tuning
-
-Create a systematic block size tuning script:
-
-```python
-import time
-import torch
-from tiny_llama_v3 import TritonSwiGLU
-
-def tune_swiglu_block_sizes():
-    """Tune block sizes for optimal SwiGLU performance."""
-
-    device = torch.device('cuda')
-    batch_size, seq_len, d_model = 4, 512, 2048
-    hidden_dim = int(2.67 * d_model)
-
-    # Test different block size combinations
-    block_configs = [
-        (1, 1, 32),   # Small blocks
-        (1, 1, 64),   # Medium blocks
-        (1, 1, 128),  # Large blocks
-        (1, 2, 64),   # Sequence blocking
-        (2, 1, 64),   # Batch blocking
-        (1, 1, 256),  # Extra large feature blocks
-    ]
-
-    x = torch.randn(batch_size, seq_len, d_model, device=device)
-
-    results = []
-
-    for b_block, s_block, d_block in block_configs:
-        print(f"\nTesting block configuration: B={b_block}, S={s_block}, D={d_block}")
-
-        # Create modified SwiGLU with specific block sizes
-        swiglu = TritonSwiGLU(d_model, hidden_dim).to(device)
-
-        # Warmup
-        for _ in range(10):
-            _ = swiglu(x)
-        torch.cuda.synchronize()
-
-        # Benchmark
-        start_time = time.time()
-        for _ in range(100):
-            output = swiglu(x)
-        torch.cuda.synchronize()
-
-        avg_time = (time.time() - start_time) / 100
-
-        results.append({
-            'config': (b_block, s_block, d_block),
-            'time_ms': avg_time * 1000,
-            'throughput': batch_size * seq_len / avg_time
-        })
-
-        print(f"  Time: {avg_time*1000:.3f} ms")
-        print(f"  Throughput: {batch_size * seq_len / avg_time:.0f} tokens/s")
-
-    # Find best configuration
-    best_result = min(results, key=lambda x: x['time_ms'])
-    print(f"\nBest configuration: {best_result['config']}")
-    print(f"Best time: {best_result['time_ms']:.3f} ms")
-
-    return results
-
-# Run block size tuning
-block_results = tune_swiglu_block_sizes()
-```
-
-#### Step 5: Memory Layout Optimization
-
-Experiment with different memory layouts:
-
-```python
-def analyze_memory_layouts():
-    """Analyze impact of different memory layouts on performance."""
-
-    device = torch.device('cuda')
-    batch_size, seq_len, d_model = 4, 512, 2048
-    hidden_dim = int(2.67 * d_model)
-
-    # Test different weight layouts
-    layouts = ['row_major', 'column_major', 'transposed']
-
-    x = torch.randn(batch_size, seq_len, d_model, device=device)
-
-    for layout in layouts:
-        print(f"\nTesting {layout} weight layout:")
-
-        swiglu = TritonSwiGLU(d_model, hidden_dim).to(device)
-
-        if layout == 'column_major':
-            # Transpose weights for column-major access
-            swiglu.gate_proj.weight.data = swiglu.gate_proj.weight.data.t().contiguous().t()
-            swiglu.up_proj.weight.data = swiglu.up_proj.weight.data.t().contiguous().t()
-        elif layout == 'transposed':
-            # Use transposed weights
-            swiglu.gate_proj.weight.data = swiglu.gate_proj.weight.data.t().contiguous()
-            swiglu.up_proj.weight.data = swiglu.up_proj.weight.data.t().contiguous()
-
-        # Benchmark
-        torch.cuda.synchronize()
-        start_time = time.time()
-
-        for _ in range(50):
-            output = swiglu(x)
-
-        torch.cuda.synchronize()
-        avg_time = (time.time() - start_time) / 50
-
-        print(f"  Average time: {avg_time*1000:.3f} ms")
-        print(f"  Memory bandwidth: {estimate_bandwidth(x, swiglu, avg_time):.1f} GB/s")
-
-def estimate_bandwidth(x, swiglu, exec_time):
-    """Estimate memory bandwidth utilization."""
-
-    # Calculate memory footprint
-    input_size = x.numel() * 4  # float32
-    weight_size = (swiglu.gate_proj.weight.numel() + swiglu.up_proj.weight.numel()) * 4
-    output_size = x.shape[0] * x.shape[1] * swiglu.gate_proj.out_features * 4
-
-    total_bytes = input_size + weight_size + output_size
-    bandwidth = total_bytes / exec_time / 1e9
-
-    return bandwidth
-
-# Run memory layout analysis
-analyze_memory_layouts()
-```
-
-#### Step 6: Arithmetic Intensity Analysis
-
-Calculate and optimize arithmetic intensity:
-
-```python
-def analyze_arithmetic_intensity():
-    """Analyze arithmetic intensity and roofline performance."""
-
-    batch_size, seq_len, d_model = 4, 512, 2048
-    hidden_dim = int(2.67 * d_model)
-
-    # Calculate FLOPs
-    # Gate projection: batch_size * seq_len * d_model * hidden_dim * 2 (MAC)
-    gate_flops = batch_size * seq_len * d_model * hidden_dim * 2
-
-    # Up projection: same as gate
-    up_flops = gate_flops
-
-    # SiLU activation: ~4 FLOPs per element (exp, add, div, mul)
-    silu_flops = batch_size * seq_len * hidden_dim * 4
-
-    # Element-wise multiply: 1 FLOP per element
-    multiply_flops = batch_size * seq_len * hidden_dim
+**Questions:**
+1. Why does this kernel use three different block sizes?
+2. How are tensors laid out in memory?
+3. What is the arithmetic intensity?
 
-    total_flops = gate_flops + up_flops + silu_flops + multiply_flops
+### Part B: Performance Optimization
 
-    # Calculate memory traffic
-    input_bytes = batch_size * seq_len * d_model * 4
-    gate_weight_bytes = d_model * hidden_dim * 4
-    up_weight_bytes = d_model * hidden_dim * 4
-    output_bytes = batch_size * seq_len * hidden_dim * 4
+Test different block size combinations:
+- (1, 1, 32), (1, 1, 64), (1, 1, 128)
+- (1, 2, 64), (2, 1, 64), (1, 1, 256)
 
-    total_bytes = input_bytes + gate_weight_bytes + up_weight_bytes + output_bytes
+### Part C: Arithmetic Intensity Analysis
 
-    arithmetic_intensity = total_flops / total_bytes
+Calculate for batch_size=4, seq_len=512, d_model=2048:
+- Total FLOPs (gate + up projections + activation)
+- Total memory traffic
+- Arithmetic intensity (FLOPs/byte)
 
-    print("SwiGLU Arithmetic Intensity Analysis")
-    print("=" * 40)
-    print(f"Problem size: {batch_size}x{seq_len}x{d_model} -> {hidden_dim}")
-    print(f"Total FLOPs: {total_flops/1e9:.2f} GFLOPs")
-    print(f"Total memory: {total_bytes/1e6:.2f} MB")
-    print(f"Arithmetic intensity: {arithmetic_intensity:.2f} FLOPs/byte")
+Determine if kernel is compute-bound or memory-bound using roofline analysis.
 
-    # Roofline analysis
-    peak_flops = 200e12  # Example: 200 TFLOPS (MI250X)
-    peak_bandwidth = 1600e9  # Example: 1.6 TB/s
+### Results Template
 
-    compute_bound_intensity = peak_flops / peak_bandwidth
-
-    print(f"\nRoofline Analysis:")
-    print(f"Peak compute: {peak_flops/1e12:.0f} TFLOPS")
-    print(f"Peak bandwidth: {peak_bandwidth/1e9:.0f} GB/s")
-    print(f"Compute-bound threshold: {compute_bound_intensity:.2f} FLOPs/byte")
-
-    if arithmetic_intensity > compute_bound_intensity:
-        print("Kernel is compute-bound - optimize arithmetic operations")
-        bottleneck = "compute"
-    else:
-        print("Kernel is memory-bound - optimize memory access")
-        bottleneck = "memory"
-
-    return {
-        'arithmetic_intensity': arithmetic_intensity,
-        'total_flops': total_flops,
-        'total_bytes': total_bytes,
-        'bottleneck': bottleneck
-    }
-
-# Run arithmetic intensity analysis
-intensity_results = analyze_arithmetic_intensity()
-```
-
-### Part C: Advanced Optimization Techniques (15 minutes)
-
-#### Step 7: Implement Kernel Variants
-
-Create optimized kernel variants:
-
-```python
-# Version 1: Basic implementation (current)
-# Version 2: Optimized for memory-bound workloads
-# Version 3: Optimized for compute-bound workloads
-
-@triton.jit
-def swiglu_kernel_optimized_memory(
-    x_ptr, gate_weight_ptr, up_weight_ptr, output_ptr,
-    batch_size, seq_len, d_model, d_ff,
-    BLOCK_SIZE_D: tl.constexpr,
-):
-    """Memory-optimized SwiGLU kernel with better data reuse."""
+| Configuration | Time (ms) | Speedup vs PyTorch | Bandwidth (GB/s) |
+|---------------|-----------|-------------------|------------------|
+| Block Size (1,1,64) | | | |
+| Block Size (1,1,128) | | | |
 
-    # Single thread processes entire token
-    batch_idx = tl.program_id(0)
-    seq_idx = tl.program_id(1)
-
-    input_offset = batch_idx * seq_len * d_model + seq_idx * d_model
-
-    # Process all outputs for this token
-    for d_out in range(0, d_ff, BLOCK_SIZE_D):
-        gate_sum = tl.zeros((BLOCK_SIZE_D,), dtype=tl.float32)
-        up_sum = tl.zeros((BLOCK_SIZE_D,), dtype=tl.float32)
-
-        # Load output indices
-        d_indices = d_out + tl.arange(0, BLOCK_SIZE_D)
-        d_mask = d_indices < d_ff
-
-        # Compute projections
-        for d_in in range(d_model):
-            x_val = tl.load(x_ptr + input_offset + d_in)
-
-            gate_weights = tl.load(gate_weight_ptr + d_indices * d_model + d_in, mask=d_mask)
-            up_weights = tl.load(up_weight_ptr + d_indices * d_model + d_in, mask=d_mask)
-
-            gate_sum += x_val * gate_weights
-            up_sum += x_val * up_weights
-
-        # Apply SiLU and multiply
-        gate_activated = gate_sum / (1.0 + tl.exp(-gate_sum))
-        result = gate_activated * up_sum
-
-        # Store results
-        output_offset = batch_idx * seq_len * d_ff + seq_idx * d_ff + d_indices
-        tl.store(output_ptr + output_offset, result, mask=d_mask)
-
-
-def benchmark_kernel_variants():
-    """Benchmark different kernel implementations."""
-
-    device = torch.device('cuda')
-    batch_size, seq_len, d_model = 4, 512, 2048
-    hidden_dim = int(2.67 * d_model)
-
-    x = torch.randn(batch_size, seq_len, d_model, device=device)
-
-    variants = [
-        ('Original', TritonSwiGLU(d_model, hidden_dim)),
-        # Add other variants here
-    ]
-
-    for name, swiglu in variants:
-        swiglu = swiglu.to(device)
-
-        # Warmup
-        for _ in range(10):
-            _ = swiglu(x)
-        torch.cuda.synchronize()
-
-        # Benchmark
-        start_time = time.time()
-        for _ in range(100):
-            output = swiglu(x)
-        torch.cuda.synchronize()
-
-        avg_time = (time.time() - start_time) / 100
-        print(f"{name}: {avg_time*1000:.3f} ms")
-
-# Run variant benchmarks
-benchmark_kernel_variants()
-```
-
-### Exercise Results
-
-#### Performance Comparison Table
-
-| Configuration | Time (ms) | Speedup vs PyTorch | Memory Usage | Bandwidth (GB/s) |
-|---------------|-----------|-------------------|--------------|------------------|
-| Original SwiGLU | | | | |
-| Block Size (1,1,32) | | | | |
-| Block Size (1,1,64) | | | | |
-| Block Size (1,1,128) | | | | |
-| Memory Optimized | | | | |
-
-#### Arithmetic Intensity Analysis
-
-- **Total FLOPs**: _____ GFLOPs
-- **Memory Traffic**: _____ MB
-- **Arithmetic Intensity**: _____ FLOPs/byte
-- **Performance Bottleneck**: _____ (compute/memory)
-- **Optimization Strategy**: _____
-
-#### Key Findings
+### Key Findings
 
 1. **Optimal Block Size**: _____
 2. **Memory Layout Impact**: _____
-3. **Arithmetic Intensity**: _____
-4. **Performance Bottleneck**: _____
-
-### Discussion Questions
-
-1. **Multi-dimensional Blocking**: How do you choose optimal block sizes for multi-dimensional problems?
-
-2. **Memory vs Compute Optimization**: When should you optimize for memory bandwidth vs computational throughput?
-
-3. **Kernel Fusion Trade-offs**: What are the trade-offs between kernel fusion and memory usage?
-
-4. **Scalability**: How do these optimizations scale with different problem sizes?
-
-### Next Steps
-
-Exercise 3 will cover Flash Attention implementation, focusing on:
+3. **Performance Bottleneck**: _____ (compute/memory)
 
-- Memory-efficient attention patterns
-- Tiling strategies for large sequences
-- Numerical stability in custom kernels
-- Advanced debugging techniques
+### Resources
 
+- Arithmetic intensity and roofline model concepts
+- Memory coalescing patterns for multi-dimensional data
diff --git a/MLExamples/TinyTransformer/version3_triton/exercises/exercise3_flash_attention.md b/MLExamples/TinyTransformer/version3_triton/exercises/exercise3_flash_attention.md
index 0b8a9045..84131bc5 100644
--- a/MLExamples/TinyTransformer/version3_triton/exercises/exercise3_flash_attention.md
+++ b/MLExamples/TinyTransformer/version3_triton/exercises/exercise3_flash_attention.md
@@ -1,30 +1,16 @@
+## Exercise 3: Flash Attention Implementation
 
-## Exercise 3: Flash Attention Implementation and Optimization
+**Objective**: Master memory-efficient attention patterns and Flash Attention in Triton.
 
-`exercise3_flash_attention.md` from `HPCTrainingExamples/MLExamples/TinyTransformer/version3_triton/exercises` in the Training Examples repository
-
-**Objective**: Master advanced memory-efficient attention patterns and understand the Flash Attention algorithm implementation in Triton.
-
-**Time**: 75 minutes
-
-**Prerequisites**: Completed Exercises 1 and 2
+**Time**: 75 minutes | **Prerequisites**: Completed Exercises 1 and 2
 
 ### Background
 
-Flash Attention is a memory-efficient implementation of scaled dot-product attention that:
-
-- Reduces memory complexity from O(N^2) to O(N)
-- Uses tiling to fit computations in SRAM
-- Maintains numerical stability through online statistics
-- Achieves significant speedups for long sequences
-
-This exercise explores the Triton implementation and optimization strategies.
+Flash Attention reduces memory complexity from O(N²) to O(N) using tiling and online statistics, enabling significant speedups for long sequences.
 
-### Part A: Flash Attention Algorithm Understanding (25 minutes)
+### Part A: Algorithm Understanding
 
-#### Step 1: Analyze the Algorithm Structure
-
-Examine the `flash_attention_kernel` in `tiny_llama_v3.py`:
+Examine `flash_attention_kernel` in `tiny_llama_v3.py`:
 
 ```python
 @triton.jit
@@ -37,546 +23,41 @@ def flash_attention_kernel(
 ):
 ```
 
-**Key Components Analysis:**
-
-1. **Tiling Strategy**: How does the algorithm tile the attention matrix?
-2. **Online Statistics**: How are max values and sum exponentials maintained?
-3. **Numerical Stability**: What prevents overflow in the softmax computation?
-
-#### Step 2: Understand the Core Loop
-
-Analyze the main computation loop:
-
-```python
-# Initialize output accumulators
-output_acc = tl.zeros((BLOCK_SIZE_Q, head_dim), dtype=tl.float32)
-max_scores = tl.full((BLOCK_SIZE_Q,), -float('inf'), dtype=tl.float32)
-sum_exp = tl.zeros((BLOCK_SIZE_Q,), dtype=tl.float32)
-
-# Process K,V blocks
-for k_block_start in range(0, seq_len, BLOCK_SIZE_K):
-    # Compute attention scores
-    scores = tl.zeros((BLOCK_SIZE_Q, BLOCK_SIZE_K), dtype=tl.float32)
-
-    # Update running statistics
-    block_max = tl.max(scores, axis=1)
-    new_max = tl.maximum(max_scores, block_max)
-    exp_scores = tl.exp(scores - new_max[:, None])
-
-    # Update accumulated values
-    decay = tl.exp(max_scores - new_max)
-    sum_exp = sum_exp * decay + tl.sum(exp_scores, axis=1)
-    max_scores = new_max
-```
-
-**Algorithm Questions:**
-
-1. **Memory Complexity**: How does this achieve O(N) memory complexity?
-2. **Numerical Stability**: Why subtract the maximum before exponentiation?
-3. **Online Updates**: How are the running statistics updated correctly?
-
-#### Step 3: Compare with Standard Attention
-
-Create a comparison analysis:
-
-```python
-def compare_attention_algorithms():
-    """Compare Flash Attention with standard attention implementation."""
-
-    print("Attention Algorithm Comparison")
-    print("=" * 40)
-
-    # Example sequence lengths
-    seq_lengths = [128, 256, 512, 1024, 2048, 4096]
-    head_dim = 64
-
-    for seq_len in seq_lengths:
-        # Standard attention memory
-        attention_matrix_size = seq_len * seq_len * 4  # float32
-        qkv_size = 3 * seq_len * head_dim * 4
-        output_size = seq_len * head_dim * 4
-
-        standard_memory = attention_matrix_size + qkv_size + output_size
-
-        # Flash attention memory (tiled)
-        block_size = 64  # Typical block size
-        tile_size = block_size * block_size * 4
-        flash_memory = tile_size + qkv_size + output_size
-
-        memory_ratio = standard_memory / flash_memory
-
-        print(f"Seq len {seq_len:4d}: Standard {standard_memory/1e6:6.2f} MB, "
-              f"Flash {flash_memory/1e6:6.2f} MB, "
-              f"Ratio: {memory_ratio:5.1f}x")
-
-    return seq_lengths, [standard_memory, flash_memory]
-
-# Run comparison
-compare_attention_algorithms()
-```
-
-#### Step 4: Analyze Causal Masking
-
-Understand how causal masking is implemented:
-
-```python
-# Apply causal mask
-causal_mask = q_offsets[:, None] >= k_offsets[None, :]
-scores = tl.where(causal_mask, scores, -float('inf'))
-```
-
-**Masking Analysis:**
-
-1. **Mask Generation**: How is the causal mask computed efficiently?
-2. **Memory Impact**: What's the memory overhead of masking?
-3. **Alternative Strategies**: What other masking approaches exist?
-
-### Part B: Performance Analysis and Optimization (30 minutes)
-
-#### Step 5: Benchmark Flash Attention Performance
-
-Create a comprehensive benchmark:
-
-```python
-import time
-import torch
-import torch.nn.functional as F
-from tiny_llama_v3 import TritonAttention
-
-def benchmark_attention_implementations():
-    """Benchmark Flash Attention vs standard PyTorch attention."""
-
-    device = torch.device('cuda')
-
-    # Test configurations
-    configs = [
-        (1, 8, 128, 64),    # Small
-        (2, 16, 256, 64),   # Medium
-        (4, 32, 512, 64),   # Large
-        (2, 16, 1024, 64),  # Long sequence
-        (1, 8, 2048, 64),   # Very long
-    ]
-
-    results = []
-
-    for batch_size, num_heads, seq_len, head_dim in configs:
-        print(f"\nTesting: B={batch_size}, H={num_heads}, S={seq_len}, D={head_dim}")
-
-        dim = num_heads * head_dim
-
-        # Create input
-        x = torch.randn(batch_size, seq_len, dim, device=device)
-
-        # Flash Attention (Triton)
-        flash_attn = TritonAttention(dim, num_heads).to(device)
-
-        # Standard PyTorch Attention
-        class StandardAttention(torch.nn.Module):
-            def __init__(self, dim, num_heads):
-                super().__init__()
-                self.num_heads = num_heads
-                self.head_dim = dim // num_heads
-                self.scale = 1.0 / (self.head_dim ** 0.5)
-
-                self.q_proj = torch.nn.Linear(dim, dim, bias=False)
-                self.k_proj = torch.nn.Linear(dim, dim, bias=False)
-                self.v_proj = torch.nn.Linear(dim, dim, bias=False)
-                self.o_proj = torch.nn.Linear(dim, dim, bias=False)
-
-            def forward(self, x):
-                B, T, C = x.shape
-
-                q = self.q_proj(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
-                k = self.k_proj(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
-                v = self.v_proj(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
-
-                # Standard attention
-                scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
-
-                # Causal mask
-                mask = torch.tril(torch.ones(T, T, device=x.device))
-                scores = scores.masked_fill(mask == 0, float('-inf'))
-
-                attn = F.softmax(scores, dim=-1)
-                out = torch.matmul(attn, v)
-
-                out = out.transpose(1, 2).contiguous().view(B, T, C)
-                return self.o_proj(out)
-
-        standard_attn = StandardAttention(dim, num_heads).to(device)
-
-        # Copy weights for fair comparison
-        standard_attn.q_proj.weight.data.copy_(flash_attn.q_proj.weight.data)
-        standard_attn.k_proj.weight.data.copy_(flash_attn.k_proj.weight.data)
-        standard_attn.v_proj.weight.data.copy_(flash_attn.v_proj.weight.data)
-        standard_attn.o_proj.weight.data.copy_(flash_attn.o_proj.weight.data)
-
-        # Benchmark Flash Attention
-        torch.cuda.synchronize()
-        start_time = time.time()
-
-        for _ in range(20):
-            flash_output = flash_attn(x)
-
-        torch.cuda.synchronize()
-        flash_time = (time.time() - start_time) / 20
-
-        # Benchmark Standard Attention
-        torch.cuda.synchronize()
-        start_time = time.time()
-
-        for _ in range(20):
-            standard_output = standard_attn(x)
-
-        torch.cuda.synchronize()
-        standard_time = (time.time() - start_time) / 20
-
-        # Memory usage
-        torch.cuda.reset_peak_memory_stats()
-        _ = flash_attn(x)
-        flash_memory = torch.cuda.max_memory_allocated()
-
-        torch.cuda.reset_peak_memory_stats()
-        _ = standard_attn(x)
-        standard_memory = torch.cuda.max_memory_allocated()
-
-        # Calculate metrics
-        speedup = standard_time / flash_time
-        memory_ratio = standard_memory / flash_memory
-        throughput = batch_size * seq_len / flash_time
-
-        result = {
-            'config': (batch_size, num_heads, seq_len, head_dim),
-            'flash_time_ms': flash_time * 1000,
-            'standard_time_ms': standard_time * 1000,
-            'speedup': speedup,
-            'flash_memory_mb': flash_memory / 1e6,
-            'standard_memory_mb': standard_memory / 1e6,
-            'memory_ratio': memory_ratio,
-            'throughput': throughput
-        }
-
-        results.append(result)
-
-        print(f"  Flash Attention: {flash_time*1000:.2f} ms, {flash_memory/1e6:.1f} MB")
-        print(f"  Standard Attention: {standard_time*1000:.2f} ms, {standard_memory/1e6:.1f} MB")
-        print(f"  Speedup: {speedup:.2f}x, Memory reduction: {memory_ratio:.2f}x")
-        print(f"  Throughput: {throughput:.0f} tokens/s")
-
-    return results
-
-# Run attention benchmarks
-attention_results = benchmark_attention_implementations()
-```
-
-#### Step 6: Block Size Optimization
-
-Optimize block sizes for different sequence lengths:
-
-```python
-def optimize_flash_attention_blocks():
-    """Find optimal block sizes for Flash Attention."""
+**Key Questions:**
+1. How does tiling achieve O(N) memory complexity?
+2. Why subtract the maximum before exponentiation?
+3. How are running statistics updated correctly?
 
-    device = torch.device('cuda')
+### Part B: Performance Analysis
 
-    # Test different block size combinations
-    block_configs = [
-        (32, 32),   # Small blocks
-        (64, 64),   # Medium blocks
-        (128, 128), # Large blocks
-        (64, 32),   # Asymmetric 1
-        (32, 64),   # Asymmetric 2
-        (128, 64),  # Asymmetric 3
-    ]
+Benchmark configurations:
+- (1, 8, 128, 64), (2, 16, 256, 64), (4, 32, 512, 64)
+- (2, 16, 1024, 64), (1, 8, 2048, 64)
 
-    # Test on different sequence lengths
-    seq_lengths = [256, 512, 1024]
+Compare Flash Attention vs standard PyTorch attention:
+- Execution time
+- Memory usage
+- Speedup and memory reduction
 
-    batch_size, num_heads, head_dim = 2, 16, 64
-    dim = num_heads * head_dim
+### Part C: Block Size Optimization
 
-    for seq_len in seq_lengths:
-        print(f"\nOptimizing for sequence length: {seq_len}")
+Test block sizes: (32,32), (64,64), (128,128), (64,32), (32,64), (128,64)
 
-        x = torch.randn(batch_size, seq_len, dim, device=device)
+### Results Template
 
-        best_time = float('inf')
-        best_config = None
-
-        for block_q, block_k in block_configs:
-            # Skip if blocks are too large for sequence
-            if block_q > seq_len or block_k > seq_len:
-                continue
-
-            print(f"  Testing blocks: Q={block_q}, K={block_k}")
-
-            # Create attention with specific block sizes
-            # Note: This requires modifying the kernel call
-            flash_attn = TritonAttention(dim, num_heads).to(device)
-
-            # Warmup
-            for _ in range(5):
-                _ = flash_attn(x)
-            torch.cuda.synchronize()
-
-            # Benchmark
-            start_time = time.time()
-            for _ in range(20):
-                _ = flash_attn(x)
-            torch.cuda.synchronize()
-
-            avg_time = (time.time() - start_time) / 20
-
-            print(f"    Time: {avg_time*1000:.3f} ms")
-
-            if avg_time < best_time:
-                best_time = avg_time
-                best_config = (block_q, block_k)
-
-        print(f"  Best configuration: Q={best_config[0]}, K={best_config[1]}")
-        print(f"  Best time: {best_time*1000:.3f} ms")
-
-# Run block size optimization
-optimize_flash_attention_blocks()
-```
-
-#### Step 7: Memory Pattern Analysis
-
-Analyze memory access patterns:
-
-```python
-def analyze_flash_attention_memory():
-    """Analyze memory access patterns in Flash Attention."""
-
-    print("Flash Attention Memory Pattern Analysis")
-    print("=" * 45)
-
-    # Example configuration
-    batch_size, num_heads, seq_len, head_dim = 2, 16, 1024, 64
-    block_q, block_k = 64, 64
-
-    print(f"Configuration: B={batch_size}, H={num_heads}, S={seq_len}, D={head_dim}")
-    print(f"Block sizes: Q={block_q}, K={block_k}")
-
-    # Calculate memory accesses
-    num_q_blocks = (seq_len + block_q - 1) // block_q
-    num_k_blocks = (seq_len + block_k - 1) // block_k
-
-    print(f"\nTiling information:")
-    print(f"  Q blocks: {num_q_blocks}")
-    print(f"  K blocks: {num_k_blocks}")
-    print(f"  Total block pairs: {num_q_blocks * num_k_blocks}")
-
-    # Memory per block
-    q_block_size = block_q * head_dim * 4  # float32
-    k_block_size = block_k * head_dim * 4
-    v_block_size = block_k * head_dim * 4
-    scores_size = block_q * block_k * 4
-
-    print(f"\nMemory per block:")
-    print(f"  Q block: {q_block_size/1e3:.1f} KB")
-    print(f"  K block: {k_block_size/1e3:.1f} KB")
-    print(f"  V block: {v_block_size/1e3:.1f} KB")
-    print(f"  Scores: {scores_size/1e3:.1f} KB")
-    print(f"  Total per iteration: {(q_block_size + k_block_size + v_block_size + scores_size)/1e3:.1f} KB")
-
-    # Total memory traffic
-    q_reads = num_q_blocks * q_block_size * num_k_blocks  # Q reused across K blocks
-    k_reads = num_k_blocks * k_block_size * num_q_blocks  # K reused across Q blocks
-    v_reads = num_k_blocks * v_block_size * num_q_blocks  # V same as K
-    output_writes = seq_len * head_dim * 4
-
-    total_traffic = q_reads + k_reads + v_reads + output_writes
-
-    print(f"\nTotal memory traffic:")
-    print(f"  Q reads: {q_reads/1e6:.2f} MB")
-    print(f"  K reads: {k_reads/1e6:.2f} MB")
-    print(f"  V reads: {v_reads/1e6:.2f} MB")
-    print(f"  Output writes: {output_writes/1e6:.2f} MB")
-    print(f"  Total: {total_traffic/1e6:.2f} MB")
-
-    # Compare with standard attention
-    standard_traffic = (
-        3 * seq_len * head_dim * 4 +  # Q, K, V
-        seq_len * seq_len * 4 +       # Attention matrix
-        seq_len * head_dim * 4        # Output
-    )
-
-    print(f"\nStandard attention traffic: {standard_traffic/1e6:.2f} MB")
-    print(f"Flash attention reduction: {standard_traffic/total_traffic:.2f}x")
-
-    return {
-        'flash_traffic_mb': total_traffic / 1e6,
-        'standard_traffic_mb': standard_traffic / 1e6,
-        'reduction_ratio': standard_traffic / total_traffic
-    }
-
-# Run memory analysis
-memory_analysis = analyze_flash_attention_memory()
-```
-
-### Part C: Advanced Optimizations and Debugging (20 minutes)
-
-#### Step 8: Numerical Stability Testing
-
-Test numerical stability across different conditions:
-
-```python
-def test_numerical_stability():
-    """Test numerical stability of Flash Attention implementation."""
-
-    device = torch.device('cuda')
-
-    # Test conditions
-    test_cases = [
-        ("normal", 1.0, 0.0),
-        ("large_values", 10.0, 0.0),
-        ("small_values", 0.1, 0.0),
-        ("extreme_large", 100.0, 0.0),
-        ("with_noise", 1.0, 0.1),
-    ]
-
-    batch_size, num_heads, seq_len, head_dim = 2, 8, 256, 64
-    dim = num_heads * head_dim
-
-    flash_attn = TritonAttention(dim, num_heads).to(device)
-
-    for name, scale, noise in test_cases:
-        print(f"\nTesting {name} (scale={scale}, noise={noise}):")
-
-        # Generate test input
-        x = torch.randn(batch_size, seq_len, dim, device=device) * scale
-        if noise > 0:
-            x += torch.randn_like(x) * noise
-
-        try:
-            output = flash_attn(x)
-
-            # Check for NaN/Inf
-            has_nan = torch.isnan(output).any()
-            has_inf = torch.isinf(output).any()
-
-            print(f"  Input range: [{x.min():.3f}, {x.max():.3f}]")
-            print(f"  Output range: [{output.min():.3f}, {output.max():.3f}]")
-            print(f"  Has NaN: {has_nan}")
-            print(f"  Has Inf: {has_inf}")
-
-            if has_nan or has_inf:
-                print("  WARNING: Numerical instability detected!")
-            else:
-                print("  PASS Numerically stable")
-
-        except Exception as e:
-            print(f"  FAIL Error: {e}")
-
-# Run stability tests
-test_numerical_stability()
-```
-
-#### Step 9: Performance Profiling Integration
-
-Integrate with ROCProfiler for detailed analysis:
-
-```python
-def create_flash_attention_profile():
-    """Create focused profiling for Flash Attention kernels."""
-
-    # Create ROCProfiler configuration for Flash Attention
-    profile_config = """
-# Flash Attention Kernel Profiling Configuration
-pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts
-pmc : VALUUtilization FlatVMemUtilization MemUnitBusy L2CacheHit
-pmc : WriteUnitStalled ALUStalledByLDS LDSBankConflict
-range: 0x1000000000000:0x2000000000000
-gpu: 0
-kernel: flash_attention_kernel
-"""
-
-    with open("flash_attention_profile.txt", "w") as f:
-        f.write(profile_config)
-
-    print("Created Flash Attention profiling configuration")
-    print("Run with: rocprof --input flash_attention_profile.txt python3 tiny_llama_v3.py")
-
-# Create profiling configuration
-create_flash_attention_profile()
-```
-
-### Exercise Results
-
-#### Performance Summary Table
-
-| Sequence Length | Flash Attention (ms) | Standard Attention (ms) | Speedup | Memory Reduction |
-|----------------|---------------------|------------------------|---------|------------------|
+| Sequence Length | Flash (ms) | Standard (ms) | Speedup | Memory Reduction |
+|----------------|------------|---------------|---------|------------------|
 | 128 | | | | |
-| 256 | | | | |
 | 512 | | | | |
 | 1024 | | | | |
-| 2048 | | | | |
-
-#### Block Size Optimization Results
-
-| Sequence Length | Optimal Q Block | Optimal K Block | Best Time (ms) | Notes |
-|----------------|----------------|----------------|----------------|-------|
-| 256 | | | | |
-| 512 | | | | |
-| 1024 | | | | |
-
-#### Memory Analysis Results
-
-- **Flash Attention Memory**: _____ MB
-- **Standard Attention Memory**: _____ MB
-- **Memory Reduction**: _____x
-- **Arithmetic Intensity**: _____ FLOPs/byte
-
-#### Key Insights
-
-1. **Performance Scaling**: How does Flash Attention performance scale with sequence length?
-2. **Memory Efficiency**: What's the memory reduction at different sequence lengths?
-3. **Optimal Block Sizes**: What patterns emerge in optimal block size selection?
-4. **Numerical Stability**: Are there any stability concerns with the implementation?
-
-### Discussion Questions
-
-1. **Algorithm Trade-offs**: What are the trade-offs between memory efficiency and computational complexity in Flash Attention?
-
-2. **Implementation Challenges**: What are the main challenges in implementing Flash Attention in Triton vs CUDA?
-
-3. **Sequence Length Scaling**: How does the algorithm's efficiency change with very long sequences (8K, 16K tokens)?
-
-4. **Hardware Considerations**: How might different GPU architectures affect Flash Attention performance?
-
-### Next Steps
-
-With Version 3 complete, you've learned:
-- Advanced Triton kernel development
-- Memory-efficient algorithm implementation
-- Performance optimization strategies
-- Numerical stability considerations
-
-Version 4 will cover ultra-fused implementations combining all optimizations into a single, highly optimized kernel suite.
-
-### Troubleshooting Guide
-
-#### Common Issues
-
-1. **Kernel Compilation Errors**
-   - Check tensor dimension compatibility
-   - Verify block sizes don't exceed hardware limits
-   - Ensure proper constexpr usage
 
-2. **Performance Regression**
-   - Verify block sizes are optimal for your sequence length
-   - Check memory access patterns
-   - Ensure proper warmup before benchmarking
+### Troubleshooting
 
-3. **Numerical Instability**
-   - Monitor for overflow in softmax computation
-   - Check running statistics update logic
-   - Verify causal mask application
+- **Kernel Compilation**: Check dimension compatibility and block size limits
+- **Performance Regression**: Verify block sizes are optimal for sequence length
+- **Numerical Instability**: Monitor overflow in softmax, check running statistics
 
-4. **Memory Issues**
-   - Reduce block sizes if running out of memory
-   - Check for memory leaks in repeated runs
-   - Monitor peak memory usage during profiling
+### Resources
 
+- [Flash Attention Paper](https://arxiv.org/abs/2205.14135)
+- Online softmax algorithm
diff --git a/MLExamples/TinyTransformer/version3_triton/exercises/performance_debugging/README.md b/MLExamples/TinyTransformer/version3_triton/exercises/performance_debugging/README.md
index 2149bda5..dfd49d3c 100644
--- a/MLExamples/TinyTransformer/version3_triton/exercises/performance_debugging/README.md
+++ b/MLExamples/TinyTransformer/version3_triton/exercises/performance_debugging/README.md
@@ -2,290 +2,87 @@
 
 ## Overview
 
-This exercise demonstrates the systematic debugging and optimization process for V3 Triton kernels. You'll learn how to:
+This exercise demonstrates systematic debugging and optimization for V3 Triton kernels:
 
 1. **Diagnose incorrect model behavior** (wrong loss values)
 2. **Fix correctness issues** (weight initialization)
 3. **Profile and identify performance bottlenecks**
 4. **Systematically optimize for performance**
 
-## The Problem
-
-Initial V3 implementation showed:
--  **Loss = 942** (should be ~7 like V1/V2)
--  **Fake timing** (reported 4ms but actually much slower)
--  **6.4x slower than baseline** after initial fixes
-
 ## Exercise Progression
 
-Each file represents a stage in the debugging process:
-
 ### Stage 1: Broken Loss (`v3_stage1_broken_loss.py`)
 **Problem:** Loss = 942 instead of ~7
 **Root Cause:** Missing weight initialization
-**What to Learn:**
-- How to add diagnostic logging
-- How to trace values through the model
-- How exploding logits break training
 
-**Run:**
 ```bash
 python v3_stage1_broken_loss.py --batch-size 8 --seq-len 128 --num-steps 20
 ```
 
-**Expected Output:**
-```
-Loss: 942.8047  # WRONG!
-Logits stats: min=-161, max=1025, std=43.79  # Exploding values
-```
-
----
-
 ### Stage 2: Fixed Loss, Terrible Performance (`v3_stage2_slow_performance.py`)
 **Problem:** Loss fixed (7.0) but only 15.2 samples/sec (vs V1's 97 samples/sec)
 **Root Cause:** Non-contiguous tensors after `repeat_interleave` for GQA
-**What to Learn:**
-- How memory layout affects Triton kernel performance
-- Why `.contiguous()` matters for GPU kernels
-- How to identify stride-related issues
 
-**Run:**
 ```bash
 python v3_stage2_slow_performance.py --batch-size 8 --seq-len 128 --num-steps 20
 ```
 
-**Expected Output:**
-```
-Loss: 7.0108  # CORRECT!
-Speed: 15.2 samples/sec  # TERRIBLE! (V1 = 97 samples/sec)
-Time: 526ms per batch
-```
-
----
-
 ### Stage 3: Better Performance, Wrong Timing (`v3_stage3_fake_timing.py`)
 **Problem:** Improved to 310 samples/sec but timing breakdown is wrong
-**Root Cause:** Missing CUDA synchronization for individual operation timing
-**What to Learn:**
-- GPU operations are asynchronous
-- How to properly measure GPU kernel timing
-- Why you need `torch.cuda.synchronize()`
+**Root Cause:** Missing CUDA synchronization for timing
 
-**Run:**
 ```bash
 python v3_stage3_fake_timing.py --batch-size 8 --seq-len 128 --num-steps 20
 ```
 
-**Expected Output:**
-```
-Loss: 7.0108  # CORRECT!
-Speed: 310.8 samples/sec  # GOOD!
-Forward: 3.2ms  # Seems reasonable
-Backward: 0.2ms  # WRONG! Too fast!
-Total: 25.7ms  # Doesn't add up! (3.2 + 0.2 + 0.2 ≠ 25.7)
-```
-
----
-
 ### Stage 4: Accurate Timing, Slow Kernels (`v3_stage4_slow_kernels.py`)
-**Problem:** Accurate timing shows forward pass is 25.5ms (2.4x slower than V1's 10.8ms)
+**Problem:** Forward pass is 25.5ms (2.4x slower than V1's 10.8ms)
 **Root Cause:** Inefficient Triton SwiGLU kernel doing manual matrix multiplication
-**What to Learn:**
-- How to identify kernel bottlenecks
-- When NOT to use custom kernels (for large matrix ops)
-- Why PyTorch BLAS is faster than naive Triton implementations
 
-**Run:**
 ```bash
 python v3_stage4_slow_kernels.py --batch-size 8 --seq-len 128 --num-steps 20
 ```
 
-**Expected Output:**
-```
-Loss: 7.0108  # CORRECT!
-Speed: 305.9 samples/sec  # STILL SLOWER THAN V1!
-Forward: 25.5ms  # TOO SLOW! (V1 = 10.8ms)
-Backward: 0.3ms
-Total: 26.2ms
-```
-
-**Profiling Analysis:**
-- SwiGLU kernel launches 2,097,152 threads (batch × seq × d_ff = 8 × 128 × 2048)
-- Each thread does manual reduction over 512 dimensions
-- PyTorch's optimized BLAS would be much faster
-
----
-
 ### Stage 5: Final Optimized (`../tiny_llama_v3.py`)
 **Solution:** Use PyTorch for matrix multiplies, Triton only for element-wise fusion
 **Result:** 2065 samples/sec (5.5x faster than V1!)
-**What to Learn:**
-- Hybrid optimization: use the best tool for each operation
-- When to use Triton (memory-bound ops, fusion opportunities)
-- When to use PyTorch (compute-bound large matrix ops)
 
-**Run:**
 ```bash
 cd .. && python tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 20
 ```
 
-**Expected Output:**
-```
-Loss: 7.0108  # CORRECT!
-Speed: 2065.0 samples/sec  # EXCELLENT! (5.5x faster than V1!)
-Forward: 3.2ms  # Fast!
-Backward: 0.3ms
-Total: 3.9ms  # Dramatic improvement!
-Memory: 281.8 MB  # 46% less than V1's 522.3 MB
-```
-
----
-
-## Profiling with ROCm Tools
-
-### Using rocprof to Profile Each Stage
-
-For each stage, you can generate detailed profiling traces:
-
-```bash
-# Stage 1: Broken Loss (short run to see the issue)
-rocprof --stats -o stage1_broken.csv python v3_stage1_broken_loss.py --batch-size 8 --seq-len 128 --num-steps 5
-
-# Stage 2: Slow Performance
-rocprof --stats -o stage2_slow.csv python v3_stage2_slow_performance.py --batch-size 8 --seq-len 128 --num-steps 20
-
-# Stage 4: Slow Kernels (shows SwiGLU bottleneck)
-rocprof --stats -o stage4_kernels.csv python v3_stage4_slow_kernels.py --batch-size 8 --seq-len 128 --num-steps 20
-
-# Stage 5: Final Optimized
-rocprof --stats -o stage5_optimized.csv python ../tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 20
-```
-
-### What to Look for in Traces
+## Summary Table
 
-**Stage 2 (Slow Performance):**
-- Look for non-coalesced memory accesses in Flash Attention kernel
-- High L2 cache miss rate
-- Memory stalls
+| Stage | Loss | Speed (samples/sec) | Issue | Fix |
+|-------|------|---------------------|-------|-----|
+| 1 | 942 | N/A | Missing weight init | Add `_init_weights()` |
+| 2 | 7.0 | 15.2 | Non-contiguous tensors | Add `.contiguous()` |
+| 3 | 7.0 | 310.8 | Wrong timing | Add CUDA sync |
+| 4 | 7.0 | 305.9 | Slow Triton SwiGLU | Use PyTorch matmul |
+| 5 | 7.0 | 2065.0 | **OPTIMIZED!** | Hybrid approach |
 
-**Stage 4 (Slow Kernels):**
-- SwiGLU kernel shows:
-  - 2M+ kernel launches
-  - Low occupancy (< 25%)
-  - High kernel launch overhead
-- Compare to PyTorch matmul:
-  - Uses rocBLAS (optimized)
-  - High throughput (90%+ of peak)
+**Baseline (V1):** 372.9 samples/sec | **Final Speedup:** 5.5x faster, 46% less memory
 
-**Stage 5 (Optimized):**
-- Flash Attention: High occupancy, good memory throughput
-- RMSNorm: Fused operations, low latency
-- Matrix ops: Delegated to rocBLAS (optimal)
+## Key Learnings
 
-### Analyzing with rocprofv2
+1. **Correctness First**: Validate loss/accuracy before optimizing
+2. **Tensor Contiguity**: Always `.contiguous()` before Triton kernels
+3. **Accurate Timing**: Use `torch.cuda.synchronize()` for GPU timing
+4. **Hybrid Approach**: Triton for memory-bound ops, PyTorch BLAS for matrix ops
 
-For more detailed analysis:
+## Profiling Commands
 
 ```bash
-# Profile with kernel trace
-rocprofv2 --kernel-trace -o stage4_trace.json python v3_stage4_slow_kernels.py --batch-size 8 --seq-len 128 --num-steps 10
+# Basic profiling
+rocprof --stats python tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 20
 
-# View in Perfetto UI
-# Upload stage4_trace.json to https://ui.perfetto.dev
+# Detailed kernel trace
+rocprofv2 --kernel-trace -o trace.json python tiny_llama_v3.py ...
+# View at https://ui.perfetto.dev
 ```
 
-**What to observe:**
-- Kernel timeline showing SwiGLU dominating execution
-- Memory transfer patterns
-- Kernel duration vs. compute capability
-
----
-
-## Key Learnings
-
-### 1. Correctness First, Performance Second
-- Stage 1 shows why: broken model can't be optimized
-- Always validate loss/accuracy before optimizing
-
-### 2. Systematic Debugging
-- Add diagnostic logging (Stage 1)
-- Measure accurately (Stage 3)
-- Profile to identify bottlenecks (Stage 4)
-- Fix one issue at a time
-
-### 3. Know Your Tools
-- **Triton**: Memory-bound ops, element-wise fusion, Flash Attention
-- **PyTorch/BLAS**: Compute-bound matrix operations
-- **Profilers**: rocprof for GPU metrics, timing for coarse analysis
-
-### 4. Common Performance Pitfalls
--  **Tensor contiguity**: Always `.contiguous()` before Triton kernels
--  **CUDA synchronization**: Required for accurate GPU timing
--  **Kernel granularity**: Avoid launching millions of tiny kernels
--  **Use optimized libraries**: Don't reimplement BLAS in Triton
-
-### 5. Optimization is Iterative
-- V1 baseline: 372.9 samples/sec
-- Stage 2 (correct): 15.2 samples/sec (40x SLOWER!)
-- Stage 3 (contiguous): 310.8 samples/sec (0.83x baseline)
-- **Stage 5 (optimized): 2065.0 samples/sec (5.5x FASTER!)**
-
----
-
-## Exercises
-
-### Exercise 1: Diagnose Stage 1
-Run `v3_stage1_broken_loss.py` and:
-1. Uncomment the diagnostic logging
-2. Identify which layer produces exploding values
-3. Explain why default weight initialization causes this
-
-### Exercise 2: Profile Stage 2
-1. Run with rocprof: `rocprof --stats python v3_stage2_slow_performance.py ...`
-2. Find the Flash Attention kernel in the trace
-3. Look at memory metrics - what's wrong?
-
-### Exercise 3: Compare Stage 4 vs Stage 5
-1. Profile both versions with rocprof
-2. Compare SwiGLU execution time
-3. Explain the 8x speedup in the forward pass
-
-### Exercise 4: Design Your Own Optimization
-1. Look at the RMSNorm kernel implementation
-2. Can you further optimize it?
-3. What profiling metrics would validate your optimization?
-
----
-
-## Next Steps
-
-After completing this exercise:
-
-1. **Apply to V4**: V4 has similar issues - can you fix them?
-2. **Custom Kernels**: Try writing your own Triton kernel for a simple operation
-3. **Advanced Profiling**: Learn rocprofv2 for detailed analysis
-4. **Production Deployment**: Consider hybrid Triton+PyTorch approaches
-
----
-
-## Additional Resources
-
-- **Triton Documentation**: https://triton-lang.org/
-- **ROCm Profiling Guide**: https://rocm.docs.amd.com/projects/rocprofiler/en/latest/
-- **Flash Attention Paper**: https://arxiv.org/abs/2205.14135
-- **PyTorch Profiler**: https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html
-
----
-
-## Summary Table
-
-| Stage | Loss | Speed (samples/sec) | Issue | Fix |
-|-------|------|---------------------|-------|-----|
-| 1 |  942 | N/A | Missing weight init | Add `_init_weights()` |
-| 2 |  7.0 |  15.2 | Non-contiguous tensors | Add `.contiguous()` |
-| 3 |  7.0 |  310.8 | Wrong timing | Add CUDA sync |
-| 4 |  7.0 |  305.9 | Slow Triton SwiGLU | Use PyTorch matmul |
-| 5 |  7.0 |  2065.0 | **OPTIMIZED!** | Hybrid approach |
+## Resources
 
-**Baseline (V1):** 372.9 samples/sec
-**Final Speedup:** 5.5x faster, 46% less memory
+- [Triton Documentation](https://triton-lang.org/)
+- [ROCm Profiling Guide](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/)
+- [Flash Attention Paper](https://arxiv.org/abs/2205.14135)
diff --git a/MLExamples/TinyTransformer/version3_triton/exercises/performance_debugging/WORKSHOP_GUIDE.md b/MLExamples/TinyTransformer/version3_triton/exercises/performance_debugging/WORKSHOP_GUIDE.md
index 2b95de0e..e27478bb 100644
--- a/MLExamples/TinyTransformer/version3_triton/exercises/performance_debugging/WORKSHOP_GUIDE.md
+++ b/MLExamples/TinyTransformer/version3_triton/exercises/performance_debugging/WORKSHOP_GUIDE.md
@@ -3,145 +3,52 @@
 ## Quick Start
 
 ```bash
-cd /workspace/version3_triton/exercises/performance_debugging
+cd version3_triton/exercises/performance_debugging
 
 # Read the comprehensive guide
 cat README.md
 
-# Note: Individual stage files (v3_stage1_broken_loss.py, etc.) are symbolic links
-# to the main tiny_llama_v3.py with modifications applied at runtime or via
-# configuration flags. This keeps the exercise files manageable.
-
-# Run all stages with automatic profiling and comparison
+# Run all stages with automatic profiling
 ./run_all_stages.sh
-
-# Results will be saved to results/ directory with:
-# - stage*_output.log: Full training outputs
-# - stage*_profile.csv: rocprof profiling data
-# - Performance comparison summary
 ```
 
 ## What This Exercise Teaches
 
-This is a **realistic performance debugging scenario** that mirrors real-world optimization work:
-
-### 1. **Correctness Before Performance** (Stage 1)
-- Shows how subtle bugs (missing weight init) can completely break training
-- Demonstrates diagnostic logging techniques
-- Loss goes from 942 → 7.0 after one-line fix
-
-### 2. **Memory Layout Matters** (Stage 2→3)
-- Non-contiguous tensors after `repeat_interleave` killed performance
-- Adding `.contiguous()` gave 20x speedup (15 → 310 samples/sec)
-- Critical lesson for GPU kernel developers
-
-### 3. **Measure Accurately** (Stage 3→4)
-- GPU operations are asynchronous
-- Without `torch.cuda.synchronize()`, timings are meaningless
-- Same performance, but now we can see WHERE the time is spent
-
-### 4. **Know When NOT to Use Custom Kernels** (Stage 4→5)
-- Triton SwiGLU kernel was launching 2M+ threads
-- Each doing naive matrix multiplication
-- PyTorch's rocBLAS is orders of magnitude faster
-- Result: 8x forward pass speedup (25.5ms → 3.2ms)
-
-### 5. **Hybrid Optimization Wins**
-- Final version: 2065 samples/sec (5.5x faster than V1 baseline!)
-- Uses Triton for: Flash Attention, RMSNorm (memory-bound ops)
-- Uses PyTorch for: Matrix multiplies (compute-bound ops)
-- **Best of both worlds**
-
-## For Workshop Participants
-
-### Beginner Level
-1. Run `./run_all_stages.sh` and observe the progression
-2. Read the output logs to understand what changed each stage
-3. Focus on the "Key Observations" in the comparison summary
-
-### Intermediate Level
-1. Examine the profiling CSV files in `results/`
-2. Compare kernel execution times between stages
-3. Try modifying block sizes in Flash Attention kernel
-4. Re-run and observe impact on performance
-
-### Advanced Level
-1. Use `rocprofv2 --kernel-trace` for detailed timeline analysis
-2. Identify memory bandwidth bottlenecks
-3. Experiment with different Triton kernel implementations
-4. Write a custom kernel for RoPE application
+### 1. Correctness Before Performance (Stage 1)
+Missing weight init → Loss 942 → 7.0 after one-line fix
+
+### 2. Memory Layout Matters (Stage 2→3)
+Non-contiguous tensors → 20x speedup with `.contiguous()`
+
+### 3. Measure Accurately (Stage 3→4)
+GPU ops are async → `torch.cuda.synchronize()` required
+
+### 4. Know When NOT to Use Custom Kernels (Stage 4→5)
+Triton SwiGLU 2M+ threads → PyTorch rocBLAS 8x faster
+
+### 5. Hybrid Optimization Wins
+Final: 2065 samples/sec (5.5x faster than V1!)
 
 ## Key Takeaways
 
-| Metric | Stage 1 | Stage 2 | Stage 3 | Stage 4 | Stage 5 |
-|--------|---------|---------|---------|---------|---------|
-| **Loss** |  942 |  7.0 |  7.0 |  7.0 |  7.0 |
-| **Speed** | N/A | 15 samp/s | 311 samp/s | 306 samp/s | **2065 samp/s** |
-| **vs Baseline** | N/A | 0.04x | 0.83x | 0.82x | **5.5x** |
-| **Key Issue** | No weight init | Non-contig tensors | Wrong timing | Slow SwiGLU | **OPTIMAL** |
-| **Memory** | N/A | ~282 MB | ~282 MB | ~282 MB | **~282 MB** |
+| Stage | Loss | Speed | vs Baseline | Key Issue |
+|-------|------|-------|-------------|-----------|
+| 1 | 942 | N/A | N/A | No weight init |
+| 2 | 7.0 | 15 samp/s | 0.04x | Non-contig tensors |
+| 3 | 7.0 | 311 samp/s | 0.83x | Wrong timing |
+| 4 | 7.0 | 306 samp/s | 0.82x | Slow SwiGLU |
+| 5 | 7.0 | **2065 samp/s** | **5.5x** | **OPTIMAL** |
 
 **Baseline (V1):** 372.9 samples/sec, 522.3 MB
 
-## Profiling Commands Reference
+## Profiling Commands
 
 ```bash
-# Basic profiling with rocprof
 rocprof --stats python tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 20
-
-# Detailed kernel trace
 rocprofv2 --kernel-trace -o trace.json python tiny_llama_v3.py ...
-
-# View trace in Perfetto
-# Upload trace.json to https://ui.perfetto.dev
-
-# Compare two stages
-diff results/stage2_profile.csv results/stage5_profile.csv
-
-# Find slowest kernels
-sort -t',' -k4 -nr results/stage4_profile.csv | head -20
 ```
 
-## Common Questions
-
-**Q: Why not just use the final optimized version?**
-A: Understanding the journey is more valuable than the destination. Each stage teaches a critical lesson about GPU programming and performance debugging.
-
-**Q: Can I apply these techniques to my own models?**
-A: Absolutely! The debugging methodology is universal:
-   1. Ensure correctness first
-   2. Add accurate timing/profiling
-   3. Identify bottlenecks with profilers
-   4. Fix one issue at a time
-   5. Re-measure and validate
-
-**Q: Should I always use Triton for custom kernels?**
-A: No! As Stage 5 shows, hybrid approaches work best:
-   - Use Triton for memory-bound, fusion opportunities (Flash Attention, layer norm)
-   - Use PyTorch/BLAS for compute-bound matrix ops
-   - Profile to verify your assumptions
-
-**Q: Why is memory usage the same across all stages?**
-A: The memory footprint is determined by model architecture (activations, weights, gradients), not by the kernel implementations. The performance gains come from faster computation, not lower memory usage. Flash Attention provides memory savings by avoiding materialization of the full attention matrix.
-
-## Next Steps
-
-After completing this exercise:
-
-1. **Apply to V4**: The ultra-fused version has similar issues - try fixing them yourself
-2. **Explore ROCm Tools**: Deep dive into rocprofv2, rocprof, omniperf
-3. **Custom Kernels**: Write your own Triton kernel for a simple operation
-4. **Production Deployment**: Consider trade-offs between development time and performance gains
-
-## Additional Resources
-
-- **Triton Tutorials**: https://triton-lang.org/main/getting-started/tutorials/index.html
-- **Flash Attention**: https://github.com/Dao-AILab/flash-attention
-- **ROCm Profiling**: https://rocm.docs.amd.com/projects/rocprofiler/en/latest/
-- **PyTorch Profiler**: https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html
-
----
+## Resources
 
-**Exercise Created**: October 2025
-**Target Hardware**: AMD MI325X with ROCm 6.4.4
-**Framework**: PyTorch 2.7.1 + Triton
+- [Triton Tutorials](https://triton-lang.org/main/getting-started/tutorials/index.html)
+- [ROCm Profiling](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/)
diff --git a/MLExamples/TinyTransformer/version3_triton/get_counters.sh b/MLExamples/TinyTransformer/version3_triton/get_counters.sh
new file mode 100755
index 00000000..4d013665
--- /dev/null
+++ b/MLExamples/TinyTransformer/version3_triton/get_counters.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# Collect kernel trace data for TinyTransformer V3 with rocprofv3.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v3.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v3"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprofv3
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+ROCM_VERSION="$(detect_rocm_version)"
+OUTPUT_DIR="$(make_output_dir counters)"
+
+echo "Starting rocprofv3 kernel trace for TinyTransformer V3..."
+if [ -n "$ROCM_VERSION" ]; then
+    echo "Detected ROCm version: $ROCM_VERSION"
+fi
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+rocprofv3 \
+    --kernel-trace \
+    --output-directory "$OUTPUT_DIR" \
+    -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 3
+echo ""
+echo "To analyze results:"
+
+DB_FILE="$(select_largest_match "$OUTPUT_DIR" "*.db")"
+CSV_FILE="$(select_largest_match "$OUTPUT_DIR" "*_kernel_trace.csv")"
+AGENT_INFO_FILE=""
+
+if [ -n "$CSV_FILE" ]; then
+    CSV_PREFIX="${CSV_FILE%_kernel_trace.csv}"
+    MATCHING_AGENT_INFO="${CSV_PREFIX}_agent_info.csv"
+    if [ -f "$MATCHING_AGENT_INFO" ]; then
+        AGENT_INFO_FILE="$MATCHING_AGENT_INFO"
+    fi
+fi
+
+if [ -z "$AGENT_INFO_FILE" ]; then
+    AGENT_INFO_FILE="$(select_largest_match "$OUTPUT_DIR" "*_agent_info.csv")"
+fi
+
+if [ -n "$CSV_FILE" ]; then
+    echo "  Kernel trace CSV: $CSV_FILE"
+fi
+if [ -n "$AGENT_INFO_FILE" ]; then
+    echo "  Agent info CSV: $AGENT_INFO_FILE"
+fi
+if [ -n "$DB_FILE" ]; then
+    echo "  SQLite database: $DB_FILE"
+    echo ""
+    echo "  Export to CSV:"
+    echo "    rocpd2csv -i \"$DB_FILE\" -o kernel_stats.csv"
+    echo ""
+    echo "  Get kernel summary:"
+    echo "    rocpd summary -i \"$DB_FILE\" --region-categories KERNEL"
+fi
+if [ -z "$CSV_FILE" ] && [ -z "$DB_FILE" ]; then
+    echo "  WARNING: No ROCm profiler output file was detected under $OUTPUT_DIR"
+fi
diff --git a/MLExamples/TinyTransformer/version3_triton/get_hotspots.sh b/MLExamples/TinyTransformer/version3_triton/get_hotspots.sh
new file mode 100755
index 00000000..e0a4921c
--- /dev/null
+++ b/MLExamples/TinyTransformer/version3_triton/get_hotspots.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Collect a quick hotspot summary for TinyTransformer V3 with rocprofv3 --stats.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v3.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v3"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprofv3
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+OUTPUT_DIR="$(make_output_dir hotspots)"
+
+echo "Starting rocprofv3 hotspot summary for TinyTransformer V3..."
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+rocprofv3 \
+    --kernel-trace \
+    --stats \
+    --output-directory "$OUTPUT_DIR" \
+    -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 3
+echo ""
+
+CSV_FILE="$(select_largest_match "$OUTPUT_DIR" "*_kernel_stats.csv")"
+if [ -z "$CSV_FILE" ]; then
+    CSV_FILE="$(select_largest_match "$OUTPUT_DIR" "*_domain_stats.csv")"
+fi
+if [ -n "$CSV_FILE" ]; then
+    echo "Top rows from $CSV_FILE:"
+    head -11 "$CSV_FILE"
+else
+    echo "WARNING: No hotspot CSV file was detected under $OUTPUT_DIR"
+fi
diff --git a/MLExamples/TinyTransformer/version3_triton/get_rocprof_compute.sh b/MLExamples/TinyTransformer/version3_triton/get_rocprof_compute.sh
new file mode 100755
index 00000000..c56eb51f
--- /dev/null
+++ b/MLExamples/TinyTransformer/version3_triton/get_rocprof_compute.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+# Collect hardware metrics for TinyTransformer V3 with rocprof-compute.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v3.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v3"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprof-compute
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+MODE="${1:-no-roof}"
+GPU_ARCH="$(detect_gpu_arch)"
+SUPPORTED_ARCH_REGEX='^(gfx908|gfx90a|gfx940|gfx941|gfx942)$'
+
+if [ -n "$GPU_ARCH" ] && ! echo "$GPU_ARCH" | grep -Eq "$SUPPORTED_ARCH_REGEX"; then
+    echo "Skipping rocprof-compute profiling for TinyTransformer V3..."
+    echo "Detected GPU architecture: $GPU_ARCH"
+    echo "rocprof-compute hardware-counter collection currently requires a supported Instinct GPU"
+    echo "(for example gfx908, gfx90a, gfx940, gfx941, or gfx942)."
+    echo "Use get_trace.sh, get_hotspots.sh, or get_counters.sh on this system instead."
+    exit 0
+fi
+
+OUTPUT_DIR="$(make_output_dir rocprof_compute)"
+PROFILE_ROOT="$OUTPUT_DIR/$WORKLOAD_NAME"
+
+case "$MODE" in
+    full)
+        PROFILE_ARGS=(--kernel-names)
+        MODE_DESCRIPTION="full profile (counters plus roofline stage)"
+        ;;
+    roof-only)
+        PROFILE_ARGS=(--roof-only --kernel-names)
+        MODE_DESCRIPTION="roofline-only profile"
+        ;;
+    no-roof)
+        PROFILE_ARGS=(--no-roof --kernel-names)
+        MODE_DESCRIPTION="counter-only profile without roofline collection"
+        ;;
+    *)
+        echo "Usage: $0 [no-roof|full|roof-only]" >&2
+        echo "  no-roof   collect counters only and skip the roofline stage" >&2
+        echo "  full      collect the default counter set and roofline data" >&2
+        echo "  roof-only collect roofline data only and label roofline kernels" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting rocprof-compute hardware metrics for TinyTransformer V3..."
+if [ -n "$GPU_ARCH" ]; then
+    echo "Detected GPU architecture: $GPU_ARCH"
+fi
+echo "Mode: $MODE_DESCRIPTION"
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+rocprof-compute profile \
+    --name "$WORKLOAD_NAME" \
+    --path "$PROFILE_ROOT" \
+    "${PROFILE_ARGS[@]}" \
+    -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 4
+echo ""
+echo "To analyze results:"
+
+ANALYZE_PATH=""
+for marker in pmc_perf.csv roofline.csv sysinfo.csv; do
+    MARKER_FILE="$(find "$PROFILE_ROOT" -name "$marker" 2>/dev/null | head -1)"
+    if [ -n "$MARKER_FILE" ]; then
+        ANALYZE_PATH="$(dirname "$MARKER_FILE")"
+        break
+    fi
+done
+
+if [ -n "$ANALYZE_PATH" ]; then
+    echo "  Raw data directory: $ANALYZE_PATH"
+    echo ""
+    echo "  1. List detected kernels and dispatches:"
+    echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --list-stats"
+    if [ "$MODE" != "roof-only" ]; then
+        echo ""
+        echo "  2. Inspect one dispatch in the default report:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N>"
+        echo ""
+        echo "  3. Check occupancy and LDS-related limits:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N> --block 2.1.15 6.2.7"
+        echo ""
+        echo "  4. Check L1/L2 memory speed-of-light metrics:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N> --block 16.1 17.1"
+    else
+        echo ""
+        echo "  Roofline-only mode does not collect the full counter set."
+        echo "  Re-run with '$0 full' or '$0 no-roof' for detailed block analysis."
+    fi
+else
+    echo "  WARNING: Could not detect the rocprof-compute raw data directory under $PROFILE_ROOT"
+    echo "  Inspect the generated workload tree and use that path with 'rocprof-compute analyze -p'."
+fi
diff --git a/MLExamples/TinyTransformer/version3_triton/get_rocprof_sys.sh b/MLExamples/TinyTransformer/version3_triton/get_rocprof_sys.sh
new file mode 100755
index 00000000..0f17b9bf
--- /dev/null
+++ b/MLExamples/TinyTransformer/version3_triton/get_rocprof_sys.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Collect a system trace for TinyTransformer V3 with rocprof-sys.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v3.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v3"
+TINYTRANSFORMER_DEFAULT_NUM_STEPS=2
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprof-sys-run
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+OUTPUT_DIR="$(make_output_dir rocprof_sys)"
+
+echo "Starting rocprof-sys trace for TinyTransformer V3..."
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+pushd "$OUTPUT_DIR" >/dev/null
+rocprof-sys-run \
+    --profile \
+    --trace \
+    -- "${BENCHMARK_CMD[@]}"
+popd >/dev/null
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 4
+echo ""
+echo "Open the trace in Perfetto:"
+PROTO_FILE="$(select_largest_match "$OUTPUT_DIR" "*.proto")"
+if [ -n "$PROTO_FILE" ]; then
+    echo "  Perfetto trace file: $PROTO_FILE"
+    echo "  Open it in Perfetto UI: https://ui.perfetto.dev/"
+else
+    echo "  WARNING: No .proto file was found under $OUTPUT_DIR"
+    echo "  Inspect the output tree and open the generated trace in Perfetto UI if present."
+fi
diff --git a/MLExamples/TinyTransformer/version3_triton/get_trace.sh b/MLExamples/TinyTransformer/version3_triton/get_trace.sh
new file mode 100755
index 00000000..064df7bc
--- /dev/null
+++ b/MLExamples/TinyTransformer/version3_triton/get_trace.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Collect a runtime trace for TinyTransformer V3 with rocprofv3.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v3.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v3"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprofv3
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+ROCM_VERSION="$(detect_rocm_version)"
+ROCM_MAJOR="$(rocm_major_from_version "$ROCM_VERSION")"
+OUTPUT_DIR="$(make_output_dir trace)"
+
+echo "Starting rocprofv3 runtime trace for TinyTransformer V3..."
+if [ -n "$ROCM_VERSION" ]; then
+    echo "Detected ROCm version: $ROCM_VERSION"
+fi
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+
+TRACE_CMD=(rocprofv3 --runtime-trace --output-directory "$OUTPUT_DIR")
+if [ "$ROCM_MAJOR" = "6" ] || [ "$ROCM_MAJOR" = "7" ]; then
+    TRACE_CMD+=(--output-format pftrace)
+fi
+
+echo ""
+"${TRACE_CMD[@]}" -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 3
+echo ""
+
+PFTRACE_FILE="$(select_largest_match "$OUTPUT_DIR" "*.pftrace")"
+DB_FILE="$(select_largest_match "$OUTPUT_DIR" "*.db")"
+
+if [ -n "$PFTRACE_FILE" ]; then
+    echo "Perfetto trace file: $PFTRACE_FILE"
+    echo "Open it in Perfetto UI: https://ui.perfetto.dev/"
+elif [ -n "$DB_FILE" ]; then
+    echo "SQLite database found: $DB_FILE"
+    echo "Convert it to Perfetto format with:"
+    echo "  rocpd2pftrace -i \"$DB_FILE\" -o trace.pftrace"
+else
+    echo "WARNING: No .pftrace or .db file was found under $OUTPUT_DIR"
+fi
diff --git a/MLExamples/TinyTransformer/version3_triton/test_rocpd.sh b/MLExamples/TinyTransformer/version3_triton/test_rocpd.sh
new file mode 100755
index 00000000..50ac7c3f
--- /dev/null
+++ b/MLExamples/TinyTransformer/version3_triton/test_rocpd.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Test rocpd (ROCm Profiling Daemon) for continuous profiling
+#
+
+set -e
+
+echo "=========================================="
+echo "rocpd Test - Version 3"
+echo "=========================================="
+echo ""
+
+# Check if rocpd is available
+if ! command -v rocpd &> /dev/null; then
+    echo "[ERROR] rocpd not found in PATH"
+    echo "rocpd may not be installed or available in this ROCm version"
+    exit 1
+fi
+
+echo "rocpd location: $(which rocpd)"
+echo ""
+
+OUTPUT_DIR="./rocpd/rocpd_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUTPUT_DIR"
+
+echo "Output directory: $OUTPUT_DIR"
+echo ""
+
+# Start rocpd in background
+echo "Starting rocpd daemon..."
+rocpd --output-dir "$OUTPUT_DIR" &
+ROCPD_PID=$!
+echo "rocpd running with PID: $ROCPD_PID"
+echo ""
+
+# Give rocpd time to initialize
+sleep 2
+
+# Run workload
+echo "Running workload: python tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 10"
+python tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 10
+WORKLOAD_EXIT=$?
+echo ""
+
+# Stop rocpd
+echo "Stopping rocpd daemon..."
+kill $ROCPD_PID 2>/dev/null || true
+wait $ROCPD_PID 2>/dev/null || true
+echo ""
+
+if [ $WORKLOAD_EXIT -eq 0 ]; then
+    echo "[SUCCESS] Workload completed"
+else
+    echo "[FAILED] Workload failed with exit code $WORKLOAD_EXIT"
+fi
+echo ""
+
+echo "Generated files in $OUTPUT_DIR:"
+ls -lh "$OUTPUT_DIR"
+echo ""
+
+echo "rocpd output is a SQLite3 database file"
+echo ""
+echo "To view the database:"
+echo "  - Use VS Code SQLite Viewer extension"
+echo "  - rocprof-compute and rocprof-systems can consume it directly"
+echo "  - No official CLI tool is provided for viewing"
+echo ""
+echo "rocpd provides continuous profiling with minimal overhead"
+echo ""
diff --git a/MLExamples/TinyTransformer/version4_pytorch_sdpa/README.md b/MLExamples/TinyTransformer/version4_pytorch_sdpa/README.md
index 441f52d1..337eccbb 100644
--- a/MLExamples/TinyTransformer/version4_pytorch_sdpa/README.md
+++ b/MLExamples/TinyTransformer/version4_pytorch_sdpa/README.md
@@ -1,1037 +1,59 @@
+# TinyTransformer Version 4: SDPA Path
 
-# Version 4: Ultra-Fused Triton Implementation
+Version 4 keeps the later fused structure but swaps the attention path to PyTorch SDPA. The main use of this directory is to compare a framework-maintained attention implementation against the custom Triton path in version 3 while keeping the workload fixed.
 
-README.md from `HPCTrainingExamples/MLExamples/TinyTransformer/version4_pytorch_sdpa` in the Training Examples repository
+## What changed
 
-**Objective**: Achieve maximum performance through ultra-fusion techniques and state-of-the-art optimization
+Relative to version 3, this version:
 
-**Expected Performance**: 3.5-5.0x speedup over baseline, 85-98% memory reduction
+- uses PyTorch SDPA for attention
+- keeps the later fused model structure
+- keeps the same profiling workflow as the rest of the progression
 
-**Learning Focus**: Advanced kernel fusion, performance engineering, optimization limits
+## Baseline run
 
-## Overview
-
-Version 4 represents the pinnacle of GPU optimization for transformer models. It implements ultra-fused kernels that process entire transformer blocks in single kernel launches, achieving unprecedented efficiency through:
-
-- **Complete Block Fusion**: Entire transformer blocks in one kernel
-- **Advanced Memory Management**: Optimal register and cache utilization
-- **Cross-Layer Optimization**: Optimization across multiple computational layers
-- **State-of-the-Art Techniques**: Latest advances in GPU performance engineering
-
-### Revolutionary Changes
-
-```
-Version 1: 12+ kernels per transformer block
-Version 2: ~8 kernels per transformer block (basic fusion)
-Version 3: ~4 kernels per transformer block (Triton kernels)
-Version 4: 1 kernel per transformer block (ultra-fusion)
-```
-
-### Performance Achievements
-
-- **Kernel Launch Overhead**: Reduced by 90-95%
-- **Memory Traffic**: Reduced by 85-98%
-- **Cache Efficiency**: Maximized through optimal data reuse
-- **Register Utilization**: Optimal balance of parallelism and resource usage
-
-## Architecture Innovations and Ultra-Fusion Techniques
-
-### Mathematical Foundation of Ultra-Fusion
-
-Ultra-fusion represents the theoretical limit of kernel fusion, combining entire transformer blocks into single GPU kernels. For complete mathematical foundations, see [TINY_LLAMA_ARCHITECTURE.md](../TINY_LLAMA_ARCHITECTURE.md).
-
-#### Ultra-Fusion Efficiency Analysis
-
-**Kernel Launch Overhead Elimination:**
-
-$$\begin{aligned}
-\text{Baseline Kernel Count} &: K_{\text{base}} = 12 \text{ kernels per block} \\
-\text{Ultra-Fused Count} &: K_{\text{ultra}} = 1 \text{ kernel per block} \\
-\text{Overhead Reduction} &: \frac{K_{\text{base}} - K_{\text{ultra}}}{K_{\text{base}}} = \frac{11}{12} = 91.7\% \\
-\text{Latency Savings} &: 11 \times T_{\text{launch}} \text{ per block}
-\end{aligned}$$
-
-**Memory Bandwidth Optimization:**
-
-$$\begin{aligned}
-\text{Baseline Memory Access} &: \sum_{i=1}^{12} (\text{Input}_i + \text{Output}_i) \\
-\text{Ultra-Fused Access} &: \text{Input}_{\text{block}} + \text{Output}_{\text{block}} \\
-\text{Bandwidth Reduction} &: \frac{\text{Baseline} - \text{Ultra-Fused}}{\text{Baseline}} \approx 85-95\%
-\end{aligned}$$
-
-### 1. Ultra-Fused Transformer Block Implementation
-
-#### Complete Mathematical Flow
-
-**Single-Kernel Transformer Block:**
-
-$$\begin{aligned}
-\text{Input:} \quad & x \in \mathbb{R}^{B \times S \times D} \\
-\text{Attention Block:} \quad & \text{attn\_out} = x + \text{Attention}(\text{RMSNorm}(x)) \\
-\text{FFN Block:} \quad & \text{output} = \text{attn\_out} + \text{SwiGLU}(\text{RMSNorm}(\text{attn\_out})) \\
-\text{All in One Kernel!} \quad & \text{Eliminates } 11 \text{ intermediate memory operations}
-\end{aligned}$$
-
-#### Ultra-Fused Kernel Implementation
-
-```python
-@triton.jit
-def ultra_fused_transformer_block_kernel(
-    # Input/Output pointers
-    x_ptr, output_ptr,
-    # Attention weights
-    attn_norm_weight_ptr, qkv_weight_ptr, attn_out_weight_ptr,
-    # FFN weights
-    ffn_norm_weight_ptr, gate_weight_ptr, up_weight_ptr, down_weight_ptr,
-    # Dimensions
-    batch_size, seq_len, hidden_dim, num_heads, intermediate_dim,
-    # Block sizes (auto-tuned)
-    BLOCK_SIZE_B: tl.constexpr,
-    BLOCK_SIZE_S: tl.constexpr,
-    BLOCK_SIZE_D: tl.constexpr
-):
-    """
-    Ultra-fused transformer block - entire block in single kernel.
-
-    Fusion Strategy:
-    1. Load input once into shared memory
-    2. Compute attention norm + QKV + attention + output in registers
-    3. Add residual connection in registers
-    4. Compute FFN norm + gate/up + SiLU + down in registers
-    5. Add final residual and write output once
-
-    Memory Optimization:
-    - Input read: 1x per block
-    - Weight reads: Streamed through cache
-    - Intermediate results: Kept in registers/shared memory
-    - Output write: 1x per block
-    """
-
-    # Thread block coordinates
-    batch_idx = tl.program_id(0)
-    seq_block_idx = tl.program_id(1)
-    dim_block_idx = tl.program_id(2)
-
-    # Compute global indices
-    seq_offset = seq_block_idx * BLOCK_SIZE_S + tl.arange(0, BLOCK_SIZE_S)
-    dim_offset = dim_block_idx * BLOCK_SIZE_D + tl.arange(0, BLOCK_SIZE_D)
-
-    # Bounds checking
-    seq_mask = seq_offset < seq_len
-    dim_mask = dim_offset < hidden_dim
-
-    # PHASE 1: Load input data (single global memory read)
-    input_ptr_offset = (
-        batch_idx * seq_len * hidden_dim +
-        seq_offset[:, None] * hidden_dim +
-        dim_offset[None, :]
-    )
-
-    x_block = tl.load(
-        x_ptr + input_ptr_offset,
-        mask=seq_mask[:, None] & dim_mask[None, :],
-        other=0.0
-    )
-
-    # Store original input for residual connections
-    residual_1 = x_block  # Stored in registers!
-
-    # PHASE 2: Attention normalization (fused with attention)
-    # RMSNorm computation in registers
-    variance = tl.sum(x_block * x_block, axis=1, keepdims=True) / hidden_dim
-    rstd = 1.0 / tl.sqrt(variance + 1e-6)
-
-    # Load attention norm weights and apply
-    attn_norm_weight = tl.load(
-        attn_norm_weight_ptr + dim_offset,
-        mask=dim_mask
-    )
-    x_normed = x_block * rstd * attn_norm_weight[None, :]
-
-    # PHASE 3: Ultra-fused attention computation
-    # This would include QKV projection, attention, and output projection
-    # (Simplified for brevity - full implementation would include all attention logic)
-    attn_output = ultra_fused_attention_computation(
-        x_normed, qkv_weight_ptr, attn_out_weight_ptr,
-        seq_offset, dim_offset, num_heads
-    )
-
-    # First residual connection (in registers)
-    post_attn = residual_1 + attn_output
-
-    # PHASE 4: FFN normalization (fused with FFN)
-    variance_2 = tl.sum(post_attn * post_attn, axis=1, keepdims=True) / hidden_dim
-    rstd_2 = 1.0 / tl.sqrt(variance_2 + 1e-6)
-
-    ffn_norm_weight = tl.load(
-        ffn_norm_weight_ptr + dim_offset,
-        mask=dim_mask
-    )
-    ffn_input = post_attn * rstd_2 * ffn_norm_weight[None, :]
-
-    # PHASE 5: Ultra-fused SwiGLU computation
-    ffn_output = ultra_fused_swiglu_computation(
-        ffn_input, gate_weight_ptr, up_weight_ptr, down_weight_ptr,
-        seq_offset, dim_offset, intermediate_dim
-    )
-
-    # Final residual connection (in registers)
-    final_output = post_attn + ffn_output
-
-    # PHASE 6: Single global memory write
-    output_ptr_offset = (
-        batch_idx * seq_len * hidden_dim +
-        seq_offset[:, None] * hidden_dim +
-        dim_offset[None, :]
-    )
-
-    tl.store(
-        output_ptr + output_ptr_offset,
-        final_output,
-        mask=seq_mask[:, None] & dim_mask[None, :]
-    )
-
-@triton.jit
-def ultra_fused_attention_computation(
-    x_normed, qkv_weight_ptr, attn_out_weight_ptr,
-    seq_offset, dim_offset, num_heads
-):
-    """
-    Ultra-fused attention computation within transformer block kernel.
-    """
-    # QKV projection with register reuse
-    head_dim = hidden_dim // num_heads
-
-    # Compute Q, K, V in parallel using register blocking
-    # (Implementation details for space efficiency)
-
-    # Flash attention computation with optimal memory access
-    # (Using techniques from Version 3 but within ultra-fused context)
-
-    # Return attention output (kept in registers)
-    return attention_result
-
-@triton.jit
-def ultra_fused_swiglu_computation(
-    ffn_input, gate_weight_ptr, up_weight_ptr, down_weight_ptr,
-    seq_offset, dim_offset, intermediate_dim
-):
-    """
-    Ultra-fused SwiGLU computation within transformer block kernel.
-    """
-    # Gate and up projections with register reuse
-    # SiLU activation fused with element-wise multiply
-    # Down projection with output accumulation
-
-    # All operations optimized for register usage
-    return swiglu_result
-```
-
-#### Memory Access Pattern Analysis
-
-```python
-ULTRA_FUSION_MEMORY_ANALYSIS = {
-    'baseline_transformer_block': {
-        'memory_reads': {
-            'input_tensor': 12,  # Read 12 times across operations
-            'weight_matrices': 12,  # Various weight reads
-            'intermediate_tensors': 22,  # Multiple intermediate results
-            'total_memory_ops': 46
-        },
-        'memory_writes': {
-            'intermediate_results': 11,  # 11 intermediate tensors stored
-            'final_output': 1,
-            'total_writes': 12
-        }
-    },
-    'ultra_fused_block': {
-        'memory_reads': {
-            'input_tensor': 1,  # Single read at start
-            'weight_matrices': 7,  # Streamed weight access
-            'intermediate_tensors': 0,  # Kept in registers!
-            'total_memory_ops': 8
-        },
-        'memory_writes': {
-            'intermediate_results': 0,  # No intermediate storage
-            'final_output': 1,
-            'total_writes': 1
-        }
-    },
-    'memory_bandwidth_reduction': '83% fewer memory operations',
-    'register_utilization': '95% of available register file'
-}
-```
-
-### 2. Advanced Memory Hierarchy Management
-
-#### Register File Optimization
-
-```python
-class UltraOptimizedRegisterManagement:
-    """
-    Sophisticated register allocation for ultra-fused kernels.
-    """
-
-    def __init__(self, gpu_arch):
-        self.register_file_size = gpu_arch.register_file_size  # e.g., 64KB per SM
-        self.max_threads_per_block = gpu_arch.max_threads_per_block
-        self.register_allocation_strategy = self._optimize_register_allocation()
-
-    def _optimize_register_allocation(self):
-        """
-        Optimize register allocation for maximum occupancy.
-
-        Trade-off Analysis:
-        - More registers per thread → Better performance per thread
-        - Fewer registers per thread → Higher occupancy
-
-        Optimal Point: Maximum (threads × performance_per_thread)
-        """
-
-        optimization_space = {
-            'high_occupancy': {
-                'registers_per_thread': 32,
-                'threads_per_block': 256,
-                'occupancy': '100%',
-                'performance_per_thread': '85%'
-            },
-            'high_performance': {
-                'registers_per_thread': 64,
-                'threads_per_block': 128,
-                'occupancy': '50%',
-                'performance_per_thread': '120%'
-            },
-            'optimal_balance': {
-                'registers_per_thread': 48,
-                'threads_per_block': 192,
-                'occupancy': '75%',
-                'performance_per_thread': '105%',
-                'total_performance': '78.75% (optimal)'
-            }
-        }
-
-        return optimization_space['optimal_balance']
-```
-
-#### Cache Hierarchy Optimization
-
-```python
-# L1 Cache optimization (32KB per SM)
-L1_CACHE_STRATEGY = {
-    'temporal_locality': {
-        'weight_reuse': 'Keep frequently accessed weights in L1',
-        'activation_reuse': 'Reuse activations across attention heads',
-        'pattern': 'Block-wise computation to maximize reuse'
-    },
-    'spatial_locality': {
-        'memory_coalescing': 'Ensure consecutive threads access consecutive memory',
-        'cache_line_utilization': 'Full 128-byte cache line usage',
-        'stride_optimization': 'Minimize memory stride patterns'
-    }
-}
-
-# L2 Cache optimization (8MB shared across CUs)
-L2_CACHE_STRATEGY = {
-    'weight_streaming': {
-        'pattern': 'Stream weights through L2 for multiple attention heads',
-        'prefetching': 'Prefetch next weight blocks during computation',
-        'retention': 'Keep frequently accessed weights in L2'
-    },
-    'activation_sharing': {
-        'cross_head_sharing': 'Share activations across attention heads',
-        'batch_sharing': 'Share activations across batch elements',
-        'temporal_reuse': 'Optimize for temporal reuse patterns'
-    }
-}
-```
-
-### 3. Intelligent Compilation and Auto-Tuning System
-
-#### Hardware-Adaptive Compilation
-
-```python
-class UltraFusedCompiler:
-    """
-    Intelligent compilation system for ultra-fused kernels.
-    """
-
-    def __init__(self, target_gpu):
-        self.gpu_arch = self._detect_gpu_architecture(target_gpu)
-        self.optimization_parameters = self._derive_optimal_parameters()
-        self.kernel_cache = {}
-
-    def _detect_gpu_architecture(self, target_gpu):
-        """
-        Detect GPU architecture and capabilities.
-        """
-        gpu_specs = {
-            'gfx906': {  # MI50
-                'compute_units': 60,
-                'register_file_per_cu': 64 * 1024,  # 64KB
-                'shared_memory_per_cu': 64 * 1024,  # 64KB
-                'memory_bandwidth': 1024,  # GB/s
-                'peak_flops_fp32': 6.7e12  # FLOPS
-            },
-            'gfx908': {  # MI100
-                'compute_units': 120,
-                'register_file_per_cu': 64 * 1024,
-                'shared_memory_per_cu': 64 * 1024,
-                'memory_bandwidth': 1200,
-                'peak_flops_fp32': 11.5e12
-            },
-            'gfx90a': {  # MI200 series
-                'compute_units': 110,
-                'register_file_per_cu': 64 * 1024,
-                'shared_memory_per_cu': 64 * 1024,
-                'memory_bandwidth': 1600,
-                'peak_flops_fp32': 23e12
-            }
-        }
-
-        return gpu_specs.get(target_gpu, gpu_specs['gfx90a'])
-
-    def _derive_optimal_parameters(self):
-        """
-        Derive optimal kernel parameters based on hardware characteristics.
-        """
-        # Roofline analysis for optimal block sizes
-        arithmetic_intensity_target = self.gpu_arch['peak_flops_fp32'] / self.gpu_arch['memory_bandwidth']
-
-        # Optimize for memory hierarchy
-        l1_cache_size = 32 * 1024  # 32KB L1 cache
-        optimal_working_set = l1_cache_size * 0.8  # 80% utilization
-
-        # Derive block sizes
-        block_size_optimization = {
-            'BLOCK_SIZE_B': self._optimize_batch_blocking(),
-            'BLOCK_SIZE_S': self._optimize_sequence_blocking(),
-            'BLOCK_SIZE_D': self._optimize_feature_blocking(),
-            'BLOCK_SIZE_H': self._optimize_head_blocking()
-        }
-
-        return block_size_optimization
-
-    def _optimize_batch_blocking(self):
-        """Optimize batch dimension blocking."""
-        # Consider memory coalescing and occupancy
-        optimal_batch_block = 4  # Empirically determined
-        return optimal_batch_block
-
-    def _optimize_sequence_blocking(self):
-        """Optimize sequence dimension blocking."""
-        # Balance between cache utilization and parallelism
-        sequence_block_candidates = [32, 64, 128, 256]
-        optimal_seq_block = 64  # Based on cache analysis
-        return optimal_seq_block
-
-    def _optimize_feature_blocking(self):
-        """Optimize feature dimension blocking."""
-        # Vectorization and memory coalescing
-        feature_block_candidates = [64, 128, 256]
-        optimal_feature_block = 128  # Optimal for most architectures
-        return optimal_feature_block
-
-    def _optimize_head_blocking(self):
-        """Optimize attention head blocking."""
-        # Balance between register usage and parallelism
-        head_block_candidates = [1, 2, 4, 8]
-        optimal_head_block = 2  # Good balance for register pressure
-        return optimal_head_block
-
-    def compile_ultra_kernel(self, kernel_signature):
-        """
-        Compile ultra-fused kernel with optimal parameters.
-        """
-        if kernel_signature in self.kernel_cache:
-            return self.kernel_cache[kernel_signature]
-
-        # Generate kernel with optimal parameters
-        compiled_kernel = self._generate_optimized_kernel(
-            kernel_signature,
-            self.optimization_parameters
-        )
-
-        # Cache for reuse
-        self.kernel_cache[kernel_signature] = compiled_kernel
-
-        return compiled_kernel
-```
-
-#### Auto-Tuning Framework
-
-```python
-class UltraFusedAutoTuner:
-    """
-    Automatic tuning system for ultra-fused kernels.
-    """
-
-    def __init__(self, search_space, evaluation_metric='throughput'):
-        self.search_space = search_space
-        self.evaluation_metric = evaluation_metric
-        self.tuning_history = []
-
-    def tune_kernel_parameters(self, model, test_inputs, max_iterations=100):
-        """
-        Auto-tune kernel parameters for optimal performance.
-        """
-
-        # Define search space
-        parameter_space = {
-            'block_sizes': {
-                'BLOCK_SIZE_B': [1, 2, 4, 8],
-                'BLOCK_SIZE_S': [32, 64, 128, 256],
-                'BLOCK_SIZE_D': [64, 128, 256],
-                'BLOCK_SIZE_H': [1, 2, 4]
-            },
-            'memory_optimization': {
-                'use_shared_memory': [True, False],
-                'vectorization_factor': [1, 2, 4],
-                'prefetch_distance': [0, 1, 2]
-            },
-            'compute_optimization': {
-                'unroll_factor': [1, 2, 4, 8],
-                'pipeline_stages': [1, 2, 3],
-                'register_allocation_strategy': ['high_occupancy', 'high_performance']
-            }
-        }
-
-        # Bayesian optimization for efficient parameter search
-        best_params, best_performance = self._bayesian_optimization(
-            parameter_space, model, test_inputs, max_iterations
-        )
-
-        return best_params, best_performance
-
-    def _bayesian_optimization(self, param_space, model, inputs, max_iter):
-        """Bayesian optimization for parameter tuning."""
-        # Efficient parameter space exploration
-        # (Simplified implementation)
-
-        best_params = None
-        best_performance = 0
-
-        for iteration in range(max_iter):
-            # Sample parameters from posterior distribution
-            params = self._sample_parameters(param_space)
-
-            # Evaluate performance
-            performance = self._evaluate_performance(model, inputs, params)
-
-            # Update best configuration
-            if performance > best_performance:
-                best_performance = performance
-                best_params = params
-
-            # Update posterior distribution
-            self._update_posterior(params, performance)
-
-        return best_params, best_performance
-```
-
-## Files and Structure
-
-```
-version4_pytorch_sdpa/
-├── README.md                          # This file
-├── tiny_llama_v4.py                  # Ultra-fused implementation
-├── run_ultra_profiling.py            # Advanced profiling suite
-├── exercises/
-│   └── exercise1_ultra_fusion.md     # Ultra-fusion deep dive
-└── results/                          # Generated analysis results
-```
-
-### Performance Engineering Principles
-
-#### Roofline Model Integration
-
-```python
-class UltraFusedRooflineAnalysis:
-    """
-    Roofline model analysis for ultra-fused kernels.
-    """
-
-    def __init__(self, gpu_specifications):
-        self.peak_compute = gpu_specifications['peak_flops_fp32']  # FLOPS/second
-        self.peak_bandwidth = gpu_specifications['memory_bandwidth']  # Bytes/second
-        self.ridge_point = self.peak_compute / self.peak_bandwidth  # FLOPS/byte
-
-    def analyze_kernel_performance(self, kernel_name, flops, bytes_accessed):
-        """
-        Analyze kernel performance using roofline model.
-        """
-        arithmetic_intensity = flops / bytes_accessed
-
-        if arithmetic_intensity < self.ridge_point:
-            # Memory-bound operation
-            theoretical_performance = arithmetic_intensity * self.peak_bandwidth
-            bottleneck = 'memory_bandwidth'
-            optimization_strategy = 'reduce_memory_access'
-        else:
-            # Compute-bound operation
-            theoretical_performance = self.peak_compute
-            bottleneck = 'compute_throughput'
-            optimization_strategy = 'increase_arithmetic_intensity'
-
-        analysis_result = {
-            'kernel': kernel_name,
-            'arithmetic_intensity': arithmetic_intensity,
-            'ridge_point': self.ridge_point,
-            'bottleneck': bottleneck,
-            'theoretical_peak': theoretical_performance,
-            'optimization_strategy': optimization_strategy
-        }
-
-        return analysis_result
-
-# Example roofline analysis for ultra-fused transformer block
-TRANSFORMER_BLOCK_ROOFLINE = {
-    'ultra_fused_block': {
-        'total_flops': 4 * batch_size * seq_len * hidden_dim * (hidden_dim + intermediate_dim),
-        'memory_bytes': batch_size * seq_len * hidden_dim * 8,  # Input + output only!
-        'arithmetic_intensity': 'total_flops / memory_bytes',
-        'expected_intensity': '~500 FLOPS/byte (highly compute-bound)',
-        'performance_regime': 'compute_bound (good for GPUs)'
-    },
-    'baseline_comparison': {
-        'baseline_arithmetic_intensity': '~50 FLOPS/byte',
-        'ultra_fused_intensity': '~500 FLOPS/byte',
-        'improvement': '10x better arithmetic intensity'
-    }
-}
-```
-
-#### Advanced Memory Optimization Techniques
-
-```python
-class UltraMemoryOptimizer:
-    """
-    Advanced memory optimization for ultra-fused kernels.
-    """
-
-    def __init__(self, gpu_memory_hierarchy):
-        self.memory_hierarchy = gpu_memory_hierarchy
-        self.optimization_strategies = self._initialize_strategies()
-
-    def _initialize_strategies(self):
-        return {
-            'register_optimization': {
-                'vectorization': 'Use float4 for 4x memory throughput',
-                'register_blocking': 'Tile data to fit in register file',
-                'spill_minimization': 'Careful variable lifetime management'
-            },
-            'shared_memory_optimization': {
-                'bank_conflict_avoidance': 'Pad data structures to avoid conflicts',
-                'coalesced_loading': 'Ensure optimal memory access patterns',
-                'double_buffering': 'Overlap computation with memory access'
-            },
-            'global_memory_optimization': {
-                'prefetching': 'Prefetch next data blocks during computation',
-                'streaming': 'Stream large data through memory hierarchy',
-                'compression': 'Use mixed precision to reduce bandwidth'
-            }
-        }
-
-    def optimize_memory_access_pattern(self, kernel_specification):
-        """
-        Optimize memory access patterns for ultra-fused kernels.
-        """
-
-        optimizations = {
-            'coalescing_optimization': {
-                'thread_mapping': 'Map consecutive threads to consecutive memory',
-                'memory_stride': 'Ensure stride-1 access patterns',
-                'alignment': 'Align data to cache line boundaries'
-            },
-            'cache_optimization': {
-                'temporal_locality': 'Reuse data while in cache',
-                'spatial_locality': 'Access nearby memory locations',
-                'cache_blocking': 'Tile computations to fit in cache'
-            },
-            'bandwidth_optimization': {
-                'vectorized_loads': 'Use SIMD memory instructions',
-                'memory_pipelining': 'Overlap memory with computation',
-                'bandwidth_balancing': 'Balance read/write bandwidth usage'
-            }
-        }
-
-        return optimizations
-```
-
-## Key Components Deep Dive
-
-### Ultra-Fused Transformer Block
-
-**Input Processing:**
-```python
-# Single token, entire transformer block
-residual_1 = x_token
-# Attention norm → QKV → Attention → Output → Residual
-# FFN norm → Gate/Up → SiLU → Down → Residual
-final_output = residual_2 + ffn_output
-```
-
-**Memory Efficiency:**
-
-- **Register Reuse**: Maximizes data kept in fast registers
-- **Memory Coalescing**: Optimal access patterns for global memory
-- **Cache Optimization**: Designed for L1/L2 cache efficiency
-
-### Advanced Performance Features
-
-**1. Adaptive Block Sizing:**
-```python
-BLOCK_SIZE_B: tl.constexpr,  # Batch dimension blocking
-BLOCK_SIZE_S: tl.constexpr,  # Sequence dimension blocking
-BLOCK_SIZE_D: tl.constexpr,  # Feature dimension blocking
-BLOCK_SIZE_H: tl.constexpr,  # Head dimension blocking
-```
-
-**2. Ultra-Mode Toggle:**
-```python
-model.enable_ultra_mode(True)   # Maximum performance
-model.enable_ultra_mode(False)  # Fallback for debugging
-```
-
-**3. Performance Prediction:**
-```python
-# Built-in performance modeling
-predicted_time = predict_performance(batch_size, seq_len, d_model)
-```
-
-## Quick Start
-
-### 1. Run Ultra-Fused Model
-
-```bash
-cd version4_pytorch_sdpa/
-python3 tiny_llama_v4.py
-```
-
-**Expected Output:**
-```
-Compiling ultra-fused kernels...
-Ultra-fused kernels compiled successfully!
-
-=== Ultra-Fused Model Benchmark ===
-Testing: batch_size=1, seq_len=128
-  Ultra-fused: XX.XX ms
-  Standard: YY.YY ms
-  Speedup: Z.ZZx
-  Throughput: XXXX tokens/s
-  Memory: X.XX GB
-
-Average speedup: X.XXx
-Maximum speedup: Y.YYx
-Peak throughput: ZZZZ tokens/s
-```
-
-<!--
-### 2. Run Comprehensive Profiling
+Load the required modules:
 
 ```bash
-python3 run_ultra_profiling.py
+module load pytorch rocm triton
 ```
 
-**Analysis Outputs:**
-
-- End-to-end performance comparison
-- Scaling behavior analysis
-- Kernel efficiency metrics
-- Memory hierarchy optimization results
-
-### 3. Examine Results
+Run:
 
 ```bash
-ls ultra_profiling_results/
-cat ultra_profiling_results/ultra_performance_report.md
-```
--->
-
-## Performance Analysis
-
-### Expected Performance Gains
-
-| Metric | Baseline | Version 2 | Version 3 | Version 4 | V4 Total Gain |
-|--------|----------|-----------|-----------|-----------|---------------|
-| Execution Time | 100% | 50-70% | 30-45% | **20-30%** | **3.3-5.0x** |
-| Memory Usage | 100% | 40-60% | 20-35% | **10-20%** | **5.0-10x** |
-| Kernel Launches | 100% | 30-50% | 15-25% | **8-12%** | **8.3-12.5x** |
-| Cache Efficiency | 100% | 120-140% | 150-180% | **200-250%** | **2.0-2.5x** |
-
-### Scaling Characteristics
-
-**Sequence Length Scaling:**
-
-- **Short sequences (≤256)**: 4.0-5.0x speedup
-- **Medium sequences (512)**: 3.5-4.5x speedup
-- **Long sequences (1024+)**: 3.0-4.0x speedup
-
-**Batch Size Scaling:**
-
-- **Single batch**: 3.5-4.5x speedup
-- **Small batches (2-4)**: 4.0-5.0x speedup
-- **Large batches (8+)**: 3.5-4.5x speedup
-
-**Model Size Scaling:**
-
-- **Small models**: 4.5-5.0x speedup
-- **Medium models**: 4.0-4.5x speedup
-- **Large models**: 3.5-4.0x speedup
-
-## Advanced Features
-
-### 1. Performance Engineering
-
-**Roofline Model Integration:**
-```python
-arithmetic_intensity = total_flops / total_bytes
-if arithmetic_intensity > compute_bound_threshold:
-    # Optimize for compute efficiency
-else:
-    # Optimize for memory bandwidth
-```
-
-**Register Pressure Management:**
-```python
-# Intelligent register allocation
-# Float4 vectorization
-# Optimal loop unrolling
-# Compiler hint optimization
-```
-
-### 2. Memory Hierarchy Optimization
-
-**L1 Cache Optimization:**
-
-- Temporal locality maximization
-- Spatial locality optimization
-- Cache line utilization
-
-**L2 Cache Strategy:**
-
-- Weight reuse patterns
-- Prefetching optimization
-- Bank conflict avoidance
-
-**Global Memory Efficiency:**
-
-- Coalescing optimization
-- Bandwidth utilization
-- Access pattern optimization
-
-### 3. Adaptive Optimization
-
-**Hardware Detection:**
-```python
-# Automatic GPU architecture detection
-# Optimal kernel parameter selection
-# Performance characteristic adaptation
-```
-
-**Dynamic Configuration:**
-```python
-# Runtime performance optimization
-# Adaptive block size selection
-# Memory configuration tuning
+python tiny_llama_v4.py --batch-size 8 --seq-len 128 --num-steps 10
 ```
 
-## Hands-on Exercises
-
-### Exercise 1: Ultra-Fusion Architecture (90 minutes)
-
-**Focus Areas:**
-
-- Ultra-fusion architecture analysis
-- Advanced memory management
-- Performance engineering deep dive
-- Roofline model application
-
-**Key Learning Objectives:**
-
-1. Understand ultra-fusion principles and trade-offs
-2. Analyze advanced memory hierarchy optimization
-3. Apply performance engineering techniques
-4. Master roofline model analysis
-
-## Advanced Topics
-
-### Performance Engineering Principles
-
-1. **Kernel Fusion Strategies**
-   - Identify fusion opportunities
-   - Balance register pressure vs parallelism
-   - Optimize memory access patterns
-
-2. **Memory Hierarchy Mastery**
-   - Register allocation optimization
-   - Cache utilization maximization
-   - Global memory bandwidth efficiency
-
-3. **Hardware-Specific Optimization**
-   - GPU architecture adaptation
-   - Instruction-level optimization
-   - Memory subsystem tuning
-
-### Optimization Methodology
-
-1. **Profile-Guided Optimization**
-   ```bash
-   # Profile → Analyze → Optimize → Validate
-   # Identify bottlenecks
-   # Apply targeted optimizations
-   # Measure improvements
-   ```
-
-2. **Performance Modeling**
-   ```python
-   # Predict performance for new configurations
-   # Guide optimization decisions
-   # Validate theoretical vs actual performance
-   ```
-
-3. **Iterative Refinement**
-   ```python
-   # Continuous optimization cycle
-   # A/B testing of optimizations
-   # Performance regression detection
-   ```
-
-<!--
-## Integration with ROCm Ecosystem
-
-### ROCProfiler Integration
+Example output from one validated run:
 
-```bash
-# Ultra-detailed profiling
-rocprof --stats --kernel-trace --hip-trace python3 tiny_llama_v4.py
-
-# Memory access analysis
-rocprof --memory-trace --sys-trace python3 tiny_llama_v4.py
-
-# Roofline analysis
-rocprof --input roofline_config.txt python3 tiny_llama_v4.py
+```text
+Performance Summary V4:
+   Average training speed: 830.7 samples/sec
+   Throughput: 106332 tokens/sec
+   Average batch time: 9.6 ms
+   Peak memory usage: 193.9 MB
 ```
--->
-
-### Performance Metrics
-
-**Key Metrics to Monitor:**
-
-1. **Kernel Efficiency**: Execution time, occupancy, utilization
-2. **Memory Performance**: Bandwidth, cache hit rates, access patterns
-3. **System Integration**: CPU-GPU coordination, data transfer efficiency
-
-## Production Considerations
-
-### Deployment Optimization
-
-1. **Model Compilation**
-   ```python
-   # Precompile for target hardware
-   # Cache compiled kernels
-   # Version management
-   ```
-
-2. **Runtime Optimization**
-   ```python
-   # Dynamic adaptation
-   # Performance monitoring
-   # Fallback strategies
-   ```
-
-3. **Scalability**
-   ```python
-   # Multi-GPU scaling
-   # Memory management
-   # Load balancing
-   ```
-
-### Monitoring and Debugging
-
-1. **Performance Monitoring**
-   - Real-time performance metrics
-   - Trend analysis
-   - Anomaly detection
-
-2. **Debugging Tools**
-   - Kernel-level debugging
-   - Memory access visualization
-   - Performance bottleneck identification
-
-## Limitations and Trade-offs
-
-### Current Limitations
-
-1. **Hardware Dependency**: Optimized for specific GPU architectures
-2. **Complexity**: Increased development and maintenance complexity
-3. **Debugging Difficulty**: More challenging to debug fused kernels
-4. **Portability**: May require adaptation for different hardware
-
-### Trade-off Analysis
-
-| Aspect | Benefit | Cost |
-|--------|---------|------|
-| Performance | 3.5-5.0x speedup | Development complexity |
-| Memory Efficiency | 85-98% reduction | Debugging difficulty |
-| Kernel Fusion | Minimal launches | Hardware dependency |
-| Optimization | Maximum efficiency | Maintenance overhead |
-
-## Future Directions
-
-### Emerging Techniques
-
-1. **AI-Guided Optimization**
-   - ML-based kernel optimization
-   - Automated parameter tuning
-   - Performance prediction
-
-2. **Hardware Co-design**
-   - Kernel-hardware co-optimization
-   - Custom instruction utilization
-   - Memory hierarchy adaptation
-
-3. **Cross-Layer Optimization**
-   - Model-kernel co-design
-   - End-to-end optimization
-   - System-level efficiency
-
-### Research Opportunities
-
-1. **Automatic Fusion**
-   - Compiler-driven optimization
-   - Pattern recognition
-   - Optimization space exploration
-
-2. **Adaptive Optimization**
-   - Runtime adaptation
-   - Workload-specific tuning
-   - Dynamic reconfiguration
 
-## Conclusion
+On the validated container, version 4 landed very close to version 3. That is the main comparison to keep in mind when profiling this directory.
 
-Version 4 represents the state-of-the-art in GPU optimization for transformer models. Through ultra-fusion techniques, it achieves:
+## Profiling workflow
 
-- **Maximum Performance**: 3.5-5.0x speedup over baseline
-- **Optimal Efficiency**: 85-98% memory reduction
-- **Advanced Techniques**: State-of-the-art optimization methods
-- **Production Ready**: Robust, scalable implementation
+Use the same scripts as the earlier versions:
 
-This implementation demonstrates the pinnacle of what's possible with current GPU optimization techniques while providing a foundation for future advances.
+- `./get_hotspots.sh`
+- `./get_trace.sh`
+- `./get_counters.sh`
+- `./get_rocprof_compute.sh`
+- `./get_rocprof_sys.sh`
 
-## Resources
+Start with `./get_hotspots.sh` and `./get_trace.sh`. The main question is whether the attention region and dominant kernel set change materially relative to version 3, even when the overall step time remains similar.
 
-### Technical Documentation
-- [Triton Advanced Programming Guide](https://triton-lang.org/main/programming-guide/index.html)
-- [AMD GPU Architecture](https://rocmdocs.amd.com/en/latest/Programming_Guides/Programming-Guides.html)
-- [Performance Optimization Best Practices](https://rocmdocs.amd.com/en/latest/Programming_Guides/Performance_optimization.html)
+## Comparison target
 
-### Research Papers
-- [FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness](https://arxiv.org/abs/2205.14135)
-- [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](https://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf)
-- [The Roofline Model: A Tool for Performance Analysis](https://crd.lbl.gov/departments/computer-science/PAR/research/roofline/)
+Compare this directory directly against [`../version3_triton`](../version3_triton). The interesting result is not whether version 4 is slightly faster or slower on one machine. It is whether the optimized behavior is preserved while relying on a framework path.
 
-### Community Resources
-- [AMD ROCm Community](https://github.com/RadeonOpenCompute/ROCm)
-- [Triton Community](https://github.com/openai/triton)
-- [GPU Optimization Forums](https://developer.amd.com/community/)
+## References
 
+- comparison across versions: [`../VERSION_COMPARISON.md`](../VERSION_COMPARISON.md)
+- PyTorch SDPA overview: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+- Perfetto UI: https://ui.perfetto.dev/
diff --git a/MLExamples/TinyTransformer/version4_pytorch_sdpa/exercises/exercise1_ultra_fusion.md b/MLExamples/TinyTransformer/version4_pytorch_sdpa/exercises/exercise1_ultra_fusion.md
index 760496c1..f3c6e31c 100644
--- a/MLExamples/TinyTransformer/version4_pytorch_sdpa/exercises/exercise1_ultra_fusion.md
+++ b/MLExamples/TinyTransformer/version4_pytorch_sdpa/exercises/exercise1_ultra_fusion.md
@@ -1,525 +1,78 @@
-
 ## Exercise 1: Ultra-Fusion Architecture and Design
 
-`exercise1_ultra_fusion.md` from `HPCTrainingExamples/MLExamples/TinyTransformer/version4_pytorch_sdpa/exercises` in the Training Examples repository
-
-**Objective**: Understand ultra-fusion principles and analyze the most advanced optimization techniques in GPU kernel development.
-
-**Time**: 90 minutes
+**Objective**: Understand ultra-fusion principles and analyze advanced GPU kernel optimization techniques.
 
-**Prerequisites**: Completed all exercises in Versions 1-3
+**Time**: 90 minutes | **Prerequisites**: Completed all exercises in Versions 1-3
 
 ### Background
 
-Ultra-fusion represents the pinnacle of GPU optimization, where entire transformer blocks are processed in single kernel launches with minimal memory traffic. This exercise explores the advanced techniques used to achieve maximum performance:
-
-- Cross-layer kernel fusion
-- Advanced memory hierarchy optimization
-- Ultra-efficient data flow patterns
-- State-of-the-art performance engineering
-
-### Part A: Ultra-Fusion Architecture Analysis (30 minutes)
+Ultra-fusion represents the pinnacle of GPU optimization, where entire transformer blocks are processed in single kernel launches with minimal memory traffic.
 
-#### Step 1: Understand the Ultra-Fused Transformer Block
+### Part A: Ultra-Fusion Architecture Analysis
 
-Examine the `ultra_fused_transformer_block_kernel` in `tiny_llama_v4.py`:
+Examine `ultra_fused_transformer_block_kernel` in `tiny_llama_v4.py`:
 
 ```python
 @triton.jit
 def ultra_fused_transformer_block_kernel(
-    # Input and output tensors
     x_ptr, output_ptr,
-    # All weights (attention + FFN + norms)
     q_weight_ptr, k_weight_ptr, v_weight_ptr, o_weight_ptr,
     gate_weight_ptr, up_weight_ptr, down_weight_ptr,
     attn_norm_weight_ptr, ffn_norm_weight_ptr,
-    # Dimensions and constants
     batch_size, seq_len, d_model, n_heads, d_ff,
     head_dim, scale, norm_eps,
-    # Advanced block sizing
     BLOCK_SIZE_B, BLOCK_SIZE_S, BLOCK_SIZE_D, BLOCK_SIZE_H,
 ):
 ```
 
-**Architecture Analysis Questions:**
-
-1. **Fusion Scope**: What operations are fused together in this single kernel?
-2. **Memory Efficiency**: How does this kernel minimize memory traffic compared to Version 3?
-3. **Computational Overlap**: How are different computations overlapped for efficiency?
-4. **Register Usage**: How is register pressure managed with so many operations?
-
-#### Step 2: Analyze the Computation Flow
-
-Follow the ultra-fused execution pattern:
-
-```python
-# Store original input for residual
-residual_1 = x_token
-
-# === ATTENTION LAYER NORM ===
-variance = tl.sum(x_token * x_token) / d_model
-inv_std = 1.0 / tl.sqrt(variance + norm_eps)
-x_normed = x_token * inv_std * attn_norm_weights
-
-# === ULTRA-FUSED ATTENTION ===
-# Parallel QKV computation...
-
-# === FIRST RESIDUAL CONNECTION ===
-x_token = residual_1 + attn_output
-residual_2 = x_token
-
-# === FFN LAYER NORM ===
-# === ULTRA-FUSED SWIGLU FFN ===
-# === FINAL RESIDUAL CONNECTION ===
-```
-
-**Flow Analysis Tasks:**
-
-1. **Data Dependencies**: Map out all data dependencies in the computation
-2. **Memory Reuse**: Identify opportunities for register and shared memory reuse
-3. **Parallelization**: Analyze how different operations can be parallelized
-4. **Critical Path**: Identify the critical path through the computation
-
-#### Step 3: Compare with Previous Versions
-
-Create a comparison table of kernel launches:
-
-| Operation | Version 1 | Version 2 | Version 3 | Version 4 |
-|-----------|-----------|-----------|-----------|-----------|
-| Input Layer Norm | 1 kernel | 1 kernel | 1 kernel | **Fused** |
-| Q Projection | 1 kernel | 1 kernel | 1 kernel | **Fused** |
-| K Projection | 1 kernel | 1 kernel | 1 kernel | **Fused** |
-| V Projection | 1 kernel | 1 kernel | 1 kernel | **Fused** |
-| Attention Compute | Multiple | Fused | 1 kernel | **Fused** |
-| Output Projection | 1 kernel | 1 kernel | 1 kernel | **Fused** |
-| Residual Add | 1 kernel | 1 kernel | 1 kernel | **Fused** |
-| FFN Layer Norm | 1 kernel | 1 kernel | 1 kernel | **Fused** |
-| Gate Projection | 1 kernel | Fused | 1 kernel | **Fused** |
-| Up Projection | 1 kernel | Fused | 1 kernel | **Fused** |
-| SiLU Activation | 1 kernel | Fused | 1 kernel | **Fused** |
-| Down Projection | 1 kernel | 1 kernel | 1 kernel | **Fused** |
-| Final Residual | 1 kernel | 1 kernel | 1 kernel | **Fused** |
+**Analysis Questions:**
+1. What operations are fused in this single kernel?
+2. How does this minimize memory traffic vs Version 3?
+3. How is register pressure managed?
+
+### Part B: Kernel Launch Comparison
+
+| Operation | V1 | V2 | V3 | V4 |
+|-----------|----|----|----|----|
+| Input Layer Norm | 1 | 1 | 1 | **Fused** |
+| QKV Projections | 3 | 3 | 3 | **Fused** |
+| Attention Compute | Multi | Fused | 1 | **Fused** |
+| Output Projection | 1 | 1 | 1 | **Fused** |
+| FFN (Gate/Up/Down) | 3 | Fused | 3 | **Fused** |
+| Residual Adds | 2 | 2 | 2 | **Fused** |
 | **Total Kernels** | **~12** | **~8** | **~4** | **1** |
 
-**Performance Implications:**
-
-1. **Launch Overhead**: Calculate the kernel launch overhead savings
-2. **Memory Bandwidth**: Estimate memory bandwidth reduction
-3. **Cache Efficiency**: Analyze L1/L2 cache utilization improvements
-
-### Part B: Advanced Memory Management Analysis (35 minutes)
-
-#### Step 4: Memory Hierarchy Optimization
-
-Analyze how the ultra-fused kernel optimizes memory usage:
-
-```python
-def analyze_memory_hierarchy():
-    """Analyze memory usage patterns in ultra-fused kernel."""
-
-    # Model configuration
-    batch_size, seq_len, d_model = 4, 512, 2048
-    n_heads = 32
-    head_dim = d_model // n_heads
-    d_ff = int(2.67 * d_model)
-
-    print("Ultra-Fused Memory Hierarchy Analysis")
-    print("=" * 45)
-
-    # Register usage analysis
-    registers_per_token = (
-        d_model +           # Input token
-        d_model +           # Residual 1
-        d_model +           # Normed input
-        n_heads * head_dim + # Q projections
-        n_heads * head_dim + # K projections
-        n_heads * head_dim + # V projections
-        d_model +           # Attention output
-        d_model +           # Residual 2
-        d_ff +              # FFN intermediate
-        d_model             # Final output
-    )
-
-    print(f"Estimated register usage per token: {registers_per_token}")
-    print(f"Register pressure: {registers_per_token * 4 / 1024:.1f} KB per token")
-
-    # Global memory access patterns
-    input_reads = batch_size * seq_len * d_model * 4  # Read input once
-    weight_reads = (
-        # Attention weights (read once per token)
-        4 * d_model * d_model * 4 +  # Q, K, V, O weights
-        # FFN weights (read once per token)
-        3 * d_model * d_ff * 4 +     # Gate, Up, Down weights
-        # Norm weights (read once per token)
-        2 * d_model * 4              # Attention + FFN norms
-    ) * batch_size * seq_len
-
-    output_writes = batch_size * seq_len * d_model * 4  # Write output once
-
-    total_memory_traffic = input_reads + weight_reads + output_writes
-
-    print(f"\nMemory Traffic Analysis:")
-    print(f"  Input reads: {input_reads / 1e6:.2f} MB")
-    print(f"  Weight reads: {weight_reads / 1e6:.2f} MB")
-    print(f"  Output writes: {output_writes / 1e6:.2f} MB")
-    print(f"  Total: {total_memory_traffic / 1e6:.2f} MB")
+### Part C: Roofline Analysis
 
-    # Compare with previous versions
-    version3_memory = (
-        input_reads * 4 +    # Read input 4 times (each kernel)
-        weight_reads * 1.5 + # Some weight reuse
-        output_writes * 4    # Multiple intermediate writes
-    )
+For batch_size=4, seq_len=512, d_model=2048:
+- Calculate total FLOPs (attention + FFN + norms)
+- Calculate total memory traffic (input + weights + output)
+- Compute arithmetic intensity (FLOPs/byte)
+- Determine if compute-bound or memory-bound
 
-    memory_reduction = (version3_memory - total_memory_traffic) / version3_memory
-    print(f"\nMemory traffic reduction vs Version 3: {memory_reduction * 100:.1f}%")
+### Results Template
 
-    return {
-        'register_usage': registers_per_token,
-        'total_memory_mb': total_memory_traffic / 1e6,
-        'memory_reduction': memory_reduction
-    }
+| Metric | Value |
+|--------|-------|
+| Register usage per token | |
+| Memory traffic reduction | % |
+| Arithmetic intensity | FLOPs/byte |
+| Performance bottleneck | (compute/memory) |
+| Kernel count reduction | x |
 
-# Run memory analysis
-memory_analysis = analyze_memory_hierarchy()
-```
-
-#### Step 5: Cache Optimization Strategies
-
-Examine cache optimization techniques:
-
-```python
-def analyze_cache_optimization():
-    """Analyze cache optimization in ultra-fused kernels."""
-
-    print("\nCache Optimization Analysis")
-    print("=" * 35)
-
-    # L1 cache utilization
-    l1_cache_size = 128 * 1024  # 128KB typical L1 cache
-    l2_cache_size = 8 * 1024 * 1024  # 8MB typical L2 cache
-
-    # Data reuse analysis
-    d_model = 2048
-    seq_len = 512
-
-    # Input token reuse
-    input_reuse_factor = 4  # Used in norm, Q, K, V projections
-    print(f"Input data reuse factor: {input_reuse_factor}x")
-
-    # Weight reuse patterns
-    attention_weight_reuse = seq_len  # Each weight used for all tokens
-    ffn_weight_reuse = seq_len       # FFN weights reused across sequence
-
-    print(f"Attention weight reuse: {attention_weight_reuse}x")
-    print(f"FFN weight reuse: {ffn_weight_reuse}x")
-
-    # Cache hit rate estimation
-    working_set_size = d_model * 4 * 4  # Input + weights for one token
-    l1_hit_rate = min(1.0, l1_cache_size / working_set_size)
-
-    print(f"Estimated L1 cache hit rate: {l1_hit_rate * 100:.1f}%")
-
-    # Temporal locality analysis
-    temporal_locality_score = (
-        input_reuse_factor +
-        attention_weight_reuse / seq_len +
-        ffn_weight_reuse / seq_len
-    ) / 3
-
-    print(f"Temporal locality score: {temporal_locality_score:.2f}")
-
-    return {
-        'l1_hit_rate': l1_hit_rate,
-        'temporal_locality': temporal_locality_score,
-        'working_set_mb': working_set_size / 1e6
-    }
-
-# Run cache analysis
-cache_analysis = analyze_cache_optimization()
-```
-
-#### Step 6: Register Pressure Management
-
-Analyze register usage optimization:
-
-```python
-def analyze_register_pressure():
-    """Analyze register pressure and management strategies."""
-
-    print("\nRegister Pressure Analysis")
-    print("=" * 30)
-
-    # GPU specifications (example for MI250X)
-    registers_per_cu = 65536  # 64K registers per CU
-    max_threads_per_cu = 2048
-    registers_per_thread_max = registers_per_cu // max_threads_per_cu
-
-    print(f"Max registers per thread: {registers_per_thread_max}")
-
-    # Estimate register usage in ultra-fused kernel
-    d_model = 2048
-    n_heads = 32
-    head_dim = d_model // n_heads
-
-    registers_needed = (
-        d_model // 4 +      # Input token (float4 packing)
-        d_model // 4 +      # Residual storage
-        n_heads +           # Attention accumulators
-        head_dim +          # Head computation temp
-        64 +                # Loop counters, indices, etc.
-        32                  # Compiler temporaries
-    )
-
-    print(f"Estimated registers needed: {registers_needed}")
-    print(f"Register utilization: {registers_needed / registers_per_thread_max * 100:.1f}%")
-
-    # Occupancy impact
-    max_threads_with_registers = registers_per_cu // registers_needed
-    occupancy = min(max_threads_with_registers / max_threads_per_cu, 1.0)
-
-    print(f"Theoretical occupancy: {occupancy * 100:.1f}%")
-
-    # Register optimization strategies
-    print(f"\nOptimization Strategies:")
-    print(f"1. Float4 vectorization reduces registers by 4x")
-    print(f"2. Loop unrolling vs register pressure trade-off")
-    print(f"3. Shared memory for intermediate results")
-    print(f"4. Careful compiler hint placement")
-
-    return {
-        'registers_needed': registers_needed,
-        'occupancy': occupancy,
-        'utilization_percent': registers_needed / registers_per_thread_max * 100
-    }
-
-# Run register analysis
-register_analysis = analyze_register_pressure()
-```
-
-### Part C: Performance Engineering Deep Dive (25 minutes)
-
-#### Step 7: Roofline Model Analysis
-
-Apply roofline analysis to ultra-fused kernels:
-
-```python
-def roofline_analysis():
-    """Perform roofline model analysis for ultra-fused kernel."""
-
-    print("\nRoofline Model Analysis")
-    print("=" * 25)
-
-    # Problem size
-    batch_size, seq_len, d_model = 4, 512, 2048
-    n_heads = 32
-    d_ff = int(2.67 * d_model)
-
-    # Calculate FLOPs for entire transformer block
-    # Attention FLOPs
-    qkv_flops = 3 * batch_size * seq_len * d_model * d_model * 2  # Q, K, V projections
-    attn_flops = batch_size * n_heads * seq_len * seq_len * d_model // n_heads * 2  # Attention matrix
-    o_proj_flops = batch_size * seq_len * d_model * d_model * 2  # Output projection
-
-    attention_total_flops = qkv_flops + attn_flops + o_proj_flops
-
-    # FFN FLOPs
-    gate_up_flops = 2 * batch_size * seq_len * d_model * d_ff * 2  # Gate + Up projections
-    silu_flops = batch_size * seq_len * d_ff * 4  # SiLU activation (~4 ops)
-    down_flops = batch_size * seq_len * d_ff * d_model * 2  # Down projection
-
-    ffn_total_flops = gate_up_flops + silu_flops + down_flops
-
-    # Layer norm FLOPs (2 layer norms)
-    norm_flops = 2 * batch_size * seq_len * d_model * 8  # Variance + normalization
-
-    total_flops = attention_total_flops + ffn_total_flops + norm_flops
-
-    # Memory traffic (ultra-optimized)
-    input_bytes = batch_size * seq_len * d_model * 4
-    weight_bytes = (4 * d_model * d_model + 3 * d_model * d_ff + 2 * d_model) * 4
-    output_bytes = batch_size * seq_len * d_model * 4
-
-    total_bytes = input_bytes + weight_bytes + output_bytes
-
-    # Arithmetic intensity
-    arithmetic_intensity = total_flops / total_bytes
-
-    print(f"Problem size: {batch_size}x{seq_len}x{d_model}")
-    print(f"Total FLOPs: {total_flops / 1e9:.2f} GFLOPs")
-    print(f"Total memory: {total_bytes / 1e6:.2f} MB")
-    print(f"Arithmetic intensity: {arithmetic_intensity:.2f} FLOPs/byte")
-
-    # GPU specifications (MI250X example)
-    peak_flops = 47.9e12  # 47.9 TFLOPS FP32
-    peak_bandwidth = 1638e9  # 1.638 TB/s
-
-    # Roofline analysis
-    compute_bound_threshold = peak_flops / peak_bandwidth
-
-    print(f"\nGPU Specifications:")
-    print(f"Peak compute: {peak_flops / 1e12:.1f} TFLOPS")
-    print(f"Peak bandwidth: {peak_bandwidth / 1e9:.0f} GB/s")
-    print(f"Compute-bound threshold: {compute_bound_threshold:.2f} FLOPs/byte")
-
-    if arithmetic_intensity > compute_bound_threshold:
-        print(f"PASS Kernel is compute-bound (good for GPU utilization)")
-        bottleneck = "compute"
-        theoretical_performance = peak_flops
-    else:
-        print(f"WARNING: Kernel is memory-bound (optimize memory access)")
-        bottleneck = "memory"
-        theoretical_performance = arithmetic_intensity * peak_bandwidth
-
-    # Performance potential
-    performance_potential = theoretical_performance / 1e12
-
-    print(f"Theoretical peak performance: {performance_potential:.1f} TFLOPS")
-
-    return {
-        'arithmetic_intensity': arithmetic_intensity,
-        'bottleneck': bottleneck,
-        'performance_potential_tflops': performance_potential,
-        'compute_bound': arithmetic_intensity > compute_bound_threshold
-    }
-
-# Run roofline analysis
-roofline_results = roofline_analysis()
-```
-
-#### Step 8: Performance Prediction Model
-
-Create a performance prediction model:
-
-```python
-def performance_prediction_model():
-    """Create performance prediction model for different configurations."""
-
-    print("\nPerformance Prediction Model")
-    print("=" * 32)
-
-    # Base performance characteristics
-    base_config = {
-        'batch_size': 4,
-        'seq_len': 512,
-        'd_model': 2048,
-        'measured_time_ms': 15.0  # Example measured time
-    }
-
-    def predict_performance(batch_size, seq_len, d_model):
-        """Predict performance for given configuration."""
-
-        # Scaling factors based on algorithmic complexity
-        batch_scale = batch_size / base_config['batch_size']
-        seq_scale = (seq_len / base_config['seq_len']) ** 1.8  # Slightly sub-quadratic due to optimizations
-        model_scale = (d_model / base_config['d_model']) ** 2.5  # Between O(n^2) and O(n^3)
-
-        # Memory bandwidth limiting factor
-        memory_factor = max(1.0, (batch_size * seq_len * d_model) / (4 * 512 * 2048) * 0.8)
-
-        predicted_time = (
-            base_config['measured_time_ms'] *
-            batch_scale * seq_scale * model_scale * memory_factor
-        )
-
-        return predicted_time
-
-    # Test predictions
-    test_configs = [
-        (1, 128, 1024),
-        (2, 256, 1536),
-        (4, 512, 2048),
-        (8, 512, 2048),
-        (4, 1024, 2048),
-        (4, 512, 4096)
-    ]
-
-    print("Performance Predictions:")
-    print("| Batch | Seq Len | Model Dim | Predicted Time (ms) | Throughput (tokens/s) |")
-    print("|-------|---------|-----------|--------------------|-----------------------|")
-
-    for batch_size, seq_len, d_model in test_configs:
-        predicted_time = predict_performance(batch_size, seq_len, d_model)
-        throughput = batch_size * seq_len / (predicted_time / 1000)
-
-        print(f"| {batch_size:5d} | {seq_len:7d} | {d_model:9d} | {predicted_time:18.2f} | {throughput:21.0f} |")
-
-    return test_configs
-
-# Run performance predictions
-performance_predictions = performance_prediction_model()
-```
-
-### Exercise Results
-
-#### Ultra-Fusion Analysis Summary
-
-Fill in your analysis results:
-
-**Memory Efficiency:**
-
-- Register usage per token: _____
-- Memory traffic reduction: _____%
-- L1 cache hit rate: _____%
-
-**Performance Characteristics:**
-
-- Arithmetic intensity: _____ FLOPs/byte
-- Performance bottleneck: _____ (compute/memory)
-- Theoretical peak: _____ TFLOPS
-
-**Optimization Impact:**
-
-- Kernel count reduction: _____x
-- Memory bandwidth savings: _____%
-- Register utilization: _____%
-
-#### Key Insights
+### Key Insights
 
 1. **Most Critical Optimization**: _____
-2. **Biggest Performance Bottleneck**: _____
-3. **Next Optimization Opportunity**: _____
-4. **Scalability Limitations**: _____
+2. **Biggest Bottleneck**: _____
+3. **Scalability Limitation**: _____
 
 ### Discussion Questions
 
-1. **Ultra-Fusion Trade-offs**: What are the main trade-offs of ultra-fusion (complexity, maintainability, portability)?
-
-2. **Hardware Dependencies**: How do ultra-fused kernels depend on specific GPU architectures?
-
-3. **Optimization Limits**: What are the theoretical limits of kernel fusion optimization?
-
-4. **Development Complexity**: How does ultra-fusion impact development time and debugging complexity?
-
-5. **Future Directions**: What future GPU architecture features would enable even better ultra-fusion?
-
-### Advanced Challenges
-
-#### Challenge 1: Register Optimization
-Redesign a portion of the ultra-fused kernel to reduce register pressure while maintaining performance.
-
-#### Challenge 2: Memory Pattern Analysis
-Implement a tool to visualize memory access patterns in the ultra-fused kernel.
-
-#### Challenge 3: Performance Modeling
-Create a detailed performance model that predicts ultra-fused kernel performance across different GPU architectures.
-
-#### Challenge 4: Debugging Framework
-Design a debugging framework for ultra-fused kernels that can isolate performance issues.
-
-### Next Steps
-
-This exercise completes your understanding of ultra-fusion techniques. In Exercise 2, you'll:
-
-- Compare all four versions comprehensively
-- Analyze performance scaling characteristics
-- Create optimization decision frameworks
-- Design production deployment strategies
-
-### Additional Resources
+1. What are the trade-offs of ultra-fusion (complexity, maintainability, portability)?
+2. How do ultra-fused kernels depend on specific GPU architectures?
+3. What are the theoretical limits of kernel fusion?
 
-- [Advanced GPU Programming Patterns](https://developer.nvidia.com/blog/cuda-pro-tip-optimize-pointer-aliasing)
-- [Memory Optimization Techniques](https://rocmdocs.amd.com/en/latest/Programming_Guides/Performance_optimization.html)
-- [Roofline Model Deep Dive](https://crd.lbl.gov/departments/computer-science/PAR/research/roofline/)
-- [Register Pressure Analysis](https://developer.nvidia.com/blog/cuda-pro-tip-understand-fat-binaries-jit-caching/)
+### Resources
 
+- [AMD Performance Optimization Guide](https://rocmdocs.amd.com/en/latest/Programming_Guides/Performance_optimization.html)
+- [Roofline Model](https://crd.lbl.gov/departments/computer-science/PAR/research/roofline/)
diff --git a/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_counters.sh b/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_counters.sh
new file mode 100755
index 00000000..44c8aad8
--- /dev/null
+++ b/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_counters.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# Collect kernel trace data for TinyTransformer V4 with rocprofv3.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v4.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v4"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprofv3
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+ROCM_VERSION="$(detect_rocm_version)"
+OUTPUT_DIR="$(make_output_dir counters)"
+
+echo "Starting rocprofv3 kernel trace for TinyTransformer V4..."
+if [ -n "$ROCM_VERSION" ]; then
+    echo "Detected ROCm version: $ROCM_VERSION"
+fi
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+rocprofv3 \
+    --kernel-trace \
+    --output-directory "$OUTPUT_DIR" \
+    -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 3
+echo ""
+echo "To analyze results:"
+
+DB_FILE="$(select_largest_match "$OUTPUT_DIR" "*.db")"
+CSV_FILE="$(select_largest_match "$OUTPUT_DIR" "*_kernel_trace.csv")"
+AGENT_INFO_FILE=""
+
+if [ -n "$CSV_FILE" ]; then
+    CSV_PREFIX="${CSV_FILE%_kernel_trace.csv}"
+    MATCHING_AGENT_INFO="${CSV_PREFIX}_agent_info.csv"
+    if [ -f "$MATCHING_AGENT_INFO" ]; then
+        AGENT_INFO_FILE="$MATCHING_AGENT_INFO"
+    fi
+fi
+
+if [ -z "$AGENT_INFO_FILE" ]; then
+    AGENT_INFO_FILE="$(select_largest_match "$OUTPUT_DIR" "*_agent_info.csv")"
+fi
+
+if [ -n "$CSV_FILE" ]; then
+    echo "  Kernel trace CSV: $CSV_FILE"
+fi
+if [ -n "$AGENT_INFO_FILE" ]; then
+    echo "  Agent info CSV: $AGENT_INFO_FILE"
+fi
+if [ -n "$DB_FILE" ]; then
+    echo "  SQLite database: $DB_FILE"
+    echo ""
+    echo "  Export to CSV:"
+    echo "    rocpd2csv -i \"$DB_FILE\" -o kernel_stats.csv"
+    echo ""
+    echo "  Get kernel summary:"
+    echo "    rocpd summary -i \"$DB_FILE\" --region-categories KERNEL"
+fi
+if [ -z "$CSV_FILE" ] && [ -z "$DB_FILE" ]; then
+    echo "  WARNING: No ROCm profiler output file was detected under $OUTPUT_DIR"
+fi
diff --git a/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_hotspots.sh b/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_hotspots.sh
new file mode 100755
index 00000000..8860dfbc
--- /dev/null
+++ b/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_hotspots.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Collect a quick hotspot summary for TinyTransformer V4 with rocprofv3 --stats.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v4.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v4"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprofv3
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+OUTPUT_DIR="$(make_output_dir hotspots)"
+
+echo "Starting rocprofv3 hotspot summary for TinyTransformer V4..."
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+rocprofv3 \
+    --kernel-trace \
+    --stats \
+    --output-directory "$OUTPUT_DIR" \
+    -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 3
+echo ""
+
+CSV_FILE="$(select_largest_match "$OUTPUT_DIR" "*_kernel_stats.csv")"
+if [ -z "$CSV_FILE" ]; then
+    CSV_FILE="$(select_largest_match "$OUTPUT_DIR" "*_domain_stats.csv")"
+fi
+if [ -n "$CSV_FILE" ]; then
+    echo "Top rows from $CSV_FILE:"
+    head -11 "$CSV_FILE"
+else
+    echo "WARNING: No hotspot CSV file was detected under $OUTPUT_DIR"
+fi
diff --git a/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_rocprof_compute.sh b/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_rocprof_compute.sh
new file mode 100755
index 00000000..d4fdcb1f
--- /dev/null
+++ b/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_rocprof_compute.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+# Collect hardware metrics for TinyTransformer V4 with rocprof-compute.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v4.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v4"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprof-compute
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+MODE="${1:-no-roof}"
+GPU_ARCH="$(detect_gpu_arch)"
+SUPPORTED_ARCH_REGEX='^(gfx908|gfx90a|gfx940|gfx941|gfx942)$'
+
+if [ -n "$GPU_ARCH" ] && ! echo "$GPU_ARCH" | grep -Eq "$SUPPORTED_ARCH_REGEX"; then
+    echo "Skipping rocprof-compute profiling for TinyTransformer V4..."
+    echo "Detected GPU architecture: $GPU_ARCH"
+    echo "rocprof-compute hardware-counter collection currently requires a supported Instinct GPU"
+    echo "(for example gfx908, gfx90a, gfx940, gfx941, or gfx942)."
+    echo "Use get_trace.sh, get_hotspots.sh, or get_counters.sh on this system instead."
+    exit 0
+fi
+
+OUTPUT_DIR="$(make_output_dir rocprof_compute)"
+PROFILE_ROOT="$OUTPUT_DIR/$WORKLOAD_NAME"
+
+case "$MODE" in
+    full)
+        PROFILE_ARGS=(--kernel-names)
+        MODE_DESCRIPTION="full profile (counters plus roofline stage)"
+        ;;
+    roof-only)
+        PROFILE_ARGS=(--roof-only --kernel-names)
+        MODE_DESCRIPTION="roofline-only profile"
+        ;;
+    no-roof)
+        PROFILE_ARGS=(--no-roof --kernel-names)
+        MODE_DESCRIPTION="counter-only profile without roofline collection"
+        ;;
+    *)
+        echo "Usage: $0 [no-roof|full|roof-only]" >&2
+        echo "  no-roof   collect counters only and skip the roofline stage" >&2
+        echo "  full      collect the default counter set and roofline data" >&2
+        echo "  roof-only collect roofline data only and label roofline kernels" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting rocprof-compute hardware metrics for TinyTransformer V4..."
+if [ -n "$GPU_ARCH" ]; then
+    echo "Detected GPU architecture: $GPU_ARCH"
+fi
+echo "Mode: $MODE_DESCRIPTION"
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+rocprof-compute profile \
+    --name "$WORKLOAD_NAME" \
+    --path "$PROFILE_ROOT" \
+    "${PROFILE_ARGS[@]}" \
+    -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 4
+echo ""
+echo "To analyze results:"
+
+ANALYZE_PATH=""
+for marker in pmc_perf.csv roofline.csv sysinfo.csv; do
+    MARKER_FILE="$(find "$PROFILE_ROOT" -name "$marker" 2>/dev/null | head -1)"
+    if [ -n "$MARKER_FILE" ]; then
+        ANALYZE_PATH="$(dirname "$MARKER_FILE")"
+        break
+    fi
+done
+
+if [ -n "$ANALYZE_PATH" ]; then
+    echo "  Raw data directory: $ANALYZE_PATH"
+    echo ""
+    echo "  1. List detected kernels and dispatches:"
+    echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --list-stats"
+    if [ "$MODE" != "roof-only" ]; then
+        echo ""
+        echo "  2. Inspect one dispatch in the default report:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N>"
+        echo ""
+        echo "  3. Check occupancy and LDS-related limits:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N> --block 2.1.15 6.2.7"
+        echo ""
+        echo "  4. Check L1/L2 memory speed-of-light metrics:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N> --block 16.1 17.1"
+    else
+        echo ""
+        echo "  Roofline-only mode does not collect the full counter set."
+        echo "  Re-run with '$0 full' or '$0 no-roof' for detailed block analysis."
+    fi
+else
+    echo "  WARNING: Could not detect the rocprof-compute raw data directory under $PROFILE_ROOT"
+    echo "  Inspect the generated workload tree and use that path with 'rocprof-compute analyze -p'."
+fi
diff --git a/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_rocprof_sys.sh b/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_rocprof_sys.sh
new file mode 100755
index 00000000..fc693ce3
--- /dev/null
+++ b/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_rocprof_sys.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Collect a system trace for TinyTransformer V4 with rocprof-sys.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v4.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v4"
+TINYTRANSFORMER_DEFAULT_NUM_STEPS=2
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprof-sys-run
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+OUTPUT_DIR="$(make_output_dir rocprof_sys)"
+
+echo "Starting rocprof-sys trace for TinyTransformer V4..."
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+pushd "$OUTPUT_DIR" >/dev/null
+rocprof-sys-run \
+    --profile \
+    --trace \
+    -- "${BENCHMARK_CMD[@]}"
+popd >/dev/null
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 4
+echo ""
+echo "Open the trace in Perfetto:"
+PROTO_FILE="$(select_largest_match "$OUTPUT_DIR" "*.proto")"
+if [ -n "$PROTO_FILE" ]; then
+    echo "  Perfetto trace file: $PROTO_FILE"
+    echo "  Open it in Perfetto UI: https://ui.perfetto.dev/"
+else
+    echo "  WARNING: No .proto file was found under $OUTPUT_DIR"
+    echo "  Inspect the output tree and open the generated trace in Perfetto UI if present."
+fi
diff --git a/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_trace.sh b/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_trace.sh
new file mode 100755
index 00000000..7db17d6c
--- /dev/null
+++ b/MLExamples/TinyTransformer/version4_pytorch_sdpa/get_trace.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Collect a runtime trace for TinyTransformer V4 with rocprofv3.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TINYTRANSFORMER_SCRIPT_DIR="$SCRIPT_DIR"
+TINYTRANSFORMER_MODEL_SCRIPT="tiny_llama_v4.py"
+TINYTRANSFORMER_WORKLOAD_NAME="tiny_llama_v4"
+source "$SCRIPT_DIR/../profile_common.sh"
+
+require_cmd rocprofv3
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+ROCM_VERSION="$(detect_rocm_version)"
+ROCM_MAJOR="$(rocm_major_from_version "$ROCM_VERSION")"
+OUTPUT_DIR="$(make_output_dir trace)"
+
+echo "Starting rocprofv3 runtime trace for TinyTransformer V4..."
+if [ -n "$ROCM_VERSION" ]; then
+    echo "Detected ROCm version: $ROCM_VERSION"
+fi
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+
+TRACE_CMD=(rocprofv3 --runtime-trace --output-directory "$OUTPUT_DIR")
+if [ "$ROCM_MAJOR" = "6" ] || [ "$ROCM_MAJOR" = "7" ]; then
+    TRACE_CMD+=(--output-format pftrace)
+fi
+
+echo ""
+"${TRACE_CMD[@]}" -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 3
+echo ""
+
+PFTRACE_FILE="$(select_largest_match "$OUTPUT_DIR" "*.pftrace")"
+DB_FILE="$(select_largest_match "$OUTPUT_DIR" "*.db")"
+
+if [ -n "$PFTRACE_FILE" ]; then
+    echo "Perfetto trace file: $PFTRACE_FILE"
+    echo "Open it in Perfetto UI: https://ui.perfetto.dev/"
+elif [ -n "$DB_FILE" ]; then
+    echo "SQLite database found: $DB_FILE"
+    echo "Convert it to Perfetto format with:"
+    echo "  rocpd2pftrace -i \"$DB_FILE\" -o trace.pftrace"
+else
+    echo "WARNING: No .pftrace or .db file was found under $OUTPUT_DIR"
+fi
diff --git a/MLExamples/TinyTransformer/version4_pytorch_sdpa/test_rocpd.sh b/MLExamples/TinyTransformer/version4_pytorch_sdpa/test_rocpd.sh
new file mode 100755
index 00000000..2d864165
--- /dev/null
+++ b/MLExamples/TinyTransformer/version4_pytorch_sdpa/test_rocpd.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Test rocpd (ROCm Profiling Daemon) for continuous profiling
+#
+
+set -e
+
+echo "=========================================="
+echo "rocpd Test - Version 4"
+echo "=========================================="
+echo ""
+
+# Check if rocpd is available
+if ! command -v rocpd &> /dev/null; then
+    echo "[ERROR] rocpd not found in PATH"
+    echo "rocpd may not be installed or available in this ROCm version"
+    exit 1
+fi
+
+echo "rocpd location: $(which rocpd)"
+echo ""
+
+OUTPUT_DIR="./rocpd/rocpd_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUTPUT_DIR"
+
+echo "Output directory: $OUTPUT_DIR"
+echo ""
+
+# Start rocpd in background
+echo "Starting rocpd daemon..."
+rocpd --output-dir "$OUTPUT_DIR" &
+ROCPD_PID=$!
+echo "rocpd running with PID: $ROCPD_PID"
+echo ""
+
+# Give rocpd time to initialize
+sleep 2
+
+# Run workload
+echo "Running workload: python tiny_llama_v4.py --batch-size 8 --seq-len 128 --num-steps 10"
+python tiny_llama_v4.py --batch-size 8 --seq-len 128 --num-steps 10
+WORKLOAD_EXIT=$?
+echo ""
+
+# Stop rocpd
+echo "Stopping rocpd daemon..."
+kill $ROCPD_PID 2>/dev/null || true
+wait $ROCPD_PID 2>/dev/null || true
+echo ""
+
+if [ $WORKLOAD_EXIT -eq 0 ]; then
+    echo "[SUCCESS] Workload completed"
+else
+    echo "[FAILED] Workload failed with exit code $WORKLOAD_EXIT"
+fi
+echo ""
+
+echo "Generated files in $OUTPUT_DIR:"
+ls -lh "$OUTPUT_DIR"
+echo ""
+
+echo "rocpd output is a SQLite3 database file"
+echo ""
+echo "To view the database:"
+echo "  - Use VS Code SQLite Viewer extension"
+echo "  - rocprof-compute and rocprof-systems can consume it directly"
+echo "  - No official CLI tool is provided for viewing"
+echo ""
+echo "rocpd provides continuous profiling with minimal overhead"
+echo ""
diff --git a/MLExamples/inference_benchmark/INFERENCE_BENCHMARK_NOTES.md b/MLExamples/inference_benchmark/INFERENCE_BENCHMARK_NOTES.md
deleted file mode 100644
index bb6654ea..00000000
--- a/MLExamples/inference_benchmark/INFERENCE_BENCHMARK_NOTES.md
+++ /dev/null
@@ -1,147 +0,0 @@
-
-# Inference Benchmark Notes
-
-`INFERENCE_BENCHMARK_NOTES.md` from `HPCTrainingExamples/MLExamples/inference_benchmark` in the Training Examples repository
-
-## Basic Inference Run
-
-DenseNet121 with torch.compile and mixed precision (FP16):
-
-```bash
-python micro_benchmarking_pytorch.py --network densenet121 --batch-size 2048 --compile --fp16 1
-```
-
-## Profiling
-
-### PyTorch Profiler (Kineto)
-
-Generate Chrome trace with detailed kernel timeline:
-
-```bash
-python micro_benchmarking_pytorch.py --network densenet121 --batch-size 2048 --compile --fp16 1 --kineto --iterations 10
-```
-
-Output: `trace<step>.json` files (viewable in chrome://tracing)
-
-Options:
-
-- `--kineto`: Enable Kineto profiler (torch.profiler with Chrome trace export)
-- `--iterations`: Number of iterations (profiler captures wait=1, warmup=2, active=2)
-
-### PyTorch Autograd Profiler (ROCTX)
-
-For use with ROCm profilers (rocprof):
-
-```bash
-python micro_benchmarking_pytorch.py --network densenet121 --batch-size 2048 --compile --fp16 1 --autograd_profiler
-```
-
-Enables ROCTX markers for correlation with GPU kernel timeline in rocprof traces.
-
-### DeepSpeed FLOPS Profiler
-
-Detailed FLOPS and memory analysis:
-
-```bash
-python micro_benchmarking_pytorch.py --network densenet121 --batch-size 2048 --fp16 1 --flops-prof-step 10 --iterations 20
-```
-
-Options:
-
-- `--flops-prof-step`: Iteration at which to capture profile (0-based index)
-- `--iterations`: Total iterations (must be > flops-prof-step)
-
-Output includes:
-
-- FLOPS per layer and operation type
-- Memory bandwidth utilization
-- Parameter count and activation memory
-- Theoretical vs achieved performance
-
-## Performance Tuning
-
-### MIOpen Kernel Tuning
-
-For optimal performance on AMD GPUs, enable MIOpen find mode:
-
-```bash
-export MIOPEN_FIND_ENFORCE=3
-python micro_benchmarking_pytorch.py --network densenet121 --batch-size 2048 --compile --fp16 1
-```
-
-First run generates performance database at `~/.config/miopen/`. Subsequent runs use cached kernels.
-
-### Torch Compile Modes
-
-Default compilation:
-```bash
-python micro_benchmarking_pytorch.py --network densenet121 --batch-size 2048 --compile --fp16 1
-```
-
-Maximum optimization:
-```bash
-python micro_benchmarking_pytorch.py --network densenet121 --batch-size 2048 --compile --fp16 1 \
-    --compileContext "{'mode': 'max-autotune', 'fullgraph': 'True'}"
-```
-
-Memory and matmul optimization:
-```bash
-python micro_benchmarking_pytorch.py --network densenet121 --batch-size 2048 --compile --fp16 1 \
-    --compileContext "{'options': {'static-memory': 'True', 'matmul-padding': 'True'}}"
-```
-
-## Multi-GPU Training
-
-### 4-GPU Run
-
-```bash
-torchrun --nproc-per-node 4 micro_benchmarking_pytorch.py --network densenet121 --batch-size 2048 --compile --fp16 1
-```
-
-### 8-GPU Run
-
-```bash
-torchrun --nproc-per-node 8 micro_benchmarking_pytorch.py --network densenet121 --batch-size 2048 --compile --fp16 1
-```
-
-**Batch size behavior:**
-
-- `--batch-size` specifies global batch size across all GPUs
-- Each GPU processes `batch-size / nproc-per-node` samples
-- Example: `--batch-size 2048` with 4 GPUs → 512 samples/GPU
-
-### Multi-GPU Profiling
-
-#### PyTorch Profiler (Kineto)
-
-Profile 4-GPU run with trace export:
-```bash
-torchrun --nproc-per-node 4 micro_benchmarking_pytorch.py \
-    --network densenet121 --batch-size 2048 --compile --fp16 1 \
-    --kineto --iterations 10
-```
-
-Output: `trace<step>.json` per rank (4 files total)
-
-#### DeepSpeed FLOPS Profiler
-
-Multi-GPU FLOPS analysis:
-```bash
-torchrun --nproc-per-node 4 micro_benchmarking_pytorch.py \
-    --network densenet121 --batch-size 2048 --fp16 1 \
-    --flops-prof-step 10 --iterations 20
-```
-
-Profile captures per-GPU metrics at specified iteration.
-
-## Metrics to Track
-
-- Throughput (images/sec)
-- GPU memory utilization (GB)
-- Training time per iteration (ms)
-- FLOPS efficiency (% of peak)
-- Memory bandwidth saturation (% of theoretical)
-- Kernel occupancy
-- Compilation overhead (first iteration vs steady state)
-
-
diff --git a/MLExamples/inference_benchmark/INFERENCE_BENCHMARK_WORKSHOP_WALKTHROUGH.md b/MLExamples/inference_benchmark/INFERENCE_BENCHMARK_WORKSHOP_WALKTHROUGH.md
deleted file mode 100644
index 50020401..00000000
--- a/MLExamples/inference_benchmark/INFERENCE_BENCHMARK_WORKSHOP_WALKTHROUGH.md
+++ /dev/null
@@ -1,2898 +0,0 @@
-# ROCm PyTorch Inference Benchmark Workshop
-## Complete Hands-On Walkthrough Guide
-
----
-
-## Important Note
-
-**The performance numbers and metrics shown throughout this workshop are representative examples and were collected on specific hardware configurations.** Your actual results will differ based on:
-
-- GPU model (e.g., MI250X, MI300X, MI325X)
-- ROCm version
-- PyTorch version
-- System configuration (CPU, memory, drivers)
-- Current GPU utilization and temperature
-
-**Focus on the relative improvements and optimization techniques** demonstrated in each exercise rather than matching the exact numbers shown. The methodologies and analysis approaches are applicable across different hardware platforms.
-
----
-
-## Table of Contents
-
-1. [Introduction & Setup](#1-introduction--setup)
-2. [Understanding the Benchmark Tool](#2-understanding-the-benchmark-tool)
-3. [Exercise 1: Single GPU Baseline](#3-exercise-1-single-gpu-baseline)
-4. [Exercise 2: Precision Comparison](#4-exercise-2-precision-comparison-fp32-vs-fp16)
-5. [Exercise 3: PyTorch Profiler Integration](#5-exercise-3-pytorch-profiler-integration)
-6. [Exercise 4: DeepSpeed FLOPS Profiler](#6-exercise-4-deepspeed-flops-profiler)
-7. [Exercise 5: Multi-GPU Scaling](#7-exercise-5-multi-gpu-scaling)
-8. [Exercise 6: PyTorch 2.0 Compilation](#8-exercise-6-pytorch-20-compilation)
-9. [Exercise 7: ROCm Profiler Integration](#9-exercise-7-rocm-profiler-integration)
-10. [Wrap-up & Best Practices](#10-wrap-up--best-practices)
-
----
-
-## 1. Introduction & Setup
-
-### 1.1 What is Inference?
-
-**Inference** is the process of using a trained neural network to make predictions on new data.
-
-**Key Differences from Training:**
-
-| Aspect | Training | Inference |
-|--------|----------|-----------|
-| **Purpose** | Learn patterns from data | Make predictions |
-| **Direction** | Forward + Backward pass | Forward pass only |
-| **Gradients** | Required | Not required |
-| **Batch Size** | Usually larger | Often smaller (1-32) |
-| **Performance Goal** | Throughput (samples/sec) | Latency (ms/sample) AND throughput |
-| **Memory Usage** | High (stores activations) | Lower (no gradient storage) |
-
-**Why Benchmark Inference?**
-
-- Optimize for production deployment
-- Understand hardware utilization
-- Compare different models
-- Justify hardware purchases
-- Identify bottlenecks
-
-### 1.2 Workshop Goals
-
-By the end of this workshop, you will:
-
-- Run standardized inference benchmarks on AMD GPUs
-- Use PyTorch Profiler to identify bottlenecks
-- Understand FLOPS efficiency with DeepSpeed profiler
-- Scale workloads across multiple GPUs
-- Apply PyTorch 2.0 compilation optimizations
-- Use ROCm profiling tools for kernel-level analysis
-- Interpret performance metrics and make optimization decisions
-
-### 1.3 Environment Verification
-
-Let's verify your system is ready for the workshop.
-
-#### Step 1: Check ROCm Installation
-
-```bash
-# Check if ROCm is installed
-rocminfo | grep "Name:"
-```
-
-**Expected Output:**
-```
-  Name:                    gfx942
-  Name:                    AMD Instinct MI325X
-```
-
-**If you see an error:**
-```bash
-# Check if ROCm is installed
-which rocminfo
-
-# If not found, ROCm is not installed
-# Contact your system administrator
-```
-
-#### Step 2: Check GPU Visibility
-
-```bash
-# Check GPU status
-rocm-smi
-```
-
-**Expected Output:**
-```
-GPU[0]    : GPU ID: 0
-GPU[0]    : GPU Name: AMD Instinct MI325X
-GPU[0]    : Temperature: 35.0°C
-GPU[0]    : GPU Memory Usage: 256 MB / 196608 MB
-GPU[0]    : GPU Utilization: 0%
-```
-
-**Common Issues:**
-
-**Error: "Unable to detect any GPUs"**
-```bash
-# Check permissions
-sudo usermod -aG video $USER
-sudo usermod -aG render $USER
-
-# Logout and login again
-# Then retry: rocm-smi
-```
-
-**Error: "Permission denied"**
-```bash
-# Check if you're in the right groups
-groups | grep video
-groups | grep render
-
-# If not, add yourself (requires sudo)
-sudo usermod -aG video $USER
-sudo usermod -aG render $USER
-# Logout/login required!
-```
-
-#### Step 3: Check PyTorch + ROCm
-
-```bash
-# Test PyTorch with ROCm
-python3 -c "
-import torch
-print(f'PyTorch Version: {torch.__version__}')
-print(f'CUDA Available: {torch.cuda.is_available()}')
-if torch.cuda.is_available():
-    print(f'GPU Name: {torch.cuda.get_device_name(0)}')
-    print(f'GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')
-else:
-    print('ERROR: No GPU detected!')
-"
-```
-
-**Expected Output:**
-```
-PyTorch Version: 2.7.1+rocm6.4.4
-CUDA Available: True
-GPU Name: AMD Instinct MI325X
-GPU Memory: 196.6 GB
-```
-
-**Common Issues:**
-
-**Error: "ModuleNotFoundError: No module named 'torch'"**
-```bash
-# Install PyTorch with ROCm support
-pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
-```
-
-**Error: "CUDA Available: False"**
-```bash
-# Check if ROCm-enabled PyTorch is installed
-python3 -c "import torch; print(torch.__version__)"
-
-# Should show something like: 2.7.1+rocm6.4.4
-# If it shows 2.7.1+cpu, you have CPU-only PyTorch
-
-# Reinstall with ROCm support
-pip uninstall torch torchvision torchaudio
-pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
-```
-
-#### Step 4: Verify Benchmark Script
-
-```bash
-# Navigate to inference benchmark directory
-cd inference_benchmark/
-
-# List files
-ls -la
-```
-
-**Expected Output:**
-```
--rw-rw-r-- micro_benchmarking_pytorch.py
--rw-rw-r-- README.md
--rw-rw-r-- fp16util.py
-drwxrwxr-x TorchTensorOpsBench/
-```
-
-
-
-#### Step 5: Quick Test Run
-
-Let's verify everything works with a very small test:
-
-```bash
-# Run a tiny test (should complete in ~30 seconds)
-python3 micro_benchmarking_pytorch.py --network resnet18 --batch-size 16 --iterations 5
-```
-
-**Expected Output:**
-```
-Using network: resnet18
-Batch size: 16
-Iterations: 5
-FP16: False
-
-Epoch 0: Loss = 6.9088, Time = 0.125 seconds
-Epoch 1: Loss = 6.9088, Time = 0.042 seconds
-Epoch 2: Loss = 6.9088, Time = 0.041 seconds
-Epoch 3: Loss = 6.9088, Time = 0.041 seconds
-Epoch 4: Loss = 6.9088, Time = 0.040 seconds
-
-Average time per iteration: 0.041 seconds
-Throughput: 390.2 samples/sec
-```
-
-**If you see this output, your environment is ready!**
-
-### 1.4 Understanding Key Metrics
-
-Before we begin the exercises, let's understand what we're measuring:
-
-#### Throughput (samples/sec or images/sec)
-- **What:** Number of samples processed per second
-- **Higher is better**
-- **Use case:** Batch inference, data center deployments
-- **Formula:** `(batch_size × num_iterations) / total_time`
-
-#### Latency (milliseconds)
-- **What:** Time to process a single sample or batch
-- **Lower is better**
-- **Use case:** Real-time applications, interactive systems
-- **Formula:** `total_time / num_iterations`
-
-#### Memory Usage (MB or GB)
-- **What:** GPU memory consumed by model and data
-- **Lower is better (allows larger batches)**
-- **Includes:** Model weights, activations, gradients (if training)
-
-#### GPU Utilization (%)
-- **What:** Percentage of GPU compute used
-- **Higher is better (approaching 100%)**
-- **Note:** Can be low if memory-bound or CPU-bound
-
-#### FLOPS (Floating Point Operations Per Second)
-- **What:** Computational throughput
-- **Higher is better**
-- **Theoretical vs Achieved:** Gap indicates optimization opportunity
-
----
-
-## 2. Understanding the Benchmark Tool
-
-### 2.1 What is `micro_benchmarking_pytorch.py`?
-
-This is a standardized tool for benchmarking PyTorch inference on ROCm.
-
-**Purpose:**
-- Measure inference performance across different models
-- Compare hardware configurations
-- Test optimization techniques
-- Standardized, reproducible results
-
-**Features:**
-- 50+ pre-configured models (ResNet, VGG, EfficientNet, ViT, etc.)
-- FP32 and FP16 precision support
-- Single and multi-GPU support
-- PyTorch Profiler integration
-- DeepSpeed FLOPS profiler integration
-- PyTorch 2.0 compilation support
-
-### 2.2 Available Models
-
-The benchmark includes many popular vision models:
-
-**Classification Models:**
-```python
-# ResNet family (most commonly used for benchmarking)
-resnet18, resnet34, resnet50, resnet101, resnet152
-
-# EfficientNet family (efficient models)
-efficientnet_b0, efficientnet_b1, ..., efficientnet_b7
-
-# Vision Transformers (attention-based)
-vit_b_16, vit_b_32, vit_l_16, vit_h_14
-
-# MobileNet (mobile/edge optimized)
-mobilenet_v2, mobilenet_v3_large, mobilenet_v3_small
-
-# VGG (classic architecture)
-vgg11, vgg13, vgg16, vgg19
-
-# And many more...
-```
-
-**Segmentation Models:**
-```python
-fcn_resnet50, fcn_resnet101
-deeplabv3_resnet50, deeplabv3_resnet101
-```
-
-**For this workshop, we'll focus on ResNet50 because:**
-- Industry-standard benchmark
-- Good balance of compute and memory operations
-- Well-optimized by hardware vendors
-- Comparable results across papers and benchmarks
-
-### 2.3 Command-Line Arguments
-
-Let's understand the key arguments:
-
-#### Basic Arguments
-
-```bash
-python3 micro_benchmarking_pytorch.py \
-    --network resnet50        # Model to benchmark
-    --batch-size 64           # Number of samples per batch
-    --iterations 20           # Number of iterations to run
-```
-
-#### Precision Arguments
-
-```bash
---fp16 1                      # Use FP16 (half precision)
---amp-opt-level 2             # Use automatic mixed precision (APEX)
-```
-
-#### Profiling Arguments
-
-```bash
---autograd-profiler           # Enable PyTorch autograd profiler
---kineto                      # Enable Kineto profiler (PyTorch 1.8+)
---flops-prof-step 10          # Enable DeepSpeed FLOPS profiler at step 10
-```
-
-#### Multi-GPU Arguments
-
-```bash
-# Option 1: Using torchrun (recommended)
-torchrun --nproc-per-node 2 micro_benchmarking_pytorch.py --network resnet50
-
-# Option 2: Manual distributed setup
---distributed_dataparallel    # Enable distributed data parallel
---device_ids 0,1              # GPUs to use
---rank 0                      # Process rank
---world-size 2                # Total number of processes
-```
-
-#### PyTorch 2.0 Arguments
-
-```bash
---compile                     # Enable torch.compile
---compileContext "{'mode': 'max-autotune'}"  # Compilation options
-```
-
-### 2.4 Understanding Output
-
-When you run the benchmark, you'll see output like this:
-
-```
-Using network: resnet50
-Batch size: 64
-Iterations: 20
-FP16: False
-
-Warming up...
-Warmup complete.
-
-Epoch 0: Loss = 6.9088, Time = 0.145 seconds
-Epoch 1: Loss = 6.9088, Time = 0.042 seconds
-Epoch 2: Loss = 6.9088, Time = 0.041 seconds
-...
-Epoch 19: Loss = 6.9088, Time = 0.040 seconds
-
-========================================
-Performance Summary:
-========================================
-Average time per iteration: 0.041 seconds
-Throughput: 1560.9 samples/sec
-Memory usage: 4523 MB
-========================================
-```
-
-**Let's break this down:**
-
-1. **Configuration Echo**
-   - Shows your settings
-   - Verify these are correct before trusting results
-
-2. **Warmup Phase**
-   - First few iterations are slower (kernel compilation, cache warming)
-   - Results are discarded
-
-3. **Timed Iterations**
-   - Each iteration shows loss and time
-   - Loss should be consistent (model is random, not trained)
-
-4. **Performance Summary**
-   - **Average time:** Excludes warmup, arithmetic mean
-   - **Throughput:** samples/sec = (batch_size × iterations) / total_time
-   - **Memory:** Peak GPU memory usage
-
-### 2.5 Creating Your Results Template
-
-Let's create a file to track your results throughout the workshop:
-
-```bash
-# Create results file
-cat > my_workshop_results.txt << 'EOF'
-================================================================================
-ROCm PyTorch Inference Benchmark Workshop Results
-================================================================================
-Name: [Your Name]
-Date: [Today's Date]
-GPU: [Your GPU Model from rocm-smi]
-ROCm Version: [From rocminfo]
-PyTorch Version: [From python -c "import torch; print(torch.__version__)"]
-================================================================================
-
-Exercise 1: Single GPU Baseline (ResNet50, FP32, BS=32)
-------------------------------------------------------------------------
-Throughput:          __________ samples/sec
-Memory Usage:        __________ MB
-Avg Time/Iteration:  __________ seconds
-Notes:
-
-
-Exercise 2: Precision Comparison (ResNet50, FP16, BS=32)
-------------------------------------------------------------------------
-FP32 Throughput:     __________ samples/sec
-FP16 Throughput:     __________ samples/sec
-Speedup (FP16/FP32): __________x
-Memory Reduction:    __________%
-Notes:
-
-
-Exercise 3: PyTorch Profiler
-------------------------------------------------------------------------
-Top 5 Slowest Operations:
-1. ____________________: __________ ms
-2. ____________________: __________ ms
-3. ____________________: __________ ms
-4. ____________________: __________ ms
-5. ____________________: __________ ms
-Notes:
-
-
-[Continue for remaining exercises...]
-EOF
-```
-
-**Open this file in a text editor and fill it out as you complete each exercise!**
-
----
-
-## 3. Exercise 1: Single GPU Baseline
-
-### 3.1 Objective
-
-Run your first benchmark and establish a baseline for comparison.
-
-**What you'll learn:**
-- How to run the benchmark tool
-- How to interpret basic output
-- What "good" performance looks like
-- How to verify your results
-
-### 3.2 Step-by-Step Instructions
-
-#### Step 1: Navigate to the benchmark directory
-
-```bash
-cd ~/castille-ai-workshop-training/inference_benchmark/
-```
-
-#### Step 2: Run the baseline benchmark
-
-```bash
-# Run ResNet50 with batch size 32 for 20 iterations
-python3 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 32 \
-    --iterations 20
-```
-
-
-#### Step 3: Watch the output
-
-You'll see output like this:
-
-```
-Using network: resnet50
-Batch size: 32
-Iterations: 20
-FP16: False
-Device: cuda:0
-
-Loading model...
-Model loaded successfully.
-
-Warming up (5 iterations)...
-Warmup iteration 0: Loss = 6.9078, Time = 0.242 seconds
-Warmup iteration 1: Loss = 6.9078, Time = 0.065 seconds
-Warmup iteration 2: Loss = 6.9078, Time = 0.064 seconds
-Warmup iteration 3: Loss = 6.9078, Time = 0.063 seconds
-Warmup iteration 4: Loss = 6.9078, Time = 0.063 seconds
-Warmup complete.
-
-Running timed iterations...
-Epoch 0: Loss = 6.9078, Time = 0.063 seconds
-Epoch 1: Loss = 6.9078, Time = 0.062 seconds
-Epoch 2: Loss = 6.9078, Time = 0.062 seconds
-...
-Epoch 19: Loss = 6.9078, Time = 0.062 seconds
-
-========================================
-Performance Summary:
-========================================
-Network: resnet50
-Batch size: 32
-Iterations: 20 (excluding warmup)
-Precision: FP32
-
-Average time per iteration: 0.062 seconds
-Standard deviation: 0.001 seconds
-Throughput: 516.1 samples/sec
-GPU Memory Usage: 4523 MB
-
-Images per second: 516.1
-Milliseconds per batch: 62.0
-Microseconds per sample: 1937.5
-========================================
-```
-
-### 3.3 Understanding Your Results
-
-Let's analyze what these numbers mean:
-
-#### 1. Warmup Phase
-```
-Warmup iteration 0: Loss = 6.9078, Time = 0.242 seconds  ← SLOW (first run)
-Warmup iteration 1: Loss = 6.9078, Time = 0.065 seconds  ← Much faster
-Warmup iteration 2: Loss = 6.9078, Time = 0.064 seconds  ← Stable
-```
-
-**Why is the first iteration slow?**
-- Kernel compilation (Triton, ROCm)
-- GPU memory allocation
-- Cache warming
-- cuDNN/MIOpen autotuning
-
-**This is normal! Always exclude warmup from measurements.**
-
-#### 2. Throughput: 516.1 samples/sec
-
-**What does this mean?**
-- Your GPU can process 516 images per second
-- For batch size 32: 516.1 / 32 = 16.1 batches/second
-
-**Is this good?**
-- For ResNet50 FP32 on MI200 series: 450-550 samples/sec is typical
-- For MI300 series: 600-800 samples/sec is typical
-- For older GPUs (V100, MI100): 300-400 samples/sec is typical
-
-#### 3. Memory Usage: 4523 MB
-
-**What uses this memory?**
-- Model weights: ~100 MB (ResNet50 has 25.6M parameters × 4 bytes)
-- Input batch: 32 × 3 × 224 × 224 × 4 bytes = ~19 MB
-- Activations: ~4400 MB (intermediate feature maps)
-
-**Why so much for activations?**
-- ResNet50 has many layers (50!)
-- Each layer creates feature maps
-- Feature maps are large (early layers: 32 × 64 × 112 × 112 × 4 bytes = 102 MB EACH!)
-
-#### 4. Time Consistency
-```
-Standard deviation: 0.001 seconds
-```
-
-**This is important!**
-- Low std dev (< 5% of mean): Stable, trustworthy results
-- High std dev (> 10% of mean): Something is wrong (thermal throttling, system interference)
-
-### 3.4 Checkpoint: Verify Your Results
-
-Before moving on, check:
-
-- [ ] Throughput is between 300-800 samples/sec (depending on GPU)
-- [ ] Memory usage is around 4000-5000 MB
-- [ ] Standard deviation is small (< 0.005 seconds)
-- [ ] All iterations show same loss (~6.9)
-- [ ] No error messages
-
-**If all checks pass, record your results and continue!**
-
-**If something looks wrong:**
-
-**Problem:** Throughput very low (< 100 samples/sec)
-```bash
-# Check GPU utilization
-rocm-smi
-
-# Should show ~100% during benchmark
-# If low, check:
-# 1. CPU bottleneck (increase --batch-size)
-# 2. Slow storage (model loading)
-# 3. System interference (close other programs)
-```
-
-**Problem:** Memory usage extremely high (> 10000 MB)
-```bash
-# Reduce batch size
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 16 --iterations 20
-```
-
-**Problem:** Inconsistent results (high std dev)
-```bash
-# Increase iterations for better averaging
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 32 --iterations 50
-```
-
-### 3.5 Recording Your Results
-
-Record these values in your `my_workshop_results.txt`:
-
-```
-Exercise 1: Single GPU Baseline (ResNet50, FP32, BS=32)
-------------------------------------------------------------------------
-Throughput:          516.1 samples/sec
-Memory Usage:        4523 MB
-Avg Time/Iteration:  0.062 seconds
-GPU Model:           AMD Instinct MI325X
-Notes:
-- Warmup took 5 iterations
-- Results very stable (std dev 0.001s)
-- Baseline for all future comparisons
-```
-
-### 3.6 Optional: Try Different Batch Sizes
-
-**Why does batch size matter?**
-
-Larger batches improve GPU utilization but increase memory usage.
-
-```bash
-# Small batch
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 8 --iterations 20
-
-# Medium batch (your baseline)
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 32 --iterations 20
-
-# Large batch
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 128 --iterations 20
-
-# Very large batch (might OOM!)
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 256 --iterations 20
-```
-
-**Create a quick comparison table:**
-
-| Batch Size | Throughput (samples/sec) | Memory (MB) | Samples/sec per GB |
-|------------|-------------------------|-------------|-------------------|
-| 8          | ?                       | ?           | ?                 |
-| 32         | 516.1                   | 4523        | 0.114             |
-| 128        | ?                       | ?           | ?                 |
-| 256        | OOM or ?                | ?           | ?                 |
-
-**What do you observe?**
-- Throughput increases with batch size... but not linearly
-- Memory increases with batch size
-- There's a sweet spot for efficiency
-
----
-
-## 4. Exercise 2: Precision Comparison (FP32 vs FP16)
-
-### 4.1 Objective
-
-Compare FP32 (32-bit floating point) vs FP16 (16-bit floating point) precision.
-
-**What you'll learn:**
-- What FP16 is and why it matters
-- Performance benefits of reduced precision
-- Memory savings from FP16
-- When to use FP16 vs FP32
-
-### 4.2 What is FP16?
-
-**Floating Point Precision:**
-
-```
-FP32 (Float32):  32 bits = 1 sign + 8 exponent + 23 mantissa
-                 Range: ±1.4 × 10⁻⁴⁵ to ±3.4 × 10³⁸
-                 Precision: ~7 decimal digits
-
-FP16 (Float16):  16 bits = 1 sign + 5 exponent + 10 mantissa
-                 Range: ±6.0 × 10⁻⁸ to ±6.5 × 10⁴
-                 Precision: ~3 decimal digits
-```
-
-**Benefits of FP16:**
-- 2x less memory (16 bits vs 32 bits)
-- 2x more data per memory transaction
-- 2-4x faster compute (specialized hardware)
-- Lower power consumption
-
-**Drawbacks of FP16:**
-- Lower precision (can cause numerical issues)
-- Smaller range (risk of overflow/underflow)
-- Requires careful model design
-
-**For inference:** FP16 is usually safe and recommended!
-
-### 4.3 Running FP32 Baseline (Repeat)
-
-First, let's re-run FP32 to have a fresh comparison:
-
-```bash
-python3 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 32 \
-    --iterations 20 \
-    --fp16 0
-```
-
-**Record the results:**
-```
-FP32 Throughput: __________ samples/sec
-FP32 Memory:     __________ MB
-```
-
-### 4.4 Running FP16 Benchmark
-
-Now let's run with FP16:
-
-```bash
-python3 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 32 \
-    --iterations 20 \
-    --fp16 1
-```
-
-**Expected output:**
-```
-Using network: resnet50
-Batch size: 32
-Iterations: 20
-FP16: True  ← Notice this!
-Device: cuda:0
-
-Converting model to FP16...
-Model conversion complete.
-
-Warming up...
-Warmup complete.
-
-Running timed iterations...
-Epoch 0: Loss = 6.9062, Time = 0.031 seconds  ← MUCH FASTER!
-Epoch 1: Loss = 6.9062, Time = 0.030 seconds
-...
-
-========================================
-Performance Summary:
-========================================
-Network: resnet50
-Batch size: 32
-Precision: FP16  ← Notice this!
-
-Average time per iteration: 0.031 seconds
-Throughput: 1032.3 samples/sec  ← ~2x faster!
-GPU Memory Usage: 2834 MB        ← ~37% less memory!
-========================================
-```
-
-### 4.5 Analyzing the Results
-
-Let's compare FP32 vs FP16:
-
-#### Create a comparison table:
-
-```
-┌──────────────────────┬───────────┬───────────┬──────────────┐
-│ Metric               │ FP32      │ FP16      │ Improvement  │
-├──────────────────────┼───────────┼───────────┼──────────────┤
-│ Throughput (samp/s)  │ 516.1     │ 1032.3    │ 2.00x faster │
-│ Memory (MB)          │ 4523      │ 2834      │ 37% less     │
-│ Time per batch (ms)  │ 62.0      │ 31.0      │ 2.00x faster │
-│ Numerical accuracy   │ Full      │ Reduced   │ -            │
-└──────────────────────┴───────────┴───────────┴──────────────┘
-```
-
-#### Why is it faster?
-
-1. **Less Memory Traffic:**
-   - FP16 tensor: half the size
-   - Loading weights from memory: 2x faster
-   - Writing activations: 2x faster
-
-2. **Specialized Hardware:**
-   - AMD MI200/MI300: Matrix Core FP16 instructions
-   - 2-4x higher TFLOPS for FP16 vs FP32
-
-3. **Cache Efficiency:**
-   - More data fits in L2 cache
-   - Fewer cache misses
-
-#### Why less memory?
-
-```
-Model weights:  25.6M params × 2 bytes = 51 MB (vs 102 MB in FP32)
-Activations:    ~2200 MB (vs ~4400 MB in FP32)
-Input batch:    32 × 3 × 224 × 224 × 2 bytes = ~9.6 MB (vs ~19 MB)
-```
-
-### 4.6 When to Use FP16?
-
-**Use FP16 when:**
-- Inference only (no gradient accumulation issues)
-- Large models (memory constrained)
-- Throughput matters more than last-bit accuracy
-- Model is not numerically sensitive
-
-**Avoid FP16 when:**
-- Need exact numerical reproducibility
-- Model has numerical instability
-- Small model (no memory benefit)
-- Training (use mixed precision instead)
-
-### 4.7 Testing Numerical Accuracy
-
-Let's verify FP16 doesn't hurt model accuracy significantly.
-
-#### Run both and compare loss:
-
-```bash
-# FP32
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 32 --iterations 5 --fp16 0 | grep "Epoch 4"
-
-# FP16
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 32 --iterations 5 --fp16 1 | grep "Epoch 4"
-```
-
-**Expected output:**
-```
-FP32: Epoch 4: Loss = 6.9078
-FP16: Epoch 4: Loss = 6.9062
-```
-
-**Difference:** 0.0016 (0.02%)
-
-**This is negligible!**
-
-### 4.8 Checkpoint
-
-Before continuing:
-
-- [ ] FP16 is ~2x faster than FP32
-- [ ] FP16 uses ~30-40% less memory
-- [ ] Loss values are very similar (~0.02% difference)
-- [ ] You understand when to use FP16
-
-**Record your results in `my_workshop_results.txt`!**
-
-### 4.9 Advanced: Maximum Batch Size
-
-Let's find the maximum batch size for both precisions:
-
-```bash
-# FP32 - keep increasing until OOM
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 5 --fp16 0
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 128 --iterations 5 --fp16 0
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 256 --iterations 5 --fp16 0
-
-# FP16 - should go much higher!
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 128 --iterations 5 --fp16 1
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 256 --iterations 5 --fp16 1
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 512 --iterations 5 --fp16 1
-```
-
-**Track maximum batch sizes:**
-```
-FP32 max batch size: __________ (before OOM)
-FP16 max batch size: __________ (before OOM)
-
-Ratio: FP16 supports __________x larger batches!
-```
-
----
-
-## 5. Exercise 3: PyTorch Profiler Integration
-
-### 5.1 Objective
-
-Use PyTorch's built-in profiler to identify performance bottlenecks.
-
-**What you'll learn:**
-- How to enable PyTorch Profiler
-- Reading profiler output
-- Identifying slow operations
-- Understanding CPU vs GPU time
-
-### 5.2 What is PyTorch Profiler?
-
-**PyTorch Profiler** provides detailed performance analysis:
-
-- **Operator-level timing:** How long each operation takes
-- **CPU vs GPU time:** Distinguish CPU overhead from GPU compute
-- **Memory profiling:** Track memory allocations
-- **Stack traces:** See which code triggered operations
-- **Kernel details:** See GPU kernel launches
-
-**When to use:**
-- Identifying bottleneck operations
-- Finding CPU overhead
-- Optimizing custom operations
-- Debugging slow models
-
-### 5.3 Running with PyTorch Profiler
-
-Let's modify our benchmark to use the profiler.
-
-#### Step 1: Run with profiler enabled
-
-```bash
-python3 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 32 \
-    --iterations 10 \
-    --fp16 0 \
-    --autograd-profiler
-```
-
-
-
-#### Step 2: Understanding the output
-
-You'll see LOTS of output! Let's focus on key sections:
-
-```
-========================================
-PyTorch Profiler Results:
-========================================
-
-Top 10 operations by total CPU time:
----------------------------  ------------  ------------  ------------
-Name                         Self CPU %    Self CPU      CPU total
----------------------------  ------------  ------------  ------------
-aten::convolution            5.23%         128.45ms      8.52s
-aten::batch_norm             2.15%         52.75ms       1.32s
-aten::relu_                  1.87%         45.91ms       45.91ms
-aten::max_pool2d             0.95%         23.32ms       67.45ms
-aten::addmm                  0.78%         19.15ms       234.67ms
-aten::linear                 0.65%         15.95ms       250.62ms
-aten::add_                   0.52%         12.78ms       12.78ms
-aten::_convolution           4.87%         119.55ms      8.40s
-aten::cudnn_convolution      78.23%        1.92s         1.92s
-...
----------------------------  ------------  ------------  ------------
-
-Top 10 operations by total CUDA time:
----------------------------  ------------  ------------  ------------
-Name                         Self CUDA     CUDA total    # of Calls
----------------------------  ------------  ------------  ------------
-void cudnn::detail::implicit_convolve_sgemm...   1.82s    1.82s      320
-void cudnn::bn_fw_tr_1C11...                     234.56ms 234.56ms   160
-Memcpy HtoD (Pageable -> Device)                 145.32ms 145.32ms   50
-void at::native::vectorized_elementwise...       89.45ms  89.45ms    640
-void cudnn::ops::nchwToNhwc...                   67.23ms  67.23ms    160
-...
----------------------------  ------------  ------------  ------------
-
-Memory Profiling:
----------------------------  ------------  ------------  ------------
-Name                         CPU Mem       CUDA Mem      # of Calls
----------------------------  ------------  ------------  ------------
-aten::convolution            0 b           3.52 Gb       320
-aten::batch_norm             0 b           834.56 Mb     160
-aten::relu_                  0 b           0 b           160
-aten::max_pool2d             0 b           256.00 Mb     32
-...
----------------------------  ------------  ------------  ------------
-```
-
-### 5.4 Interpreting the Results
-
-#### 1. CPU Time vs CUDA Time
-
-**CPU Time:** Time spent on Python/CPU side
-- Launching kernels
-- Python overhead
-- Data preparation
-
-**CUDA Time:** Time spent on GPU
-- Actual computation
-- Memory transfers
-- Kernel execution
-
-**Key insight:** If CPU time >> CUDA time, you have CPU overhead!
-
-#### 2. Top Operations
-
-From the example above:
-
-```
-Top operation: cudnn_convolution (78.23% of CPU time)
-```
-
-**What this means:**
-- Convolutions dominate runtime
-- This is expected for ResNet50!
-- Optimizing convolutions = biggest impact
-
-#### 3. Memory Allocation
-
-```
-aten::convolution: 3.52 GB CUDA memory
-```
-
-**What this means:**
-- Convolutions use most memory
-- Intermediate feature maps are large
-- This is why batch size is limited
-
-### 5.5 Hands-On: Finding Bottlenecks
-
-Let's analyze YOUR profiler output:
-
-#### Task 1: Find the top 5 slowest operations
-
-Look at "Top 10 operations by total CUDA time" and write down:
-
-```
-1. ___________________________: ___________ ms
-2. ___________________________: ___________ ms
-3. ___________________________: ___________ ms
-4. ___________________________: ___________ ms
-5. ___________________________: ___________ ms
-```
-
-#### Task 2: Calculate convolution percentage
-
-```
-Total CUDA time: ___________ seconds
-Convolution CUDA time: ___________ seconds
-Percentage: (___________ / ___________) × 100 = _________%
-```
-
-**Is convolution the bottleneck?**
-- If > 70%: Yes, convolution is the main bottleneck
-- If < 50%: Other operations are significant
-
-#### Task 3: Check for CPU overhead
-
-```
-Total CPU time: ___________ seconds
-Total CUDA time: ___________ seconds
-Ratio: ___________ / ___________ = ___________
-```
-
-**Interpretation:**
-- Ratio < 1.2: Good! Low CPU overhead
-- Ratio 1.2-2.0: Moderate CPU overhead
-- Ratio > 2.0: High CPU overhead!
-
-### 5.6 Comparing FP32 vs FP16 Profiling
-
-Let's profile both precisions:
-
-```bash
-# FP32
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 32 --iterations 10 --fp16 0 --autograd-profiler > profile_fp32.txt
-
-# FP16
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 32 --iterations 10 --fp16 1 --autograd-profiler > profile_fp16.txt
-```
-
-#### Compare convolution times:
-
-```bash
-# FP32 convolution time
-grep "cudnn_convolution" profile_fp32.txt | head -1
-
-# FP16 convolution time
-grep "cudnn_convolution" profile_fp16.txt | head -1
-```
-
-**Create comparison:**
-```
-FP32 convolution time: ___________ ms
-FP16 convolution time: ___________ ms
-Speedup: ___________ / ___________ = ___________x
-```
-
-### 5.7 Advanced: Chrome Trace Visualization
-
-PyTorch Profiler can export a Chrome trace for visual analysis.
-
-#### Step 1: Create a profiling script
-
-Create a file `profile_resnet.py`:
-
-```python
-import torch
-import torchvision
-import torch.profiler
-
-# Load model
-model = torchvision.models.resnet50().cuda()
-model.eval()
-
-# Create dummy input
-input = torch.randn(32, 3, 224, 224).cuda()
-
-# Warmup
-with torch.no_grad():
-    for _ in range(5):
-        model(input)
-
-# Profile with Chrome trace export
-with torch.profiler.profile(
-    activities=[
-        torch.profiler.ProfilerActivity.CPU,
-        torch.profiler.ProfilerActivity.CUDA,
-    ],
-    record_shapes=True,
-    profile_memory=True,
-    with_stack=True,
-) as prof:
-    with torch.no_grad():
-        for _ in range(10):
-            model(input)
-
-# Print summary
-print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
-
-# Export Chrome trace
-prof.export_chrome_trace("resnet50_trace.json")
-print("\nChrome trace exported to: resnet50_trace.json")
-print("View at: chrome://tracing")
-```
-
-#### Step 2: Run the script
-
-```bash
-python3 profile_resnet.py
-```
-
-#### Step 3: View the trace
-
-1. Open Chrome browser
-2. Go to `chrome://tracing`
-3. Click "Load"
-4. Select `resnet50_trace.json`
-
-**You'll see a timeline view:**
-- X-axis: Time
-- Y-axis: Different operations
-- Color: Operation type
-
-**What to look for:**
-- Long operations (bottlenecks)
-- GPU idle time (gaps)
-- Memory transfer time
-- Kernel launch overhead
-
-### 5.8 Checkpoint
-
-Before continuing:
-
-- [ ] You can enable PyTorch Profiler with `--autograd-profiler`
-- [ ] You can identify top operations by CUDA time
-- [ ] You understand CPU time vs CUDA time
-- [ ] You can compare FP32 vs FP16 performance at operation level
-- [ ] You know how to export Chrome traces for visualization
-
-**Record your top 5 operations in `my_workshop_results.txt`!**
-
----
-
-## 6. Exercise 4: DeepSpeed FLOPS Profiler
-
-### 6.1 Objective
-
-Measure computational efficiency using DeepSpeed FLOPS Profiler.
-
-**What you'll learn:**
-- What FLOPs are and why they matter
-- Theoretical vs achieved FLOPS
-- Computational efficiency
-- Identifying compute vs memory-bound operations
-
-### 6.2 What are FLOPs?
-
-**FLOPS = Floating Point Operations Per Second**
-
-**Key concepts:**
-
-1. **Operation Count:**
-   - Total floating-point operations in your model
-   - Example: Matrix multiply (M×K) × (K×N) = 2×M×K×N FLOPs
-
-2. **Theoretical Peak:**
-   - Maximum FLOPs your hardware can achieve
-   - MI325X: ~653 TFLOPS (FP16), ~326 TFLOPS (FP32)
-
-3. **Achieved FLOPs:**
-   - What your model actually achieves
-   - Usually much lower than peak!
-
-4. **Efficiency:**
-   - (Achieved / Theoretical) × 100%
-   - 50%+ is very good!
-   - 10-20% is typical for many workloads
-
-### 6.3 Why Measure FLOPs?
-
-**FLOPs efficiency tells you:**
-
-- Are you **compute-bound** or **memory-bound**?
-  - High efficiency (>40%): Compute-bound (good!)
-  - Low efficiency (<20%): Memory-bound (need optimization!)
-
-- How much headroom for optimization?
-  - At 10% efficiency: 10x speedup possible!
-  - At 80% efficiency: Already well-optimized
-
-- Hardware utilization:
-  - Are you getting value from your expensive GPU?
-
-### 6.4 Understanding Compute vs Memory Bound
-
-```
-Compute-bound:
-- Lots of arithmetic operations
-- GPU cores fully utilized
-- Examples: Matrix multiply, convolutions with large kernels
-- Optimization: Use faster compute (FP16, Tensor Cores)
-
-Memory-bound:
-- Lots of memory reads/writes
-- Memory bandwidth saturated
-- Examples: Element-wise operations, small convolutions, attention
-- Optimization: Reduce memory traffic (fusion, better layouts)
-```
-
-### 6.5 Running DeepSpeed FLOPS Profiler
-
-#### Step 1: Install DeepSpeed
-
-```bash
-# Install DeepSpeed
-pip install deepspeed
-```
-
-#### Step 2: Run with FLOPS profiler
-
-```bash
-python3 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 32 \
-    --iterations 20 \
-    --fp16 0 \
-    --flops-prof-step 10
-```
-
-**Note:** `--flops-prof-step 10` means profile at iteration 10 (after warmup)
-
-
-
-#### Step 3: Understanding the output
-
-You'll see extensive output like this:
-
-```
-========================================
-DeepSpeed FLOPS Profiler Output:
-========================================
-
--------------------------- DeepSpeed Flops Profiler --------------------------
-
-Profile Summary at step 10:
-Notations:
-data parallel size (dp_size), model parallel size(mp_size),
-number of parameters (params), number of multiply-accumulate operations(MACs),
-number of floating-point operations (flops), floating-point operations per second (FLOPS),
-fwd latency (forward propagation latency), bwd latency (backward propagation latency),
-step (weights update latency), iter latency (sum of fwd, bwd and step latency)
-
-world size:                        1
-data parallel size:                1
-model parallel size:               1
-batch size per GPU:                32
-params per GPU:                    25.56 M
-params of model = params per GPU * mp_size:            25.56 M
-fwd MACs per GPU:                  4.10 G
-fwd FLOPs per GPU:                 8.20 G
-fwd FLOPs of model = fwd FLOPs per GPU * mp_size:     8.20 G
-fwd latency:                       10.52 ms
-bwd latency:                       21.34 ms
-fwd FLOPS per GPU = fwd FLOPs per GPU / fwd latency:  779.47 GFLOPS
-bwd FLOPS per GPU = 2 * fwd FLOPs per GPU / bwd latency:  768.54 GFLOPS
-fwd+bwd FLOPS per GPU = 3 * fwd FLOPs per GPU / (fwd+bwd latency): 772.89 GFLOPS
-
------------------------------ Aggregated Profile per GPU -----------------------------
-Top 10 modules in terms of params, MACs or fwd latency at different model depths:
-
-depth 0:
-    params      |    MACs       |  fwd latency  |   module
-    25.56 M     |  4.10 G       |  10.52 ms     |  ResNet
-
-depth 1:
-    params      |    MACs       |  fwd latency  |   module
-    0           |   803.16 M    |   1.23 ms     |  conv1
-    0           |   411.04 M    |   1.45 ms     |  layer1
-    0           |   822.08 M    |   2.34 ms     |  layer2
-    0           |   1.64 G      |   3.67 ms     |  layer3
-    0           |   822.08 M    |   1.54 ms     |  layer4
-    2.05 M      |   0           |   0.12 ms     |  fc
-
-Top 10 modules in terms of fwd latency:
-    fwd latency |   module
-    10.52 ms    |  ResNet
-    3.67 ms     |  layer3
-    2.34 ms     |  layer2
-    1.54 ms     |  layer4
-    1.45 ms     |  layer1
-    1.23 ms     |  conv1
-    0.12 ms     |  fc
-
------------------------------ Detailed Profile per GPU -----------------------------
-
-Each module profile is listed after its name in the following order:
-params, percentage of total params, MACs, percentage of total MACs, fwd latency, percentage of total fwd latency
-
-ResNet (25.56 M, 100.00%, 4.10 G, 100.00%, 10.52 ms, 100.00%)
-  conv1 (0, 0.00%, 803.16 M, 19.59%, 1.23 ms, 11.69%)
-  bn1 (0, 0.00%, 0, 0.00%, 0.34 ms, 3.23%)
-  relu (0, 0.00%, 0, 0.00%, 0.18 ms, 1.71%)
-  maxpool (0, 0.00%, 0, 0.00%, 0.23 ms, 2.19%)
-  layer1 (0, 0.00%, 411.04 M, 10.03%, 1.45 ms, 13.78%)
-    layer1.0 (0, 0.00%, 205.52 M, 5.01%, 0.73 ms, 6.94%)
-      layer1.0.conv1 (0, 0.00%, 51.38 M, 1.25%, 0.15 ms, 1.43%)
-      layer1.0.bn1 (0, 0.00%, 0, 0.00%, 0.11 ms, 1.05%)
-      layer1.0.relu (0, 0.00%, 0, 0.00%, 0.09 ms, 0.86%)
-      layer1.0.conv2 (0, 0.00%, 51.38 M, 1.25%, 0.16 ms, 1.52%)
-      ...
-  layer2 (0, 0.00%, 822.08 M, 20.05%, 2.34 ms, 22.24%)
-  layer3 (0, 0.00%, 1.64 G, 40.01%, 3.67 ms, 34.89%)
-  layer4 (0, 0.00%, 822.08 M, 20.05%, 1.54 ms, 14.64%)
-  avgpool (0, 0.00%, 0, 0.00%, 0.08 ms, 0.76%)
-  fc (2.05 M, 8.01%, 0, 0.00%, 0.12 ms, 1.14%)
-
-------------------------------------------------------------------------------
-```
-
-### 6.6 Analyzing FLOPS Results
-
-Let's break down the key metrics:
-
-#### 1. FLOPs of the Model
-
-```
-fwd FLOPs per GPU: 8.20 G (GigaFLOPs)
-```
-
-**What this means:**
-- One forward pass requires 8.2 billion floating-point operations
-- This is fixed for ResNet50 at this batch size
-- Doubling batch size doubles FLOPs
-
-#### 2. Forward Pass FLOPS (Throughput)
-
-```
-fwd FLOPS per GPU: 779.47 GFLOPS
-```
-
-**What this means:**
-- GPU is executing 779 billion FLOPs per second during forward pass
-- This is achieved performance, not theoretical
-
-#### 3. Efficiency Calculation
-
-```
-Theoretical peak (MI325X FP32): ~163,000 GFLOPS (163 TFLOPS)
-Achieved: 779.47 GFLOPS
-Efficiency: (779.47 / 163,000) × 100% = 0.48%
-```
-
-**Wait, only 0.48%?! Is this bad?**
-
-Not necessarily! Here's why:
-
-- **Small batch size:** BS=32 doesn't saturate the GPU
-- **Mixed operations:** Not all operations are compute-intensive
-- **Memory bound:** Some operations are limited by memory bandwidth, not compute
-
-Let's verify this with a larger batch:
-
-### 6.7 Batch Size Impact on Efficiency
-
-Run with different batch sizes:
-
-```bash
-# Small batch
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 16 --iterations 20 --flops-prof-step 10 | grep "fwd FLOPS"
-
-# Medium batch
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20 --flops-prof-step 10 | grep "fwd FLOPS"
-
-# Large batch
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 128 --iterations 20 --flops-prof-step 10 | grep "fwd FLOPS"
-```
-
-**Create a table:**
-
-| Batch Size | FLOPs per Forward (G) | Achieved GFLOPS | Efficiency (%) |
-|------------|----------------------|-----------------|----------------|
-| 16         | ?                    | ?               | ?              |
-| 32         | 8.20                 | 779.47          | 0.48%          |
-| 64         | ?                    | ?               | ?              |
-| 128        | ?                    | ?               | ?              |
-
-**What pattern do you see?**
-- Larger batches → Higher achieved GFLOPS
-- FLOPs per forward increases linearly with batch size
-- Efficiency improves with batch size
-
-### 6.8 FP16 FLOPS Comparison
-
-Let's see how FP16 affects FLOPs efficiency:
-
-```bash
-# FP32
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20 --fp16 0 --flops-prof-step 10 | grep "fwd FLOPS"
-
-# FP16
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20 --fp16 1 --flops-prof-step 10 | grep "fwd FLOPS"
-```
-
-**Compare:**
-```
-FP32 achieved GFLOPS: ___________ GFLOPS
-FP16 achieved GFLOPS: ___________ GFLOPS
-
-FP32 peak (MI325X): 163 TFLOPS
-FP16 peak (MI325X): 653 TFLOPS
-
-FP32 efficiency: (___________ / 163,000) × 100% = ___________%
-FP16 efficiency: (___________ / 653,000) × 100% = ___________%
-```
-
-### 6.9 Identifying Bottleneck Layers
-
-From the detailed profile, look at "fwd latency":
-
-```
-Top modules by forward latency:
-  10.52 ms    |  ResNet (total)
-   3.67 ms    |  layer3 (34.89% of total!)
-   2.34 ms    |  layer2 (22.24% of total)
-   1.54 ms    |  layer4
-   1.45 ms    |  layer1
-```
-
-**Analysis:**
-- **layer3 is the bottleneck** (35% of forward time!)
-- This makes sense: layer3 has the most FLOPs (1.64 G, 40% of total)
-- Optimizing layer3 would have the biggest impact
-
-### 6.10 Compute vs Memory Bound Analysis
-
-Let's determine if ResNet50 is compute-bound or memory-bound:
-
-#### Arithmetic Intensity Calculation
-
-```
-Arithmetic Intensity = FLOPs / Bytes Transferred
-
-For ResNet50 forward pass:
-- FLOPs: 8.20 G
-- Weights: 25.56 M params × 4 bytes = 102 MB
-- Activations: ~4 GB (estimated)
-- Total bytes: ~4.1 GB
-
-Arithmetic Intensity = 8.20 G / 4.1 GB ≈ 2.0 FLOPs/byte
-```
-
-**Interpretation:**
-
-```
-Arithmetic Intensity (FLOPs/byte):
-< 1:   Severely memory-bound
-1-10:  Memory-bound (typical for ResNet)
-10-50: Balanced
-> 50:  Compute-bound
-```
-
-**ResNet50 is memory-bound!** This explains the low efficiency.
-
-**Optimization strategies:**
-- Increase batch size (amortize memory transfers)
-- Use FP16 (reduce bytes transferred)
-- Fuse operations (reduce intermediate tensors)
-- Use better memory layouts
-
-### 6.11 Checkpoint
-
-Before continuing:
-
-- [ ] You understand what FLOPs and GFLOPS mean
-- [ ] You can measure achieved GFLOPS with DeepSpeed profiler
-- [ ] You understand efficiency = achieved / theoretical
-- [ ] You know the difference between compute-bound and memory-bound
-- [ ] You can identify bottleneck layers
-- [ ] You understand why ResNet50 has low efficiency
-
-**Record your FLOPS results in `my_workshop_results.txt`!**
-
----
-
-## 7. Exercise 5: Multi-GPU Scaling
-
-### 7.1 Objective
-
-Scale your inference workload across multiple GPUs using distributed data parallel.
-
-**What you'll learn:**
-- How to use `torchrun` for multi-GPU execution
-- Understanding data parallelism
-- Measuring scaling efficiency
-- Common multi-GPU issues
-
-### 7.2 What is Distributed Data Parallel (DDP)?
-
-**Data Parallelism:**
-- Split batch across multiple GPUs
-- Each GPU has a complete copy of the model
-- Process different data on each GPU in parallel
-- Combine results at the end
-
-**Example with 2 GPUs:**
-```
-Original batch: 64 samples
-├── GPU 0: processes samples 0-31
-└── GPU 1: processes samples 32-63
-
-Throughput: ~2x faster (ideally)
-```
-
-**Key concepts:**
-- **World Size:** Total number of processes (= number of GPUs)
-- **Rank:** ID of current process (0 to world_size-1)
-- **Local Rank:** ID of GPU on current node
-
-### 7.3 Prerequisites: Check Available GPUs
-
-```bash
-# Check how many GPUs you have
-rocm-smi --showid
-
-# Should show something like:
-# GPU[0] : GPU ID: 0
-# GPU[1] : GPU ID: 1
-# ...
-```
-
-**For this exercise, you need at least 2 GPUs.**
-
-If you only have 1 GPU, you can still read along and understand the concepts!
-
-### 7.4 Single GPU Baseline (For Comparison)
-
-First, establish a single-GPU baseline:
-
-```bash
-python3 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 64 \
-    --iterations 20
-```
-
-**Record the throughput:**
-```
-Single GPU (BS=64): ___________ samples/sec
-```
-
-### 7.5 Running with 2 GPUs
-
-Now let's scale to 2 GPUs:
-
-```bash
-torchrun --nproc-per-node 2 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 128 \
-    --iterations 20
-```
-
-**Important notes:**
-- `--nproc-per-node 2`: Use 2 GPUs
-- `--batch-size 128`: Total batch size (64 per GPU)
-- `torchrun` automatically splits the batch
-
-**Expected output:**
-```
-**** Launching with torchrun ****
-Setting up process group...
-[GPU 0] Initializing...
-[GPU 1] Initializing...
-Process group initialized.
-
-[GPU 0] Using network: resnet50
-[GPU 0] Local batch size: 64
-[GPU 0] Global batch size: 128
-[GPU 1] Using network: resnet50
-[GPU 1] Local batch size: 64
-[GPU 1] Global batch size: 128
-
-Warming up...
-[GPU 0] Warmup complete.
-[GPU 1] Warmup complete.
-
-Running timed iterations...
-[GPU 0] Epoch 0: Loss = 6.9078, Time = 0.063 seconds
-[GPU 1] Epoch 0: Loss = 6.9078, Time = 0.063 seconds
-...
-
-========================================
-Performance Summary (GPU 0):
-========================================
-Global batch size: 128
-Local batch size: 64
-World size: 2
-
-Average time per iteration: 0.063 seconds
-Throughput: 2032.5 samples/sec (global)
-Per-GPU throughput: 1016.3 samples/sec
-GPU Memory Usage: 4523 MB
-========================================
-```
-
-### 7.6 Analyzing Multi-GPU Results
-
-Let's calculate scaling efficiency:
-
-```
-Single GPU:  ___________ samples/sec (BS=64)
-Two GPUs:    ___________ samples/sec (BS=128)
-
-Ideal 2-GPU: ___________ × 2 = ___________ samples/sec
-Actual 2-GPU: ___________ samples/sec
-
-Scaling efficiency: (Actual / Ideal) × 100% = ___________%
-```
-
-**Typical results:**
-- **Perfect scaling (100%):** Rare! Means no overhead
-- **Good scaling (90-95%):** Common for large batches
-- **Moderate scaling (80-90%):** Typical for medium batches
-- **Poor scaling (<80%):** Communication overhead, small batches
-
-### 7.7 Scaling Factors: What Affects Efficiency?
-
-#### 1. Batch Size Per GPU
-
-```bash
-# Small batch per GPU (32)
-torchrun --nproc-per-node 2 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20
-
-# Medium batch per GPU (64)
-torchrun --nproc-per-node 2 micro_benchmarking_pytorch.py --network resnet50 --batch-size 128 --iterations 20
-
-# Large batch per GPU (128)
-torchrun --nproc-per-node 2 micro_benchmarking_pytorch.py --network resnet50 --batch-size 256 --iterations 20
-```
-
-**Create a table:**
-
-| Batch per GPU | Total Batch | 1-GPU Throughput | 2-GPU Throughput | Scaling Efficiency |
-|---------------|-------------|------------------|------------------|--------------------|
-| 32            | 64          | ?                | ?                | ?%                 |
-| 64            | 128         | ?                | ?                | ?%                 |
-| 128           | 256         | ?                | ?                | ?%                 |
-
-**Pattern:**
-- Larger batches → Better scaling efficiency
-- Why? Communication overhead is amortized
-
-#### 2. Model Size
-
-```bash
-# Small model (ResNet18)
-torchrun --nproc-per-node 2 micro_benchmarking_pytorch.py --network resnet18 --batch-size 128 --iterations 20
-
-# Medium model (ResNet50)
-torchrun --nproc-per-node 2 micro_benchmarking_pytorch.py --network resnet50 --batch-size 128 --iterations 20
-
-# Large model (ResNet152)
-torchrun --nproc-per-node 2 micro_benchmarking_pytorch.py --network resnet152 --batch-size 128 --iterations 20
-```
-
-**Observation:**
-- Larger models scale better
-- Why? More computation relative to communication
-
-### 7.8 Running with 4 GPUs (If Available)
-
-If you have 4+ GPUs:
-
-```bash
-# 4 GPUs
-torchrun --nproc-per-node 4 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 256 \
-    --iterations 20
-```
-
-**Scaling analysis:**
-
-| GPUs | Batch Size | Throughput | Ideal | Efficiency |
-|------|------------|------------|-------|------------|
-| 1    | 64         | ___        | ___   | 100%       |
-| 2    | 128        | ___        | ___   | ___%       |
-| 4    | 256        | ___        | ___   | ___%       |
-
-**Typical pattern:**
-- 1 → 2 GPUs: 90-95% efficiency
-- 2 → 4 GPUs: 85-90% efficiency
-- Efficiency decreases with more GPUs (communication overhead)
-
-### 7.9 Common Multi-GPU Issues
-
-#### Issue 1: "RuntimeError: NCCL error"
-
-```bash
-# Solution 1: Check GPU visibility
-export ROCR_VISIBLE_DEVICES=0,1
-
-# Solution 2: Set NCCL debug level
-export NCCL_DEBUG=INFO
-```
-
-#### Issue 2: "OOM on some GPUs but not others"
-
-**Cause:** Imbalanced workload or initialization
-
-```bash
-# Check memory on all GPUs
-rocm-smi
-
-# Should be similar across GPUs
-```
-
-#### Issue 3: "Very poor scaling (<50%)"
-
-**Possible causes:**
-- Batch size too small per GPU
-- High communication overhead
-- CPU bottleneck
-- Slow interconnect
-
-**Debug steps:**
-```bash
-# 1. Profile a single GPU
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20
-
-# 2. Check if single GPU is efficient
-# If single GPU is slow, fix that first!
-
-# 3. Increase batch size per GPU
-torchrun --nproc-per-node 2 micro_benchmarking_pytorch.py --network resnet50 --batch-size 256 --iterations 20
-```
-
-#### Issue 4: "Hangs at initialization"
-
-```bash
-# Check if processes can communicate
-export NCCL_DEBUG=INFO
-torchrun --nproc-per-node 2 micro_benchmarking_pytorch.py --network resnet50 --batch-size 128 --iterations 2
-
-# Look for NCCL initialization messages
-# If stuck, check firewall, network, GPU interconnect
-```
-
-### 7.10 Best Practices for Multi-GPU Inference
-
-**1. Batch Size:**
-- Use largest batch that fits in memory per GPU
-- Larger batches = better scaling
-
-**2. Model Loading:**
-- Load model once, copy to all GPUs
-- Don't load from disk on each GPU (slow!)
-
-**3. Data Loading:**
-- Use multiple workers for data loading
-- Pre-fetch batches to avoid GPU idle time
-
-**4. Warmup:**
-- Always warmup before timing
-- First iteration compiles kernels
-
-**5. Synchronization:**
-- Use `torch.cuda.synchronize()` when timing
-- Otherwise you measure launch time, not execution time
-
-### 7.11 Checkpoint
-
-Before continuing:
-
-- [ ] You can use `torchrun` for multi-GPU execution
-- [ ] You understand batch splitting in DDP
-- [ ] You can calculate scaling efficiency
-- [ ] You understand factors affecting scaling
-- [ ] You know how to debug common multi-GPU issues
-
-**Record your multi-GPU results in `my_workshop_results.txt`!**
-
----
-
-## 8. Exercise 6: PyTorch 2.0 Compilation
-
-### 8.1 Objective
-
-Use PyTorch 2.0's `torch.compile` to automatically optimize your model.
-
-**What you'll learn:**
-- What is torch.compile and how it works
-- Different compilation modes
-- Measuring speedup from compilation
-- When compilation helps (and when it doesn't)
-
-### 8.2 What is torch.compile?
-
-**PyTorch 2.0 introduced `torch.compile`:**
-- Analyzes your model's computation graph
-- Applies graph-level optimizations
-- Generates optimized GPU kernels
-- No code changes required!
-
-**How it works:**
-```
-1. Trace your model: Record operations
-2. Optimize graph: Fuse operations, eliminate redundancy
-3. Generate kernels: Compile optimized CUDA/ROCm code
-4. Execute: Run optimized version
-```
-
-**Potential speedups:**
-- Operator fusion (reduce kernel launches)
-- Memory layout optimization
-- Kernel specialization
-- Dead code elimination
-
-### 8.3 Baseline (No Compilation)
-
-First, run without compilation:
-
-```bash
-python3 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 64 \
-    --iterations 20
-```
-
-**Record baseline:**
-```
-No compilation: ___________ samples/sec
-```
-
-### 8.4 Default Compilation Mode
-
-Now enable compilation with default settings:
-
-```bash
-python3 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 64 \
-    --iterations 20 \
-    --compile
-```
-
-**Note:** First run will be SLOW (compilation time!)
-
-**Expected output:**
-```
-Using network: resnet50
-Batch size: 64
-Iterations: 20
-PyTorch Compile: ENABLED (mode=default)
-
-Compiling model...
-[Compiling...] This may take 1-2 minutes on first run...
-[COMPILE] Tracing model...
-[COMPILE] Optimizing graph...
-[COMPILE] Generating kernels...
-Compilation complete.
-
-Warming up...
-Warmup complete.
-
-Running timed iterations...
-Epoch 0: Loss = 6.9078, Time = 0.058 seconds
-...
-
-========================================
-Performance Summary:
-========================================
-Throughput: 1103.4 samples/sec
-Compilation time: 87.3 seconds (first run only)
-========================================
-```
-
-### 8.5 Understanding Compilation Overhead
-
-**First run:**
-- Slow! Compilation takes 1-3 minutes
-- Not included in performance measurements
-
-**Subsequent runs:**
-- Fast! Cached kernels are reused
-- No recompilation needed
-
-**When is this worth it?**
-- Production deployments (compile once, run millions of times)
-- Long-running inference servers
-- Batch processing large datasets
-
-**When is it NOT worth it?**
-- Single inference runs
-- Prototyping
-- Frequently changing models
-
-### 8.6 Compilation Modes
-
-PyTorch 2.0 has different compilation modes:
-
-#### Mode 1: default (Conservative)
-
-```bash
-python3 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 64 \
-    --iterations 20 \
-    --compile
-```
-
-**Characteristics:**
-- Fast compilation
-- Safe optimizations
-- Moderate speedup
-
-#### Mode 2: reduce-overhead
-
-```bash
-python3 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 64 \
-    --iterations 20 \
-    --compile \
-    --compileContext "{'mode': 'reduce-overhead'}"
-```
-
-**Characteristics:**
-- Focus on reducing Python overhead
-- Faster for many small operations
-- Good for models with lots of layers
-
-#### Mode 3: max-autotune (Aggressive)
-
-```bash
-python3 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 64 \
-    --iterations 20 \
-    --compile \
-    --compileContext "{'mode': 'max-autotune'}"
-```
-
-**Characteristics:**
-- VERY slow compilation (5-10 minutes!)
-- Tries many kernel variants
-- Benchmarks each variant
-- Selects fastest
-- Best runtime performance
-
-**Expected output:**
-```
-[COMPILE] Mode: max-autotune
-[COMPILE] Testing kernel variant 1/53...
-[COMPILE] Testing kernel variant 2/53...
-[COMPILE] Testing kernel variant 3/53...
-...
-[COMPILE] Best kernel selected: variant 27
-Compilation complete (took 347.2 seconds).
-
-Throughput: 1287.5 samples/sec  ← Even faster!
-```
-
-### 8.7 Comparing Compilation Modes
-
-Run all modes and compare:
-
-```bash
-# No compilation
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20 > results_no_compile.txt
-
-# Default mode
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20 --compile > results_default.txt
-
-# Reduce overhead
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20 --compile --compileContext "{'mode': 'reduce-overhead'}" > results_reduce_overhead.txt
-
-# Max autotune (WARNING: This takes 5-10 minutes!)
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20 --compile --compileContext "{'mode': 'max-autotune'}" > results_max_autotune.txt
-```
-
-**Extract throughput:**
-```bash
-grep "Throughput" results_no_compile.txt
-grep "Throughput" results_default.txt
-grep "Throughput" results_reduce_overhead.txt
-grep "Throughput" results_max_autotune.txt
-```
-
-**Create comparison table:**
-
-| Mode | Compilation Time | Throughput | Speedup |
-|------|------------------|------------|---------|
-| No compile | 0 seconds | ___ samples/sec | 1.0x |
-| default | ___ seconds | ___ samples/sec | ___x |
-| reduce-overhead | ___ seconds | ___ samples/sec | ___x |
-| max-autotune | ___ seconds | ___ samples/sec | ___x |
-
-**Typical results:**
-- default: 1.1-1.2x speedup
-- reduce-overhead: 1.1-1.3x speedup
-- max-autotune: 1.2-1.4x speedup
-
-### 8.8 When Does Compilation Help Most?
-
-Let's test different models:
-
-```bash
-# ResNet18 (small model)
-python3 micro_benchmarking_pytorch.py --network resnet18 --batch-size 64 --iterations 20
-python3 micro_benchmarking_pytorch.py --network resnet18 --batch-size 64 --iterations 20 --compile
-
-# ResNet50 (medium model)
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20 --compile
-
-# ResNet152 (large model)
-python3 micro_benchmarking_pytorch.py --network resnet152 --batch-size 64 --iterations 20
-python3 micro_benchmarking_pytorch.py --network resnet152 --batch-size 64 --iterations 20 --compile
-```
-
-**Pattern:**
-- Deeper models (more layers) → More benefit from compilation
-- Why? More opportunities for fusion and optimization
-
-### 8.9 Compilation + FP16
-
-Let's combine compilation with FP16:
-
-```bash
-# FP32 no compile
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20 --fp16 0
-
-# FP32 with compile
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20 --fp16 0 --compile
-
-# FP16 no compile
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20 --fp16 1
-
-# FP16 with compile
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20 --fp16 1 --compile
-```
-
-**Comparison table:**
-
-| Configuration | Throughput | Speedup vs FP32 No Compile |
-|---------------|------------|---------------------------|
-| FP32, No compile | ___ | 1.0x |
-| FP32, Compiled | ___ | ___x |
-| FP16, No compile | ___ | ___x |
-| FP16, Compiled | ___ | ___x |
-
-**Best combination:** FP16 + max-autotune compilation!
-
-### 8.10 Common Compilation Issues
-
-#### Issue 1: "RuntimeError: Compiled function failed"
-
-**Cause:** Compilation doesn't support some operations
-
-**Solution:**
-```bash
-# Disable compilation for troubleshooting
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20
-```
-
-#### Issue 2: "Very slow compilation (>10 minutes)"
-
-**Cause:** max-autotune mode tests many variants
-
-**Solution:**
-- Use `default` mode for faster compilation
-- Only use `max-autotune` for production
-- Be patient! It's worth it for long-running inference
-
-#### Issue 3: "No speedup from compilation"
-
-**Possible causes:**
-- Model already well-optimized
-- Bottleneck is memory, not compute
-- Batch size too small
-
-**Debug:**
-```bash
-# Try larger batch
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 128 --iterations 20 --compile
-
-# Try different model
-python3 micro_benchmarking_pytorch.py --network efficientnet_b0 --batch-size 64 --iterations 20 --compile
-```
-
-### 8.11 Checkpoint
-
-Before continuing:
-
-- [ ] You understand what torch.compile does
-- [ ] You can enable compilation with `--compile`
-- [ ] You know the different compilation modes
-- [ ] You understand compilation overhead (first run)
-- [ ] You can combine compilation with FP16
-- [ ] You know when compilation helps most
-
-**Record your compilation results in `my_workshop_results.txt`!**
-
----
-
-## 9. Exercise 7: ROCm Profiler Integration
-
-### 9.1 Objective
-
-Use ROCm-specific profilers for deep kernel-level analysis.
-
-**What you'll learn:**
-- Using `rocprof` for kernel statistics
-- Using `rocprofv2` for timeline visualization
-- Interpreting kernel-level metrics
-- Identifying GPU inefficiencies
-
-### 9.2 ROCm Profiling Tools Overview
-
-| Tool | Purpose | Output |
-|------|---------|--------|
-| **rocprof** | Kernel statistics (CSV) | Execution times, call counts |
-| **rocprofv2** | Timeline visualization | JSON for Perfetto UI |
-| **rocprof-compute** | Hardware counters | Memory bandwidth, occupancy |
-
-**When to use each:**
-1. Start with manual timing (Exercise 1)
-2. Use PyTorch Profiler for operator-level (Exercise 3)
-3. Use `rocprof` for kernel statistics (this exercise)
-4. Use `rocprofv2` for timeline analysis (this exercise)
-5. Use `rocprof-compute` for advanced optimization (advanced users)
-
-### 9.3 Using rocprof for Kernel Statistics
-
-#### Step 1: Run with rocprof
-
-```bash
-rocprof --stats python3 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 32 \
-    --iterations 10
-```
-
-**Note:** Reduced iterations to keep profile size manageable
-
-
-
-**Expected output:**
-```
-ROCProfiler: Profiling enabled
-Profiling output will be in: results.csv
-
-Running benchmark...
-[... normal benchmark output ...]
-
-Profiling complete.
-Results saved to: results.csv
-```
-
-#### Step 2: Examine the results
-
-```bash
-# View first 20 lines
-head -20 results.csv
-
-# Or open in spreadsheet program
-# LibreOffice, Excel, etc.
-```
-
-**Sample results.csv:**
-```
-"Name","Calls","TotalDurationNs","AverageNs","Percentage"
-"Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16_SN_1LDSB0_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW8_GSU1_GSUASB_GLS0_ISA1100_IU1_K1_KLA_LBSPP0_LPA0_LPB8_LDL1_LRVW16_LWPMn1_LDW0_FMA_MIAV1_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PC0_PGR1_PLR1_RK0_SIA1_SS1_SU32_SUM0_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT8_64_TLDS1_USFGROn1_VAW2_VSn1_VW4_WSGRA1_WSGRB1_WS64_WG32_16_1_WGM8",42,"2476543000","58965310","45.67%"
-"void at::native::(anonymous namespace)::batch_norm_collect_statistics_kernel<float, float, int, 128>(at::native::(anonymous namespace)::BatchNormCollectStatisticsKernelParams<float, float, int>)",80,"523456000","6543200","9.65%"
-"void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor<float, float, float, at::native::MulFunctor<float> >, at::detail::Array<char*, 3> >(int, at::native::BinaryFunctor<float, float, float, at::native::MulFunctor<float> >, at::detail::Array<char*, 3>)",320,"387234000","1210106","7.14%"
-...
-```
-
-#### Step 3: Analyze kernel statistics
-
-```bash
-# Sort by total duration (slowest kernels)
-sort -t',' -k3 -nr results.csv | head -20
-
-# Count total kernel launches
-wc -l results.csv
-
-# Find memory copy operations
-grep -i "memcpy" results.csv
-```
-
-### 9.4 Understanding Kernel Statistics
-
-Let's break down the CSV columns:
-
-#### 1. Name
-- Kernel function name
-- Long, mangled names (C++ name mangling)
-- Look for keywords: `conv`, `gemm`, `batch_norm`, `relu`
-
-#### 2. Calls
-- Number of times kernel was launched
-- High call count might indicate opportunity for fusion
-
-#### 3. TotalDurationNs
-- Total time spent in this kernel (nanoseconds)
-- Sort by this to find bottlenecks!
-
-#### 4. AverageNs
-- Average time per kernel launch
-- `TotalDurationNs / Calls`
-
-#### 5. Percentage
-- Percentage of total GPU time
-- Sum of top 5-10 kernels often 80-90% of total time
-
-### 9.5 Hands-On Analysis
-
-Using your `results.csv`, answer:
-
-**Question 1:** What is the slowest kernel?
-```
-Name: _______________________________________
-Total Duration: _____________ ms (divide ns by 1,000,000)
-Percentage: _____________%
-```
-
-**Question 2:** How many total kernel launches?
-```
-Total kernels: _____________ (use: wc -l results.csv)
-```
-
-**Question 3:** What percentage of time is spent in top 5 kernels?
-```
-Kernel 1: ______________%
-Kernel 2: ______________%
-Kernel 3: ______________%
-Kernel 4: ______________%
-Kernel 5: ______________%
-
-Total: ______________%
-```
-
-**Question 4:** Are there memory copy operations?
-```
-grep -i "memcpy" results.csv
-
-Found: _______ memcpy operations
-Total time: _______ ms
-Percentage: _______%
-```
-
-**Interpretation:**
-- If memcpy > 10%: Memory transfer is a bottleneck
-- If memcpy < 5%: Compute-bound, memory transfers are efficient
-
-### 9.6 Comparing FP32 vs FP16 Kernels
-
-Let's see how kernels differ:
-
-```bash
-# FP32
-rocprof --stats -o profile_fp32.csv python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 32 --iterations 10 --fp16 0
-
-# FP16
-rocprof --stats -o profile_fp16.csv python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 32 --iterations 10 --fp16 1
-```
-
-#### Compare kernel counts:
-
-```bash
-# FP32 kernel count
-wc -l profile_fp32.csv
-
-# FP16 kernel count
-wc -l profile_fp16.csv
-```
-
-#### Compare slowest kernel:
-
-```bash
-# FP32 slowest
-sort -t',' -k3 -nr profile_fp32.csv | head -2
-
-# FP16 slowest
-sort -t',' -k3 -nr profile_fp16.csv | head -2
-```
-
-**Create comparison:**
-```
-FP32:
-  Total kernels: _____________
-  Slowest kernel: _____________ ms
-
-FP16:
-  Total kernels: _____________
-  Slowest kernel: _____________ ms
-
-Speedup: _____________ / _____________ = _____________x
-```
-
-### 9.7 Using rocprofv2 for Timeline Visualization
-
-Now let's create a timeline visualization:
-
-```bash
-rocprofv2 --kernel-trace -o timeline.json python3 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 32 \
-    --iterations 5
-```
-
-**Note:** Only 5 iterations to keep file size small
-
-**Expected output:**
-```
-ROCProfiler v2: Timeline tracing enabled
-Output file: timeline.json
-
-Running benchmark...
-[... normal benchmark output ...]
-
-Timeline saved to: timeline.json
-File size: 23.4 MB
-
-View at: https://ui.perfetto.dev
-```
-
-#### Step 2: Visualize the timeline
-
-1. **Open Chrome browser**
-2. **Go to:** `https://ui.perfetto.dev`
-3. **Click "Open trace file"**
-4. **Select `timeline.json`**
-
-**You'll see a timeline view!**
-
-### 9.8 Interpreting the Timeline
-
-The timeline shows:
-
-**X-axis:** Time (microseconds)
-**Y-axis:** Different "tracks":
-- CPU threads
-- GPU streams
-- Kernel executions
-- Memory copies
-
-**What to look for:**
-
-#### 1. GPU Idle Time (Gaps)
-```
-Good:  ████████████████████████████████  (No gaps, fully utilized)
-Bad:   ███  ██  ███  ██  ███  ██  ███  (Lots of gaps, idle time)
-```
-
-**If you see gaps:**
-- CPU bottleneck (slow data loading, Python overhead)
-- Synchronization issues
-- Small batch size
-
-#### 2. Kernel Duration Variance
-```
-Good:  ████ ████ ████ ████ ████  (Consistent duration)
-Bad:   █ ████ ██ ████████ █ ████  (Highly variable)
-```
-
-**If highly variable:**
-- Different batch sizes
-- Conditional execution
-- Autotuning happening
-
-#### 3. Memory Copies
-```
-Look for: Memcpy HtoD (Host to Device)
-          Memcpy DtoH (Device to Host)
-```
-
-**If significant:**
-- Consider pinned memory
-- Use async copies
-- Overlap compute and transfer
-
-#### 4. Kernel Launch Overhead
-```
-Measure gap between kernel end and next kernel start
-```
-
-**If large gaps (>10μs):**
-- Kernel fusion opportunity
-- CPU-side overhead
-
-### 9.9 Advanced: rocprof-compute Metrics
-
-For advanced users, `rocprof-compute` provides hardware counters:
-
-```bash
-rocprof-compute profile -w profile.csv python3 micro_benchmarking_pytorch.py \
-    --network resnet50 \
-    --batch-size 32 \
-    --iterations 5
-```
-
-**Metrics available:**
-- Memory bandwidth utilization (%)
-- GPU occupancy (%)
-- Cache hit rates
-- Arithmetic intensity
-- Wave occupancy
-
-**Example metrics:**
-```
-LDS Bank Conflicts: 234
-L2 Cache Hit Rate: 87.5%
-Memory Bandwidth Util: 72.3%
-Wave Occupancy: 45.2%
-```
-
-**Interpretation:**
-- Memory bandwidth > 80%: Memory-bound
-- Occupancy < 30%: Poor kernel utilization
-- Cache hit < 70%: Poor memory access patterns
-
-### 9.10 Checkpoint
-
-Before continuing:
-
-- [ ] You can use `rocprof --stats` for kernel statistics
-- [ ] You can identify slowest kernels
-- [ ] You can count kernel launches
-- [ ] You can use `rocprofv2` for timeline visualization
-- [ ] You can interpret timeline traces
-- [ ] You understand GPU idle time, gaps, and kernel duration
-
-**Record your profiling insights in `my_workshop_results.txt`!**
-
----
-
-## 10. Wrap-up & Best Practices
-
-### 10.1 Workshop Summary
-
-Congratulations! You've completed the ROCm PyTorch Inference Benchmark Workshop!
-
-**What you've learned:**
-
-1. **Environment Setup**
-   - Verify ROCm, PyTorch, GPUs
-   - Run standardized benchmarks
-
-2. **Benchmark Tool Mastery**
-   - Use `micro_benchmarking_pytorch.py`
-   - Understand command-line options
-   - Interpret output metrics
-
-3. **Precision Optimization**
-   - FP16 vs FP32 comparison
-   - 2x speedup, 40% memory reduction
-   - When to use FP16
-
-4. **Framework Profiling**
-   - PyTorch Profiler for operator-level analysis
-   - DeepSpeed FLOPS profiler for efficiency
-   - Identifying bottleneck operations
-
-5. **Multi-GPU Scaling**
-   - Distributed data parallel with `torchrun`
-   - Scaling efficiency calculation
-   - Debugging multi-GPU issues
-
-6. **Compilation Optimization**
-   - torch.compile for automatic optimization
-   - Different compilation modes
-   - 1.2-1.4x additional speedup
-
-7. **Hardware Profiling**
-   - rocprof for kernel statistics
-   - rocprofv2 for timeline visualization
-   - Finding GPU inefficiencies
-
-### 10.2 Performance Optimization Checklist
-
-Use this checklist for optimizing YOUR models:
-
-#### Phase 1: Baseline & Measurement
-- [ ] Establish baseline performance (no optimizations)
-- [ ] Use manual timing with `torch.cuda.synchronize()`
-- [ ] Record throughput, latency, memory usage
-- [ ] Run multiple iterations for stable measurements
-
-#### Phase 2: Low-Hanging Fruit
-- [ ] Use FP16 if model supports it (2x speedup typical)
-- [ ] Increase batch size to maximum (better GPU utilization)
-- [ ] Enable `torch.compile` with default mode (1.2x speedup typical)
-- [ ] Use `model.eval()` and `torch.no_grad()` for inference
-
-#### Phase 3: Profiling
-- [ ] PyTorch Profiler: Identify slow operators
-- [ ] rocprof: Find bottleneck kernels
-- [ ] rocprofv2: Visualize timeline, find idle time
-- [ ] DeepSpeed FLOPS: Calculate efficiency
-
-#### Phase 4: Optimization
-- [ ] If memory-bound (<20% efficiency):
-  - Increase batch size
-  - Use FP16
-  - Fuse operations
-  - Optimize memory layout
-
-- [ ] If compute-bound (>40% efficiency):
-  - Use specialized kernels (cuDNN/MIOpen)
-  - Try custom Triton kernels
-  - Use torch.compile max-autotune
-
-- [ ] If CPU-bound (gaps in timeline):
-  - Use data loading workers
-  - Pre-allocate tensors
-  - Reduce Python overhead
-  - Use JIT compilation
-
-#### Phase 5: Validation
-- [ ] Re-measure performance
-- [ ] Verify numerical accuracy (compare outputs)
-- [ ] Test with different batch sizes
-- [ ] Ensure consistent results (low std dev)
-
-#### Phase 6: Scaling (If Multi-GPU)
-- [ ] Test single GPU first
-- [ ] Scale to 2, 4, 8 GPUs
-- [ ] Calculate scaling efficiency
-- [ ] Optimize batch size per GPU
-
-### 10.3 Common Pitfalls and How to Avoid Them
-
-#### Pitfall 1: Not Using torch.cuda.synchronize()
-
-**Problem:**
-```python
-start = time.time()
-output = model(input)
-end = time.time()  # WRONG! GPU is still running
-```
-
-**Solution:**
-```python
-start = time.time()
-output = model(input)
-torch.cuda.synchronize()  # Wait for GPU to finish
-end = time.time()
-```
-
-#### Pitfall 2: Including Warmup in Measurements
-
-**Problem:**
-```python
-for i in range(20):
-    output = model(input)
-# Average includes slow first iteration
-```
-
-**Solution:**
-```python
-# Warmup
-for i in range(5):
-    output = model(input)
-torch.cuda.synchronize()
-
-# Timed iterations
-start = time.time()
-for i in range(20):
-    output = model(input)
-torch.cuda.synchronize()
-end = time.time()  # Excludes warmup
-```
-
-#### Pitfall 3: Batch Size Too Small
-
-**Problem:**
-- Low GPU utilization
-- High kernel launch overhead
-- Poor performance
-
-**Solution:**
-- Increase batch size
-- Profile to find optimal batch size
-- Trade-off: Larger batch = more memory, higher throughput
-
-#### Pitfall 4: Ignoring Numerical Accuracy
-
-**Problem:**
-- FP16 causes NaN or Inf
-- Model outputs are wrong
-- Silent numerical errors
-
-**Solution:**
-```python
-# Always verify outputs
-output_fp32 = model_fp32(input)
-output_fp16 = model_fp16(input)
-
-diff = (output_fp32 - output_fp16).abs().max()
-print(f"Max difference: {diff}")  # Should be < 0.01
-```
-
-#### Pitfall 5: Over-Optimizing Small Operations
-
-**Problem:**
-- Spend hours optimizing 2% of runtime
-- Ignore operations that take 80% of time
-
-**Solution:**
-- Profile first!
-- Focus on bottlenecks (top 80% of time)
-- Use Pareto principle: 20% of operations take 80% of time
-
-### 10.4 When to Use Each Technique
-
-| Technique | Speedup | Effort | When to Use |
-|-----------|---------|--------|-------------|
-| FP16 | 2x | Low (1 line) | Almost always for inference |
-| Larger batch | 1.5-3x | Low | When memory allows |
-| torch.compile | 1.2-1.4x | Low (1 line) | Production deployments |
-| Multi-GPU | Nx | Medium | Large throughput requirements |
-| Custom kernels | 2-10x | High | Bottleneck operations |
-| Model optimization | 2-5x | High | Production, critical latency |
-
-### 10.5 Real-World Deployment Recommendations
-
-#### For Production Inference:
-
-1. **Model Optimization:**
-   - Use FP16 or INT8 quantization
-   - Compile with max-autotune mode
-   - Prune unnecessary operations
-
-2. **Batch Processing:**
-   - Use largest batch size that meets latency requirements
-   - Implement dynamic batching (combine requests)
-
-3. **Hardware Selection:**
-   - Profile your specific model on different GPUs
-   - Consider memory requirements
-   - Calculate cost per inference
-
-4. **Monitoring:**
-   - Track throughput, latency, memory usage
-   - Set up alerts for performance degradation
-   - Log profiling data periodically
-
-5. **Optimization Cycle:**
-   - Measure → Analyze → Optimize → Validate
-   - Repeat as workload changes
-   - Keep profiling infrastructure in place
-
-### 10.6 Resources for Further Learning
-
-#### Official Documentation
-- **ROCm Documentation:** https://rocm.docs.amd.com/
-- **PyTorch Profiler:** https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html
-- **DeepSpeed:** https://www.deepspeed.ai/tutorials/flops-profiler/
-- **torch.compile:** https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html
-
-#### Profiling Tools
-- **rocprof Guide:** https://rocm.docs.amd.com/projects/rocprofiler/en/latest/
-- **rocprofv2:** https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/
-- **Perfetto UI:** https://ui.perfetto.dev
-
-#### Community
-- **PyTorch Forums:** https://discuss.pytorch.org/
-- **ROCm GitHub:** https://github.com/RadeonOpenCompute/ROCm
-- **AMD Developer Community:** https://community.amd.com/
-
-### 10.7 Next Steps
-
-**Immediate Actions:**
-1. Apply techniques to YOUR models
-2. Establish baselines for your workload
-3. Create profiling scripts for regular testing
-4. Document optimization wins
-
-**Short-term (1-2 weeks):**
-1. Deep-dive into your bottleneck operations
-2. Try custom optimizations (if needed)
-3. Test multi-GPU scaling (if applicable)
-4. Implement monitoring
-
-**Long-term (1-3 months):**
-1. Build optimization into CI/CD
-2. Create performance regression tests
-3. Track performance over time
-4. Share learnings with team
-
-### 10.8 Workshop Feedback
-
-**Please provide feedback on:**
-
-1. **What worked well?**
-   - Which exercises were most valuable?
-   - What concepts were clearest?
-
-2. **What could be improved?**
-   - Which parts were confusing?
-   - What needs more detail?
-
-3. **What's missing?**
-   - Topics you wanted to cover?
-   - Tools or techniques?
-
-4. **Overall experience:**
-   - Pacing (too fast/slow)?
-   - Difficulty level?
-   - Practical applicability?
-
-### 10.9 Final Checklist
-
-Before leaving the workshop:
-
-- [ ] All exercises completed
-- [ ] Results recorded in `my_workshop_results.txt`
-- [ ] Understood key concepts (FP16, profiling, multi-GPU, compilation)
-- [ ] Know how to profile YOUR models
-- [ ] Have resources for further learning
-- [ ] Can apply techniques to production workloads
-
-### 10.10 Thank You!
-
-**Congratulations on completing the ROCm PyTorch Inference Benchmark Workshop!**
-
-You now have the skills to:
-- Benchmark AI models systematically
-- Use profiling tools to find bottlenecks
-- Apply optimization techniques
-- Scale workloads across GPUs
-- Measure and validate improvements
-
-**Go forth and optimize!** 
-
----
-
-## Appendix A: Quick Reference Commands
-
-### Basic Benchmarking
-```bash
-# Single GPU, FP32
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20
-
-# Single GPU, FP16
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20 --fp16 1
-
-# Multi-GPU
-torchrun --nproc-per-node 2 micro_benchmarking_pytorch.py --network resnet50 --batch-size 128 --iterations 20
-
-# With compilation
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 20 --compile
-```
-
-### Profiling
-```bash
-# PyTorch Profiler
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 32 --iterations 10 --autograd-profiler
-
-# DeepSpeed FLOPS
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 32 --iterations 20 --flops-prof-step 10
-
-# rocprof statistics
-rocprof --stats python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 32 --iterations 10
-
-# rocprofv2 timeline
-rocprofv2 --kernel-trace -o timeline.json python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 32 --iterations 5
-```
-
-### System Checks
-```bash
-# Check ROCm
-rocminfo | grep "Name:"
-
-# Check GPU
-rocm-smi
-
-# Check PyTorch
-python3 -c "import torch; print(torch.__version__); print(torch.cuda.is_available())"
-
-# Check GPU memory
-rocm-smi --showmeminfo vram
-```
-
----
-
-## Appendix B: Troubleshooting Guide
-
-### GPU Not Detected
-```bash
-# Check GPU visibility
-rocminfo | grep "Name:"
-
-# Check permissions
-sudo usermod -aG video $USER
-sudo usermod -aG render $USER
-# Logout and login
-
-# Verify
-groups | grep video
-```
-
-### Out of Memory (OOM)
-```bash
-# Reduce batch size
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 16 --iterations 20
-
-# Use FP16
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 32 --iterations 20 --fp16 1
-
-# Clear cache
-python3 -c "import torch; torch.cuda.empty_cache()"
-```
-
-### Poor Performance
-```bash
-# Check GPU utilization during run
-watch -n 0.5 rocm-smi
-
-# Should show ~100% utilization
-# If low, check:
-# 1. Batch size too small
-# 2. CPU bottleneck
-# 3. Thermal throttling
-```
-
-### Inconsistent Results
-```bash
-# Increase iterations for better averaging
-python3 micro_benchmarking_pytorch.py --network resnet50 --batch-size 32 --iterations 50
-
-# Check for system interference
-top
-# Look for other processes using CPU/GPU
-```
-
----
-
-**End of Workshop Guide**
-
-
-**Exercises Completed:** 7 major exercises
-**Skills Acquired:** GPU benchmarking, profiling, optimization
-
-**Now go optimize your models!** 
diff --git a/MLExamples/inference_benchmark/README.md b/MLExamples/inference_benchmark/README.md
deleted file mode 100644
index fa52322e..00000000
--- a/MLExamples/inference_benchmark/README.md
+++ /dev/null
@@ -1,77 +0,0 @@
-# pytorch-micro-benchmarking
-We supply a small microbenchmarking script for PyTorch training on ROCm.
-
-To execute:
-`python micro_benchmarking_pytorch.py --network <network name> [--batch-size <batch size> ] [--iterations <number of iterations>] [--fp16 <0 or 1> ] [--distributed_dataparallel] [--device_ids <comma separated list (no spaces) of GPU indices (0-indexed) to run distributed_dataparallel api on>] `
-
-Possible network names are: `alexnet`, `densenet121`, `inception_v3`, `resnet50`, `resnet101`, `SqueezeNet`, `vgg16` etc.
-
-Default are 10 training iterations, `fp16` off (i.e., 0), and a batch size of 64.
-
-For mGPU runs, use one of the following methods.
-- `torchrun`: It will spawn multiple sub-processes for each of the GPUs and adjust `world_size` and `rank` accordingly. `torchrun` also defaults to using distributed dataparallel.
-- `--distributed_dataparallel`: Uses torch.nn.parallel.DistributedDataParallel to run multiple processes/node. However, the script only launches one process per GPU, multiple processes need to be launched manually. See example below.
-  
-_NOTE_: `--distributed_dataparallel` option will be deprecated in the future as this path can be exercised now with `torchrun`.
-_NOTE_: If comparing `--distributed_dataprallel` performance with `torchrun` one, you need to multiply the `--batch-size` with number of nodes in the `torchrun` command. `torchrun` will split the batch size into mini batches that run on each of the nodes. `--distributed_dataparallel` doesn't do that automatically, it run with whatever the user provides.
-
-Examples: 
-- for a 1-GPU resnet50 run:
-```
-python3 micro_benchmarking_pytorch.py --network resnet50
-```
-
-- for a 2-GPU run on a single node using `torchrun`:
-```
-torchrun --nproc-per-node 2 micro_benchmarking_pytorch.py --network resnet50 --batch-size 128
-
-```
-
-- for a 2-GPU run on a single node using `--distributed_dataparallel`:
-```
-python3 micro_benchmarking_pytorch.py --device_ids=0 --network resnet50 --distributed_dataparallel --rank 0 --world-size 2 --dist-backend nccl --dist-url tcp://127.0.0.1:4332 --batch-size 64 &
-python3 micro_benchmarking_pytorch.py --device_ids=1 --network resnet50 --distributed_dataparallel --rank 1 --world-size 2 --dist-backend nccl --dist-url tcp://127.0.0.1:4332 --batch-size 64 &
-```
-
-
-To run FlopsProfiler (with deepspeed.profiling.flops_profiler imported):
-`python micro_benchmarking_pytorch.py --network resnet50 --amp-opt-level=2 --batch-size=256 --iterations=20 --flops-prof-step 10`
-
-## Performance tuning
-If performance on a specific card and/or model is found to be lacking, typically some gains can be made by tuning MIOpen. For this, `export MIOPEN_FIND_ENFORCE=3` prior to running the model. This will take some time if untuned configurations are encountered and write to a local performance database. More information on this can be found in the [MIOpen documentation](https://rocm.github.io/MIOpen/doc/html/perfdatabase.html).
-
-## PyTorch 2.0
-Added the `--compile` option opens up PyTorch 2.0 capabilities, which comes with several options. Here are some notes from upstream: 
-```
-    Optimizes given model/function using TorchDynamo and specified backend.
-
-    Args:
-       model (Callable): Module/function to optimize
-       fullgraph (bool): Whether it is ok to break model into several subgraphs
-       dynamic (bool): Use dynamic shape tracing
-       backend (str or Callable): backend to be used
-       mode (str): Can be either "default", "reduce-overhead" or "max-autotune"
-       options (dict): A dictionary of options to pass to the backend.
-       disable (bool): Turn torch.compile() into a no-op for testing
-
-    Example::
-
-        @torch.compile(options={"matmul-padding": True}, fullgraph=True)
-        def foo(x):
-            return torch.sin(x) + torch.cos(x)
-```
-
-With the required `--compile` option, these additional options are now available from the command line with the `--compileContext` flag. Here are a few examples:
-
-```bash
-python micro_benchmarking_pytorch.py --network resnet50 --compile # default run
-```
-
-```bash
-python micro_benchmarking_pytorch.py --network resnet50 --compile --compileContext "{'mode': 'max-autotune', 'fullgraph': 'True'}"
-```
-
-```bash
-python micro_benchmarking_pytorch.py --network resnet50 --compile --compileContext "{'options': {'static-memory': 'True', 'matmul-padding': 'True'}}"
-```
-Note: you cannot pass the `mode` and `options` options together.
diff --git a/MLExamples/inference_benchmark/ATTRIBUTION.md b/MLExamples/pytorch_microbench/ATTRIBUTION.md
similarity index 100%
rename from MLExamples/inference_benchmark/ATTRIBUTION.md
rename to MLExamples/pytorch_microbench/ATTRIBUTION.md
diff --git a/MLExamples/pytorch_microbench/INFERENCE_BENCHMARK_NOTES.md b/MLExamples/pytorch_microbench/INFERENCE_BENCHMARK_NOTES.md
new file mode 100644
index 00000000..3676243f
--- /dev/null
+++ b/MLExamples/pytorch_microbench/INFERENCE_BENCHMARK_NOTES.md
@@ -0,0 +1,93 @@
+# PyTorch Micro-Benchmark Notes
+
+This file collects a few technical notes that are useful when varying the default benchmark case described in `README.md`.
+
+## Mixed precision and compilation
+
+Mixed precision can be enabled with:
+
+```bash
+python micro_benchmarking_pytorch.py --network densenet121 --batch-size 64 --iterations 10 --fp16 1
+```
+
+Compilation can be enabled with:
+
+```bash
+python micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 10 --compile
+```
+
+For short runs, the one-time compile cost may dominate the reported timing, so a compiled case may appear slower than the eager baseline even when the steady-state behavior is better. When the goal is steady-state comparison, use a larger iteration count.
+
+Additional compile options may be passed through `--compileContext`, for example:
+
+```bash
+python micro_benchmarking_pytorch.py \
+    --network resnet50 \
+    --batch-size 64 \
+    --iterations 20 \
+    --compile \
+    --compileContext "{'mode': 'max-autotune', 'fullgraph': 'True'}"
+```
+
+## MIOpen tuning
+
+On systems that use MIOpen, it can be useful to allow the library to tune and cache its convolution choices before comparing results:
+
+```bash
+export MIOPEN_FIND_ENFORCE=3
+python micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 10
+```
+
+The first run may spend additional time building the performance database. Subsequent runs are then more meaningful for comparison.
+
+## PyTorch profiler options
+
+The script also supports framework-level profiling:
+
+```bash
+python micro_benchmarking_pytorch.py \
+    --network densenet121 \
+    --batch-size 2048 \
+    --compile \
+    --fp16 1 \
+    --kineto \
+    --iterations 10
+```
+
+This path is useful when the goal is to correlate Python-level and operator-level behavior before moving to ROCm tools.
+
+For ROCTX correlation with ROCm profilers, use:
+
+```bash
+python micro_benchmarking_pytorch.py \
+    --network densenet121 \
+    --batch-size 2048 \
+    --compile \
+    --fp16 1 \
+    --autograd_profiler
+```
+
+## DeepSpeed FLOPS profiling
+
+If DeepSpeed is available, the benchmark can also be run with FLOPS profiling:
+
+```bash
+python micro_benchmarking_pytorch.py \
+    --network densenet121 \
+    --batch-size 2048 \
+    --fp16 1 \
+    --flops-prof-step 10 \
+    --iterations 20
+```
+
+This mode is useful when the question is about model-level efficiency rather than kernel-level execution.
+
+## Multi-GPU runs
+
+For distributed cases, `--batch-size` is the global batch size across all ranks. For example:
+
+```bash
+torchrun --nproc-per-node <ngpu> micro_benchmarking_pytorch.py --network densenet121 --batch-size 2048 --compile --fp16 1
+```
+
+Each rank processes `batch-size / <ngpu>` samples. When comparing distributed results, it is important to keep that interpretation in mind.
diff --git a/MLExamples/pytorch_microbench/INFERENCE_BENCHMARK_WORKSHOP_WALKTHROUGH.md b/MLExamples/pytorch_microbench/INFERENCE_BENCHMARK_WORKSHOP_WALKTHROUGH.md
new file mode 100644
index 00000000..e3aa94d8
--- /dev/null
+++ b/MLExamples/pytorch_microbench/INFERENCE_BENCHMARK_WORKSHOP_WALKTHROUGH.md
@@ -0,0 +1,154 @@
+# PyTorch Micro-Benchmark Workshop Guide
+
+The main walkthrough for this directory is the `README.md` file. This note keeps only a short set of exercises that can be completed in one sitting. The intent is to preserve a README-first workflow while still providing a compact lab sequence for training use.
+
+## Preparation
+
+Load the required modules:
+
+```bash
+module load pytorch rocm
+```
+
+Use the default case from the directory scripts unless there is a reason to change it:
+
+```bash
+python micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 10
+```
+
+Record the reported throughput before collecting any profiler output.
+
+## Exercise 1: Baseline run
+
+Run the benchmark once:
+
+```bash
+python micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 10
+```
+
+Write down the following quantities:
+
+- throughput in images per second
+- dtype
+- batch size
+- whether `--compile` or `--fp16 1` was used
+
+This baseline gives the reference point for the remaining exercises.
+
+The figure below was generated from fresh container runs with `generate_example_plots.py`. It shows the baseline case together with two follow-up variations used later in this workshop.
+
+![pytorch_microbench example measurements from validated container runs](images/pytorch_microbench_example_runs.png)
+
+## Exercise 2: Runtime trace
+
+Collect a full runtime trace:
+
+```bash
+./get_trace.sh
+```
+
+Open the generated `.pftrace` file in Perfetto:
+
+```text
+https://ui.perfetto.dev/
+```
+
+Inspect the trace with three questions in mind:
+
+- Are the GPU kernels separated by visible idle gaps?
+- Do memory operations appear in the critical path?
+- Is the host side primarily launching work, or is it waiting on synchronization?
+
+If time is limited, this is the first profiler we recommend running because it gives the clearest overall picture of the execution.
+
+## Exercise 3: GPU hotspots
+
+Collect a kernel trace:
+
+```bash
+./get_gpu_hotspots.sh
+```
+
+If the result is a ROCm 7.x database, extract a summary with:
+
+```bash
+rocpd2csv -i <db_file> -o kernel_stats.csv
+rocpd summary -i <db_file> --region-categories KERNEL
+```
+
+From this output, record:
+
+- total GPU time
+- number of kernel dispatches
+- number of unique kernels
+- the top three kernels by time
+
+For the CNN workloads in this directory, the dominant kernels are often convolution and batch normalization kernels from MIOpen. The exact names matter less than their share of the total time.
+
+The plot below comes from an actual `get_gpu_hotspots.sh` run in the container and gives one compact example of the hotspot distribution.
+
+![pytorch_microbench GPU hotspots from validated container run](images/pytorch_microbench_gpu_hotspots.png)
+
+## Exercise 4: Performance metrics
+
+Collect a `rocprof-compute` report:
+
+```bash
+./get_performance_metrics.sh
+```
+
+Then list the detected kernels and dispatches:
+
+```bash
+rocprof-compute analyze -p <profile_dir> --list-stats
+```
+
+After selecting a dispatch, generate a focused report:
+
+```bash
+rocprof-compute analyze -p <profile_dir> --dispatch <N>
+rocprof-compute analyze -p <profile_dir> --dispatch <N> --block 2.1.15 6.2.7
+rocprof-compute analyze -p <profile_dir> --dispatch <N> --block 16.1 17.1
+```
+
+This exercise is most useful after Exercise 3 because it is easier to interpret the report when there is already a target kernel in mind. The occupancy-oriented block selection and memory-oriented block selection mirror the usage pattern in the `rocprof-compute` training examples elsewhere in this repository.
+
+On systems where `rocprof-compute` hardware-counter collection is unavailable, treat this exercise as optional and continue with the remaining steps.
+
+Questions to answer:
+
+- Does the kernel appear limited by memory traffic or by arithmetic throughput?
+- Is occupancy likely to be the issue?
+- Does the report reinforce what was seen in the runtime trace?
+
+## Exercise 5: System trace
+
+Collect a system trace:
+
+```bash
+./get_rocprof_sys.sh
+```
+
+Open the resulting `.proto` file in Perfetto and compare it with the runtime trace from Exercise 2. The goal is not to replace the runtime trace, but to see whether the broader system view changes the interpretation of the run.
+
+If the system-level view is not needed for the first pass, it is reasonable to stop after Exercise 4 and return to `rocprof-sys` later.
+
+## Follow-up variations
+
+After the default case has been studied, try one variable at a time:
+
+```bash
+python micro_benchmarking_pytorch.py --network densenet121 --batch-size 64 --iterations 10 --fp16 1
+python micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 10 --compile
+```
+
+For each variation, compare:
+
+- throughput
+- dominant kernels
+- trace shape
+- whether the same profiler workflow still answers the main performance questions
+
+## Closing remark
+
+If only a short training exercise is desired, Exercises 1 through 3 are sufficient. They provide a complete path from benchmark run to trace to hotspot identification, which is usually enough to begin a more detailed performance study.
diff --git a/MLExamples/pytorch_microbench/PROFILING_SCRIPTS.md b/MLExamples/pytorch_microbench/PROFILING_SCRIPTS.md
new file mode 100644
index 00000000..d7c0d1fd
--- /dev/null
+++ b/MLExamples/pytorch_microbench/PROFILING_SCRIPTS.md
@@ -0,0 +1,94 @@
+# PyTorch Micro-Benchmark Profiling Scripts
+
+The `README.md` file in this directory is the primary walkthrough and the only full tutorial. This note is only a short reference to the profiling scripts and their outputs.
+
+## Default workload
+
+Unless modified, the scripts profile the following command:
+
+```bash
+python micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 10
+```
+
+The scripts can be retargeted without editing them:
+
+- `PYTORCH_MICROBENCH_NETWORK`: override the model name
+- `PYTORCH_MICROBENCH_BATCH_SIZE`: override the batch size
+- `PYTORCH_MICROBENCH_ITERATIONS`: override the iteration count
+- `PYTORCH_MICROBENCH_EXTRA_ARGS`: append simple benchmark flags such as `--fp16 1` or `--compile`
+- `PYTORCH_MICROBENCH_OUTPUT_ROOT`: write results under a different root directory
+- `PYTORCH_MICROBENCH_PYTHON`: select a specific Python executable
+
+Example:
+
+```bash
+PYTORCH_MICROBENCH_NETWORK=densenet121 \
+PYTORCH_MICROBENCH_EXTRA_ARGS="--fp16 1" \
+./get_trace.sh
+```
+
+## Script summary
+
+| Script | Tool | Main output | Primary use |
+|--------|------|-------------|-------------|
+| `get_trace.sh` | `rocprofv3 --runtime-trace` | `profiling_results/trace_*` | Timeline view of host activity, kernel launches, and memory traffic |
+| `get_gpu_hotspots.sh` | `rocprofv3 --kernel-trace` | `profiling_results/gpu_hotspots_*` | Kernel counts, total GPU time, and hotspot identification |
+| `get_performance_metrics.sh` | `rocprof-compute profile` | `profiling_results/performance_metrics_*` | Hardware counter analysis for selected dispatches |
+| `get_rocprof_sys.sh` | `rocprof-sys-run --profile --trace` | `profiling_results/rocprof_sys_*` | System-level view in Perfetto |
+
+In a typical first pass through the example, `get_trace.sh`, `get_gpu_hotspots.sh`, and `get_rocprof_sys.sh` should produce the expected trace or summary outputs. If hardware-counter collection is unsupported on the local system, `get_performance_metrics.sh` exits early with a short explanation.
+
+## ROCm 7.x note
+
+For ROCm 7.x, `get_gpu_hotspots.sh` commonly produces a SQLite database rather than a CSV file. Two useful follow-up commands are:
+
+```bash
+rocpd2csv -i <db_file> -o kernel_stats.csv
+rocpd summary -i <db_file> --region-categories KERNEL
+```
+
+For `get_trace.sh`, if a database is produced instead of a `.pftrace` file, convert it with:
+
+```bash
+rocpd2pftrace -i <db_file> -o trace.pftrace
+```
+
+## `rocprof-compute` note
+
+The performance-metrics script follows the same pattern used in the `rocprof-compute` training examples in this repository:
+
+1. collect a workload
+2. list kernels and dispatches
+3. analyze a selected dispatch with targeted metric blocks
+
+The first post-processing command should therefore be:
+
+```bash
+rocprof-compute analyze -p <profile_dir> --list-stats
+```
+
+After selecting a dispatch, two useful analysis commands are:
+
+```bash
+rocprof-compute analyze -p <profile_dir> --dispatch <N> --block 2.1.15 6.2.7
+rocprof-compute analyze -p <profile_dir> --dispatch <N> --block 16.1 17.1
+```
+
+Counter availability is best on supported Instinct class GPUs. Other systems may expose only a subset of the metrics, or no supported counter collection path at all.
+
+When counter collection is unsupported on the local system, the script reports that condition explicitly and exits without attempting collection.
+
+The script accepts an optional mode argument:
+
+- `no-roof`: default tutorial mode; collect counters only and skip the roofline stage
+- `full`: collect the default counters and roofline data
+- `roof-only`: collect roofline data only
+
+## Recommended order
+
+For a first pass through the example, we suggest:
+
+1. `get_trace.sh`
+2. `get_gpu_hotspots.sh`
+3. `get_performance_metrics.sh`
+4. `get_rocprof_sys.sh`
diff --git a/MLExamples/pytorch_microbench/README.md b/MLExamples/pytorch_microbench/README.md
new file mode 100644
index 00000000..9509cdc4
--- /dev/null
+++ b/MLExamples/pytorch_microbench/README.md
@@ -0,0 +1,272 @@
+# ML Example: PyTorch Micro-Benchmarking with ROCm Profiling
+
+In this example we consider a compact PyTorch workload that is useful for learning the ROCm profiling tools on a model that is small enough to run quickly, but large enough to produce non-trivial GPU activity. The driver runs forward and backward passes for common CNN architectures and reports throughput in images per second. The scripts in this directory use `resnet50`, batch size `64`, and `10` iterations so that the outputs from the different profilers can be compared on the same workload.
+
+The purpose of the directory is straightforward. We begin with one reproducible benchmark run, then examine the same execution with a timeline trace, a kernel summary, a hardware counter report, and a system trace. In that sense, the example is meant to be read and run in the same spirit as the GhostExchange materials: one workload, a small number of commands, and a clear progression from run to analysis.
+
+This `README.md` file is the primary walkthrough for the directory. The other markdown files are short reference notes and optional training checklists, not separate full tutorials.
+
+## Overview of the benchmark
+
+The benchmark is controlled with the following arguments:
+
+- `--network <name>`: network to benchmark, for example `resnet50`, `resnet101`, `densenet121`, `vgg16`, or `alexnet`
+- `--batch-size <N>`: global mini-batch size
+- `--iterations <N>`: number of timed iterations
+- `--fp16 <0|1>`: enable mixed precision when supported
+- `--compile`: enable `torch.compile`
+- `--compileContext <dict>`: pass compile options as a Python dictionary string
+- `--distributed_dataparallel`: run with distributed data parallel
+- `--device_ids <ids>`: comma-separated GPU ids for distributed runs
+
+## Profiling scripts in this directory
+
+The directory contains four short profiling scripts:
+
+- `get_trace.sh`: collect a runtime trace with `rocprofv3`
+- `get_gpu_hotspots.sh`: collect a kernel trace and hotspot summary with `rocprofv3`
+- `get_performance_metrics.sh`: collect hardware counter reports with `rocprof-compute`
+- `get_rocprof_sys.sh`: collect a system trace with `rocprof-sys`
+
+We recommend using them in the order listed above. The runtime trace shows the overall execution flow. The kernel trace identifies the dominant GPU kernels. The compute report is most useful once there is a narrower question about occupancy, memory traffic, or arithmetic intensity.
+
+All four scripts use the same default workload, but they can be retargeted without editing the files. The common overrides are `PYTORCH_MICROBENCH_NETWORK`, `PYTORCH_MICROBENCH_BATCH_SIZE`, `PYTORCH_MICROBENCH_ITERATIONS`, `PYTORCH_MICROBENCH_EXTRA_ARGS`, and `PYTORCH_MICROBENCH_OUTPUT_ROOT`. For example, `PYTORCH_MICROBENCH_EXTRA_ARGS="--fp16 1" ./get_trace.sh` profiles the default trace workflow with mixed precision enabled.
+
+## Running the benchmark
+
+Load the required modules:
+
+```bash
+module load pytorch rocm
+```
+
+Run a baseline case:
+
+```bash
+python micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 10
+```
+
+An example output from one run is shown below. The exact timing values depend on the model, GPU, ROCm version, and whether relevant caches are already warm:
+
+```text
+INFO: running forward and backward for warmup.
+INFO: running the benchmark..
+OK: finished running benchmark..
+--------------------SUMMARY--------------------------
+Microbenchmark for network : resnet50
+Num devices: 1
+Dtype: FP32
+Mini batch size [img] : 64
+Time per mini-batch : 0.1770334005355835
+Throughput [img/sec] : 361.51370197024534
+```
+
+The main quantity to record from this run is the throughput. For profiling, it is also useful to note the problem size and whether `torch.compile` or `--fp16 1` was enabled. The values should be treated as measurements for the current system, not as targets that should match across devices.
+
+The plot below was generated from fresh container runs with `generate_example_plots.py`, using the same commands shown in this README.
+
+![pytorch_microbench example measurements from validated container runs](images/pytorch_microbench_example_runs.png)
+
+## Runtime trace with `get_trace.sh`
+
+Run the script:
+
+```bash
+./get_trace.sh
+```
+
+The script writes a timestamped directory under `profiling_results/trace_*`. On ROCm 6.x and 7.x it requests Perfetto output directly, so the main file to look for is a `.pftrace` file. Open it in Perfetto:
+
+```text
+https://ui.perfetto.dev/
+```
+
+A successful run prints the generated trace path explicitly, for example:
+
+```text
+Perfetto trace file: profiling_results/trace_20260321_231808/<hostname>/19455_results.pftrace
+Open it in Perfetto UI: https://ui.perfetto.dev/
+```
+
+When reading the trace, the first questions to ask are:
+
+- where the host spends time between launches
+- whether GPU kernels run back-to-back or with visible gaps
+- how much explicit memory traffic appears relative to compute work
+- whether synchronization points serialize the execution
+
+On systems that expose more than one agent, `rocprofv3` may print warnings about one of the agents before the trace starts. In many cases the trace is still produced successfully, so the first check should be whether the expected output file was generated.
+
+If a ROCm 7.x database is generated instead of a Perfetto trace, convert it with:
+
+```bash
+rocpd2pftrace -i <db_file> -o trace.pftrace
+```
+
+## GPU hotspots with `get_gpu_hotspots.sh`
+
+Run the script:
+
+```bash
+./get_gpu_hotspots.sh
+```
+
+The script writes to `profiling_results/gpu_hotspots_*`. On ROCm 6.x the main output is usually a CSV file. On ROCm 7.x the output is typically a SQLite database. For ROCm 7.x, the two most useful follow-up commands are:
+
+```bash
+rocpd2csv -i <db_file> -o kernel_stats.csv
+rocpd summary -i <db_file> --region-categories KERNEL
+```
+
+For this benchmark, the quantities that usually matter first are:
+
+- total GPU time
+- number of kernel dispatches
+- number of unique kernels
+- the few kernels that dominate the total time
+
+For `resnet50`, the dominant entries are often convolution, batch normalization, and elementwise kernels from MIOpen and PyTorch. The exact names vary across hardware and ROCm versions, but the methodology does not.
+
+One example kernel summary from this workflow produced the following dominant entries:
+
+- `miopenSp3AsmConv_v30_3_1_gfx11_fp32_f2x3_stride1`: `763.126806 ms`
+- `MIOpenBatchNormBwdSpatial`: `167.792579 ms`
+- `ATen vectorized elementwise kernel`: `120.853175 ms`
+
+The next plot was generated from the `get_gpu_hotspots.sh` run that produced the example summary above.
+
+![pytorch_microbench GPU hotspots from validated container run](images/pytorch_microbench_gpu_hotspots.png)
+
+## Performance metrics with `get_performance_metrics.sh`
+
+Run the script in its default mode:
+
+```bash
+./get_performance_metrics.sh
+```
+
+The script writes a timestamped workload directory under `profiling_results/performance_metrics_*`. The default tutorial mode uses `--no-roof`, which keeps the run focused on detailed counter collection. Use the follow-up analysis commands to inspect specific dispatches and metric blocks.
+
+Treat this as a short workshop sequence:
+
+```bash
+rocprof-compute analyze -p <profile_dir> --list-stats
+rocprof-compute analyze -p <profile_dir> --dispatch <N>
+rocprof-compute analyze -p <profile_dir> --dispatch <N> --block 2.1.15 6.2.7
+rocprof-compute analyze -p <profile_dir> --dispatch <N> --block 16.1 17.1
+```
+
+Use `--list-stats` to find the kernel and dispatch to study, then inspect that dispatch in the default report. The `2.1.15 6.2.7` blocks are useful for occupancy and LDS-related limits. The `16.1 17.1` blocks are useful for L1 and L2 speed-of-light metrics.
+
+This step is most useful after `get_trace.sh` and `get_gpu_hotspots.sh` have identified a kernel worth studying.
+
+The script also supports explicit modes:
+
+```bash
+./get_performance_metrics.sh
+./get_performance_metrics.sh full
+./get_performance_metrics.sh roof-only
+```
+
+The default mode is `no-roof`. Use `full` when the roofline stage is needed, and use `roof-only` when the immediate question is where the kernel falls on the roofline.
+
+`rocprof-compute` has the best counter coverage on supported Instinct class GPUs. On other systems, some counters may be unavailable, or the collection path may be unsupported. In that case the script exits early with a short explanation, so this step should be treated as optional unless the tutorial is running on a system with supported hardware-counter collection.
+
+One example of that skip path is:
+
+```text
+Skipping rocprof-compute profiling for pytorch_microbench...
+Detected GPU architecture: gfx1100
+rocprof-compute hardware-counter collection currently requires a supported Instinct GPU
+Use get_trace.sh and get_gpu_hotspots.sh on this system instead.
+```
+
+## System trace with `get_rocprof_sys.sh`
+
+Run the script:
+
+```bash
+./get_rocprof_sys.sh
+```
+
+The script writes to `profiling_results/rocprof_sys_*`. Open the resulting `.proto` file in Perfetto:
+
+```text
+https://ui.perfetto.dev/
+```
+
+A successful run prints the trace file directly, for example:
+
+```text
+Perfetto trace file: profiling_results/rocprof_sys_20260321_231923/rocprofsys-python-output/<timestamp>/perfetto-trace-19832.proto
+Open it in Perfetto UI: https://ui.perfetto.dev/
+```
+
+This tool is useful when the question is broader than kernel timing alone, for example when the interaction between the Python runtime, libraries, and the GPU execution needs to be examined.
+
+On some systems, `rocprof-sys` may print warnings related to host performance-counter permissions or device telemetry before continuing. The important check is whether the run completes and produces the expected `.proto` file. The script prints that path explicitly so that the file can be opened directly.
+
+## Variations to try
+
+Once the baseline case has been examined, the following variations are reasonable next steps:
+
+- change the network, for example `--network densenet121` or `--network vgg16`
+- enable mixed precision with `--fp16 1`
+- enable compilation with `--compile`
+- run a distributed case with `torchrun`
+
+For example:
+
+```bash
+python micro_benchmarking_pytorch.py --network densenet121 --batch-size 64 --iterations 10 --fp16 1
+python micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 10 --compile
+torchrun --nproc-per-node <ngpu> micro_benchmarking_pytorch.py --network resnet50 --batch-size 128
+```
+
+For distributed runs, set `<ngpu>` to the number of visible GPUs on the system. If only one GPU is available, omit the distributed example and stay with the single-device path.
+
+`--fp16 1` and `--compile` are useful follow-up comparisons, but the direction and magnitude of the change will depend on the system and workload. For `--compile`, use a larger iteration count if the goal is steady-state performance rather than functionality; with only `10` iterations, startup effects may still influence the result.
+
+Example outputs from two such follow-up runs are shown below:
+
+```text
+$ python micro_benchmarking_pytorch.py --network densenet121 --batch-size 64 --iterations 10 --fp16 1
+INFO: running forward and backward for warmup.
+INFO: running the benchmark..
+OK: finished running benchmark..
+--------------------SUMMARY--------------------------
+Microbenchmark for network : densenet121
+Num devices: 1
+Dtype: FP16
+Mini batch size [img] : 64
+Time per mini-batch : 0.1000108003616333
+Throughput [img/sec] : 639.9308851502005
+
+$ python micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 10 --compile
+INFO: running forward and backward for warmup.
+INFO: running the benchmark..
+OK: finished running benchmark..
+--------------------SUMMARY--------------------------
+Microbenchmark for network : resnet50
+Num devices: 1
+Dtype: FP32
+Mini batch size [img] : 64
+Time per mini-batch : 0.1676210880279541
+Throughput [img/sec] : 381.8135340424872
+```
+
+## Performance note
+
+On systems that use MIOpen, it can be useful to allow the library to tune and cache convolution choices before comparing results:
+
+```bash
+export MIOPEN_FIND_ENFORCE=3
+python micro_benchmarking_pytorch.py --network resnet50 --batch-size 64 --iterations 10
+```
+
+## Additional resources
+
+- [`generate_example_plots.py`](generate_example_plots.py): regenerates the example plots from container logs
+- rocprofv3: https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/develop/how-to/using-rocprofv3.html
+- rocpd tools: https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/develop/how-to/using-rocpd-output-format.html
+- Perfetto UI: https://ui.perfetto.dev/
diff --git a/MLExamples/inference_benchmark/TorchTensorOpsBench/README.md b/MLExamples/pytorch_microbench/TorchTensorOpsBench/README.md
similarity index 100%
rename from MLExamples/inference_benchmark/TorchTensorOpsBench/README.md
rename to MLExamples/pytorch_microbench/TorchTensorOpsBench/README.md
diff --git a/MLExamples/inference_benchmark/TorchTensorOpsBench/run.sh b/MLExamples/pytorch_microbench/TorchTensorOpsBench/run.sh
similarity index 100%
rename from MLExamples/inference_benchmark/TorchTensorOpsBench/run.sh
rename to MLExamples/pytorch_microbench/TorchTensorOpsBench/run.sh
diff --git a/MLExamples/inference_benchmark/TorchTensorOpsBench/torch_tensor_ops_bench.py b/MLExamples/pytorch_microbench/TorchTensorOpsBench/torch_tensor_ops_bench.py
similarity index 100%
rename from MLExamples/inference_benchmark/TorchTensorOpsBench/torch_tensor_ops_bench.py
rename to MLExamples/pytorch_microbench/TorchTensorOpsBench/torch_tensor_ops_bench.py
diff --git a/MLExamples/inference_benchmark/fp16util.py b/MLExamples/pytorch_microbench/fp16util.py
similarity index 100%
rename from MLExamples/inference_benchmark/fp16util.py
rename to MLExamples/pytorch_microbench/fp16util.py
diff --git a/MLExamples/pytorch_microbench/generate_example_plots.py b/MLExamples/pytorch_microbench/generate_example_plots.py
new file mode 100644
index 00000000..84d9a8b6
--- /dev/null
+++ b/MLExamples/pytorch_microbench/generate_example_plots.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+"""Generate example tutorial plots from validated pytorch_microbench runs."""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import os
+import re
+from collections import defaultdict
+from pathlib import Path
+
+os.environ.setdefault("MPLCONFIGDIR", "/tmp/matplotlib")
+
+import matplotlib
+
+matplotlib.use("Agg")
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+RUNS = [
+    ("baseline_resnet50_fp32.log", "ResNet50\nFP32"),
+    ("densenet121_fp16.log", "DenseNet121\nFP16"),
+    ("resnet50_compile.log", "ResNet50\ncompile"),
+]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Generate example plots from pytorch_microbench validation logs."
+    )
+    parser.add_argument(
+        "--log-dir",
+        type=Path,
+        default=Path("/tmp/pytorch_microbench_plot_runs_20260321"),
+        help="Directory containing benchmark and profiler logs",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("MLExamples/pytorch_microbench/images"),
+        help="Directory where plot images will be written",
+    )
+    return parser.parse_args()
+
+
+def require_match(pattern: str, text: str, context: str) -> str:
+    match = re.search(pattern, text)
+    if not match:
+        raise ValueError(f"Could not find pattern {pattern!r} in {context}")
+    return match.group(1)
+
+
+def resolve_artifact_path(path_text: str) -> Path:
+    path = Path(path_text)
+    if path.exists():
+        return path
+    if path_text.startswith("/workspace/"):
+        translated = REPO_ROOT / path.relative_to("/workspace")
+        if translated.exists():
+            return translated
+    raise FileNotFoundError(f"Could not resolve artifact path: {path_text}")
+
+
+def parse_benchmark_logs(log_dir: Path) -> pd.DataFrame:
+    rows = []
+    for filename, label in RUNS:
+        log_path = log_dir / filename
+        text = log_path.read_text()
+        rows.append(
+            {
+                "label": label,
+                "network": require_match(
+                    r"Microbenchmark for network : ([^\n]+)", text, str(log_path)
+                ),
+                "dtype": require_match(r"Dtype: ([^\n]+)", text, str(log_path)),
+                "time_per_batch": float(
+                    require_match(r"Time per mini-batch : ([0-9.]+)", text, str(log_path))
+                ),
+                "throughput": float(
+                    require_match(r"Throughput \[img/sec\] : ([0-9.]+)", text, str(log_path))
+                ),
+            }
+        )
+    return pd.DataFrame(rows)
+
+
+def shorten_kernel_name(name: str) -> str:
+    if name.startswith("void at::native::vectorized_elementwise_kernel"):
+        short = "ATen vectorized elementwise kernel"
+    elif name.startswith("Cijk_"):
+        short = "Tensile GEMM kernel"
+    else:
+        short = name
+
+    if len(short) > 52:
+        short = short[:49] + "..."
+    return short
+
+
+def parse_hotspots(log_dir: Path, top_n: int = 8) -> pd.DataFrame:
+    log_path = log_dir / "get_gpu_hotspots.log"
+    text = log_path.read_text()
+    csv_path = resolve_artifact_path(
+        require_match(r"Kernel trace CSV: (.+_kernel_trace\.csv)", text, str(log_path))
+    )
+
+    totals: defaultdict[str, float] = defaultdict(float)
+    with csv_path.open(newline="") as handle:
+        reader = csv.DictReader(handle)
+        for row in reader:
+            duration_ms = (
+                int(row["End_Timestamp"]) - int(row["Start_Timestamp"])
+            ) / 1_000_000.0
+            totals[row["Kernel_Name"]] += duration_ms
+
+    top = sorted(totals.items(), key=lambda item: item[1], reverse=True)[:top_n]
+    return pd.DataFrame(
+        {
+            "kernel_name": [name for name, _ in top],
+            "total_duration_ms": [duration for _, duration in top],
+            "short_name": [shorten_kernel_name(name) for name, _ in top],
+        }
+    )
+
+
+def add_bar_labels(ax: plt.Axes, values: pd.Series, fmt: str) -> None:
+    for idx, value in enumerate(values):
+        ax.text(idx, value, fmt.format(value), ha="center", va="bottom", fontsize=9)
+
+
+def plot_benchmark_examples(df: pd.DataFrame, output_path: Path) -> None:
+    colors = ["#1f3c88", "#4f772d", "#c97b24"]
+    fig, axes = plt.subplots(1, 2, figsize=(11.5, 4.8), constrained_layout=True)
+
+    axes[0].bar(df["label"], df["throughput"], color=colors)
+    axes[0].set_title("Throughput")
+    axes[0].set_ylabel("img/sec")
+    axes[0].grid(axis="y", alpha=0.2)
+    add_bar_labels(axes[0], df["throughput"], "{:.1f}")
+
+    axes[1].bar(df["label"], df["time_per_batch"], color=colors)
+    axes[1].set_title("Time per mini-batch")
+    axes[1].set_ylabel("seconds")
+    axes[1].grid(axis="y", alpha=0.2)
+    add_bar_labels(axes[1], df["time_per_batch"], "{:.3f}")
+
+    fig.suptitle(
+        "pytorch_microbench example measurements from validated container runs",
+        fontsize=14,
+        fontweight="bold",
+    )
+    fig.savefig(output_path, dpi=180, bbox_inches="tight")
+    plt.close(fig)
+
+
+def plot_hotspots(df: pd.DataFrame, output_path: Path) -> None:
+    plot_df = df.sort_values("total_duration_ms", ascending=True)
+    fig, ax = plt.subplots(figsize=(10.5, 5.5), constrained_layout=True)
+    ax.barh(plot_df["short_name"], plot_df["total_duration_ms"], color="#1f3c88")
+    ax.set_xlabel("Total duration (ms)")
+    ax.set_title("pytorch_microbench GPU hotspots from validated container run")
+    ax.grid(axis="x", alpha=0.2)
+
+    for y, value in enumerate(plot_df["total_duration_ms"]):
+        ax.text(value, y, f" {value:.2f}", va="center", ha="left", fontsize=9)
+
+    fig.savefig(output_path, dpi=180, bbox_inches="tight")
+    plt.close(fig)
+
+
+def main() -> None:
+    args = parse_args()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    benchmark_df = parse_benchmark_logs(args.log_dir)
+    plot_benchmark_examples(
+        benchmark_df,
+        args.output_dir / "pytorch_microbench_example_runs.png",
+    )
+
+    hotspots_df = parse_hotspots(args.log_dir)
+    plot_hotspots(
+        hotspots_df,
+        args.output_dir / "pytorch_microbench_gpu_hotspots.png",
+    )
+
+    print(f"Wrote plots to {args.output_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLExamples/pytorch_microbench/get_gpu_hotspots.sh b/MLExamples/pytorch_microbench/get_gpu_hotspots.sh
new file mode 100755
index 00000000..d90dc2b5
--- /dev/null
+++ b/MLExamples/pytorch_microbench/get_gpu_hotspots.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Script to identify pytorch_microbench GPU hotspots with rocprofv3.
+
+set -euo pipefail
+
+source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/profile_common.sh"
+
+require_cmd rocprofv3
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+ROCM_VERSION="$(detect_rocm_version)"
+OUTPUT_DIR="$(make_output_dir gpu_hotspots)"
+
+echo "Starting rocprofv3 GPU hotspot collection for pytorch_microbench..."
+if [ -n "$ROCM_VERSION" ]; then
+    echo "Detected ROCm version: $ROCM_VERSION"
+else
+    echo "Warning: Could not detect ROCm version. Proceeding with default rocprofv3 behavior."
+fi
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+rocprofv3 \
+    --kernel-trace \
+    --output-directory "$OUTPUT_DIR" \
+    -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 3
+echo ""
+
+echo "To analyze results:"
+DB_FILE="$(find "$OUTPUT_DIR" -name "*.db" 2>/dev/null | head -1)"
+CSV_FILE="$(find "$OUTPUT_DIR" -name "*_kernel_trace.csv" 2>/dev/null | head -1)"
+AGENT_INFO_FILE="$(find "$OUTPUT_DIR" -name "*_agent_info.csv" 2>/dev/null | head -1)"
+
+if [ -n "$CSV_FILE" ]; then
+    echo "  Kernel trace CSV: $CSV_FILE"
+fi
+if [ -n "$AGENT_INFO_FILE" ]; then
+    echo "  Agent info CSV: $AGENT_INFO_FILE"
+fi
+if [ -n "$DB_FILE" ]; then
+    echo "  SQLite database: $DB_FILE"
+    echo ""
+    echo "  Export to CSV:"
+    echo "    rocpd2csv -i \"$DB_FILE\" -o kernel_stats.csv"
+    echo ""
+    echo "  Get kernel summary:"
+    echo "    rocpd summary -i \"$DB_FILE\" --region-categories KERNEL"
+fi
+if [ -z "$CSV_FILE" ] && [ -z "$DB_FILE" ]; then
+    echo "  WARNING: No ROCm profiler output file was detected under $OUTPUT_DIR"
+fi
diff --git a/MLExamples/pytorch_microbench/get_performance_metrics.sh b/MLExamples/pytorch_microbench/get_performance_metrics.sh
new file mode 100755
index 00000000..450ee51c
--- /dev/null
+++ b/MLExamples/pytorch_microbench/get_performance_metrics.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# Script to collect pytorch_microbench performance metrics with rocprof-compute.
+
+set -euo pipefail
+
+source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/profile_common.sh"
+
+require_cmd rocprof-compute
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+MODE="${1:-no-roof}"
+GPU_ARCH=""
+SUPPORTED_ARCH_REGEX='^(gfx908|gfx90a|gfx940|gfx941|gfx942)$'
+
+if command -v rocminfo >/dev/null 2>&1; then
+    GPU_ARCH="$(rocminfo 2>/dev/null | awk '/^[[:space:]]+Name:[[:space:]]+gfx/ {print $2; exit}')"
+fi
+
+if [ -n "$GPU_ARCH" ] && ! echo "$GPU_ARCH" | grep -Eq "$SUPPORTED_ARCH_REGEX"; then
+    echo "Skipping rocprof-compute profiling for pytorch_microbench..."
+    echo "Detected GPU architecture: $GPU_ARCH"
+    echo "rocprof-compute hardware-counter collection currently requires a supported Instinct GPU"
+    echo "(for example gfx908, gfx90a, gfx940, gfx941, or gfx942)."
+    echo "Use get_trace.sh and get_gpu_hotspots.sh on this system instead."
+    exit 0
+fi
+
+OUTPUT_DIR="$(make_output_dir performance_metrics)"
+WORKLOAD_NAME="microbench"
+PROFILE_ROOT="$OUTPUT_DIR/$WORKLOAD_NAME"
+
+case "$MODE" in
+    full)
+        PROFILE_ARGS=(--kernel-names)
+        MODE_DESCRIPTION="full profile (counters plus roofline stage)"
+        ;;
+    roof-only)
+        PROFILE_ARGS=(--roof-only --kernel-names)
+        MODE_DESCRIPTION="roofline-only profile"
+        ;;
+    no-roof)
+        PROFILE_ARGS=(--no-roof --kernel-names)
+        MODE_DESCRIPTION="counter-only profile without roofline collection"
+        ;;
+    *)
+        echo "Usage: $0 [no-roof|full|roof-only]" >&2
+        echo "  no-roof   collect counters only and skip the roofline stage" >&2
+        echo "  full      collect the default counter set and roofline data" >&2
+        echo "  roof-only collect roofline data only and label roofline kernels" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting rocprof-compute performance-metric collection for pytorch_microbench..."
+if [ -n "$GPU_ARCH" ]; then
+    echo "Detected GPU architecture: $GPU_ARCH"
+fi
+echo "Mode: $MODE_DESCRIPTION"
+echo "Workload name: $WORKLOAD_NAME"
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+echo "Note: rocprof-compute may replay kernels multiple times to collect all requested counters."
+echo ""
+
+rocprof-compute profile \
+    --name "$WORKLOAD_NAME" \
+    --path "$PROFILE_ROOT" \
+    "${PROFILE_ARGS[@]}" \
+    -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 4
+echo ""
+echo "To analyze results:"
+ANALYZE_PATH=""
+for marker in pmc_perf.csv roofline.csv sysinfo.csv; do
+    MARKER_FILE="$(find "$PROFILE_ROOT" -name "$marker" 2>/dev/null | head -1)"
+    if [ -n "$MARKER_FILE" ]; then
+        ANALYZE_PATH="$(dirname "$MARKER_FILE")"
+        break
+    fi
+done
+
+if [ -n "$ANALYZE_PATH" ]; then
+    echo "  Raw data directory: $ANALYZE_PATH"
+    echo ""
+    echo "  1. List detected kernels and dispatches:"
+    echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --list-stats"
+    if [ "$MODE" != "roof-only" ]; then
+        echo ""
+        echo "  2. Inspect one dispatch in the default report:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N>"
+        echo ""
+        echo "  3. Check occupancy and LDS-related limits:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N> --block 2.1.15 6.2.7"
+        echo ""
+        echo "  4. Check L1/L2 memory speed-of-light metrics:"
+        echo "     rocprof-compute analyze -p \"$ANALYZE_PATH\" --dispatch <N> --block 16.1 17.1"
+    else
+        echo ""
+        echo "  Roofline-only mode does not collect the full counter set."
+        echo "  Re-run with '$0 full' or '$0 no-roof' for detailed block analysis."
+    fi
+else
+    echo "  WARNING: Could not detect the rocprof-compute raw data directory under $PROFILE_ROOT"
+    echo "  Inspect the generated workload tree and use that path with 'rocprof-compute analyze -p'."
+fi
+echo ""
+echo "For help on analysis options:"
+echo "  rocprof-compute analyze --help"
diff --git a/MLExamples/pytorch_microbench/get_rocprof_sys.sh b/MLExamples/pytorch_microbench/get_rocprof_sys.sh
new file mode 100755
index 00000000..4b5475a7
--- /dev/null
+++ b/MLExamples/pytorch_microbench/get_rocprof_sys.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Script to profile pytorch_microbench with rocprof-sys.
+
+set -euo pipefail
+
+source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/profile_common.sh"
+
+require_cmd rocprof-sys-run
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+OUTPUT_DIR="$(make_output_dir rocprof_sys)"
+mkdir -p "$OUTPUT_DIR"
+
+echo "Starting rocprof-sys profiling for pytorch_microbench..."
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+echo ""
+
+pushd "$OUTPUT_DIR" >/dev/null
+rocprof-sys-run \
+    --profile \
+    --trace \
+    -- "${BENCHMARK_CMD[@]}"
+popd >/dev/null
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 4
+echo ""
+echo "Open the trace in Perfetto:"
+PROTO_FILE="$(find "$OUTPUT_DIR" -name "*.proto" 2>/dev/null | head -1)"
+if [ -n "$PROTO_FILE" ]; then
+    echo "  Perfetto trace file: $PROTO_FILE"
+    echo "  Open it in Perfetto UI: https://ui.perfetto.dev/"
+else
+    echo "  WARNING: No .proto file was found under $OUTPUT_DIR"
+    echo "  Inspect the output tree and open the generated trace in Perfetto UI if present."
+fi
diff --git a/MLExamples/pytorch_microbench/get_trace.sh b/MLExamples/pytorch_microbench/get_trace.sh
new file mode 100755
index 00000000..0e822758
--- /dev/null
+++ b/MLExamples/pytorch_microbench/get_trace.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+# Script to profile pytorch_microbench with rocprofv3 runtime trace.
+
+set -euo pipefail
+
+source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/profile_common.sh"
+
+require_cmd rocprofv3
+require_cmd "$PYTHON_BIN"
+ensure_benchmark_script
+build_benchmark_cmd
+
+ROCM_VERSION="$(detect_rocm_version)"
+ROCM_MAJOR="$(rocm_major_from_version "$ROCM_VERSION")"
+OUTPUT_DIR="$(make_output_dir trace)"
+
+echo "Starting rocprofv3 runtime trace profiling for pytorch_microbench..."
+if [ -n "$ROCM_VERSION" ]; then
+    echo "Detected ROCm version: $ROCM_VERSION"
+else
+    echo "Warning: Could not detect ROCm version. Proceeding without version-specific assumptions."
+fi
+echo "Output directory: $OUTPUT_DIR"
+print_workload_summary
+
+TRACE_CMD=(rocprofv3 --runtime-trace --output-directory "$OUTPUT_DIR")
+if [ "$ROCM_MAJOR" = "6" ] || [ "$ROCM_MAJOR" = "7" ]; then
+    echo "Using explicit Perfetto output for ROCm $ROCM_MAJOR.x."
+    TRACE_CMD+=(--output-format pftrace)
+fi
+
+echo ""
+echo "Collecting full runtime trace (API calls, kernels, memory operations, and synchronization events)..."
+echo ""
+
+"${TRACE_CMD[@]}" -- "${BENCHMARK_CMD[@]}"
+
+echo ""
+echo "Profiling complete! Results saved to: $OUTPUT_DIR"
+echo ""
+echo "Generated files:"
+print_generated_files "$OUTPUT_DIR" 3
+echo ""
+
+PFTRACE_FILE="$(find "$OUTPUT_DIR" -name "*.pftrace" | head -1)"
+DB_FILE="$(find "$OUTPUT_DIR" -name "*.db" | head -1)"
+
+if [ -n "$PFTRACE_FILE" ]; then
+    echo "Perfetto trace file: $PFTRACE_FILE"
+    echo "Size: $(du -h "$PFTRACE_FILE" | cut -f1)"
+    echo "Open it in Perfetto UI: https://ui.perfetto.dev/"
+elif [ -n "$DB_FILE" ]; then
+    echo "SQLite database found: $DB_FILE"
+    echo "Convert it to Perfetto format with:"
+    echo "  rocpd2pftrace -i \"$DB_FILE\" -o trace.pftrace"
+else
+    echo "WARNING: No .pftrace or .db file was found under $OUTPUT_DIR"
+fi
+echo ""
diff --git a/MLExamples/pytorch_microbench/images/pytorch_microbench_example_runs.png b/MLExamples/pytorch_microbench/images/pytorch_microbench_example_runs.png
new file mode 100644
index 00000000..97ecdf1f
Binary files /dev/null and b/MLExamples/pytorch_microbench/images/pytorch_microbench_example_runs.png differ
diff --git a/MLExamples/pytorch_microbench/images/pytorch_microbench_gpu_hotspots.png b/MLExamples/pytorch_microbench/images/pytorch_microbench_gpu_hotspots.png
new file mode 100644
index 00000000..2a3a9bf9
Binary files /dev/null and b/MLExamples/pytorch_microbench/images/pytorch_microbench_gpu_hotspots.png differ
diff --git a/MLExamples/inference_benchmark/micro_benchmarking_pytorch.py b/MLExamples/pytorch_microbench/micro_benchmarking_pytorch.py
similarity index 100%
rename from MLExamples/inference_benchmark/micro_benchmarking_pytorch.py
rename to MLExamples/pytorch_microbench/micro_benchmarking_pytorch.py
diff --git a/MLExamples/pytorch_microbench/profile_common.sh b/MLExamples/pytorch_microbench/profile_common.sh
new file mode 100644
index 00000000..79807c67
--- /dev/null
+++ b/MLExamples/pytorch_microbench/profile_common.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# Shared helpers for the pytorch_microbench profiling scripts.
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BENCHMARK_SCRIPT="$SCRIPT_DIR/micro_benchmarking_pytorch.py"
+OUTPUT_ROOT="${PYTORCH_MICROBENCH_OUTPUT_ROOT:-$SCRIPT_DIR/profiling_results}"
+NETWORK="${PYTORCH_MICROBENCH_NETWORK:-resnet50}"
+BATCH_SIZE="${PYTORCH_MICROBENCH_BATCH_SIZE:-64}"
+ITERATIONS="${PYTORCH_MICROBENCH_ITERATIONS:-10}"
+EXTRA_BENCHMARK_ARGS_RAW="${PYTORCH_MICROBENCH_EXTRA_ARGS:-}"
+EXTRA_BENCHMARK_ARGS=()
+
+if [ -n "$EXTRA_BENCHMARK_ARGS_RAW" ]; then
+    read -r -a EXTRA_BENCHMARK_ARGS <<< "$EXTRA_BENCHMARK_ARGS_RAW"
+fi
+
+if [ -n "${PYTORCH_MICROBENCH_PYTHON:-}" ]; then
+    PYTHON_BIN="$PYTORCH_MICROBENCH_PYTHON"
+elif command -v python >/dev/null 2>&1; then
+    PYTHON_BIN="python"
+else
+    PYTHON_BIN="python3"
+fi
+
+require_cmd() {
+    local cmd="$1"
+    if ! command -v "$cmd" >/dev/null 2>&1; then
+        echo "Error: required command '$cmd' was not found in PATH." >&2
+        exit 1
+    fi
+}
+
+ensure_benchmark_script() {
+    if [ ! -f "$BENCHMARK_SCRIPT" ]; then
+        echo "Error: benchmark script not found at '$BENCHMARK_SCRIPT'." >&2
+        exit 1
+    fi
+}
+
+detect_rocm_version() {
+    local version=""
+    local hip_version=""
+
+    if command -v rocminfo >/dev/null 2>&1; then
+        version=$(rocminfo 2>/dev/null | awk '/ROCm Version/ {print $3; exit}')
+    fi
+
+    if [ -z "$version" ] && [ -n "${ROCM_PATH:-}" ] && [ -f "$ROCM_PATH/.info/version" ]; then
+        version="$(cat "$ROCM_PATH/.info/version")"
+    fi
+
+    if [ -z "$version" ] && command -v hipcc >/dev/null 2>&1; then
+        hip_version=$(hipcc --version 2>/dev/null | awk '/HIP version/ {print $3; exit}')
+        if [ -n "$hip_version" ]; then
+            version="$hip_version"
+        fi
+    fi
+
+    printf '%s\n' "$version"
+}
+
+rocm_major_from_version() {
+    local version="$1"
+    if [ -n "$version" ]; then
+        printf '%s\n' "${version%%.*}"
+    else
+        printf '%s\n' ""
+    fi
+}
+
+make_output_dir() {
+    local prefix="$1"
+    local timestamp
+    local output_dir
+    timestamp="$(date +%Y%m%d_%H%M%S)"
+    mkdir -p "$OUTPUT_ROOT"
+    output_dir="$OUTPUT_ROOT/${prefix}_${timestamp}"
+    mkdir -p "$output_dir"
+    printf '%s\n' "$output_dir"
+}
+
+build_benchmark_cmd() {
+    BENCHMARK_CMD=(
+        "$PYTHON_BIN"
+        "$BENCHMARK_SCRIPT"
+        --network "$NETWORK"
+        --batch-size "$BATCH_SIZE"
+        --iterations "$ITERATIONS"
+        "${EXTRA_BENCHMARK_ARGS[@]}"
+    )
+}
+
+print_workload_summary() {
+    echo "Workload:"
+    echo "  network: $NETWORK"
+    echo "  batch size: $BATCH_SIZE"
+    echo "  iterations: $ITERATIONS"
+    echo "  python: $PYTHON_BIN"
+    if [ "${#EXTRA_BENCHMARK_ARGS[@]}" -gt 0 ]; then
+        echo "  extra args: ${EXTRA_BENCHMARK_ARGS[*]}"
+    fi
+}
+
+print_generated_files() {
+    local output_dir="$1"
+    local maxdepth="${2:-3}"
+
+    if ! find "$output_dir" -maxdepth "$maxdepth" -type f | grep -q .; then
+        echo "  No files found under $output_dir"
+        return
+    fi
+
+    while IFS= read -r file; do
+        ls -lh "$file"
+    done < <(find "$output_dir" -maxdepth "$maxdepth" -type f | sort)
+}
diff --git a/MLExamples/inference_benchmark/shufflenet.py b/MLExamples/pytorch_microbench/shufflenet.py
similarity index 100%
rename from MLExamples/inference_benchmark/shufflenet.py
rename to MLExamples/pytorch_microbench/shufflenet.py
diff --git a/MLExamples/inference_benchmark/shufflenet_v2.py b/MLExamples/pytorch_microbench/shufflenet_v2.py
similarity index 100%
rename from MLExamples/inference_benchmark/shufflenet_v2.py
rename to MLExamples/pytorch_microbench/shufflenet_v2.py
diff --git a/MLExamples/inference_benchmark/xception.py b/MLExamples/pytorch_microbench/xception.py
similarity index 100%
rename from MLExamples/inference_benchmark/xception.py
rename to MLExamples/pytorch_microbench/xception.py