diff --git a/kernel/src/drivers/mod.rs b/kernel/src/drivers/mod.rs
index 4539b2a1..abb62270 100644
--- a/kernel/src/drivers/mod.rs
+++ b/kernel/src/drivers/mod.rs
@@ -122,6 +122,10 @@ pub fn init() -> usize {
                         serial_println!("[drivers] VirGL 3D acceleration active");
                         crate::graphics::set_compositor_backend(crate::graphics::CompositorBackend::VirGL);
                         serial_println!("[drivers] Compositor backend: VirGL");
+                        // Enable yielding during GPU command waits now that init is complete.
+                        // During runtime, GPU poll loops yield to the scheduler instead of
+                        // spinning, letting other tasks run during ~3.4ms GPU processing.
+                        virtio::gpu_pci::enable_gpu_yield();
                     }
                     Err(e) => serial_println!("[drivers] VirGL init skipped: {}", e),
                 }
diff --git a/kernel/src/drivers/virtio/gpu_pci.rs b/kernel/src/drivers/virtio/gpu_pci.rs
index 4e764435..81f816cc 100644
--- a/kernel/src/drivers/virtio/gpu_pci.rs
+++ b/kernel/src/drivers/virtio/gpu_pci.rs
@@ -924,6 +924,69 @@ static GPU_IRQ: AtomicU32 = AtomicU32::new(0);
 /// Set by the interrupt handler to wake the WFI loop in send_command().
 static GPU_CMD_COMPLETE: AtomicBool = AtomicBool::new(false);
 
+/// Whether GPU command waits should yield to the scheduler instead of spinning.
+/// False during init (single-threaded, scheduler may not be ready).
+/// True after init completes (compositor runtime — yield lets other tasks run
+/// during the ~3.4ms GPU processing time for SUBMIT_3D).
+static GPU_YIELD_ENABLED: AtomicBool = AtomicBool::new(false);
+
+/// Accumulated ticks spent sleeping (blocked) during GPU command waits.
+/// Used by ksyscall-perf to distinguish sleep time from CPU time.
+static GPU_SLEEP_TICKS: AtomicU64 = AtomicU64::new(0);
+
+/// Separate sleep tick counter for gpu-phases reporting.
+/// ksyscall-perf swaps GPU_SLEEP_TICKS to 0 on its own schedule;
+/// gpu-phases needs its own counter to avoid interference.
+static GPU_SLEEP_TICKS_PHASES: AtomicU64 = AtomicU64::new(0);
+
+/// Thread ID of the thread currently blocked waiting for GPU command completion.
+/// Set before blocking in send_command_3desc, cleared after waking.
+/// The GPU interrupt handler uses this to wake the thread immediately.
+static GPU_WAITING_THREAD: AtomicU64 = AtomicU64::new(0);
+
+/// Enable yielding during GPU command waits.
+/// Called after GPU init completes, when the scheduler is fully running.
+pub fn enable_gpu_yield() {
+    GPU_YIELD_ENABLED.store(true, Ordering::Release);
+
+    // Enable GPU MSI-X SPI now that VirGL init is complete.
+    // During init, the SPI is configured but not enabled to avoid interrupt storms.
+    let irq = GPU_IRQ.load(Ordering::Relaxed);
+    if irq != 0 {
+        #[cfg(target_arch = "aarch64")]
+        {
+            use crate::arch_impl::aarch64::gic;
+
+            // Assign VirtIO-level MSI-X vectors via modern transport common config.
+            // Config change → no interrupt (0xFFFF). Controlq (0) → vector 0.
+            unsafe {
+                let ptr = &raw const GPU_PCI_STATE;
+                if let Some(ref state) = *ptr {
+                    state.device.set_config_msix_vector(0xFFFF);
+                    state.device.select_queue(0);
+                    let rb = state.device.set_queue_msix_vector(0);
+                    if rb == 0xFFFF {
+                        crate::serial_println!("[virtio-gpu-pci] MSI-X: device rejected controlq vector — disabling");
+                        GPU_IRQ.store(0, Ordering::Relaxed);
+                    } else {
+                        // Clear any pending SPI from init/VirGL commands before enabling
+                        gic::clear_spi_pending(irq);
+                        gic::enable_spi(irq);
+                        crate::serial_println!("[virtio-gpu-pci] MSI-X SPI {} enabled — interrupt-driven GPU wake active", irq);
+                    }
+                }
+            }
+        }
+    } else {
+        crate::serial_println!("[virtio-gpu-pci] GPU yield enabled (polling mode — no MSI-X)");
+    }
+}
+
+/// Take (read and reset) accumulated sleep ticks from GPU command waits.
+pub fn take_gpu_sleep_ticks() -> u64 {
+    GPU_SLEEP_TICKS.swap(0, Ordering::Relaxed)
+}
+
 // =============================================================================
 // Helpers
 // =============================================================================
@@ -1065,24 +1128,30 @@ fn setup_gpu_msi(pci_dev: &crate::drivers::pci::Device) -> u32 {
 
     let msi_address = base + 0x40;
 
-    // Step 3: Try MSI-X first (what VirtIO modern devices use)
+    // Step 3: Try MSI-X (what VirtIO modern devices use)
     if let Some(msix_cap) = pci_dev.find_msix_capability() {
         let table_size = pci_dev.msix_table_size(msix_cap);
-        let (table_bar, table_offset) = pci_dev.msix_table_location(msix_cap);
-        crate::serial_println!("[virtio-gpu-pci] MSI-X cap found: offset={:#x} table_size={} table_bar={} table_offset={:#x}",
-            msix_cap, table_size, table_bar, table_offset);
-
-        // Check BAR validity before accessing MSI-X table
-        let bar_info = &pci_dev.bars[table_bar as usize];
-        crate::serial_println!("[virtio-gpu-pci] MSI-X BAR {}: addr={:#x} size={:#x} valid={}",
-            table_bar, bar_info.address, bar_info.size, bar_info.is_valid());
-
-        // DIAGNOSTIC: Skip MSI-X PCI enable to avoid interrupt interference during init.
-        // We still write VirtIO MSI-X vectors to NO_VECTOR to test if vector
-        // configuration (without actual MSI-X) affects VirGL activation.
-        // Once VirGL works, we can re-enable MSI-X for runtime performance.
-        crate::serial_println!("[virtio-gpu-pci] MSI-X cap present but skipping PCI enable (using polling)");
-        // Return 0 = polling mode, VirtIO vectors will be set to NO_VECTOR below
+        crate::serial_println!("[virtio-gpu-pci] MSI-X cap at {:#x}: {} vectors", msix_cap, table_size);
+
+        // Program all MSI-X table entries with same SPI (single-vector mode)
+        for v in 0..table_size {
+            pci_dev.configure_msix_entry(msix_cap, v, msi_address, spi);
+        }
+
+        gic::configure_spi_edge_triggered(spi);
+        // Do NOT store GPU_IRQ or enable SPI here. GPU_IRQ=0 during init means
+        // send_command uses spin-polling and the interrupt handler ignores GPU SPIs.
+        // Both are activated after all init commands succeed (see end of init()).
+
+        // Enable MSI-X at PCI level and disable legacy INTx
+        pci_dev.enable_msix(msix_cap);
+        pci_dev.disable_intx();
+
+        crate::serial_println!(
+            "[virtio-gpu-pci] MSI-X enabled: SPI {} doorbell={:#x} vectors={}",
+            spi, msi_address, table_size
+        );
+        return spi;
     }
 
     // Step 4: Fall back to plain MSI
@@ -1126,6 +1195,15 @@ pub fn handle_interrupt() {
 
     GPU_CMD_COMPLETE.store(true, Ordering::Release);
 
+    // Wake the compositor thread blocked in send_command_3desc.
+    let waiting_tid = GPU_WAITING_THREAD.load(Ordering::Acquire);
+    if waiting_tid != 0 {
+        crate::task::scheduler::with_scheduler(|sched| {
+            sched.unblock(waiting_tid);
+        });
+        crate::task::scheduler::set_need_resched();
+    }
+
     gic::clear_spi_pending(irq);
     gic::enable_spi(irq);
 }
@@ -1293,15 +1371,8 @@ pub fn init() -> Result<(), &'static str> {
     // Mark device ready — MUST happen before sending any commands (Linux: virtio_device_ready())
     virtio.driver_ok();
 
-    // NOTE: We do NOT store msi_spi in GPU_IRQ yet! GPU_IRQ=0 means send_command
-    // uses spin-polling instead of WFI. At this early boot stage there's no timer
-    // interrupt, so if an MSI-X interrupt fails to deliver, WFI would block forever.
-    // We enable MSI-X interrupt delivery after all init commands succeed.
-    #[cfg(target_arch = "aarch64")]
-    if msi_spi != 0 {
-        crate::arch_impl::aarch64::gic::enable_spi(msi_spi);
-        crate::serial_println!("[virtio-gpu-pci] MSI-X SPI {} GIC-enabled (polling during init, WFI after)", msi_spi);
-    }
+    // GPU_IRQ=0 at this point (set in setup_gpu_msi but NOT stored in GPU_IRQ).
+    // All init commands below use spin-polling. SPI is enabled after init succeeds.
 
     // Read device-specific config (Linux reads num_scanouts + num_capsets here)
     let num_scanouts = virtio.read_config_u32(GPU_CFG_NUM_SCANOUTS);
@@ -1409,13 +1480,14 @@ pub fn init() -> Result<(), &'static str> {
     // All GPU setup commands succeeded — now mark as initialized.
     GPU_PCI_INITIALIZED.store(true, Ordering::Release);
 
-    // NOW enable MSI-X interrupt-driven command completion. All init commands
-    // above used spin-polling (GPU_IRQ=0). From here on, send_command will use
-    // WFI to wait for MSI-X interrupts, which is more efficient for runtime.
+    // Store GPU_IRQ now so enable_gpu_yield() (called after virgl_init) can
+    // activate interrupt-driven wake. We do NOT enable the SPI here — all
+    // VirGL init commands also use spin-polling. enable_gpu_yield() handles
+    // clearing pending, storing GPU_IRQ, and enabling the SPI.
     #[cfg(target_arch = "aarch64")]
     if msi_spi != 0 {
         GPU_IRQ.store(msi_spi, Ordering::Release);
-        crate::serial_println!("[virtio-gpu-pci] MSI-X WFI mode activated (SPI={})", msi_spi);
+        crate::serial_println!("[virtio-gpu-pci] MSI-X configured (SPI={}, deferred enable after VirGL init)", msi_spi);
     }
 
     crate::serial_println!("[virtio-gpu-pci] Initialized: {}x{}", use_width, use_height);
@@ -1510,12 +1582,22 @@ fn send_command(
         fence(Ordering::SeqCst);
     }
 
+    // Suppress device interrupts for fast 2-desc commands (spin-only).
+    unsafe {
+        let q = &raw mut PCI_CTRL_QUEUE;
+        (*q).avail.flags = 1; // VRING_AVAIL_F_NO_INTERRUPT
+        #[cfg(target_arch = "aarch64")]
+        dma_cache_clean(&(*q).avail.flags as *const u16 as *const u8, 64);
+    }
+
     // Signal that we're waiting for a completion, then notify device
     GPU_CMD_COMPLETE.store(false, Ordering::Release);
     state.device.notify_queue_fast(0);
 
-    // Wait for used ring update — WFI if MSI is available, spin_loop otherwise.
-    let use_msi = GPU_IRQ.load(Ordering::Relaxed) != 0;
+    // Wait for used ring update — tight spin for 2-desc commands.
+    // These complete in microseconds (SET_SCANOUT, RESOURCE_FLUSH, TRANSFER_TO_HOST_3D).
+    // Yielding here adds ~1-2ms of context switch overhead per command, which is
+    // catastrophic with 5+ commands per frame.
     let mut timeout = 10_000_000u32;
     loop {
         // Invalidate used ring cache line so we see the device's DMA write
@@ -1540,17 +1622,7 @@ fn send_command(
         if timeout == 0 {
             return Err("GPU PCI command timeout");
         }
-        if use_msi {
-            // WFI halts the vCPU until an interrupt arrives. The hypervisor
-            // processes the VirtIO command while the guest is halted, then
-            // delivers the MSI interrupt to wake us.
-            #[cfg(target_arch = "aarch64")]
-            unsafe { core::arch::asm!("wfi", options(nomem, nostack)); }
-            #[cfg(not(target_arch = "aarch64"))]
-            core::hint::spin_loop();
-        } else {
-            core::hint::spin_loop();
-        }
+        core::hint::spin_loop();
     }
 
     Ok(())
@@ -1630,48 +1702,180 @@ fn send_command_3desc(
         fence(Ordering::SeqCst);
     }
 
+    let can_yield = GPU_YIELD_ENABLED.load(Ordering::Relaxed);
+    let has_msi = GPU_IRQ.load(Ordering::Relaxed) != 0;
+
+    // When MSI-X is active: zero-spin, pure interrupt-driven wake.
+    // Enable device interrupts, register our thread, notify device, block immediately.
+    // The GPU interrupt handler fires on completion and wakes us — no CPU cycles
+    // wasted spinning. This matches Linux's virtio-gpu driver (wait_event_timeout).
+    //
+    // When MSI-X is not available: spin briefly then block with timer fallback.
+    if !has_msi {
+        // No MSI-X — suppress interrupts, we'll poll.
+        unsafe {
+            let q = &raw mut PCI_CTRL_QUEUE;
+            (*q).avail.flags = 1; // VRING_AVAIL_F_NO_INTERRUPT
+            #[cfg(target_arch = "aarch64")]
+            dma_cache_clean(&(*q).avail.flags as *const u16 as *const u8, 64);
+        }
+    }
+
     // Notify device
     state.device.notify_queue_fast(0);
 
-    // Poll for completion — WFI if MSI available, spin_loop otherwise.
-    let use_msi = GPU_IRQ.load(Ordering::Relaxed) != 0;
-    let mut timeout = 10_000_000u32;
     let used_len;
-    loop {
-        #[cfg(target_arch = "aarch64")]
+
+    #[cfg(target_arch = "aarch64")]
+    if has_msi && can_yield {
+        // MSI-X path: block immediately, zero spin.
+        let sleep_start: u64;
+        unsafe { core::arch::asm!("mrs {}, cntvct_el0", out(reg) sleep_start, options(nomem, nostack)); }
+
+        // Register thread for interrupt-driven wake
+        if let Some(tid) = crate::task::scheduler::current_thread_id() {
+            GPU_WAITING_THREAD.store(tid, Ordering::Release);
+        }
+
+        // Enable device interrupts — GPU will fire MSI-X on completion
         unsafe {
+            let q = &raw mut PCI_CTRL_QUEUE;
+            (*q).avail.flags = 0; // Enable notifications
+            dma_cache_clean(&(*q).avail.flags as *const u16 as *const u8, 64);
+        }
+
+        // Check if already complete before blocking (race: device may have
+        // finished between notify and here)
+        let already_done = unsafe {
             let q = &raw const PCI_CTRL_QUEUE;
             let used_addr = &(*q).used as *const _ as usize;
             core::arch::asm!("dc civac, {}", in(reg) used_addr, options(nostack));
             core::arch::asm!("dsb sy", options(nostack, preserves_flags));
+            fence(Ordering::Acquire);
+            read_volatile(&(*q).used.idx) != state.last_used_idx
+        };
+
+        if !already_done {
+            let (s, n) = crate::time::get_monotonic_time_ns();
+            let now_ns = (s as u64) * 1_000_000_000 + (n as u64);
+            let wake_ns = now_ns + 10_000_000; // 10ms safety timeout
+
+            crate::task::scheduler::with_scheduler(|sched| {
+                sched.block_current_for_compositor(wake_ns);
+            });
+            crate::per_cpu_aarch64::preempt_enable();
+            crate::task::scheduler::yield_current();
+            crate::arch_halt_with_interrupts();
+
+            crate::task::scheduler::with_scheduler(|sched| {
+                if let Some(thread) = sched.current_thread_mut() {
+                    thread.blocked_in_syscall = false;
+                }
+            });
+            crate::per_cpu_aarch64::preempt_disable();
+        }
+
+        GPU_WAITING_THREAD.store(0, Ordering::Release);
+
+        // Suppress interrupts now that we're awake
+        unsafe {
+            let q = &raw mut PCI_CTRL_QUEUE;
+            (*q).avail.flags = 1; // NO_INTERRUPT
+            dma_cache_clean(&(*q).avail.flags as *const u16 as *const u8, 64);
         }
 
+        let sleep_end: u64;
+        unsafe { core::arch::asm!("mrs {}, cntvct_el0", out(reg) sleep_end, options(nomem, nostack)); }
+        let slept = sleep_end.saturating_sub(sleep_start);
+        GPU_SLEEP_TICKS.fetch_add(slept, Ordering::Relaxed);
+        GPU_SLEEP_TICKS_PHASES.fetch_add(slept, Ordering::Relaxed);
+
+        // Read used ring — GPU must be done by now
+        unsafe {
+            let q = &raw const PCI_CTRL_QUEUE;
+            let used_addr = &(*q).used as *const _ as usize;
+            core::arch::asm!("dc civac, {}", in(reg) used_addr, options(nostack));
+            core::arch::asm!("dsb sy", options(nostack, preserves_flags));
+        }
         fence(Ordering::Acquire);
         let used_idx = unsafe {
             let q = &raw const PCI_CTRL_QUEUE;
             read_volatile(&(*q).used.idx)
         };
-        if used_idx != state.last_used_idx {
-            // Read used ring entry
-            used_len = unsafe {
-                let q = &raw const PCI_CTRL_QUEUE;
-                let elem_idx = (state.last_used_idx % 16) as usize;
-                read_volatile(&(*q).used.ring[elem_idx].len)
-            };
-            state.last_used_idx = used_idx;
-            break;
-        }
-        timeout -= 1;
-        if timeout == 0 {
-            return Err("GPU PCI 3-desc command timeout");
+        if used_idx == state.last_used_idx {
+            return Err("GPU PCI 3-desc command timeout (MSI-X wake but no completion)");
         }
-        if use_msi {
+        used_len = unsafe {
+            let q = &raw const PCI_CTRL_QUEUE;
+            let elem_idx = (state.last_used_idx % 16) as usize;
+            read_volatile(&(*q).used.ring[elem_idx].len)
+        };
+        state.last_used_idx = used_idx;
+    } else {
+        // Polling fallback: spin then block with timer wake.
+        let mut timeout = 10_000_000u32;
+        let mut spin_count = 0u32;
+        loop {
             #[cfg(target_arch = "aarch64")]
-            unsafe { core::arch::asm!("wfi", options(nomem, nostack)); }
-            #[cfg(not(target_arch = "aarch64"))]
-            core::hint::spin_loop();
-        } else {
-            core::hint::spin_loop();
+            unsafe {
+                let q = &raw const PCI_CTRL_QUEUE;
+                let used_addr = &(*q).used as *const _ as usize;
+                core::arch::asm!("dc civac, {}", in(reg) used_addr, options(nostack));
+                core::arch::asm!("dsb sy", options(nostack, preserves_flags));
+            }
+
+            fence(Ordering::Acquire);
+            let used_idx = unsafe {
+                let q = &raw const PCI_CTRL_QUEUE;
+                read_volatile(&(*q).used.idx)
+            };
+            if used_idx != state.last_used_idx {
+                used_len = unsafe {
+                    let q = &raw const PCI_CTRL_QUEUE;
+                    let elem_idx = (state.last_used_idx % 16) as usize;
+                    read_volatile(&(*q).used.ring[elem_idx].len)
+                };
+                state.last_used_idx = used_idx;
+                break;
+            }
+            timeout -= 1;
+            if timeout == 0 {
+                return Err("GPU PCI 3-desc command timeout");
+            }
+            spin_count += 1;
+            if can_yield && spin_count == 5_000 {
+                #[cfg(target_arch = "aarch64")]
+                {
+                    let sleep_start: u64;
+                    unsafe { core::arch::asm!("mrs {}, cntvct_el0", out(reg) sleep_start, options(nomem, nostack)); }
+
+                    let (s, n) = crate::time::get_monotonic_time_ns();
+                    let now_ns = (s as u64) * 1_000_000_000 + (n as u64);
+                    let wake_ns = now_ns + 4_000_000; // 4ms timeout
+
+                    crate::task::scheduler::with_scheduler(|sched| {
+                        sched.block_current_for_compositor(wake_ns);
+                    });
+                    crate::per_cpu_aarch64::preempt_enable();
+                    crate::task::scheduler::yield_current();
+                    crate::arch_halt_with_interrupts();
+                    crate::task::scheduler::with_scheduler(|sched| {
+                        if let Some(thread) = sched.current_thread_mut() {
+                            thread.blocked_in_syscall = false;
+                        }
+                    });
+                    crate::per_cpu_aarch64::preempt_disable();
+
+                    let sleep_end: u64;
+                    unsafe { core::arch::asm!("mrs {}, cntvct_el0", out(reg) sleep_end, options(nomem, nostack)); }
+                    let slept = sleep_end.saturating_sub(sleep_start);
+                    GPU_SLEEP_TICKS.fetch_add(slept, Ordering::Relaxed);
+                    GPU_SLEEP_TICKS_PHASES.fetch_add(slept, Ordering::Relaxed);
+                }
+                spin_count = 0;
+            } else {
+                core::hint::spin_loop();
+            }
         }
     }
 
@@ -3986,6 +4190,14 @@ pub fn virgl_composite_windows(
     // Note: cursor-only moves (no bg_dirty, no window_dirty) still trigger SUBMIT_3D
     // below to redraw the GPU cursor quad at the new position. No COMPOSITE_TEX upload needed.
 
+    // Perf: timestamp after COMPOSITE_TEX upload, before per-window uploads
+    #[cfg(target_arch = "aarch64")]
+    let t_after_bg = {
+        let v: u64;
+        unsafe { core::arch::asm!("mrs {}, cntvct_el0", out(reg) v, options(nomem, nostack)); }
+        v
+    };
+
     // =========================================================================
     // Phase A2: Upload per-window GPU textures
     // Per-window textures pre-allocated at init, TRANSFER_TO_HOST_3D proven working.
@@ -4005,7 +4217,7 @@ pub fn virgl_composite_windows(
     // Background + decorations from COMPOSITE_TEX, per-window content from
     // individual GPU textures, all in one SUBMIT_3D batch.
 
-    // Perf: timestamp before display phase
+    // Perf: timestamp before display phase (after all uploads)
     #[cfg(target_arch = "aarch64")]
     let t_display = {
         let v: u64;
@@ -4027,33 +4239,46 @@ pub fn virgl_composite_windows(
         crate::serial_println!("[composite-win] frame={} complete", frame);
     }
 
-    // Performance summary every 500 frames
+    // Performance summary every 500 WORK frames (frames that actually composited)
+    // Uses a separate counter from the main frame counter since many calls hit early return.
     #[cfg(target_arch = "aarch64")]
     {
         use core::sync::atomic::AtomicU64;
-        static PERF_COMPOSE_TICKS: AtomicU64 = AtomicU64::new(0);
-        static PERF_DISPLAY_TICKS: AtomicU64 = AtomicU64::new(0);
+        static WORK_FRAME: AtomicU32 = AtomicU32::new(0);
+        static PERF_BG_UPLOAD_TICKS: AtomicU64 = AtomicU64::new(0);
+        static PERF_WIN_UPLOAD_TICKS: AtomicU64 = AtomicU64::new(0);
+        static PERF_SUBMIT_TICKS: AtomicU64 = AtomicU64::new(0);
         static PERF_TOTAL_TICKS: AtomicU64 = AtomicU64::new(0);
 
-        let compose = t_display.saturating_sub(t_start);
-        let display = t_end.saturating_sub(t_display);
+        let wf = WORK_FRAME.fetch_add(1, Ordering::Relaxed);
+
+        let bg_upload = t_after_bg.saturating_sub(t_start);
+        let win_upload = t_display.saturating_sub(t_after_bg);
+        let submit = t_end.saturating_sub(t_display);
         let total = t_end.saturating_sub(t_start);
 
-        PERF_COMPOSE_TICKS.fetch_add(compose, Ordering::Relaxed);
-        PERF_DISPLAY_TICKS.fetch_add(display, Ordering::Relaxed);
+        PERF_BG_UPLOAD_TICKS.fetch_add(bg_upload, Ordering::Relaxed);
+        PERF_WIN_UPLOAD_TICKS.fetch_add(win_upload, Ordering::Relaxed);
+        PERF_SUBMIT_TICKS.fetch_add(submit, Ordering::Relaxed);
         PERF_TOTAL_TICKS.fetch_add(total, Ordering::Relaxed);
 
-        if frame > 0 && frame % 500 == 0 {
+        if wf > 0 && wf % 500 == 0 {
             let freq: u64;
             unsafe { core::arch::asm!("mrs {}, cntfrq_el0", out(reg) freq, options(nomem, nostack)); }
             let to_us = |ticks: u64| -> u64 { ticks * 1_000_000 / freq / 500 };
 
-            let avg_compose = to_us(PERF_COMPOSE_TICKS.swap(0, Ordering::Relaxed));
-            let avg_display = to_us(PERF_DISPLAY_TICKS.swap(0, Ordering::Relaxed));
-            let avg_total = to_us(PERF_TOTAL_TICKS.swap(0, Ordering::Relaxed));
+            let bg_us = to_us(PERF_BG_UPLOAD_TICKS.swap(0, Ordering::Relaxed));
+            let win_us = to_us(PERF_WIN_UPLOAD_TICKS.swap(0, Ordering::Relaxed));
+            let submit_wall_us = to_us(PERF_SUBMIT_TICKS.swap(0, Ordering::Relaxed));
+            let sleep_us = to_us(GPU_SLEEP_TICKS_PHASES.swap(0, Ordering::Relaxed));
+            let submit_cpu_us = submit_wall_us.saturating_sub(sleep_us);
+            let total_us = to_us(PERF_TOTAL_TICKS.swap(0, Ordering::Relaxed));
+            let cpu_us = total_us.saturating_sub(sleep_us);
 
-            // GPU perf counters available via GDB: PERF_COMPOSE_TICKS, PERF_DISPLAY_TICKS, PERF_TOTAL_TICKS
-            let _ = (avg_compose, avg_display, avg_total);
+            crate::serial_println!(
+                "[gpu-phases] 500f: bg={}us win={}us submit_wall={}us submit_cpu={}us total_cpu={}us sleep={}us",
+                bg_us, win_us, submit_wall_us, submit_cpu_us, cpu_us, sleep_us,
+            );
         }
     }
 
diff --git a/kernel/src/syscall/graphics.rs b/kernel/src/syscall/graphics.rs
index 54bda828..9c1311a6 100644
--- a/kernel/src/syscall/graphics.rs
+++ b/kernel/src/syscall/graphics.rs
@@ -57,6 +57,18 @@ static COMPOSITOR_LAST_MOUSE: core::sync::atomic::AtomicU64 = core::sync::atomic
 #[cfg(target_arch = "aarch64")]
 static COMPOSITOR_DIRTY_WAKE: core::sync::atomic::AtomicBool = core::sync::atomic::AtomicBool::new(false);
 
+/// Timestamp (ns) of the last compositor_wait return.
+/// Used to enforce a minimum inter-frame interval so the compositor doesn't
+/// saturate the CPU when GPU wake is fast (e.g., MSI-X interrupt-driven).
+#[cfg(target_arch = "aarch64")]
+static COMPOSITOR_LAST_WAKE_NS: core::sync::atomic::AtomicU64 = core::sync::atomic::AtomicU64::new(0);
+
+/// Minimum nanoseconds between compositor_wait returns.
+/// 5ms = 200 FPS cap — smooth enough for all use cases while preventing
+/// the compositor from running flat-out when events arrive continuously.
+#[cfg(target_arch = "aarch64")]
+const MIN_FRAME_INTERVAL_NS: u64 = 5_000_000;
+
 /// Wake the compositor thread if it's blocked in compositor_wait (op=23).
 /// Called from input interrupt handlers (mouse, keyboard) to provide low-latency
 /// input response without polling.
@@ -176,6 +188,8 @@ struct WindowBuffer {
     /// Window position (set by compositor)
     x: i32,
     y: i32,
+    /// Z-order (0 = bottom, higher = closer to viewer). Set by compositor.
+    z_order: u32,
     /// VirGL TEXTURE_2D resource ID (0 = not initialized)
     virgl_resource_id: u32,
     /// Whether VirGL texture has been created + backed + primed
@@ -267,6 +281,7 @@ impl WindowRegistry {
             title_len: 0,
             x: 0,
             y: 0,
+            z_order: 0,
             virgl_resource_id: 0,
             virgl_initialized: false,
             generation: 0,
@@ -856,19 +871,68 @@ fn handle_virgl_op(cmd: &FbDrawCmd) -> SyscallResult {
             if desc_ptr == 0 || desc_ptr >= USER_SPACE_MAX {
                 return SyscallResult::Err(super::ErrorCode::Fault as u64);
             }
+
+            // ksyscall-perf: measure composite time and report CPU% every 500 frames
+            #[cfg(target_arch = "aarch64")]
+            {
+                let t0: u64;
+                unsafe { core::arch::asm!("mrs {}, cntvct_el0", out(reg) t0, options(nomem, nostack)); }
+
+                let result = handle_composite_windows(desc_ptr);
+
+                use core::sync::atomic::{AtomicU64, AtomicU32};
+                static PERF_FRAME: AtomicU32 = AtomicU32::new(0);
+                static PERF_GPU_TICKS: AtomicU64 = AtomicU64::new(0);
+                static PERF_EPOCH: AtomicU64 = AtomicU64::new(0);
+
+                let t1: u64;
+                unsafe { core::arch::asm!("mrs {}, cntvct_el0", out(reg) t1, options(nomem, nostack)); }
+
+                let frame = PERF_FRAME.fetch_add(1, core::sync::atomic::Ordering::Relaxed);
+                PERF_GPU_TICKS.fetch_add(t1.saturating_sub(t0), core::sync::atomic::Ordering::Relaxed);
+
+                if frame == 0 {
+                    PERF_EPOCH.store(t0, core::sync::atomic::Ordering::Relaxed);
+                }
+
+                if frame > 0 && (frame + 1) % 500 == 0 {
+                    let freq: u64;
+                    unsafe { core::arch::asm!("mrs {}, cntfrq_el0", out(reg) freq, options(nomem, nostack)); }
+                    let gpu_ticks = PERF_GPU_TICKS.swap(0, core::sync::atomic::Ordering::Relaxed);
+                    let sleep_ticks = crate::drivers::virtio::gpu_pci::take_gpu_sleep_ticks();
+                    let cpu_ticks = gpu_ticks.saturating_sub(sleep_ticks);
+                    let epoch = PERF_EPOCH.swap(t1, core::sync::atomic::Ordering::Relaxed);
+                    let wall_ticks = t1.saturating_sub(epoch);
+
+                    let wall_us = gpu_ticks * 1_000_000 / freq / 500;
+                    let sleep_us = sleep_ticks * 1_000_000 / freq / 500;
+                    let cpu_us = cpu_ticks * 1_000_000 / freq / 500;
+                    let busy_pct = if wall_ticks > 0 { cpu_ticks * 100 / wall_ticks } else { 0 };
+
+                    crate::serial_println!(
+                        "[ksyscall-perf] 500f: wall={}us sleep={}us cpu={}us busy={}%",
+                        wall_us, sleep_us, cpu_us, busy_pct,
+                    );
+                }
+
+                result
+            }
+            #[cfg(not(target_arch = "aarch64"))]
             handle_composite_windows(desc_ptr)
         }
         17 => {
-            // SetWindowPosition: set window position for compositor
-            // p1=buffer_id, p2=x (i16 low) | y (i16 high)
+            // SetWindowPosition: set window position + z-order for compositor
+            // p1=buffer_id, p2=x (i16 low) | y (i16 high), p3=z_order
             let buffer_id = cmd.p1 as u32;
             let x = (cmd.p2 & 0xFFFF) as i16 as i32;
             let y = ((cmd.p2 >> 16) & 0xFFFF) as i16 as i32;
+            let z_order = cmd.p3 as u32;
             let mut reg = WINDOW_REGISTRY.lock();
             match reg.find_mut(buffer_id) {
                 Some(buf) => {
                     buf.x = x;
                     buf.y = y;
+                    buf.z_order = z_order;
                     SyscallResult::Ok(0)
                 }
                 None => SyscallResult::Err(super::ErrorCode::InvalidArgument as u64),
@@ -1107,6 +1171,54 @@ fn handle_compositor_wait(cmd: &FbDrawCmd) -> SyscallResult {
     let timeout_ms = cmd.p1 as u32;
     let last_registry_gen = cmd.p2 as u32 as u64;
 
+    // Frame pacing: enforce minimum inter-frame interval.
+    // Without this, MSI-X interrupt-driven GPU wake causes the compositor to
+    // run flat-out (~200+ FPS), saturating the CPU. By sleeping until the
+    // minimum interval has elapsed, we cap effective FPS while keeping
+    // latency low for input events (mouse/keyboard still wake immediately).
+    //
+    // IMPORTANT: This uses a plain timer block, NOT block_current_for_compositor.
+    // We must NOT set COMPOSITOR_WAITING_THREAD here because mark_window_dirty
+    // would wake us early and consume the dirty signal, causing the main
+    // blocking section to re-block and wait for the full 16ms timeout.
+    let (s, n) = crate::time::get_monotonic_time_ns();
+    let now_ns = (s as u64) * 1_000_000_000 + (n as u64);
+    let last_wake = COMPOSITOR_LAST_WAKE_NS.load(Ordering::Relaxed);
+    if last_wake != 0 {
+        let earliest_return = last_wake + MIN_FRAME_INTERVAL_NS;
+        if now_ns < earliest_return {
+            crate::task::scheduler::with_scheduler(|sched| {
+                sched.block_current_for_timer(earliest_return);
+            });
+
+            #[cfg(target_arch = "aarch64")]
+            crate::per_cpu_aarch64::preempt_enable();
+
+            loop {
+                let still_blocked = crate::task::scheduler::with_scheduler(|sched| {
+                    sched.wake_expired_timers();
+                    sched.current_thread_mut()
+                        .map(|t| t.state == crate::task::thread::ThreadState::BlockedOnTimer)
+                        .unwrap_or(false)
+                });
+                if !still_blocked.unwrap_or(false) { break; }
+                crate::task::scheduler::yield_current();
+                crate::arch_halt_with_interrupts();
+            }
+
+            crate::task::scheduler::with_scheduler(|sched| {
+                if let Some(thread) = sched.current_thread_mut() {
+                    thread.blocked_in_syscall = false;
+                }
+            });
+
+            #[cfg(target_arch = "aarch64")]
+            crate::per_cpu_aarch64::preempt_disable();
+            #[cfg(target_arch = "aarch64")]
+            ensure_current_address_space();
+        }
+    }
+
     // Pack current mouse state for comparison
     let (mx, my, mb) = crate::drivers::usb::hid::mouse_state();
     let mouse_packed = ((mx as u64) << 32) | ((my as u64) << 16) | (mb as u64);
@@ -1126,10 +1238,16 @@ fn handle_compositor_wait(cmd: &FbDrawCmd) -> SyscallResult {
         ready |= 4;
     }
 
-    // If mouse or registry changed, return immediately (don't check dirty — BWM
-    // will do its own per-window dirty check via check_window_dirty).
+    // Bit 0: dirty window signal pending (may have arrived during frame pacing sleep)
+    if COMPOSITOR_DIRTY_WAKE.swap(false, Ordering::Relaxed) {
+        ready |= 1;
+    }
+
+    // If anything is ready, return immediately.
     if ready != 0 {
         COMPOSITOR_LAST_MOUSE.store(mouse_packed, Ordering::Relaxed);
+        let (ws, wn) = crate::time::get_monotonic_time_ns();
+        COMPOSITOR_LAST_WAKE_NS.store((ws as u64) * 1_000_000_000 + (wn as u64), Ordering::Relaxed);
         return SyscallResult::Ok(ready | ((cur_reg_gen & 0x00FF_FFFF) << 8));
     }
 
@@ -1199,6 +1317,9 @@ fn handle_compositor_wait(cmd: &FbDrawCmd) -> SyscallResult {
 
     COMPOSITOR_LAST_MOUSE.store(mouse_packed2, Ordering::Relaxed);
 
+    let (ws2, wn2) = crate::time::get_monotonic_time_ns();
+    COMPOSITOR_LAST_WAKE_NS.store((ws2 as u64) * 1_000_000_000 + (wn2 as u64), Ordering::Relaxed);
+
     SyscallResult::Ok(ready_after | ((cur_reg_gen2 & 0x00FF_FFFF) << 8))
 }
 
@@ -1340,6 +1461,7 @@ fn handle_composite_windows(desc_ptr: u64) -> SyscallResult {
                     height: buf.height,
                     x: buf.x,
                     y: buf.y,
+                    z_order: buf.z_order,
                     dirty,
                     page_phys_addrs: buf.page_phys_addrs.clone(),
                     size: buf.size,
@@ -1356,6 +1478,8 @@ fn handle_composite_windows(desc_ptr: u64) -> SyscallResult {
                 win_idx += 1;
             }
         }
+        // Sort by z_order so GPU draws back-to-front (lower z = drawn first).
+        result.sort_unstable_by_key(|w| w.z_order);
         result
     };
 
@@ -1420,6 +1544,7 @@ pub struct WindowCompositeInfo {
     pub height: u32,
     pub x: i32,
     pub y: i32,
+    pub z_order: u32,
     pub dirty: bool,
     pub page_phys_addrs: alloc::vec::Vec<u64>,
     pub size: usize,
diff --git a/kernel/src/task/scheduler.rs b/kernel/src/task/scheduler.rs
index ad3dfe4b..6883b63a 100644
--- a/kernel/src/task/scheduler.rs
+++ b/kernel/src/task/scheduler.rs
@@ -514,23 +514,28 @@ impl Scheduler {
                 // Check the state and determine what to do
                 let (is_terminated, is_blocked) =
                     if let Some(current) = self.get_thread_mut(current_id) {
-                        // Charge elapsed CPU ticks to the outgoing thread
-                        let now = crate::time::get_ticks();
-                        current.cpu_ticks_total += now.wrapping_sub(current.run_start_ticks);
-                        // Reset run_start_ticks so that if no context switch happens
-                        // (function returns None), the next call won't double-count.
-                        current.run_start_ticks = now;
-
                         let was_terminated = current.state == ThreadState::Terminated;
                         // Check for any blocked state
                         let was_blocked = current.state == ThreadState::Blocked
                             || current.state == ThreadState::BlockedOnSignal
                             || current.state == ThreadState::BlockedOnChildExit
                             || current.state == ThreadState::BlockedOnTimer;
-                        // Only set to Ready if not terminated AND not blocked
-                        if !was_terminated && !was_blocked {
+
+                        // Charge elapsed CPU ticks to the outgoing thread, but ONLY
+                        // if it was actually running. Blocked threads already had
+                        // their ticks charged at block time — charging again here
+                        // would count blocked/sleeping time as CPU usage.
+                        if !was_blocked && !was_terminated {
+                            let now = crate::time::get_ticks();
+                            current.cpu_ticks_total += now.wrapping_sub(current.run_start_ticks);
+                            current.run_start_ticks = now;
                             current.set_ready();
+                        } else {
+                            // Reset run_start_ticks so the next dispatch doesn't
+                            // charge stale time from the blocked period.
+                            current.run_start_ticks = crate::time::get_ticks();
                         }
+
                         (was_terminated, was_blocked)
                     } else {
                         (true, false)
@@ -712,21 +717,22 @@ impl Scheduler {
             if current_id != self.cpu_state[Self::current_cpu_id()].idle_thread {
                 let (is_terminated, is_blocked) =
                     if let Some(current) = self.get_thread_mut(current_id) {
-                        // Charge elapsed CPU ticks to the outgoing thread
-                        let now = crate::time::get_ticks();
-                        current.cpu_ticks_total += now.wrapping_sub(current.run_start_ticks);
-                        // Reset run_start_ticks so that if no context switch happens
-                        // (function returns None), the next call won't double-count.
-                        current.run_start_ticks = now;
-
                         let was_terminated = current.state == ThreadState::Terminated;
                         let was_blocked = current.state == ThreadState::Blocked
                             || current.state == ThreadState::BlockedOnSignal
                             || current.state == ThreadState::BlockedOnChildExit
                             || current.state == ThreadState::BlockedOnTimer;
-                        if !was_terminated && !was_blocked {
+
+                        // Only charge CPU ticks if thread was actually running
+                        if !was_blocked && !was_terminated {
+                            let now = crate::time::get_ticks();
+                            current.cpu_ticks_total += now.wrapping_sub(current.run_start_ticks);
+                            current.run_start_ticks = now;
                             current.set_ready();
+                        } else {
+                            current.run_start_ticks = crate::time::get_ticks();
                         }
+
                         (was_terminated, was_blocked)
                     } else {
                         (true, false)
@@ -887,6 +893,11 @@ impl Scheduler {
     #[allow(dead_code)]
     pub fn block_current(&mut self) {
         if let Some(current) = self.current_thread_mut() {
+            // Charge elapsed CPU ticks before blocking
+            let now = crate::time::get_ticks();
+            current.cpu_ticks_total += now.wrapping_sub(current.run_start_ticks);
+            current.run_start_ticks = now;
+
             current.set_blocked();
         }
     }
@@ -1010,6 +1021,11 @@ impl Scheduler {
     ) {
         if let Some(current_id) = self.cpu_state[Self::current_cpu_id()].current_thread {
             if let Some(thread) = self.get_thread_mut(current_id) {
+                // Charge elapsed CPU ticks before blocking
+                let now = crate::time::get_ticks();
+                thread.cpu_ticks_total += now.wrapping_sub(thread.run_start_ticks);
+                thread.run_start_ticks = now;
+
                 // CRITICAL: Save userspace context FIRST, THEN set state.
                 // This ensures that when unblock_for_signal() is called,
                 // the context is already saved and ready for signal delivery.
@@ -1133,6 +1149,11 @@ impl Scheduler {
     pub fn block_current_for_child_exit(&mut self) {
         if let Some(current_id) = self.cpu_state[Self::current_cpu_id()].current_thread {
             if let Some(thread) = self.get_thread_mut(current_id) {
+                // Charge elapsed CPU ticks before blocking
+                let now = crate::time::get_ticks();
+                thread.cpu_ticks_total += now.wrapping_sub(thread.run_start_ticks);
+                thread.run_start_ticks = now;
+
                 thread.state = ThreadState::BlockedOnChildExit;
                 // CRITICAL: Mark that this thread is blocked inside a syscall.
                 // When the thread is resumed, we must NOT restore userspace context
@@ -1199,6 +1220,11 @@ impl Scheduler {
     pub fn block_current_for_timer(&mut self, wake_time_ns: u64) {
         if let Some(current_id) = self.cpu_state[Self::current_cpu_id()].current_thread {
             if let Some(thread) = self.get_thread_mut(current_id) {
+                // Charge elapsed CPU ticks before blocking
+                let now = crate::time::get_ticks();
+                thread.cpu_ticks_total += now.wrapping_sub(thread.run_start_ticks);
+                thread.run_start_ticks = now;
+
                 thread.state = ThreadState::BlockedOnTimer;
                 thread.wake_time_ns = Some(wake_time_ns);
                 thread.blocked_in_syscall = true;
@@ -1216,6 +1242,13 @@ impl Scheduler {
     pub fn block_current_for_compositor(&mut self, timeout_ns: u64) {
         if let Some(current_id) = self.cpu_state[Self::current_cpu_id()].current_thread {
             if let Some(thread) = self.get_thread_mut(current_id) {
+                // Charge elapsed CPU ticks NOW, before blocking. Otherwise the
+                // next schedule() call charges all time since last dispatch —
+                // including blocked/sleeping time — as CPU usage.
+                let now = crate::time::get_ticks();
+                thread.cpu_ticks_total += now.wrapping_sub(thread.run_start_ticks);
+                thread.run_start_ticks = now;
+
                 thread.state = ThreadState::BlockedOnTimer;
                 thread.wake_time_ns = Some(timeout_ns);
                 thread.blocked_in_syscall = true;
diff --git a/libs/libbreenix/src/graphics.rs b/libs/libbreenix/src/graphics.rs
index c350f0d8..8478540c 100644
--- a/libs/libbreenix/src/graphics.rs
+++ b/libs/libbreenix/src/graphics.rs
@@ -567,13 +567,13 @@ pub fn mark_window_dirty(buffer_id: u32) -> Result<(), Error> {
 ///
 /// Tells the kernel where to place this window during compositing.
 /// If position is never set, windows are auto-positioned.
-pub fn set_window_position(buffer_id: u32, x: i32, y: i32) -> Result<(), Error> {
+pub fn set_window_position(buffer_id: u32, x: i32, y: i32, z_order: u32) -> Result<(), Error> {
     let packed_xy = ((x as u16 as u32) | ((y as u16 as u32) << 16)) as i32;
     let cmd = FbDrawCmd {
         op: draw_op::SET_WINDOW_POSITION,
         p1: buffer_id as i32,
         p2: packed_xy,
-        p3: 0,
+        p3: z_order as i32,
         p4: 0,
         color: 0,
     };
diff --git a/userspace/programs/src/bwm.rs b/userspace/programs/src/bwm.rs
index 37d793a9..15efca25 100644
--- a/userspace/programs/src/bwm.rs
+++ b/userspace/programs/src/bwm.rs
@@ -52,7 +52,6 @@ const TITLE_TEXT: Color = Color::rgb(160, 165, 175);
 const TITLE_FOCUSED_TEXT: Color = Color::rgb(255, 255, 255);
 const WIN_BORDER_COLOR: Color = Color::rgb(50, 55, 70);
 const WIN_BORDER_FOCUSED: Color = Color::rgb(60, 130, 255);
-const CONTENT_BG: Color = Color::rgb(20, 25, 40);
 
 // Taskbar/Appbar colors
 const TASKBAR_BG: Color = Color::rgb(20, 22, 30);
@@ -244,15 +243,7 @@ impl Window {
 // ─── Drawing Helpers ─────────────────────────────────────────────────────────
 
 fn fill_rect(fb: &mut FrameBuf, x: i32, y: i32, w: usize, h: usize, color: Color) {
-    for dy in 0..h as i32 {
-        let py = y + dy;
-        if py < 0 || py >= fb.height as i32 { continue; }
-        for dx in 0..w as i32 {
-            let px = x + dx;
-            if px < 0 || px >= fb.width as i32 { continue; }
-            fb.put_pixel(px as usize, py as usize, color);
-        }
-    }
+    libgfx::shapes::fill_rect(fb, x, y, w as i32, h as i32, color);
 }
 
 fn draw_text_at(fb: &mut FrameBuf, text: &[u8], x: i32, y: i32, color: Color) {
@@ -298,8 +289,7 @@ fn draw_window_frame(fb: &mut FrameBuf, win: &Window, focused: bool) {
     let my = mby + (mbh as i32 - CELL_H as i32) / 2;
     draw_text_at(fb, b"-", mx, my, MINIMIZE_BTN_TEXT);
 
-    fill_rect(fb, win.content_x(), win.content_y(),
-              win.content_width(), win.content_height(), CONTENT_BG);
+    // Content area NOT filled here — GPU composites per-window textures over it.
 }
 
 /// Paint the decorative desktop background — gradient with grid
@@ -604,10 +594,13 @@ fn discover_windows(windows: &mut Vec<Window>, screen_w: usize, screen_h: usize,
             core::str::from_utf8(&title[..title_len]).unwrap_or("?"),
             info.buffer_id, info.width, info.height, cascade_x, cascade_y);
 
-        // Tell kernel where the client content goes on screen (for GPU compositing)
+        // Tell kernel where the client content goes on screen (for GPU compositing).
+        // z_order = index in windows vec (0 = bottom). New windows are pushed to
+        // the end, so they get the highest z_order.
         let content_x = cascade_x + BORDER_WIDTH as i32;
         let content_y = cascade_y + TITLE_BAR_HEIGHT as i32 + BORDER_WIDTH as i32;
-        let _ = graphics::set_window_position(info.buffer_id, content_x, content_y);
+        let z_order = windows.len() as u32; // will be at this index after push
+        let _ = graphics::set_window_position(info.buffer_id, content_x, content_y, z_order);
 
         let order = *next_order;
         *next_order += 1;
@@ -624,6 +617,17 @@ fn discover_windows(windows: &mut Vec<Window>, screen_w: usize, screen_h: usize,
     removed || added
 }
 
+/// Update kernel z-order for all windows. Called after any z-order change
+/// (raise-to-front, new window, etc.) so the GPU compositor draws quads
+/// in correct back-to-front order.
+fn update_kernel_z_order(windows: &[Window]) {
+    for (i, win) in windows.iter().enumerate() {
+        if win.window_id != 0 {
+            let _ = graphics::set_window_position(win.window_id, win.content_x(), win.content_y(), i as u32);
+        }
+    }
+}
+
 /// Redraw all windows in z-order (index 0 = bottom), plus taskbar and app bar.
 /// Window frames and decorations go into the compositor buffer; GPU compositing
 /// handles client content via per-window textured quads.
@@ -845,6 +849,7 @@ fn main() {
     let mut last_clock_sec: i64 = -1;
     let mut clock_text = [0u8; 11];
     format_clock(0, &mut clock_text);
+    let mut frame_counter: u32 = 0;
     let mut next_creation_order: u32 = 0;
 
     // Initial composite
@@ -882,6 +887,7 @@ fn main() {
                 // New windows are pushed to end of Vec (top of z-order).
                 // Always focus the topmost visible window so appbar selection
                 // matches the visually foregrounded window.
+                update_kernel_z_order(&windows);
                 focused_win = next_visible_window(&windows, 0);
                 compose_full_redraw(composite_buf, &mut fb, &mut shadow_fb, &bg_cache, &windows, focused_win, &clock_text);
                 full_redraw = true;
@@ -961,7 +967,7 @@ fn main() {
                             if windows[win_idx].window_id != 0 {
                                 let cx = windows[win_idx].content_x();
                                 let cy = windows[win_idx].content_y();
-                                let _ = graphics::set_window_position(windows[win_idx].window_id, cx, cy);
+                                let _ = graphics::set_window_position(windows[win_idx].window_id, cx, cy, win_idx as u32);
                             }
                             // Dirty region = union of old and new bounds
                             let (nx0, ny0, nx1, ny1) = windows[win_idx].bounds();
@@ -1028,6 +1034,7 @@ fn main() {
                                 if idx < windows.len() - 1 {
                                     let win = windows.remove(idx);
                                     windows.push(win);
+                                    update_kernel_z_order(&windows);
                                 }
                                 let top = windows.len() - 1;
                                 if top != focused_win {
@@ -1057,13 +1064,16 @@ fn main() {
                             }
                         }
                         if let Some(ci) = clicked_idx {
-                            if ci < windows.len() - 1 {
+                            let z_changed = ci < windows.len() - 1;
+                            if z_changed {
                                 let win = windows.remove(ci);
                                 windows.push(win);
+                                update_kernel_z_order(&windows);
                             }
                             let top = windows.len() - 1;
+                            let focus_changed = top != focused_win;
 
-                            if top != focused_win {
+                            if focus_changed {
                                 send_focus_event(&windows, focused_win, input_event_type::FOCUS_LOST);
                                 focused_win = top;
                                 send_focus_event(&windows, focused_win, input_event_type::FOCUS_GAINED);
@@ -1102,8 +1112,9 @@ fn main() {
                                 route_mouse_button_to_focused(&windows, focused_win, 1, true, local_x, local_y);
                             }
 
-                            // Full redraw for z-order change (unless minimize already did it)
-                            if !full_redraw {
+                            // Full redraw for z-order or focus change (unless minimize
+                            // already did it, or nothing visual changed)
+                            if !full_redraw && (z_changed || focus_changed) {
                                 compose_full_redraw(composite_buf, &mut fb, &mut shadow_fb, &bg_cache, &windows, focused_win, &clock_text);
                                 full_redraw = true;
                             }
@@ -1124,17 +1135,20 @@ fn main() {
         }
 
         // ── 5b. Update clock (once per second) ──
-        if let Ok(ts) = libbreenix::time::now_realtime() {
-            if ts.tv_sec != last_clock_sec {
-                last_clock_sec = ts.tv_sec;
-                format_clock(ts.tv_sec, &mut clock_text);
-                draw_taskbar(&mut fb, &clock_text);
-                // Expand dirty rect to cover taskbar
-                dirty_x0 = 0;
-                dirty_y0 = 0;
-                dirty_x1 = dirty_x1.max(screen_w as i32);
-                dirty_y1 = dirty_y1.max(TASKBAR_HEIGHT as i32);
-                content_dirty = true;
+        // Only check realtime every 30 frames (~5-6 checks/sec at 200 FPS)
+        frame_counter = frame_counter.wrapping_add(1);
+        if frame_counter % 30 == 0 {
+            if let Ok(ts) = libbreenix::time::now_realtime() {
+                if ts.tv_sec != last_clock_sec {
+                    last_clock_sec = ts.tv_sec;
+                    format_clock(ts.tv_sec, &mut clock_text);
+                    draw_taskbar(&mut fb, &clock_text);
+                    dirty_x0 = 0;
+                    dirty_y0 = 0;
+                    dirty_x1 = dirty_x1.max(screen_w as i32);
+                    dirty_y1 = dirty_y1.max(TASKBAR_HEIGHT as i32);
+                    content_dirty = true;
+                }
             }
         }
 
@@ -1176,6 +1190,5 @@ fn main() {
             windows_dirty = false;
         }
         // No sleep — compositor_wait handles blocking
-
     }
 }