diff --git a/kernel/src/drivers/mod.rs b/kernel/src/drivers/mod.rs index 4539b2a1..abb62270 100644 --- a/kernel/src/drivers/mod.rs +++ b/kernel/src/drivers/mod.rs @@ -122,6 +122,10 @@ pub fn init() -> usize { serial_println!("[drivers] VirGL 3D acceleration active"); crate::graphics::set_compositor_backend(crate::graphics::CompositorBackend::VirGL); serial_println!("[drivers] Compositor backend: VirGL"); + // Enable yielding during GPU command waits now that init is complete. + // During runtime, GPU poll loops yield to the scheduler instead of + // spinning, letting other tasks run during ~3.4ms GPU processing. + virtio::gpu_pci::enable_gpu_yield(); } Err(e) => serial_println!("[drivers] VirGL init skipped: {}", e), } diff --git a/kernel/src/drivers/virtio/gpu_pci.rs b/kernel/src/drivers/virtio/gpu_pci.rs index 4e764435..81f816cc 100644 --- a/kernel/src/drivers/virtio/gpu_pci.rs +++ b/kernel/src/drivers/virtio/gpu_pci.rs @@ -924,6 +924,69 @@ static GPU_IRQ: AtomicU32 = AtomicU32::new(0); /// Set by the interrupt handler to wake the WFI loop in send_command(). static GPU_CMD_COMPLETE: AtomicBool = AtomicBool::new(false); +/// Whether GPU command waits should yield to the scheduler instead of spinning. +/// False during init (single-threaded, scheduler may not be ready). +/// True after init completes (compositor runtime — yield lets other tasks run +/// during the ~3.4ms GPU processing time for SUBMIT_3D). +static GPU_YIELD_ENABLED: AtomicBool = AtomicBool::new(false); + +/// Accumulated ticks spent sleeping (blocked) during GPU command waits. +/// Used by ksyscall-perf to distinguish sleep time from CPU time. +static GPU_SLEEP_TICKS: AtomicU64 = AtomicU64::new(0); + +/// Separate sleep tick counter for gpu-phases reporting. +/// ksyscall-perf swaps GPU_SLEEP_TICKS to 0 on its own schedule; +/// gpu-phases needs its own counter to avoid interference. +static GPU_SLEEP_TICKS_PHASES: AtomicU64 = AtomicU64::new(0); + +/// Thread ID of the thread currently blocked waiting for GPU command completion. +/// Set before blocking in send_command_3desc, cleared after waking. +/// The GPU interrupt handler uses this to wake the thread immediately. +static GPU_WAITING_THREAD: AtomicU64 = AtomicU64::new(0); + +/// Enable yielding during GPU command waits. +/// Called after GPU init completes, when the scheduler is fully running. +pub fn enable_gpu_yield() { + GPU_YIELD_ENABLED.store(true, Ordering::Release); + + // Enable GPU MSI-X SPI now that VirGL init is complete. + // During init, the SPI is configured but not enabled to avoid interrupt storms. + let irq = GPU_IRQ.load(Ordering::Relaxed); + if irq != 0 { + #[cfg(target_arch = "aarch64")] + { + use crate::arch_impl::aarch64::gic; + + // Assign VirtIO-level MSI-X vectors via modern transport common config. + // Config change → no interrupt (0xFFFF). Controlq (0) → vector 0. + unsafe { + let ptr = &raw const GPU_PCI_STATE; + if let Some(ref state) = *ptr { + state.device.set_config_msix_vector(0xFFFF); + state.device.select_queue(0); + let rb = state.device.set_queue_msix_vector(0); + if rb == 0xFFFF { + crate::serial_println!("[virtio-gpu-pci] MSI-X: device rejected controlq vector — disabling"); + GPU_IRQ.store(0, Ordering::Relaxed); + } else { + // Clear any pending SPI from init/VirGL commands before enabling + gic::clear_spi_pending(irq); + gic::enable_spi(irq); + crate::serial_println!("[virtio-gpu-pci] MSI-X SPI {} enabled — interrupt-driven GPU wake active", irq); + } + } + } + } + } else { + crate::serial_println!("[virtio-gpu-pci] GPU yield enabled (polling mode — no MSI-X)"); + } +} + +/// Take (read and reset) accumulated sleep ticks from GPU command waits. +pub fn take_gpu_sleep_ticks() -> u64 { + GPU_SLEEP_TICKS.swap(0, Ordering::Relaxed) +} + // ============================================================================= // Helpers // ============================================================================= @@ -1065,24 +1128,30 @@ fn setup_gpu_msi(pci_dev: &crate::drivers::pci::Device) -> u32 { let msi_address = base + 0x40; - // Step 3: Try MSI-X first (what VirtIO modern devices use) + // Step 3: Try MSI-X (what VirtIO modern devices use) if let Some(msix_cap) = pci_dev.find_msix_capability() { let table_size = pci_dev.msix_table_size(msix_cap); - let (table_bar, table_offset) = pci_dev.msix_table_location(msix_cap); - crate::serial_println!("[virtio-gpu-pci] MSI-X cap found: offset={:#x} table_size={} table_bar={} table_offset={:#x}", - msix_cap, table_size, table_bar, table_offset); - - // Check BAR validity before accessing MSI-X table - let bar_info = &pci_dev.bars[table_bar as usize]; - crate::serial_println!("[virtio-gpu-pci] MSI-X BAR {}: addr={:#x} size={:#x} valid={}", - table_bar, bar_info.address, bar_info.size, bar_info.is_valid()); - - // DIAGNOSTIC: Skip MSI-X PCI enable to avoid interrupt interference during init. - // We still write VirtIO MSI-X vectors to NO_VECTOR to test if vector - // configuration (without actual MSI-X) affects VirGL activation. - // Once VirGL works, we can re-enable MSI-X for runtime performance. - crate::serial_println!("[virtio-gpu-pci] MSI-X cap present but skipping PCI enable (using polling)"); - // Return 0 = polling mode, VirtIO vectors will be set to NO_VECTOR below + crate::serial_println!("[virtio-gpu-pci] MSI-X cap at {:#x}: {} vectors", msix_cap, table_size); + + // Program all MSI-X table entries with same SPI (single-vector mode) + for v in 0..table_size { + pci_dev.configure_msix_entry(msix_cap, v, msi_address, spi); + } + + gic::configure_spi_edge_triggered(spi); + // Do NOT store GPU_IRQ or enable SPI here. GPU_IRQ=0 during init means + // send_command uses spin-polling and the interrupt handler ignores GPU SPIs. + // Both are activated after all init commands succeed (see end of init()). + + // Enable MSI-X at PCI level and disable legacy INTx + pci_dev.enable_msix(msix_cap); + pci_dev.disable_intx(); + + crate::serial_println!( + "[virtio-gpu-pci] MSI-X enabled: SPI {} doorbell={:#x} vectors={}", + spi, msi_address, table_size + ); + return spi; } // Step 4: Fall back to plain MSI @@ -1126,6 +1195,15 @@ pub fn handle_interrupt() { GPU_CMD_COMPLETE.store(true, Ordering::Release); + // Wake the compositor thread blocked in send_command_3desc. + let waiting_tid = GPU_WAITING_THREAD.load(Ordering::Acquire); + if waiting_tid != 0 { + crate::task::scheduler::with_scheduler(|sched| { + sched.unblock(waiting_tid); + }); + crate::task::scheduler::set_need_resched(); + } + gic::clear_spi_pending(irq); gic::enable_spi(irq); } @@ -1293,15 +1371,8 @@ pub fn init() -> Result<(), &'static str> { // Mark device ready — MUST happen before sending any commands (Linux: virtio_device_ready()) virtio.driver_ok(); - // NOTE: We do NOT store msi_spi in GPU_IRQ yet! GPU_IRQ=0 means send_command - // uses spin-polling instead of WFI. At this early boot stage there's no timer - // interrupt, so if an MSI-X interrupt fails to deliver, WFI would block forever. - // We enable MSI-X interrupt delivery after all init commands succeed. - #[cfg(target_arch = "aarch64")] - if msi_spi != 0 { - crate::arch_impl::aarch64::gic::enable_spi(msi_spi); - crate::serial_println!("[virtio-gpu-pci] MSI-X SPI {} GIC-enabled (polling during init, WFI after)", msi_spi); - } + // GPU_IRQ=0 at this point (set in setup_gpu_msi but NOT stored in GPU_IRQ). + // All init commands below use spin-polling. SPI is enabled after init succeeds. // Read device-specific config (Linux reads num_scanouts + num_capsets here) let num_scanouts = virtio.read_config_u32(GPU_CFG_NUM_SCANOUTS); @@ -1409,13 +1480,14 @@ pub fn init() -> Result<(), &'static str> { // All GPU setup commands succeeded — now mark as initialized. GPU_PCI_INITIALIZED.store(true, Ordering::Release); - // NOW enable MSI-X interrupt-driven command completion. All init commands - // above used spin-polling (GPU_IRQ=0). From here on, send_command will use - // WFI to wait for MSI-X interrupts, which is more efficient for runtime. + // Store GPU_IRQ now so enable_gpu_yield() (called after virgl_init) can + // activate interrupt-driven wake. We do NOT enable the SPI here — all + // VirGL init commands also use spin-polling. enable_gpu_yield() handles + // clearing pending, storing GPU_IRQ, and enabling the SPI. #[cfg(target_arch = "aarch64")] if msi_spi != 0 { GPU_IRQ.store(msi_spi, Ordering::Release); - crate::serial_println!("[virtio-gpu-pci] MSI-X WFI mode activated (SPI={})", msi_spi); + crate::serial_println!("[virtio-gpu-pci] MSI-X configured (SPI={}, deferred enable after VirGL init)", msi_spi); } crate::serial_println!("[virtio-gpu-pci] Initialized: {}x{}", use_width, use_height); @@ -1510,12 +1582,22 @@ fn send_command( fence(Ordering::SeqCst); } + // Suppress device interrupts for fast 2-desc commands (spin-only). + unsafe { + let q = &raw mut PCI_CTRL_QUEUE; + (*q).avail.flags = 1; // VRING_AVAIL_F_NO_INTERRUPT + #[cfg(target_arch = "aarch64")] + dma_cache_clean(&(*q).avail.flags as *const u16 as *const u8, 64); + } + // Signal that we're waiting for a completion, then notify device GPU_CMD_COMPLETE.store(false, Ordering::Release); state.device.notify_queue_fast(0); - // Wait for used ring update — WFI if MSI is available, spin_loop otherwise. - let use_msi = GPU_IRQ.load(Ordering::Relaxed) != 0; + // Wait for used ring update — tight spin for 2-desc commands. + // These complete in microseconds (SET_SCANOUT, RESOURCE_FLUSH, TRANSFER_TO_HOST_3D). + // Yielding here adds ~1-2ms of context switch overhead per command, which is + // catastrophic with 5+ commands per frame. let mut timeout = 10_000_000u32; loop { // Invalidate used ring cache line so we see the device's DMA write @@ -1540,17 +1622,7 @@ fn send_command( if timeout == 0 { return Err("GPU PCI command timeout"); } - if use_msi { - // WFI halts the vCPU until an interrupt arrives. The hypervisor - // processes the VirtIO command while the guest is halted, then - // delivers the MSI interrupt to wake us. - #[cfg(target_arch = "aarch64")] - unsafe { core::arch::asm!("wfi", options(nomem, nostack)); } - #[cfg(not(target_arch = "aarch64"))] - core::hint::spin_loop(); - } else { - core::hint::spin_loop(); - } + core::hint::spin_loop(); } Ok(()) @@ -1630,48 +1702,180 @@ fn send_command_3desc( fence(Ordering::SeqCst); } + let can_yield = GPU_YIELD_ENABLED.load(Ordering::Relaxed); + let has_msi = GPU_IRQ.load(Ordering::Relaxed) != 0; + + // When MSI-X is active: zero-spin, pure interrupt-driven wake. + // Enable device interrupts, register our thread, notify device, block immediately. + // The GPU interrupt handler fires on completion and wakes us — no CPU cycles + // wasted spinning. This matches Linux's virtio-gpu driver (wait_event_timeout). + // + // When MSI-X is not available: spin briefly then block with timer fallback. + if !has_msi { + // No MSI-X — suppress interrupts, we'll poll. + unsafe { + let q = &raw mut PCI_CTRL_QUEUE; + (*q).avail.flags = 1; // VRING_AVAIL_F_NO_INTERRUPT + #[cfg(target_arch = "aarch64")] + dma_cache_clean(&(*q).avail.flags as *const u16 as *const u8, 64); + } + } + // Notify device state.device.notify_queue_fast(0); - // Poll for completion — WFI if MSI available, spin_loop otherwise. - let use_msi = GPU_IRQ.load(Ordering::Relaxed) != 0; - let mut timeout = 10_000_000u32; let used_len; - loop { - #[cfg(target_arch = "aarch64")] + + #[cfg(target_arch = "aarch64")] + if has_msi && can_yield { + // MSI-X path: block immediately, zero spin. + let sleep_start: u64; + unsafe { core::arch::asm!("mrs {}, cntvct_el0", out(reg) sleep_start, options(nomem, nostack)); } + + // Register thread for interrupt-driven wake + if let Some(tid) = crate::task::scheduler::current_thread_id() { + GPU_WAITING_THREAD.store(tid, Ordering::Release); + } + + // Enable device interrupts — GPU will fire MSI-X on completion unsafe { + let q = &raw mut PCI_CTRL_QUEUE; + (*q).avail.flags = 0; // Enable notifications + dma_cache_clean(&(*q).avail.flags as *const u16 as *const u8, 64); + } + + // Check if already complete before blocking (race: device may have + // finished between notify and here) + let already_done = unsafe { let q = &raw const PCI_CTRL_QUEUE; let used_addr = &(*q).used as *const _ as usize; core::arch::asm!("dc civac, {}", in(reg) used_addr, options(nostack)); core::arch::asm!("dsb sy", options(nostack, preserves_flags)); + fence(Ordering::Acquire); + read_volatile(&(*q).used.idx) != state.last_used_idx + }; + + if !already_done { + let (s, n) = crate::time::get_monotonic_time_ns(); + let now_ns = (s as u64) * 1_000_000_000 + (n as u64); + let wake_ns = now_ns + 10_000_000; // 10ms safety timeout + + crate::task::scheduler::with_scheduler(|sched| { + sched.block_current_for_compositor(wake_ns); + }); + crate::per_cpu_aarch64::preempt_enable(); + crate::task::scheduler::yield_current(); + crate::arch_halt_with_interrupts(); + + crate::task::scheduler::with_scheduler(|sched| { + if let Some(thread) = sched.current_thread_mut() { + thread.blocked_in_syscall = false; + } + }); + crate::per_cpu_aarch64::preempt_disable(); + } + + GPU_WAITING_THREAD.store(0, Ordering::Release); + + // Suppress interrupts now that we're awake + unsafe { + let q = &raw mut PCI_CTRL_QUEUE; + (*q).avail.flags = 1; // NO_INTERRUPT + dma_cache_clean(&(*q).avail.flags as *const u16 as *const u8, 64); } + let sleep_end: u64; + unsafe { core::arch::asm!("mrs {}, cntvct_el0", out(reg) sleep_end, options(nomem, nostack)); } + let slept = sleep_end.saturating_sub(sleep_start); + GPU_SLEEP_TICKS.fetch_add(slept, Ordering::Relaxed); + GPU_SLEEP_TICKS_PHASES.fetch_add(slept, Ordering::Relaxed); + + // Read used ring — GPU must be done by now + unsafe { + let q = &raw const PCI_CTRL_QUEUE; + let used_addr = &(*q).used as *const _ as usize; + core::arch::asm!("dc civac, {}", in(reg) used_addr, options(nostack)); + core::arch::asm!("dsb sy", options(nostack, preserves_flags)); + } fence(Ordering::Acquire); let used_idx = unsafe { let q = &raw const PCI_CTRL_QUEUE; read_volatile(&(*q).used.idx) }; - if used_idx != state.last_used_idx { - // Read used ring entry - used_len = unsafe { - let q = &raw const PCI_CTRL_QUEUE; - let elem_idx = (state.last_used_idx % 16) as usize; - read_volatile(&(*q).used.ring[elem_idx].len) - }; - state.last_used_idx = used_idx; - break; - } - timeout -= 1; - if timeout == 0 { - return Err("GPU PCI 3-desc command timeout"); + if used_idx == state.last_used_idx { + return Err("GPU PCI 3-desc command timeout (MSI-X wake but no completion)"); } - if use_msi { + used_len = unsafe { + let q = &raw const PCI_CTRL_QUEUE; + let elem_idx = (state.last_used_idx % 16) as usize; + read_volatile(&(*q).used.ring[elem_idx].len) + }; + state.last_used_idx = used_idx; + } else { + // Polling fallback: spin then block with timer wake. + let mut timeout = 10_000_000u32; + let mut spin_count = 0u32; + loop { #[cfg(target_arch = "aarch64")] - unsafe { core::arch::asm!("wfi", options(nomem, nostack)); } - #[cfg(not(target_arch = "aarch64"))] - core::hint::spin_loop(); - } else { - core::hint::spin_loop(); + unsafe { + let q = &raw const PCI_CTRL_QUEUE; + let used_addr = &(*q).used as *const _ as usize; + core::arch::asm!("dc civac, {}", in(reg) used_addr, options(nostack)); + core::arch::asm!("dsb sy", options(nostack, preserves_flags)); + } + + fence(Ordering::Acquire); + let used_idx = unsafe { + let q = &raw const PCI_CTRL_QUEUE; + read_volatile(&(*q).used.idx) + }; + if used_idx != state.last_used_idx { + used_len = unsafe { + let q = &raw const PCI_CTRL_QUEUE; + let elem_idx = (state.last_used_idx % 16) as usize; + read_volatile(&(*q).used.ring[elem_idx].len) + }; + state.last_used_idx = used_idx; + break; + } + timeout -= 1; + if timeout == 0 { + return Err("GPU PCI 3-desc command timeout"); + } + spin_count += 1; + if can_yield && spin_count == 5_000 { + #[cfg(target_arch = "aarch64")] + { + let sleep_start: u64; + unsafe { core::arch::asm!("mrs {}, cntvct_el0", out(reg) sleep_start, options(nomem, nostack)); } + + let (s, n) = crate::time::get_monotonic_time_ns(); + let now_ns = (s as u64) * 1_000_000_000 + (n as u64); + let wake_ns = now_ns + 4_000_000; // 4ms timeout + + crate::task::scheduler::with_scheduler(|sched| { + sched.block_current_for_compositor(wake_ns); + }); + crate::per_cpu_aarch64::preempt_enable(); + crate::task::scheduler::yield_current(); + crate::arch_halt_with_interrupts(); + crate::task::scheduler::with_scheduler(|sched| { + if let Some(thread) = sched.current_thread_mut() { + thread.blocked_in_syscall = false; + } + }); + crate::per_cpu_aarch64::preempt_disable(); + + let sleep_end: u64; + unsafe { core::arch::asm!("mrs {}, cntvct_el0", out(reg) sleep_end, options(nomem, nostack)); } + let slept = sleep_end.saturating_sub(sleep_start); + GPU_SLEEP_TICKS.fetch_add(slept, Ordering::Relaxed); + GPU_SLEEP_TICKS_PHASES.fetch_add(slept, Ordering::Relaxed); + } + spin_count = 0; + } else { + core::hint::spin_loop(); + } } } @@ -3986,6 +4190,14 @@ pub fn virgl_composite_windows( // Note: cursor-only moves (no bg_dirty, no window_dirty) still trigger SUBMIT_3D // below to redraw the GPU cursor quad at the new position. No COMPOSITE_TEX upload needed. + // Perf: timestamp after COMPOSITE_TEX upload, before per-window uploads + #[cfg(target_arch = "aarch64")] + let t_after_bg = { + let v: u64; + unsafe { core::arch::asm!("mrs {}, cntvct_el0", out(reg) v, options(nomem, nostack)); } + v + }; + // ========================================================================= // Phase A2: Upload per-window GPU textures // Per-window textures pre-allocated at init, TRANSFER_TO_HOST_3D proven working. @@ -4005,7 +4217,7 @@ pub fn virgl_composite_windows( // Background + decorations from COMPOSITE_TEX, per-window content from // individual GPU textures, all in one SUBMIT_3D batch. - // Perf: timestamp before display phase + // Perf: timestamp before display phase (after all uploads) #[cfg(target_arch = "aarch64")] let t_display = { let v: u64; @@ -4027,33 +4239,46 @@ pub fn virgl_composite_windows( crate::serial_println!("[composite-win] frame={} complete", frame); } - // Performance summary every 500 frames + // Performance summary every 500 WORK frames (frames that actually composited) + // Uses a separate counter from the main frame counter since many calls hit early return. #[cfg(target_arch = "aarch64")] { use core::sync::atomic::AtomicU64; - static PERF_COMPOSE_TICKS: AtomicU64 = AtomicU64::new(0); - static PERF_DISPLAY_TICKS: AtomicU64 = AtomicU64::new(0); + static WORK_FRAME: AtomicU32 = AtomicU32::new(0); + static PERF_BG_UPLOAD_TICKS: AtomicU64 = AtomicU64::new(0); + static PERF_WIN_UPLOAD_TICKS: AtomicU64 = AtomicU64::new(0); + static PERF_SUBMIT_TICKS: AtomicU64 = AtomicU64::new(0); static PERF_TOTAL_TICKS: AtomicU64 = AtomicU64::new(0); - let compose = t_display.saturating_sub(t_start); - let display = t_end.saturating_sub(t_display); + let wf = WORK_FRAME.fetch_add(1, Ordering::Relaxed); + + let bg_upload = t_after_bg.saturating_sub(t_start); + let win_upload = t_display.saturating_sub(t_after_bg); + let submit = t_end.saturating_sub(t_display); let total = t_end.saturating_sub(t_start); - PERF_COMPOSE_TICKS.fetch_add(compose, Ordering::Relaxed); - PERF_DISPLAY_TICKS.fetch_add(display, Ordering::Relaxed); + PERF_BG_UPLOAD_TICKS.fetch_add(bg_upload, Ordering::Relaxed); + PERF_WIN_UPLOAD_TICKS.fetch_add(win_upload, Ordering::Relaxed); + PERF_SUBMIT_TICKS.fetch_add(submit, Ordering::Relaxed); PERF_TOTAL_TICKS.fetch_add(total, Ordering::Relaxed); - if frame > 0 && frame % 500 == 0 { + if wf > 0 && wf % 500 == 0 { let freq: u64; unsafe { core::arch::asm!("mrs {}, cntfrq_el0", out(reg) freq, options(nomem, nostack)); } let to_us = |ticks: u64| -> u64 { ticks * 1_000_000 / freq / 500 }; - let avg_compose = to_us(PERF_COMPOSE_TICKS.swap(0, Ordering::Relaxed)); - let avg_display = to_us(PERF_DISPLAY_TICKS.swap(0, Ordering::Relaxed)); - let avg_total = to_us(PERF_TOTAL_TICKS.swap(0, Ordering::Relaxed)); + let bg_us = to_us(PERF_BG_UPLOAD_TICKS.swap(0, Ordering::Relaxed)); + let win_us = to_us(PERF_WIN_UPLOAD_TICKS.swap(0, Ordering::Relaxed)); + let submit_wall_us = to_us(PERF_SUBMIT_TICKS.swap(0, Ordering::Relaxed)); + let sleep_us = to_us(GPU_SLEEP_TICKS_PHASES.swap(0, Ordering::Relaxed)); + let submit_cpu_us = submit_wall_us.saturating_sub(sleep_us); + let total_us = to_us(PERF_TOTAL_TICKS.swap(0, Ordering::Relaxed)); + let cpu_us = total_us.saturating_sub(sleep_us); - // GPU perf counters available via GDB: PERF_COMPOSE_TICKS, PERF_DISPLAY_TICKS, PERF_TOTAL_TICKS - let _ = (avg_compose, avg_display, avg_total); + crate::serial_println!( + "[gpu-phases] 500f: bg={}us win={}us submit_wall={}us submit_cpu={}us total_cpu={}us sleep={}us", + bg_us, win_us, submit_wall_us, submit_cpu_us, cpu_us, sleep_us, + ); } } diff --git a/kernel/src/syscall/graphics.rs b/kernel/src/syscall/graphics.rs index 54bda828..9c1311a6 100644 --- a/kernel/src/syscall/graphics.rs +++ b/kernel/src/syscall/graphics.rs @@ -57,6 +57,18 @@ static COMPOSITOR_LAST_MOUSE: core::sync::atomic::AtomicU64 = core::sync::atomic #[cfg(target_arch = "aarch64")] static COMPOSITOR_DIRTY_WAKE: core::sync::atomic::AtomicBool = core::sync::atomic::AtomicBool::new(false); +/// Timestamp (ns) of the last compositor_wait return. +/// Used to enforce a minimum inter-frame interval so the compositor doesn't +/// saturate the CPU when GPU wake is fast (e.g., MSI-X interrupt-driven). +#[cfg(target_arch = "aarch64")] +static COMPOSITOR_LAST_WAKE_NS: core::sync::atomic::AtomicU64 = core::sync::atomic::AtomicU64::new(0); + +/// Minimum nanoseconds between compositor_wait returns. +/// 5ms = 200 FPS cap — smooth enough for all use cases while preventing +/// the compositor from running flat-out when events arrive continuously. +#[cfg(target_arch = "aarch64")] +const MIN_FRAME_INTERVAL_NS: u64 = 5_000_000; + /// Wake the compositor thread if it's blocked in compositor_wait (op=23). /// Called from input interrupt handlers (mouse, keyboard) to provide low-latency /// input response without polling. @@ -176,6 +188,8 @@ struct WindowBuffer { /// Window position (set by compositor) x: i32, y: i32, + /// Z-order (0 = bottom, higher = closer to viewer). Set by compositor. + z_order: u32, /// VirGL TEXTURE_2D resource ID (0 = not initialized) virgl_resource_id: u32, /// Whether VirGL texture has been created + backed + primed @@ -267,6 +281,7 @@ impl WindowRegistry { title_len: 0, x: 0, y: 0, + z_order: 0, virgl_resource_id: 0, virgl_initialized: false, generation: 0, @@ -856,19 +871,68 @@ fn handle_virgl_op(cmd: &FbDrawCmd) -> SyscallResult { if desc_ptr == 0 || desc_ptr >= USER_SPACE_MAX { return SyscallResult::Err(super::ErrorCode::Fault as u64); } + + // ksyscall-perf: measure composite time and report CPU% every 500 frames + #[cfg(target_arch = "aarch64")] + { + let t0: u64; + unsafe { core::arch::asm!("mrs {}, cntvct_el0", out(reg) t0, options(nomem, nostack)); } + + let result = handle_composite_windows(desc_ptr); + + use core::sync::atomic::{AtomicU64, AtomicU32}; + static PERF_FRAME: AtomicU32 = AtomicU32::new(0); + static PERF_GPU_TICKS: AtomicU64 = AtomicU64::new(0); + static PERF_EPOCH: AtomicU64 = AtomicU64::new(0); + + let t1: u64; + unsafe { core::arch::asm!("mrs {}, cntvct_el0", out(reg) t1, options(nomem, nostack)); } + + let frame = PERF_FRAME.fetch_add(1, core::sync::atomic::Ordering::Relaxed); + PERF_GPU_TICKS.fetch_add(t1.saturating_sub(t0), core::sync::atomic::Ordering::Relaxed); + + if frame == 0 { + PERF_EPOCH.store(t0, core::sync::atomic::Ordering::Relaxed); + } + + if frame > 0 && (frame + 1) % 500 == 0 { + let freq: u64; + unsafe { core::arch::asm!("mrs {}, cntfrq_el0", out(reg) freq, options(nomem, nostack)); } + let gpu_ticks = PERF_GPU_TICKS.swap(0, core::sync::atomic::Ordering::Relaxed); + let sleep_ticks = crate::drivers::virtio::gpu_pci::take_gpu_sleep_ticks(); + let cpu_ticks = gpu_ticks.saturating_sub(sleep_ticks); + let epoch = PERF_EPOCH.swap(t1, core::sync::atomic::Ordering::Relaxed); + let wall_ticks = t1.saturating_sub(epoch); + + let wall_us = gpu_ticks * 1_000_000 / freq / 500; + let sleep_us = sleep_ticks * 1_000_000 / freq / 500; + let cpu_us = cpu_ticks * 1_000_000 / freq / 500; + let busy_pct = if wall_ticks > 0 { cpu_ticks * 100 / wall_ticks } else { 0 }; + + crate::serial_println!( + "[ksyscall-perf] 500f: wall={}us sleep={}us cpu={}us busy={}%", + wall_us, sleep_us, cpu_us, busy_pct, + ); + } + + result + } + #[cfg(not(target_arch = "aarch64"))] handle_composite_windows(desc_ptr) } 17 => { - // SetWindowPosition: set window position for compositor - // p1=buffer_id, p2=x (i16 low) | y (i16 high) + // SetWindowPosition: set window position + z-order for compositor + // p1=buffer_id, p2=x (i16 low) | y (i16 high), p3=z_order let buffer_id = cmd.p1 as u32; let x = (cmd.p2 & 0xFFFF) as i16 as i32; let y = ((cmd.p2 >> 16) & 0xFFFF) as i16 as i32; + let z_order = cmd.p3 as u32; let mut reg = WINDOW_REGISTRY.lock(); match reg.find_mut(buffer_id) { Some(buf) => { buf.x = x; buf.y = y; + buf.z_order = z_order; SyscallResult::Ok(0) } None => SyscallResult::Err(super::ErrorCode::InvalidArgument as u64), @@ -1107,6 +1171,54 @@ fn handle_compositor_wait(cmd: &FbDrawCmd) -> SyscallResult { let timeout_ms = cmd.p1 as u32; let last_registry_gen = cmd.p2 as u32 as u64; + // Frame pacing: enforce minimum inter-frame interval. + // Without this, MSI-X interrupt-driven GPU wake causes the compositor to + // run flat-out (~200+ FPS), saturating the CPU. By sleeping until the + // minimum interval has elapsed, we cap effective FPS while keeping + // latency low for input events (mouse/keyboard still wake immediately). + // + // IMPORTANT: This uses a plain timer block, NOT block_current_for_compositor. + // We must NOT set COMPOSITOR_WAITING_THREAD here because mark_window_dirty + // would wake us early and consume the dirty signal, causing the main + // blocking section to re-block and wait for the full 16ms timeout. + let (s, n) = crate::time::get_monotonic_time_ns(); + let now_ns = (s as u64) * 1_000_000_000 + (n as u64); + let last_wake = COMPOSITOR_LAST_WAKE_NS.load(Ordering::Relaxed); + if last_wake != 0 { + let earliest_return = last_wake + MIN_FRAME_INTERVAL_NS; + if now_ns < earliest_return { + crate::task::scheduler::with_scheduler(|sched| { + sched.block_current_for_timer(earliest_return); + }); + + #[cfg(target_arch = "aarch64")] + crate::per_cpu_aarch64::preempt_enable(); + + loop { + let still_blocked = crate::task::scheduler::with_scheduler(|sched| { + sched.wake_expired_timers(); + sched.current_thread_mut() + .map(|t| t.state == crate::task::thread::ThreadState::BlockedOnTimer) + .unwrap_or(false) + }); + if !still_blocked.unwrap_or(false) { break; } + crate::task::scheduler::yield_current(); + crate::arch_halt_with_interrupts(); + } + + crate::task::scheduler::with_scheduler(|sched| { + if let Some(thread) = sched.current_thread_mut() { + thread.blocked_in_syscall = false; + } + }); + + #[cfg(target_arch = "aarch64")] + crate::per_cpu_aarch64::preempt_disable(); + #[cfg(target_arch = "aarch64")] + ensure_current_address_space(); + } + } + // Pack current mouse state for comparison let (mx, my, mb) = crate::drivers::usb::hid::mouse_state(); let mouse_packed = ((mx as u64) << 32) | ((my as u64) << 16) | (mb as u64); @@ -1126,10 +1238,16 @@ fn handle_compositor_wait(cmd: &FbDrawCmd) -> SyscallResult { ready |= 4; } - // If mouse or registry changed, return immediately (don't check dirty — BWM - // will do its own per-window dirty check via check_window_dirty). + // Bit 0: dirty window signal pending (may have arrived during frame pacing sleep) + if COMPOSITOR_DIRTY_WAKE.swap(false, Ordering::Relaxed) { + ready |= 1; + } + + // If anything is ready, return immediately. if ready != 0 { COMPOSITOR_LAST_MOUSE.store(mouse_packed, Ordering::Relaxed); + let (ws, wn) = crate::time::get_monotonic_time_ns(); + COMPOSITOR_LAST_WAKE_NS.store((ws as u64) * 1_000_000_000 + (wn as u64), Ordering::Relaxed); return SyscallResult::Ok(ready | ((cur_reg_gen & 0x00FF_FFFF) << 8)); } @@ -1199,6 +1317,9 @@ fn handle_compositor_wait(cmd: &FbDrawCmd) -> SyscallResult { COMPOSITOR_LAST_MOUSE.store(mouse_packed2, Ordering::Relaxed); + let (ws2, wn2) = crate::time::get_monotonic_time_ns(); + COMPOSITOR_LAST_WAKE_NS.store((ws2 as u64) * 1_000_000_000 + (wn2 as u64), Ordering::Relaxed); + SyscallResult::Ok(ready_after | ((cur_reg_gen2 & 0x00FF_FFFF) << 8)) } @@ -1340,6 +1461,7 @@ fn handle_composite_windows(desc_ptr: u64) -> SyscallResult { height: buf.height, x: buf.x, y: buf.y, + z_order: buf.z_order, dirty, page_phys_addrs: buf.page_phys_addrs.clone(), size: buf.size, @@ -1356,6 +1478,8 @@ fn handle_composite_windows(desc_ptr: u64) -> SyscallResult { win_idx += 1; } } + // Sort by z_order so GPU draws back-to-front (lower z = drawn first). + result.sort_unstable_by_key(|w| w.z_order); result }; @@ -1420,6 +1544,7 @@ pub struct WindowCompositeInfo { pub height: u32, pub x: i32, pub y: i32, + pub z_order: u32, pub dirty: bool, pub page_phys_addrs: alloc::vec::Vec, pub size: usize, diff --git a/kernel/src/task/scheduler.rs b/kernel/src/task/scheduler.rs index ad3dfe4b..6883b63a 100644 --- a/kernel/src/task/scheduler.rs +++ b/kernel/src/task/scheduler.rs @@ -514,23 +514,28 @@ impl Scheduler { // Check the state and determine what to do let (is_terminated, is_blocked) = if let Some(current) = self.get_thread_mut(current_id) { - // Charge elapsed CPU ticks to the outgoing thread - let now = crate::time::get_ticks(); - current.cpu_ticks_total += now.wrapping_sub(current.run_start_ticks); - // Reset run_start_ticks so that if no context switch happens - // (function returns None), the next call won't double-count. - current.run_start_ticks = now; - let was_terminated = current.state == ThreadState::Terminated; // Check for any blocked state let was_blocked = current.state == ThreadState::Blocked || current.state == ThreadState::BlockedOnSignal || current.state == ThreadState::BlockedOnChildExit || current.state == ThreadState::BlockedOnTimer; - // Only set to Ready if not terminated AND not blocked - if !was_terminated && !was_blocked { + + // Charge elapsed CPU ticks to the outgoing thread, but ONLY + // if it was actually running. Blocked threads already had + // their ticks charged at block time — charging again here + // would count blocked/sleeping time as CPU usage. + if !was_blocked && !was_terminated { + let now = crate::time::get_ticks(); + current.cpu_ticks_total += now.wrapping_sub(current.run_start_ticks); + current.run_start_ticks = now; current.set_ready(); + } else { + // Reset run_start_ticks so the next dispatch doesn't + // charge stale time from the blocked period. + current.run_start_ticks = crate::time::get_ticks(); } + (was_terminated, was_blocked) } else { (true, false) @@ -712,21 +717,22 @@ impl Scheduler { if current_id != self.cpu_state[Self::current_cpu_id()].idle_thread { let (is_terminated, is_blocked) = if let Some(current) = self.get_thread_mut(current_id) { - // Charge elapsed CPU ticks to the outgoing thread - let now = crate::time::get_ticks(); - current.cpu_ticks_total += now.wrapping_sub(current.run_start_ticks); - // Reset run_start_ticks so that if no context switch happens - // (function returns None), the next call won't double-count. - current.run_start_ticks = now; - let was_terminated = current.state == ThreadState::Terminated; let was_blocked = current.state == ThreadState::Blocked || current.state == ThreadState::BlockedOnSignal || current.state == ThreadState::BlockedOnChildExit || current.state == ThreadState::BlockedOnTimer; - if !was_terminated && !was_blocked { + + // Only charge CPU ticks if thread was actually running + if !was_blocked && !was_terminated { + let now = crate::time::get_ticks(); + current.cpu_ticks_total += now.wrapping_sub(current.run_start_ticks); + current.run_start_ticks = now; current.set_ready(); + } else { + current.run_start_ticks = crate::time::get_ticks(); } + (was_terminated, was_blocked) } else { (true, false) @@ -887,6 +893,11 @@ impl Scheduler { #[allow(dead_code)] pub fn block_current(&mut self) { if let Some(current) = self.current_thread_mut() { + // Charge elapsed CPU ticks before blocking + let now = crate::time::get_ticks(); + current.cpu_ticks_total += now.wrapping_sub(current.run_start_ticks); + current.run_start_ticks = now; + current.set_blocked(); } } @@ -1010,6 +1021,11 @@ impl Scheduler { ) { if let Some(current_id) = self.cpu_state[Self::current_cpu_id()].current_thread { if let Some(thread) = self.get_thread_mut(current_id) { + // Charge elapsed CPU ticks before blocking + let now = crate::time::get_ticks(); + thread.cpu_ticks_total += now.wrapping_sub(thread.run_start_ticks); + thread.run_start_ticks = now; + // CRITICAL: Save userspace context FIRST, THEN set state. // This ensures that when unblock_for_signal() is called, // the context is already saved and ready for signal delivery. @@ -1133,6 +1149,11 @@ impl Scheduler { pub fn block_current_for_child_exit(&mut self) { if let Some(current_id) = self.cpu_state[Self::current_cpu_id()].current_thread { if let Some(thread) = self.get_thread_mut(current_id) { + // Charge elapsed CPU ticks before blocking + let now = crate::time::get_ticks(); + thread.cpu_ticks_total += now.wrapping_sub(thread.run_start_ticks); + thread.run_start_ticks = now; + thread.state = ThreadState::BlockedOnChildExit; // CRITICAL: Mark that this thread is blocked inside a syscall. // When the thread is resumed, we must NOT restore userspace context @@ -1199,6 +1220,11 @@ impl Scheduler { pub fn block_current_for_timer(&mut self, wake_time_ns: u64) { if let Some(current_id) = self.cpu_state[Self::current_cpu_id()].current_thread { if let Some(thread) = self.get_thread_mut(current_id) { + // Charge elapsed CPU ticks before blocking + let now = crate::time::get_ticks(); + thread.cpu_ticks_total += now.wrapping_sub(thread.run_start_ticks); + thread.run_start_ticks = now; + thread.state = ThreadState::BlockedOnTimer; thread.wake_time_ns = Some(wake_time_ns); thread.blocked_in_syscall = true; @@ -1216,6 +1242,13 @@ impl Scheduler { pub fn block_current_for_compositor(&mut self, timeout_ns: u64) { if let Some(current_id) = self.cpu_state[Self::current_cpu_id()].current_thread { if let Some(thread) = self.get_thread_mut(current_id) { + // Charge elapsed CPU ticks NOW, before blocking. Otherwise the + // next schedule() call charges all time since last dispatch — + // including blocked/sleeping time — as CPU usage. + let now = crate::time::get_ticks(); + thread.cpu_ticks_total += now.wrapping_sub(thread.run_start_ticks); + thread.run_start_ticks = now; + thread.state = ThreadState::BlockedOnTimer; thread.wake_time_ns = Some(timeout_ns); thread.blocked_in_syscall = true; diff --git a/libs/libbreenix/src/graphics.rs b/libs/libbreenix/src/graphics.rs index c350f0d8..8478540c 100644 --- a/libs/libbreenix/src/graphics.rs +++ b/libs/libbreenix/src/graphics.rs @@ -567,13 +567,13 @@ pub fn mark_window_dirty(buffer_id: u32) -> Result<(), Error> { /// /// Tells the kernel where to place this window during compositing. /// If position is never set, windows are auto-positioned. -pub fn set_window_position(buffer_id: u32, x: i32, y: i32) -> Result<(), Error> { +pub fn set_window_position(buffer_id: u32, x: i32, y: i32, z_order: u32) -> Result<(), Error> { let packed_xy = ((x as u16 as u32) | ((y as u16 as u32) << 16)) as i32; let cmd = FbDrawCmd { op: draw_op::SET_WINDOW_POSITION, p1: buffer_id as i32, p2: packed_xy, - p3: 0, + p3: z_order as i32, p4: 0, color: 0, }; diff --git a/userspace/programs/src/bwm.rs b/userspace/programs/src/bwm.rs index 37d793a9..15efca25 100644 --- a/userspace/programs/src/bwm.rs +++ b/userspace/programs/src/bwm.rs @@ -52,7 +52,6 @@ const TITLE_TEXT: Color = Color::rgb(160, 165, 175); const TITLE_FOCUSED_TEXT: Color = Color::rgb(255, 255, 255); const WIN_BORDER_COLOR: Color = Color::rgb(50, 55, 70); const WIN_BORDER_FOCUSED: Color = Color::rgb(60, 130, 255); -const CONTENT_BG: Color = Color::rgb(20, 25, 40); // Taskbar/Appbar colors const TASKBAR_BG: Color = Color::rgb(20, 22, 30); @@ -244,15 +243,7 @@ impl Window { // ─── Drawing Helpers ───────────────────────────────────────────────────────── fn fill_rect(fb: &mut FrameBuf, x: i32, y: i32, w: usize, h: usize, color: Color) { - for dy in 0..h as i32 { - let py = y + dy; - if py < 0 || py >= fb.height as i32 { continue; } - for dx in 0..w as i32 { - let px = x + dx; - if px < 0 || px >= fb.width as i32 { continue; } - fb.put_pixel(px as usize, py as usize, color); - } - } + libgfx::shapes::fill_rect(fb, x, y, w as i32, h as i32, color); } fn draw_text_at(fb: &mut FrameBuf, text: &[u8], x: i32, y: i32, color: Color) { @@ -298,8 +289,7 @@ fn draw_window_frame(fb: &mut FrameBuf, win: &Window, focused: bool) { let my = mby + (mbh as i32 - CELL_H as i32) / 2; draw_text_at(fb, b"-", mx, my, MINIMIZE_BTN_TEXT); - fill_rect(fb, win.content_x(), win.content_y(), - win.content_width(), win.content_height(), CONTENT_BG); + // Content area NOT filled here — GPU composites per-window textures over it. } /// Paint the decorative desktop background — gradient with grid @@ -604,10 +594,13 @@ fn discover_windows(windows: &mut Vec, screen_w: usize, screen_h: usize, core::str::from_utf8(&title[..title_len]).unwrap_or("?"), info.buffer_id, info.width, info.height, cascade_x, cascade_y); - // Tell kernel where the client content goes on screen (for GPU compositing) + // Tell kernel where the client content goes on screen (for GPU compositing). + // z_order = index in windows vec (0 = bottom). New windows are pushed to + // the end, so they get the highest z_order. let content_x = cascade_x + BORDER_WIDTH as i32; let content_y = cascade_y + TITLE_BAR_HEIGHT as i32 + BORDER_WIDTH as i32; - let _ = graphics::set_window_position(info.buffer_id, content_x, content_y); + let z_order = windows.len() as u32; // will be at this index after push + let _ = graphics::set_window_position(info.buffer_id, content_x, content_y, z_order); let order = *next_order; *next_order += 1; @@ -624,6 +617,17 @@ fn discover_windows(windows: &mut Vec, screen_w: usize, screen_h: usize, removed || added } +/// Update kernel z-order for all windows. Called after any z-order change +/// (raise-to-front, new window, etc.) so the GPU compositor draws quads +/// in correct back-to-front order. +fn update_kernel_z_order(windows: &[Window]) { + for (i, win) in windows.iter().enumerate() { + if win.window_id != 0 { + let _ = graphics::set_window_position(win.window_id, win.content_x(), win.content_y(), i as u32); + } + } +} + /// Redraw all windows in z-order (index 0 = bottom), plus taskbar and app bar. /// Window frames and decorations go into the compositor buffer; GPU compositing /// handles client content via per-window textured quads. @@ -845,6 +849,7 @@ fn main() { let mut last_clock_sec: i64 = -1; let mut clock_text = [0u8; 11]; format_clock(0, &mut clock_text); + let mut frame_counter: u32 = 0; let mut next_creation_order: u32 = 0; // Initial composite @@ -882,6 +887,7 @@ fn main() { // New windows are pushed to end of Vec (top of z-order). // Always focus the topmost visible window so appbar selection // matches the visually foregrounded window. + update_kernel_z_order(&windows); focused_win = next_visible_window(&windows, 0); compose_full_redraw(composite_buf, &mut fb, &mut shadow_fb, &bg_cache, &windows, focused_win, &clock_text); full_redraw = true; @@ -961,7 +967,7 @@ fn main() { if windows[win_idx].window_id != 0 { let cx = windows[win_idx].content_x(); let cy = windows[win_idx].content_y(); - let _ = graphics::set_window_position(windows[win_idx].window_id, cx, cy); + let _ = graphics::set_window_position(windows[win_idx].window_id, cx, cy, win_idx as u32); } // Dirty region = union of old and new bounds let (nx0, ny0, nx1, ny1) = windows[win_idx].bounds(); @@ -1028,6 +1034,7 @@ fn main() { if idx < windows.len() - 1 { let win = windows.remove(idx); windows.push(win); + update_kernel_z_order(&windows); } let top = windows.len() - 1; if top != focused_win { @@ -1057,13 +1064,16 @@ fn main() { } } if let Some(ci) = clicked_idx { - if ci < windows.len() - 1 { + let z_changed = ci < windows.len() - 1; + if z_changed { let win = windows.remove(ci); windows.push(win); + update_kernel_z_order(&windows); } let top = windows.len() - 1; + let focus_changed = top != focused_win; - if top != focused_win { + if focus_changed { send_focus_event(&windows, focused_win, input_event_type::FOCUS_LOST); focused_win = top; send_focus_event(&windows, focused_win, input_event_type::FOCUS_GAINED); @@ -1102,8 +1112,9 @@ fn main() { route_mouse_button_to_focused(&windows, focused_win, 1, true, local_x, local_y); } - // Full redraw for z-order change (unless minimize already did it) - if !full_redraw { + // Full redraw for z-order or focus change (unless minimize + // already did it, or nothing visual changed) + if !full_redraw && (z_changed || focus_changed) { compose_full_redraw(composite_buf, &mut fb, &mut shadow_fb, &bg_cache, &windows, focused_win, &clock_text); full_redraw = true; } @@ -1124,17 +1135,20 @@ fn main() { } // ── 5b. Update clock (once per second) ── - if let Ok(ts) = libbreenix::time::now_realtime() { - if ts.tv_sec != last_clock_sec { - last_clock_sec = ts.tv_sec; - format_clock(ts.tv_sec, &mut clock_text); - draw_taskbar(&mut fb, &clock_text); - // Expand dirty rect to cover taskbar - dirty_x0 = 0; - dirty_y0 = 0; - dirty_x1 = dirty_x1.max(screen_w as i32); - dirty_y1 = dirty_y1.max(TASKBAR_HEIGHT as i32); - content_dirty = true; + // Only check realtime every 30 frames (~5-6 checks/sec at 200 FPS) + frame_counter = frame_counter.wrapping_add(1); + if frame_counter % 30 == 0 { + if let Ok(ts) = libbreenix::time::now_realtime() { + if ts.tv_sec != last_clock_sec { + last_clock_sec = ts.tv_sec; + format_clock(ts.tv_sec, &mut clock_text); + draw_taskbar(&mut fb, &clock_text); + dirty_x0 = 0; + dirty_y0 = 0; + dirty_x1 = dirty_x1.max(screen_w as i32); + dirty_y1 = dirty_y1.max(TASKBAR_HEIGHT as i32); + content_dirty = true; + } } } @@ -1176,6 +1190,5 @@ fn main() { windows_dirty = false; } // No sleep — compositor_wait handles blocking - } }