diff --git a/build.rs b/build.rs index f35372b0..edc9f637 100644 --- a/build.rs +++ b/build.rs @@ -26,36 +26,26 @@ fn main() { boot_config.frame_buffer.minimum_framebuffer_height = Some(fb_height); disk_builder.set_boot_config(&boot_config); - println!("cargo:warning=Configured framebuffer: {}x{}", fb_width, fb_height); - // specify output paths let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); let uefi_path = out_dir.join("breenix-uefi.img"); let bios_path = out_dir.join("breenix-bios.img"); // Only create the UEFI image by default. BIOS image can be enabled via env var. - println!("cargo:warning=Creating UEFI disk image at {}", uefi_path.display()); disk_builder .create_uefi_image(&uefi_path) .expect("failed to create UEFI disk image"); let build_bios = env::var("BREENIX_BUILD_BIOS").is_ok(); if build_bios { - println!( - "cargo:warning=BREENIX_BUILD_BIOS set; creating BIOS disk image at {}", - bios_path.display() - ); // New bootloader API removed BIOS builder; use UEFI image as placeholder to keep API surface stable. // If BIOS support is needed, switch to a branch that still exposes create_bios_image or vendor our own. - println!("cargo:warning=bootloader no longer provides create_bios_image; duplicating UEFI image for BIOS placeholder"); disk_builder .create_uefi_image(&bios_path) .expect("failed to create BIOS placeholder image"); - } else { - println!("cargo:warning=Skipping BIOS image creation (BREENIX_BUILD_BIOS not set)"); } // pass the disk image paths via environment variables println!("cargo:rustc-env=UEFI_IMAGE={}", uefi_path.display()); println!("cargo:rustc-env=BIOS_IMAGE={}", bios_path.display()); -} \ No newline at end of file +} diff --git a/docs/planning/PCI_MSI_NETWORKING_PLAN.md b/docs/planning/PCI_MSI_NETWORKING_PLAN.md new file mode 100644 index 00000000..06398bc6 --- /dev/null +++ b/docs/planning/PCI_MSI_NETWORKING_PLAN.md @@ -0,0 +1,267 @@ +# PCI MSI Interrupt-Driven Networking + +## Problem + +ARM64 network drivers (VirtIO net PCI on Parallels, e1000 on VMware) rely on +timer-based polling at 100Hz (every 10ms). This adds 5-10ms latency per +network round-trip, which compounds across DNS, TCP handshake, and HTTP +response phases. On x86, the e1000 has a proper IRQ 11 handler that processes +packets immediately via softirq. + +## Goal + +Replace timer-based polling with interrupt-driven packet processing on ARM64, +achieving sub-millisecond packet delivery latency. + +--- + +## Phase 1: VirtIO Net PCI MSI on Parallels (Priority: Immediate) + +### Why This Is Easy + +All infrastructure already exists and is proven working: +- **GIC driver** (`gic.rs`): `enable_spi()`, `disable_spi()`, + `configure_spi_edge_triggered()`, `clear_spi_pending()` — all present +- **PCI driver** (`pci.rs`): `find_msi_capability()`, `configure_msi()`, + `disable_intx()` — all present +- **GICv2m MSI** (`platform_config.rs`): `probe_gicv2m()`, + `allocate_msi_spi()` — already used by xHCI and GPU PCI drivers on Parallels +- **net_pci.rs** already has `handle_interrupt()` (line 552) that reads ISR + and raises NetRx softirq — it's just never called from the interrupt path + +### Files to Modify + +#### 1. `kernel/src/drivers/virtio/net_pci.rs` + +Add MSI setup following the exact pattern from `xhci.rs:setup_xhci_msi()`: + +```rust +static NET_PCI_IRQ: AtomicU32 = AtomicU32::new(0); + +pub fn get_irq() -> Option { + let irq = NET_PCI_IRQ.load(Ordering::Relaxed); + if irq != 0 { Some(irq) } else { None } +} + +fn setup_net_pci_msi(pci_dev: &pci::Device) -> Option { + // 1. Find MSI capability (cap ID 0x05) + let cap_offset = pci_dev.find_msi_capability()?; + // 2. Probe GICv2m (already probed by xHCI, returns cached value) + let gicv2m_base = platform_config::gicv2m_base_phys()?; + // 3. Allocate SPI from GICv2m pool + let spi = platform_config::allocate_msi_spi()?; + // 4. Program MSI: address = GICv2m doorbell, data = SPI number + pci_dev.configure_msi(cap_offset, gicv2m_base + 0x40, spi); + // 5. Disable INTx (MSI replaces it) + pci_dev.disable_intx(); + // 6. Configure GIC: edge-triggered, enable SPI + gic::configure_spi_edge_triggered(spi); + gic::enable_spi(spi); + Some(spi) +} +``` + +In `init()`, after device setup: call `setup_net_pci_msi()`, store result in +`NET_PCI_IRQ`. + +Update `handle_interrupt()` with disable/clear/ack/enable SPI pattern (matching +the xHCI and GPU handlers): + +```rust +pub fn handle_interrupt() { + let irq = NET_PCI_IRQ.load(Ordering::Relaxed); + if irq != 0 { + gic::disable_spi(irq); + gic::clear_spi_pending(irq); + } + // Read ISR status register (existing code — auto-acks on read for legacy VirtIO) + // Raise NetRx softirq (existing code) + if irq != 0 { + gic::enable_spi(irq); + } +} +``` + +#### 2. `kernel/src/arch_impl/aarch64/exception.rs` + +Add dispatch entry in the SPI match arm (32..=1019), alongside existing GPU +PCI handler: + +```rust +if let Some(net_pci_irq) = crate::drivers::virtio::net_pci::get_irq() { + if irq_id == net_pci_irq { + crate::drivers::virtio::net_pci::handle_interrupt(); + } +} +``` + +#### 3. `kernel/src/arch_impl/aarch64/timer_interrupt.rs` + +Conditionalize polling — only poll when no MSI IRQ is configured: + +```rust +if !crate::drivers::virtio::net_pci::get_irq().is_some() + && (net_pci::is_initialized() || e1000::is_initialized()) + && _count % 10 == 0 +{ + raise_softirq(SoftirqType::NetRx); +} +``` + +### Verification + +- DNS resolution should complete in <200ms (was 4-5 seconds) +- HTTP fetch should complete in <2 seconds (was 10 seconds) +- `cat /proc/interrupts` or trace counters should show NIC interrupts firing + +--- + +## Phase 2: E1000 MSI on VMware (Priority: Next) + +VMware Fusion uses GICv3 with ITS (Interrupt Translation Service), not GICv2m. +This is a different MSI delivery mechanism. + +### Approach A: GICv3 ITS (Correct, Complex) + +The ITS provides MSI translation for GICv3 systems: + +1. **Discover ITS**: Parse ACPI MADT for ITS entry, or scan GIC redistributor + space. ITS is typically at a well-known address (e.g., 0x0801_0000 on + VMware virt). + +2. **Initialize ITS**: + - Allocate command queue (4KB aligned, mapped uncacheable) + - Allocate device table and collection table + - Enable ITS via GITS_CTLR + +3. **Per-device setup**: + - `MAPD` command: map device ID to interrupt table + - `MAPTI` command: map event ID to LPI (physical interrupt) + - `MAPI` command: map interrupt to collection (target CPU) + - `INV` command: invalidate cached translation + +4. **MSI configuration**: + - MSI address = `GITS_TRANSLATER` physical address + - MSI data = device-specific event ID + - Program via `pci_dev.configure_msi(cap, its_translater, event_id)` + +5. **IRQ handling**: LPIs are delivered via GICv3 ICC_IAR1_EL1, same as SPIs. + Dispatch by LPI number in exception.rs. + +**Estimated effort**: 200-400 lines of new code for ITS initialization + per-device +setup. Most complex part is the command queue protocol. + +### Approach B: INTx via ACPI _PRT (Simpler, Limited) + +Parse the ACPI DSDT for PCI interrupt routing: + +1. **Parse ACPI _PRT**: The PCI Routing Table maps (slot, pin) -> GIC SPI. + Breenix already has basic ACPI parsing for MADT/SPCR. Extend to parse + DSDT for _PRT entries. + +2. **Configure SPI**: Once the SPI number is known from _PRT, configure it as + level-triggered (INTx is level, not edge), enable in GIC. + +3. **Shared interrupt handling**: INTx lines may be shared between devices. + Handler must check each device's ISR before claiming the interrupt. + +**Estimated effort**: 100-200 lines for _PRT parsing + level-triggered handler. + +### Approach C: VMware-Specific Probe (Pragmatic) + +If VMware always maps e1000 INTx to a known SPI (discoverable from the device +tree or hardcoded for the vmware-aarch64 machine model), we could: + +1. Read `interrupt_line` from PCI config space (currently 0xFF on ARM64) +2. Use VMware's DT to find the actual SPI mapping +3. Hardcode the mapping as a platform quirk if it's stable + +**Estimated effort**: 20-50 lines, but fragile. + +### Recommendation + +Start with Approach B (_PRT parsing) since ACPI infrastructure partially exists. +Defer ITS to Phase 3 when multiple PCI devices need independent MSI vectors. + +--- + +## Phase 3: Generic PCI Interrupt Framework (Priority: Future) + +### Dynamic IRQ Dispatch Table + +Replace the chain of `if let Some(irq)` in exception.rs with a registration- +based dispatch: + +```rust +static PCI_IRQ_HANDLERS: Mutex<[(u32, fn()); 16]>; + +pub fn register_pci_irq(spi: u32, handler: fn()) { ... } +``` + +This allows any PCI driver to register its own handler without modifying +exception.rs. + +### Full ITS Support + +For GICv3 platforms (VMware, newer QEMU configs, real hardware): +- ITS command queue management +- LPI configuration tables (PROPBASER, PENDBASER) +- Per-device interrupt translation +- Multi-CPU interrupt routing via collections + +### QEMU Virt INTx Mapping + +QEMU virt machine maps PCI INTx to fixed SPIs: +- INTA -> SPI 3 (GIC INTID 35) +- INTB -> SPI 4 (GIC INTID 36) +- INTC -> SPI 5 (GIC INTID 37) +- INTD -> SPI 6 (GIC INTID 38) + +With swizzling: `actual_pin = (slot + pin - 1) % 4` + +These are level-triggered and shared, requiring ISR checks per device. + +--- + +## Architecture Reference + +### Current Packet Receive Path (Polling) + +``` +Timer interrupt (1000Hz) + -> every 10th tick: raise_softirq(NetRx) + -> net_rx_softirq_handler() + -> process_rx() + -> net_pci::receive() / e1000::receive() + -> process_packet() + -> udp::enqueue_packet() / tcp::handle_segment() + -> wake blocked thread +``` + +Latency: 0-10ms (mean 5ms) per packet. + +### Target Packet Receive Path (MSI) + +``` +NIC MSI interrupt -> GIC SPI + -> exception.rs handle_irq() + -> net_pci::handle_interrupt() + -> read ISR (auto-ack) + -> raise_softirq(NetRx) + -> net_rx_softirq_handler() + -> process_rx() + -> ... (same as above) +``` + +Latency: <100us per packet (GIC + softirq overhead). + +### MSI Delivery on Parallels (GICv2m) + +``` +Device writes MSI data to GICv2m doorbell address: + addr = GICV2M_BASE + 0x40 (MSI_SETSPI_NS) + data = allocated SPI number + +GICv2m translates write to GIC SPI assertion. +GIC delivers SPI to target CPU via ICC_IAR1_EL1. +``` diff --git a/docs/planning/gpu-rendering-attack-plan.md b/docs/planning/gpu-rendering-attack-plan.md new file mode 100644 index 00000000..089b8ab9 --- /dev/null +++ b/docs/planning/gpu-rendering-attack-plan.md @@ -0,0 +1,164 @@ +# GPU-Only Rendering Attack Plan + +## Problem + +The current rendering pipeline wastes CPU on work the GPU should do: + +1. **BWM compositing**: CPU-blits window pixels into compositor texture row-by-row + (`blit_client_pixels`), then does TRANSFER_TO_HOST_3D to upload to GPU. Linux ftrace + proved this transfer is unnecessary: Mesa's per-frame path is just + **SUBMIT_3D -> SET_SCANOUT -> RESOURCE_FLUSH** with zero CPU transfers. + +2. **Bounce (and all Breengel clients)**: Software-renders pixels into shared memory + buffers. Bounce draws circles pixel-by-pixel on CPU. All rendering should use VirGL + GPU primitives (DRAW_VBO with shaders). + +3. **Per-window texture "limitation" was a bug**: The note "per-window VirGL textures + DON'T work" was a bug in our resource creation, not a Parallels limitation. Linux + probe VM proved multiple VirGL textures work correctly on identical hardware. + +## Target Architecture + +``` +Client (bounce, bterm, etc.) BWM Compositor + | | + | VirGL SUBMIT_3D | VirGL SUBMIT_3D + | (draw geometry into | (draw textured quads for + | per-window texture) | each window texture onto + | | compositor surface) + v v + GPU renders to GPU composites all windows + window texture -> SET_SCANOUT -> RESOURCE_FLUSH +``` + +Zero CPU pixel copying. Zero TRANSFER_TO_HOST_3D per frame. + +## Phase 1: Fix Per-Window VirGL Textures + +**Goal**: Create multiple VirGL TEXTURE_2D resources that can be rendered to and sampled from. + +### Debugging Approach (Linux-first, per proven methodology) + +1. On Linux probe VM, write a test program that: + - Creates 2+ RESOURCE_CREATE_3D textures (TEXTURE_2D, B8G8R8X8_UNORM) + - ATTACH_BACKING with paged scatter-gather for each + - SUBMIT_3D: render different colors into each texture (set as render target, CLEAR) + - SUBMIT_3D: sample from both textures as textured quads onto a third surface + - SET_SCANOUT + RESOURCE_FLUSH + - Verify both textures display correctly + +2. If it works on Linux (expected), capture the exact VirGL byte sequence with + virgl_intercept.c LD_PRELOAD. + +3. Port the exact bytes to Breenix. If it fails, diff against the Linux bytes to find + the resource creation/backing bug. + +### Likely Bug Candidates + +- Missing ATTACH_BACKING on new resources (paged scatter-gather required) +- Missing CTX_ATTACH_RESOURCE for new resources +- Missing "priming" TRANSFER_TO_HOST_3D (required once per resource, not per frame) +- Wrong bind flags (need RENDER_TARGET | SAMPLER_VIEW at minimum) +- Handle collisions in virglrenderer hash table (handles must be globally unique) + +### Files +- `kernel/src/drivers/virtio/gpu_pci.rs` — resource creation, backing attachment +- `kernel/src/drivers/virtio/virgl.rs` — VirGL command encoding + +## Phase 2: GPU-Based BWM Compositing + +**Goal**: BWM composites windows using GPU textured quads instead of CPU blit. + +### Architecture + +1. Each registered window gets a VirGL TEXTURE_2D resource (created once) +2. Window pixel data lives in the texture's backing pages (MAP_SHARED to client) +3. Per-frame, BWM issues one SUBMIT_3D batch: + - For each visible window: create_sampler_view on window texture, bind as FS input, + DRAW_VBO textured quad at window position + - Background quad rendered first, windows in z-order on top +4. SET_SCANOUT + RESOURCE_FLUSH (matches Linux per-frame sequence) + +### Key Change: No TRANSFER_TO_HOST_3D Per Frame + +The current pipeline does TRANSFER_TO_HOST_3D every frame to upload pixel data. Linux +proves this is unnecessary — the host reads directly from the GPU texture's backing +pages when rendering via SUBMIT_3D. The one-time "priming" TRANSFER_TO_HOST_3D at +resource creation is sufficient. + +### Window Dirty Tracking + +When a client calls mark_window_dirty, BWM knows to include that window in the next +SUBMIT_3D batch. Clean windows can be skipped (their texture is already on the GPU +from the previous frame). + +### Files +- `userspace/programs/src/bwm.rs` — compositor main loop, replace blit_client_pixels +- `kernel/src/syscall/graphics.rs` — window buffer syscalls, texture resource management +- `kernel/src/drivers/virtio/gpu_pci.rs` — per-window resource creation + +## Phase 3: Client-Side GPU Rendering (Bounce) + +**Goal**: Bounce renders spheres using VirGL DRAW_VBO instead of CPU pixel pushing. + +### Architecture + +1. Bounce creates its window (gets a VirGL texture resource as render target) +2. Each frame, bounce issues VirGL commands via a new syscall: + - Set window texture as render target + - CLEAR background + - For each sphere: DRAW_VBO with colored vertices (triangle fan or instanced quad + with circle fragment shader) +3. Calls mark_window_dirty to trigger BWM compositing + +### New API: Breengel GPU Drawing + +Breengel needs a GPU drawing API so clients don't need to encode raw VirGL: + +```rust +// Proposed Breengel GPU API +impl Window { + fn begin_frame(&mut self); + fn clear(&mut self, color: Color); + fn draw_circle(&mut self, cx: i32, cy: i32, radius: i32, color: Color); + fn draw_rect(&mut self, x: i32, y: i32, w: i32, h: i32, color: Color); + fn draw_text(&mut self, text: &[u8], x: i32, y: i32, color: Color); + fn end_frame(&mut self); // triggers SUBMIT_3D + mark_dirty +} +``` + +Under the hood, these accumulate VirGL commands and submit in one batch. + +### Files +- `libs/breengel/src/lib.rs` — GPU drawing API +- `userspace/programs/src/bounce.rs` — convert to GPU rendering +- `kernel/src/syscall/graphics.rs` — new syscall for client SUBMIT_3D + +## Phase 4: Text Rendering on GPU + +**Goal**: bterm, bcheck, btop render text using GPU textured quads with a font atlas. + +### Architecture + +1. Upload bitmap font as a VirGL texture (one-time) +2. Each glyph = textured quad sampling from the font atlas +3. Text rendering becomes a batch of DRAW_VBO calls with texture coordinates + +This eliminates the biggest CPU cost in terminal rendering — drawing characters +pixel-by-pixel into framebuffers. + +## Verification + +Each phase should be verified independently: + +- **Phase 1**: Create 2 textures, render different colors, sample both in one frame +- **Phase 2**: BWM composites without CPU blit, no TRANSFER_TO_HOST_3D per frame +- **Phase 3**: Bounce renders at 60+ FPS with ~0% CPU (only physics simulation) +- **Phase 4**: bterm scrolls smoothly with minimal CPU + +## Priority Order + +Phase 1 (fix per-window textures) unblocks everything else. Start there. +Phase 2 (GPU compositing) gives the biggest immediate win — eliminates the CPU blit. +Phase 3 (client GPU rendering) makes bounce truly GPU-rendered. +Phase 4 (text on GPU) is the final polish for terminal/text apps. diff --git a/kernel/src/arch_impl/aarch64/exception.rs b/kernel/src/arch_impl/aarch64/exception.rs index 48f8cd3f..78208da1 100644 --- a/kernel/src/arch_impl/aarch64/exception.rs +++ b/kernel/src/arch_impl/aarch64/exception.rs @@ -1051,6 +1051,12 @@ pub extern "C" fn handle_irq() { crate::drivers::virtio::gpu_pci::handle_interrupt(); } } + // VirtIO network PCI interrupt dispatch (GICv2m MSI) + if let Some(net_pci_irq) = crate::drivers::virtio::net_pci::get_irq() { + if irq_id == net_pci_irq { + crate::drivers::virtio::net_pci::handle_interrupt(); + } + } } // Should not happen - GIC filters invalid IDs (1020+) diff --git a/kernel/src/arch_impl/aarch64/timer_interrupt.rs b/kernel/src/arch_impl/aarch64/timer_interrupt.rs index 314817b7..420896ee 100644 --- a/kernel/src/arch_impl/aarch64/timer_interrupt.rs +++ b/kernel/src/arch_impl/aarch64/timer_interrupt.rs @@ -280,9 +280,9 @@ pub extern "C" fn timer_interrupt_handler() { crate::drivers::usb::ehci::poll_keyboard(); // Poll XHCI USB HID events (needed when PCI interrupt routing isn't available) crate::drivers::usb::xhci::poll_hid_events(); - // Poll network RX for incoming packets (PCI INTx routing not wired up) - // Covers both VirtIO net PCI (Parallels) and e1000 (VMware) - // Poll every 10th tick (~100Hz at 1000Hz timer) for responsive networking + // Poll network RX as a safety net alongside MSI-X interrupts. + // MSI-X provides sub-ms latency; this 100Hz fallback ensures packets + // are still processed if MSI-X delivery fails for any reason. if (crate::drivers::virtio::net_pci::is_initialized() || crate::drivers::e1000::is_initialized()) && _count % 10 == 0 diff --git a/kernel/src/drivers/virtio/gpu_pci.rs b/kernel/src/drivers/virtio/gpu_pci.rs index c42d0eaf..b0695b25 100644 --- a/kernel/src/drivers/virtio/gpu_pci.rs +++ b/kernel/src/drivers/virtio/gpu_pci.rs @@ -614,6 +614,50 @@ fn init_composite_texture(width: u32, height: u32) -> Result<(), &'static str> { COMPOSITE_TEX_READY.store(true, Ordering::Release); crate::serial_println!("[virgl-composite] Texture resource initialized (id={})", RESOURCE_COMPOSITE_TEX_ID); + + // ── Pre-allocate per-window texture pool ── + // Parallels requires resources to be created BEFORE the first SUBMIT_3D. + // Resources created after SUBMIT_3D has been called don't get their + // TRANSFER_TO_HOST_3D data. Pre-allocate all slots now with display-sized + // backing so they're ready when windows appear. + let pool_w = width; + let pool_h = height; + let pool_size = (pool_w as usize) * (pool_h as usize) * 4; + let mut pool_count = 0usize; + for slot in 0..MAX_WIN_TEX_SLOTS { + let res_id = RESOURCE_WIN_TEX_BASE + slot as u32; + let layout = alloc::alloc::Layout::from_size_align(pool_size, 4096) + .map_err(|_| "win texture pool: layout error")?; + let ptr = unsafe { alloc::alloc::alloc_zeroed(layout) }; + if ptr.is_null() { + crate::serial_println!("[virgl-pool] slot {} alloc failed, pool stopped at {}", slot, slot); + break; + } + + with_device_state(|state| { + virgl_resource_create_3d_cmd( + state, res_id, pipe::TEXTURE_2D, vfmt::B8G8R8X8_UNORM, + pipe::BIND_SAMPLER_VIEW | pipe::BIND_SCANOUT, + pool_w, pool_h, 1, 1, + ) + })?; + with_device_state(|state| { + virgl_attach_backing_paged(state, res_id, ptr, pool_size) + })?; + with_device_state(|state| { + virgl_ctx_attach_resource_cmd(state, VIRGL_CTX_ID, res_id) + })?; + dma_cache_clean(ptr, pool_size); + with_device_state(|state| { + transfer_to_host_3d(state, res_id, 0, 0, pool_w, pool_h, pool_w * 4) + })?; + + unsafe { WIN_TEX_BACKING[slot] = (ptr, pool_size); } + pool_count += 1; + } + crate::serial_println!("[virgl-pool] Pre-allocated {}/{} window texture slots ({}x{}, {}KB each)", + pool_count, MAX_WIN_TEX_SLOTS, pool_w, pool_h, pool_size / 1024); + Ok(()) } @@ -2227,7 +2271,7 @@ fn virgl_attach_backing_from_pages( /// Base resource ID for per-window VirGL textures. Window slot N → resource (10 + N). const RESOURCE_WIN_TEX_BASE: u32 = 10; -const MAX_WIN_TEX_SLOTS: usize = 16; +const MAX_WIN_TEX_SLOTS: usize = 8; /// Per-window contiguous backing buffers for VirGL textures. /// Parallels requires contiguous physical backing for TRANSFER_TO_HOST_3D to work. @@ -2246,81 +2290,72 @@ pub fn init_window_texture( width: u32, height: u32, _page_phys_addrs: &[u64], - total_len: usize, + _total_len: usize, ) -> Result { - use super::virgl::{format as vfmt, pipe}; if slot_index >= MAX_WIN_TEX_SLOTS { return Err("init_window_texture: slot_index out of range"); } let resource_id = RESOURCE_WIN_TEX_BASE + slot_index as u32; - crate::serial_println!( - "[virgl-win] init_window_texture: slot={}, res_id={}, {}x{}, {} bytes (contiguous backing)", - slot_index, resource_id, width, height, total_len - ); - // Allocate contiguous, page-aligned heap buffer for VirGL backing - let backing_layout = alloc::alloc::Layout::from_size_align(total_len, 4096) - .map_err(|_| "init_window_texture: invalid backing layout")?; - let backing_ptr = unsafe { alloc::alloc::alloc_zeroed(backing_layout) }; - if backing_ptr.is_null() { - return Err("init_window_texture: failed to allocate contiguous backing"); + // Pool was pre-allocated at init time (before first SUBMIT_3D). + // Just verify the slot exists and return the resource ID. + let (existing_ptr, existing_len) = unsafe { WIN_TEX_BACKING[slot_index] }; + if existing_ptr.is_null() || existing_len == 0 { + return Err("init_window_texture: slot not pre-allocated"); } - unsafe { WIN_TEX_BACKING[slot_index] = (backing_ptr, total_len); } - // Create TEXTURE_2D with SAMPLER_VIEW bind - with_device_state(|state| { - virgl_resource_create_3d_cmd( - state, - resource_id, - pipe::TEXTURE_2D, - vfmt::B8G8R8X8_UNORM, - pipe::BIND_SAMPLER_VIEW, - width, height, 1, 1, - ) - })?; - - // Attach contiguous backing (same method as compositor texture — proven working) - with_device_state(|state| { - virgl_attach_backing_paged(state, resource_id, backing_ptr, total_len) - })?; - - // Attach to VirGL context - with_device_state(|state| { - virgl_ctx_attach_resource_cmd(state, VIRGL_CTX_ID, resource_id) - })?; - - // Prime with TRANSFER_TO_HOST_3D - dma_cache_clean(backing_ptr, total_len); - with_device_state(|state| { - transfer_to_host_3d(state, resource_id, 0, 0, width, height, width * 4) - })?; - - crate::serial_println!("[virgl-win] Window texture initialized (res_id={}, backing={:#x})", - resource_id, backing_ptr as u64); + crate::serial_println!( + "[virgl-win] init_window_texture: slot={} using pre-allocated res={} ({}x{}, backing={:#x})", + slot_index, resource_id, width, height, existing_ptr as u64 + ); Ok(resource_id) } -/// Copy window pixels from MAP_SHARED pages to the contiguous VirGL backing buffer. -/// Must be called before cache clean + TRANSFER_TO_HOST_3D. -#[allow(dead_code)] -fn copy_window_pages_to_backing(slot_index: usize, page_phys_addrs: &[u64], total_len: usize) { - let (backing_ptr, backing_len) = unsafe { WIN_TEX_BACKING[slot_index] }; - if backing_ptr.is_null() || backing_len == 0 { return; } - - let phys_mem_offset = crate::memory::physical_memory_offset().as_u64(); - let copy_len = total_len.min(backing_len); - let mut offset = 0usize; - - for &page_phys in page_phys_addrs { - if offset >= copy_len { break; } - let page_ptr = (phys_mem_offset + page_phys) as *const u8; - let chunk = (copy_len - offset).min(4096); - unsafe { - core::ptr::copy_nonoverlapping(page_ptr, backing_ptr.add(offset), chunk); +/// Blit window content from MAP_SHARED pages directly into COMPOSITE_TEX at (x, y). +/// This composites window pixels into the single compositor texture, giving correct +/// z-order when called bottom-to-top. The cursor is drawn AFTER this, so it appears on top. +fn blit_window_to_compositor( + win_x: u32, win_y: u32, + win_w: u32, win_h: u32, + page_phys_addrs: &[u64], + tex_w: u32, tex_h: u32, +) { + let phys_offset = crate::memory::physical_memory_offset().as_u64(); + let row_bytes = (win_w as usize) * 4; + let tex_stride = (tex_w as usize) * 4; + let tex_ptr = unsafe { COMPOSITE_TEX_PTR }; + + for row in 0..win_h as usize { + let dst_y = (win_y as usize) + row; + if dst_y >= tex_h as usize { break; } + let dst_x = win_x as usize; + let copy_w = (win_w as usize).min((tex_w as usize).saturating_sub(dst_x)); + if copy_w == 0 { continue; } + let copy_bytes = copy_w * 4; + + let src_offset = row * row_bytes; + let dst_offset = dst_y * tex_stride + dst_x * 4; + + // Copy from scattered pages, handling page boundaries + let mut copied = 0usize; + while copied < copy_bytes { + let linear_pos = src_offset + copied; + let page_idx = linear_pos / 4096; + let page_off = linear_pos % 4096; + if page_idx >= page_phys_addrs.len() { break; } + let chunk = (4096 - page_off).min(copy_bytes - copied); + let src_ptr = (phys_offset + page_phys_addrs[page_idx] + page_off as u64) as *const u8; + unsafe { + core::ptr::copy_nonoverlapping( + src_ptr, + tex_ptr.add(dst_offset + copied), + chunk, + ); + } + copied += chunk; } - offset += chunk; } } @@ -3376,6 +3411,77 @@ pub fn virgl_composite_frame_textured( Ok(()) } +/// Build and submit a single fullscreen textured quad from COMPOSITE_TEX. +/// +/// COMPOSITE_TEX already contains the fully-composited frame: background, window +/// frames/decorations, window content (blitted in z-order), and cursor. +fn virgl_composite_single_quad() -> Result<(), &'static str> { + use super::virgl::{CommandBuffer, format as vfmt, pipe, swizzle}; + + let tex_w = COMPOSITE_TEX_W.load(Ordering::Relaxed); + let tex_h = COMPOSITE_TEX_H.load(Ordering::Relaxed); + let (display_w, display_h) = dimensions().ok_or("GPU not initialized")?; + + let mut cmdbuf = CommandBuffer::new(); + cmdbuf.create_sub_ctx(1); + cmdbuf.set_sub_ctx(1); + cmdbuf.set_tweaks(1, 1); + cmdbuf.set_tweaks(2, display_w); + + cmdbuf.create_surface(10, RESOURCE_3D_ID, vfmt::B8G8R8X8_UNORM, 0, 0); + cmdbuf.set_framebuffer_state(0, &[10]); + cmdbuf.create_blend_simple(11); + cmdbuf.bind_object(11, super::virgl::OBJ_BLEND); + cmdbuf.create_dsa_default(12); + cmdbuf.bind_object(12, super::virgl::OBJ_DSA); + cmdbuf.create_rasterizer_default(13); + cmdbuf.bind_object(13, super::virgl::OBJ_RASTERIZER); + + let tex_vs = b"VERT\nDCL IN[0]\nDCL IN[1]\nDCL OUT[0], POSITION\nDCL OUT[1], GENERIC[0]\n 0: MOV OUT[0], IN[0]\n 1: MOV OUT[1], IN[1]\n 2: END\n"; + cmdbuf.create_shader(14, pipe::SHADER_VERTEX, 300, tex_vs); + cmdbuf.bind_shader(14, pipe::SHADER_VERTEX); + let tex_fs = b"FRAG\nPROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1\nDCL IN[0], GENERIC[0], LINEAR\nDCL OUT[0], COLOR\nDCL SAMP[0]\nDCL SVIEW[0], 2D, FLOAT\n 0: TEX OUT[0], IN[0], SAMP[0], 2D\n 1: END\n"; + cmdbuf.create_shader(15, pipe::SHADER_FRAGMENT, 300, tex_fs); + cmdbuf.bind_shader(15, pipe::SHADER_FRAGMENT); + + cmdbuf.create_vertex_elements(16, &[ + (0, 0, 0, vfmt::R32G32B32A32_FLOAT), + (16, 0, 0, vfmt::R32G32B32A32_FLOAT), + ]); + cmdbuf.bind_object(16, super::virgl::OBJ_VERTEX_ELEMENTS); + + cmdbuf.create_sampler_state(18, pipe::TEX_WRAP_CLAMP_TO_EDGE, pipe::TEX_WRAP_CLAMP_TO_EDGE, + pipe::TEX_WRAP_CLAMP_TO_EDGE, pipe::TEX_FILTER_NEAREST, pipe::TEX_MIPFILTER_NONE, + pipe::TEX_FILTER_NEAREST); + cmdbuf.bind_sampler_states(pipe::SHADER_FRAGMENT, 0, &[18]); + cmdbuf.set_min_samples(1); + cmdbuf.set_viewport(display_w as f32, display_h as f32); + + cmdbuf.create_sampler_view(17, RESOURCE_COMPOSITE_TEX_ID, vfmt::B8G8R8X8_UNORM, + pipe::TEXTURE_2D, 0, 0, 0, 0, swizzle::IDENTITY); + cmdbuf.set_sampler_views(pipe::SHADER_FRAGMENT, 0, &[17]); + + let u_max = (tex_w.min(display_w) as f32) / (tex_w as f32); + let v_max = (tex_h.min(display_h) as f32) / (tex_h as f32); + let bg_verts: [u32; 32] = [ + (-1.0f32).to_bits(), (1.0f32).to_bits(), 0f32.to_bits(), 1.0f32.to_bits(), + 0f32.to_bits(), 0f32.to_bits(), 0f32.to_bits(), 0f32.to_bits(), + (-1.0f32).to_bits(), (-1.0f32).to_bits(), 0f32.to_bits(), 1.0f32.to_bits(), + 0f32.to_bits(), v_max.to_bits(), 0f32.to_bits(), 0f32.to_bits(), + 1.0f32.to_bits(), (-1.0f32).to_bits(), 0f32.to_bits(), 1.0f32.to_bits(), + u_max.to_bits(), v_max.to_bits(), 0f32.to_bits(), 0f32.to_bits(), + 1.0f32.to_bits(), (1.0f32).to_bits(), 0f32.to_bits(), 1.0f32.to_bits(), + u_max.to_bits(), 0f32.to_bits(), 0f32.to_bits(), 0f32.to_bits(), + ]; + cmdbuf.resource_inline_write(RESOURCE_VB_ID, 0, 128, &bg_verts); + cmdbuf.set_vertex_buffers(&[(32, 0, RESOURCE_VB_ID)]); + cmdbuf.draw_vbo(0, 4, pipe::PRIM_TRIANGLE_FAN, 3); + + virgl_submit_sync(cmdbuf.as_slice())?; + with_device_state(|state| set_scanout_resource(state, RESOURCE_3D_ID))?; + with_device_state(|state| resource_flush_3d(state, RESOURCE_3D_ID)) +} + /// Multi-window GPU compositor. /// /// Uploads dirty textures (background + per-window), then renders all windows @@ -3478,6 +3584,24 @@ pub fn virgl_composite_windows( } } + // Step 2: Blit window content from MAP_SHARED pages into COMPOSITE_TEX. + // Windows are composited in z-order (bottom first in the array, top last) + // so higher-z windows correctly overwrite lower-z windows where they overlap. + // This must happen BEFORE cursor drawing so the cursor appears on top. + if bg_dirty || any_window_dirty { + for win in windows.iter() { + if win.page_phys_addrs.is_empty() || win.width == 0 || win.height == 0 { + continue; + } + blit_window_to_compositor( + win.x as u32, win.y as u32, + win.width, win.height, + &win.page_phys_addrs, + tex_w, tex_h, + ); + } + } + // ── Step 3: Cursor rendering ──────────────────────────────────────────── // Draw the mouse cursor directly into COMPOSITE_TEX so it appears in the // composited output without requiring a full 4.9MB upload from userspace. @@ -3690,11 +3814,10 @@ pub fn virgl_composite_windows( } // ========================================================================= - // Phase B+C: Direct scanout on COMPOSITE_TEX (skip SUBMIT_3D entirely) + // Phase B+C: Single fullscreen SUBMIT_3D quad + display // ========================================================================= - // Instead of building a VirGL 3D pipeline (shaders, textured quad, SUBMIT_3D) - // to copy COMPOSITE_TEX onto RESOURCE_3D_ID, we set scanout directly on - // COMPOSITE_TEX_ID. This eliminates the SUBMIT_3D round-trip. + // Window content was already blitted into COMPOSITE_TEX in z-order (step 2), + // so a single textured quad correctly displays everything including cursor. // Perf: timestamp before display phase #[cfg(target_arch = "aarch64")] @@ -3704,20 +3827,7 @@ pub fn virgl_composite_windows( v }; - // Direct scanout on COMPOSITE_TEX — skip SUBMIT_3D entirely. - // TRANSFER_TO_HOST_3D already pushed pixels to the host texture. - // SET_SCANOUT + RESOURCE_FLUSH displays it directly. - static SCANOUT_ESTABLISHED: core::sync::atomic::AtomicBool = - core::sync::atomic::AtomicBool::new(false); - if !SCANOUT_ESTABLISHED.load(Ordering::Relaxed) { - with_device_state(|state| { - set_scanout_resource(state, RESOURCE_COMPOSITE_TEX_ID) - })?; - SCANOUT_ESTABLISHED.store(true, Ordering::Relaxed); - } - with_device_state(|state| { - resource_flush_3d(state, RESOURCE_COMPOSITE_TEX_ID) - })?; + virgl_composite_single_quad()?; // Perf: end of frame #[cfg(target_arch = "aarch64")] @@ -3756,10 +3866,8 @@ pub fn virgl_composite_windows( let avg_display = to_us(PERF_DISPLAY_TICKS.swap(0, Ordering::Relaxed)); let avg_total = to_us(PERF_TOTAL_TICKS.swap(0, Ordering::Relaxed)); - crate::serial_println!( - "[gpu-perf] frame={} avg/frame: compose={}us display={}us TOTAL={}us", - frame, avg_compose, avg_display, avg_total - ); + // GPU perf counters available via GDB: PERF_COMPOSE_TICKS, PERF_DISPLAY_TICKS, PERF_TOTAL_TICKS + let _ = (avg_compose, avg_display, avg_total); } } diff --git a/kernel/src/drivers/virtio/net_pci.rs b/kernel/src/drivers/virtio/net_pci.rs index 856cd524..03ec17fb 100644 --- a/kernel/src/drivers/virtio/net_pci.rs +++ b/kernel/src/drivers/virtio/net_pci.rs @@ -17,7 +17,7 @@ use crate::drivers::pci; use core::ptr::{read_volatile, write_volatile}; -use core::sync::atomic::{fence, AtomicBool, Ordering}; +use core::sync::atomic::{fence, AtomicBool, AtomicU32, Ordering}; // Legacy VirtIO PCI register offsets (from BAR0) const REG_DEVICE_FEATURES: usize = 0x00; @@ -67,6 +67,12 @@ struct VirtqDesc { const DESC_F_WRITE: u16 = 2; +/// When set in avail.flags, tells the device NOT to send interrupts (MSIs) +/// when it adds entries to the used ring. Used for NAPI-style interrupt +/// coalescing: handler sets this to suppress MSI storm, softirq clears it +/// after draining the used ring. +const VRING_AVAIL_F_NO_INTERRUPT: u16 = 1; + /// Legacy VirtIO queue size — must match what the device reports. /// Parallels reports 256; the driver can't change it on legacy transport. const VIRTQ_SIZE: usize = 256; @@ -174,6 +180,8 @@ struct NetPciState { static mut NET_PCI_STATE: Option = None; static DEVICE_INITIALIZED: AtomicBool = AtomicBool::new(false); +static NET_PCI_IRQ: AtomicU32 = AtomicU32::new(0); +static NET_PCI_MSI_COUNT: AtomicU32 = AtomicU32::new(0); // Legacy register access helpers #[inline(always)] @@ -211,6 +219,144 @@ fn virt_to_phys(addr: u64) -> u64 { addr - crate::memory::physical_memory_offset().as_u64() } +/// Get the GIC INTID for the VirtIO PCI net interrupt, if MSI is enabled. +pub fn get_irq() -> Option { + let irq = NET_PCI_IRQ.load(Ordering::Relaxed); + if irq != 0 { Some(irq) } else { None } +} + +/// VirtIO legacy MSI-X register offsets (present when MSI-X is enabled at PCI level). +/// These replace the device config at BAR0+0x14; device config shifts to 0x18. +const MSIX_CONFIG_VECTOR: usize = 0x14; +const MSIX_QUEUE_VECTOR: usize = 0x16; + +/// Resolve a GICv2m doorbell address. Returns the MSI_SETSPI_NS physical address. +fn resolve_gicv2m_doorbell() -> Option { + const PARALLELS_GICV2M_BASE: u64 = 0x0225_0000; + let gicv2m_base = crate::platform_config::gicv2m_base_phys(); + let base = if gicv2m_base != 0 { + gicv2m_base + } else if crate::platform_config::probe_gicv2m(PARALLELS_GICV2M_BASE) { + PARALLELS_GICV2M_BASE + } else { + return None; + }; + Some(base + 0x40) +} + +/// Set up PCI MSI or MSI-X delivery for the VirtIO network device through GICv2m. +fn setup_net_pci_msi(pci_dev: &crate::drivers::pci::Device) { + use crate::arch_impl::aarch64::gic; + + pci_dev.dump_capabilities(); + + // Try plain MSI first (some VirtIO devices have this) + if let Some(cap_offset) = pci_dev.find_msi_capability() { + crate::serial_println!("[virtio-net-pci] Found MSI capability at offset {:#x}", cap_offset); + if let Some(doorbell) = resolve_gicv2m_doorbell() { + let spi = crate::platform_config::allocate_msi_spi(); + if spi != 0 { + pci_dev.configure_msi(cap_offset, doorbell as u32, spi as u16); + pci_dev.disable_intx(); + gic::configure_spi_edge_triggered(spi); + NET_PCI_IRQ.store(spi, Ordering::Relaxed); + gic::enable_spi(spi); + crate::serial_println!("[virtio-net-pci] MSI enabled: SPI {} doorbell={:#x}", spi, doorbell); + return; + } + } + crate::serial_println!("[virtio-net-pci] MSI setup failed — trying MSI-X"); + } + + // Try MSI-X (Parallels VirtIO net PCI 1af4:1000 has MSI-X with 3 vectors) + let msix_cap = match pci_dev.find_msix_capability() { + Some(cap) => cap, + None => { + crate::serial_println!("[virtio-net-pci] No MSI or MSI-X capability — polling fallback"); + return; + } + }; + + let table_size = pci_dev.msix_table_size(msix_cap); + crate::serial_println!("[virtio-net-pci] MSI-X cap at {:#x}: {} vectors", msix_cap, table_size); + + let doorbell = match resolve_gicv2m_doorbell() { + Some(d) => d, + None => { + crate::serial_println!("[virtio-net-pci] GICv2m not available — polling fallback"); + return; + } + }; + + let spi = crate::platform_config::allocate_msi_spi(); + if spi == 0 { + crate::serial_println!("[virtio-net-pci] Failed to allocate MSI SPI — polling fallback"); + return; + } + + // Program all MSI-X table entries with the same SPI (single-vector mode). + for v in 0..table_size { + pci_dev.configure_msix_entry(msix_cap, v, doorbell, spi); + } + + gic::configure_spi_edge_triggered(spi); + // Store IRQ but do NOT enable the SPI yet. The SPI is enabled by + // enable_msi_spi() after init_common() completes its synchronous + // ARP/ICMP polling. This avoids the GICv2m level-triggered SPI storm + // during init (the device fires MSIs for ARP replies, and the level + // stays asserted through EOI). + NET_PCI_IRQ.store(spi, Ordering::Release); + + // Enable MSI-X at PCI level and disable legacy INTx + pci_dev.enable_msix(msix_cap); + pci_dev.disable_intx(); + + // Assign VirtIO-level MSI-X vectors. + let bar0_virt = unsafe { + let ptr = &raw const NET_PCI_STATE; + match (*ptr).as_ref() { + Some(s) => s.bar0_virt, + None => { + crate::serial_println!("[virtio-net-pci] MSI-X: device state not available"); + return; + } + } + }; + + // Config change → no interrupt (0xFFFF). Avoids spurious config-change + // MSIs that could cause an interrupt storm unrelated to packet RX. + reg_write_u16(bar0_virt, MSIX_CONFIG_VECTOR, 0xFFFF); + let cfg_rb = reg_read_u16(bar0_virt, MSIX_CONFIG_VECTOR); + + // RX queue (0) → vector 0 + reg_write_u16(bar0_virt, REG_QUEUE_SELECT, 0); + reg_write_u16(bar0_virt, MSIX_QUEUE_VECTOR, 0); + let rx_rb = reg_read_u16(bar0_virt, MSIX_QUEUE_VECTOR); + + // TX queue (1) → no interrupt + reg_write_u16(bar0_virt, REG_QUEUE_SELECT, 1); + reg_write_u16(bar0_virt, MSIX_QUEUE_VECTOR, 0xFFFF); + + crate::serial_println!( + "[virtio-net-pci] MSI-X vector assignments: cfg={:#x} rx={:#x}", + cfg_rb, rx_rb + ); + + // Only RX vector must succeed; config vector is intentionally 0xFFFF + if rx_rb == 0xFFFF { + crate::serial_println!("[virtio-net-pci] MSI-X: device rejected RX vector — polling fallback"); + pci_dev.disable_msix(msix_cap); + pci_dev.enable_intx(); + NET_PCI_IRQ.store(0, Ordering::Relaxed); + return; + } + + crate::serial_println!( + "[virtio-net-pci] MSI-X enabled: SPI {} doorbell={:#x} vectors={}", + spi, doorbell, table_size + ); +} + /// Initialize the VirtIO network device via PCI legacy transport. pub fn init() -> Result<(), &'static str> { crate::serial_println!("[virtio-net-pci] Searching for VirtIO network device on PCI bus..."); @@ -311,6 +457,7 @@ pub fn init() -> Result<(), &'static str> { post_rx_buffers()?; DEVICE_INITIALIZED.store(true, Ordering::Release); + setup_net_pci_msi(pci_dev); crate::serial_println!("[virtio-net-pci] Network device initialized successfully"); Ok(()) } @@ -548,12 +695,115 @@ pub fn mac_address() -> Option<[u8; 6]> { } } -/// Interrupt handler for VirtIO network PCI device. +/// Get the MSI interrupt count (for diagnostics). +pub fn msi_interrupt_count() -> u32 { + NET_PCI_MSI_COUNT.load(Ordering::Relaxed) +} + +/// Interrupt handler for VirtIO network PCI device (MSI-X). +/// +/// Uses NAPI-style two-level suppression to prevent GICv2m SPI storms: +/// 1. Device-level: sets VRING_AVAIL_F_NO_INTERRUPT so the device stops +/// writing MSIs to GICv2m entirely. +/// 2. GIC-level: disables the SPI as a safety net. +/// +/// Does NOT process packets or raise softirq (locks in the packet +/// processing path could deadlock with the interrupted thread). +/// Timer-based NetRx softirq handles packet processing and calls +/// re_enable_irq() to re-arm both levels. pub fn handle_interrupt() { - if !DEVICE_INITIALIZED.load(Ordering::Acquire) { + use crate::arch_impl::aarch64::gic; + + NET_PCI_MSI_COUNT.fetch_add(1, Ordering::Relaxed); + + let irq = NET_PCI_IRQ.load(Ordering::Relaxed); + if irq == 0 { return; } + // Suppress at the device level FIRST — prevents new MSI writes to GICv2m. + unsafe { + let q = &raw mut PCI_RX_QUEUE; + write_volatile(&mut (*q).avail.flags, VRING_AVAIL_F_NO_INTERRUPT); + fence(Ordering::SeqCst); + } + + // Mask SPI at the GIC — belt-and-suspenders with device-level suppression. + gic::disable_spi(irq); + gic::clear_spi_pending(irq); + + // Read ISR to clear the VirtIO device's internal interrupt condition. + let state = &raw const NET_PCI_STATE; + unsafe { + if let Some(ref s) = *state { + let _isr = reg_read_u8(s.bar0_virt, REG_ISR_STATUS); + } + } + + // Both levels stay suppressed — re_enable_irq() called from timer softirq. +} + +/// Re-enable the network device's MSI-X interrupt after softirq processing. +/// +/// Called by the NetRx softirq handler after draining the used ring. +/// Follows the Linux virtqueue_enable_cb() pattern: +/// 1. Read ISR to clear any pending device interrupt condition +/// 2. Re-enable device-level interrupts (clear NO_INTERRUPT flag) +/// 3. Memory barrier + check for new used ring entries +/// 4. If more work: re-suppress and let next softirq handle it +/// 5. If clean: clear GIC pending + enable SPI +pub fn re_enable_irq() { + use crate::arch_impl::aarch64::gic; + + let irq = NET_PCI_IRQ.load(Ordering::Relaxed); + if irq == 0 { + return; + } + + // Read ISR to clear any pending device interrupt condition before re-enabling. + let state_ptr = &raw const NET_PCI_STATE; + unsafe { + if let Some(ref s) = *state_ptr { + let _isr = reg_read_u8(s.bar0_virt, REG_ISR_STATUS); + } + } + + // Re-enable device-level interrupts (Linux: virtqueue_enable_cb) + unsafe { + let q = &raw mut PCI_RX_QUEUE; + write_volatile(&mut (*q).avail.flags, 0); + fence(Ordering::SeqCst); + } + + // Check if more work arrived while we were processing (race window). + // If so, re-suppress and let the next timer softirq cycle handle it. + let has_more = unsafe { + let q = &raw const PCI_RX_QUEUE; + let used_idx = read_volatile(&(*q).used.idx); + if let Some(ref s) = *state_ptr { + used_idx != s.rx_last_used_idx + } else { + false + } + }; + + if has_more { + // More work arrived — re-suppress device interrupts, don't enable SPI. + unsafe { + let q = &raw mut PCI_RX_QUEUE; + write_volatile(&mut (*q).avail.flags, VRING_AVAIL_F_NO_INTERRUPT); + fence(Ordering::SeqCst); + } + return; + } + + // Used ring is drained — safe to re-enable the GIC SPI. + gic::clear_spi_pending(irq); + gic::enable_spi(irq); +} + +/// Diagnostic: dump RX queue state for debugging MSI-X issues. +pub fn dump_rx_state() { let state = unsafe { let ptr = &raw const NET_PCI_STATE; match (*ptr).as_ref() { @@ -562,10 +812,43 @@ pub fn handle_interrupt() { } }; - // Reading ISR status auto-acknowledges on legacy PCI - let _isr = reg_read_u8(state.bar0_virt, REG_ISR_STATUS); + let isr = reg_read_u8(state.bar0_virt, REG_ISR_STATUS); + let (used_idx, avail_idx) = unsafe { + let q = &raw const PCI_RX_QUEUE; + (read_volatile(&(*q).used.idx), read_volatile(&(*q).avail.idx)) + }; + let msi_count = NET_PCI_MSI_COUNT.load(Ordering::Relaxed); + crate::serial_println!( + "[virtio-net-pci] RX diag: used_idx={} last_used={} avail_idx={} isr={:#x} msi_count={}", + used_idx, state.rx_last_used_idx, avail_idx, isr, msi_count + ); +} + +/// Enable the MSI-X SPI at the GIC after init polling is complete. +/// +/// During init, the ARP/ICMP polling loop processes RX via timer-based softirq. +/// The SPI must NOT be enabled during init because the GICv2m level-triggered +/// storm would prevent the main thread from making progress. After init drains +/// all used ring entries, it's safe to enable the SPI for interrupt-driven RX. +pub fn enable_msi_spi() { + use crate::arch_impl::aarch64::gic; + + let irq = NET_PCI_IRQ.load(Ordering::Relaxed); + if irq == 0 { + return; + } + + // Read ISR to clear any pending device interrupt from init polling + let state_ptr = &raw const NET_PCI_STATE; + unsafe { + if let Some(ref s) = *state_ptr { + let _isr = reg_read_u8(s.bar0_virt, REG_ISR_STATUS); + } + } - crate::task::softirqd::raise_softirq(crate::task::softirqd::SoftirqType::NetRx); + gic::clear_spi_pending(irq); + gic::enable_spi(irq); + crate::serial_println!("[virtio-net-pci] MSI-X SPI {} enabled (post-init)", irq); } /// Whether the PCI net device is initialized diff --git a/kernel/src/fs/procfs/mod.rs b/kernel/src/fs/procfs/mod.rs index ded541f7..ce13b779 100644 --- a/kernel/src/fs/procfs/mod.rs +++ b/kernel/src/fs/procfs/mod.rs @@ -774,6 +774,11 @@ fn generate_stat() -> String { GPU_FULL_UPLOADS.aggregate(), GPU_PARTIAL_UPLOADS.aggregate(), ); + #[cfg(target_arch = "aarch64")] + { + let _ = write!(out, "net_msi_irqs {}\n", + crate::drivers::virtio::net_pci::msi_interrupt_count()); + } out } diff --git a/kernel/src/net/arp.rs b/kernel/src/net/arp.rs index 548dacc1..a9c60180 100644 --- a/kernel/src/net/arp.rs +++ b/kernel/src/net/arp.rs @@ -218,14 +218,19 @@ pub fn handle_arp(eth_frame: &EthernetFrame, arp: &ArpPacket) { } } -/// Update the ARP cache with a new entry +/// Update the ARP cache with a new entry. +/// IRQ-safe: disables interrupts to prevent deadlock with softirq handler +/// which also calls update_cache via process_rx → handle_arp. fn update_cache(ip: &[u8; 4], mac: &[u8; 6]) { + let saved = super::irq_save(); let mut cache = ARP_CACHE.lock(); // First, check if entry already exists for entry in cache.iter_mut() { if entry.valid && entry.ip == *ip { entry.mac = *mac; + drop(cache); + super::irq_restore(saved); return; } } @@ -236,6 +241,8 @@ fn update_cache(ip: &[u8; 4], mac: &[u8; 6]) { entry.ip = *ip; entry.mac = *mac; entry.valid = true; + drop(cache); + super::irq_restore(saved); return; } } @@ -244,18 +251,27 @@ fn update_cache(ip: &[u8; 4], mac: &[u8; 6]) { cache[0].ip = *ip; cache[0].mac = *mac; cache[0].valid = true; + drop(cache); + super::irq_restore(saved); } -/// Look up a MAC address in the ARP cache +/// Look up a MAC address in the ARP cache. +/// IRQ-safe: disables interrupts to prevent deadlock with softirq handler. pub fn lookup(ip: &[u8; 4]) -> Option<[u8; 6]> { + let saved = super::irq_save(); let cache = ARP_CACHE.lock(); for entry in cache.iter() { if entry.valid && entry.ip == *ip { - return Some(entry.mac); + let mac = entry.mac; + drop(cache); + super::irq_restore(saved); + return Some(mac); } } + drop(cache); + super::irq_restore(saved); None } diff --git a/kernel/src/net/mod.rs b/kernel/src/net/mod.rs index eb3cc68f..fe0f4cfb 100644 --- a/kernel/src/net/mod.rs +++ b/kernel/src/net/mod.rs @@ -31,6 +31,41 @@ use crate::drivers::virtio::net_pci; use crate::task::softirqd::{register_softirq_handler, SoftirqType}; +/// Disable IRQs and return saved DAIF state. Prevents timer interrupt → +/// softirq → process_rx from deadlocking on shared locks (ARP_CACHE, +/// NET_CONFIG) that the interrupted thread may hold. +#[cfg(target_arch = "aarch64")] +#[inline(always)] +pub(crate) fn irq_save() -> u64 { + let daif: u64; + unsafe { + core::arch::asm!("mrs {}, daif", out(reg) daif, options(nomem, nostack)); + core::arch::asm!("msr daifset, #2", options(nomem, nostack)); + } + daif +} + +#[cfg(target_arch = "aarch64")] +#[inline(always)] +pub(crate) fn irq_restore(saved: u64) { + unsafe { + core::arch::asm!("msr daif, {}", in(reg) saved, options(nomem, nostack)); + } +} + +#[cfg(target_arch = "x86_64")] +#[inline(always)] +pub(crate) fn irq_save() -> u64 { 0 } + +#[cfg(target_arch = "x86_64")] +#[inline(always)] +pub(crate) fn irq_restore(_: u64) {} + +/// Re-entrancy guard for process_rx() on aarch64. Prevents the softirq handler +/// from re-entering process_rx() while the ARP polling loop is already inside it. +#[cfg(target_arch = "aarch64")] +static RX_PROCESSING: core::sync::atomic::AtomicBool = core::sync::atomic::AtomicBool::new(false); + // Logging macros that work on both architectures #[cfg(target_arch = "x86_64")] macro_rules! net_log { @@ -189,10 +224,19 @@ pub fn drain_loopback_queue() { } } -/// Softirq handler for network RX processing -/// Called from softirq context when NetRx softirq is raised by network interrupt handler +/// Softirq handler for network RX processing. +/// Called from softirq context when NetRx softirq is raised by the timer (every 10ms). +/// +/// The MSI handler does NOT raise softirq (to avoid lock contention in +/// exception context). Instead, the timer raises NetRx every 10ms. This handler +/// processes packets and then re-enables the MSI-X SPI so new interrupts can fire. fn net_rx_softirq_handler(_softirq: SoftirqType) { process_rx(); + + #[cfg(target_arch = "aarch64")] + if net_pci::is_initialized() { + net_pci::re_enable_irq(); + } } /// Re-register the network softirq handler. @@ -232,12 +276,18 @@ pub fn init() { // Auto-detect platform: PCI net = Parallels, e1000 = VMware, MMIO net = QEMU if net_pci::is_initialized() { crate::serial_println!("[net] Using VirtIO net PCI driver (Parallels)"); + let saved = irq_save(); let mut config = NET_CONFIG.lock(); *config = PARALLELS_CONFIG; + drop(config); + irq_restore(saved); } else if e1000::is_initialized() { crate::serial_println!("[net] Using Intel e1000 driver (VMware)"); + let saved = irq_save(); let mut config = NET_CONFIG.lock(); *config = VMWARE_CONFIG; + drop(config); + irq_restore(saved); } if let Some(mac) = get_mac_address() { @@ -262,13 +312,15 @@ fn init_common() { return; } + let saved = irq_save(); let config = NET_CONFIG.lock(); - net_log!("NET: IP address: {}.{}.{}.{}", - config.ip_addr[0], config.ip_addr[1], config.ip_addr[2], config.ip_addr[3] - ); - net_log!("NET: Gateway: {}.{}.{}.{}", - config.gateway[0], config.gateway[1], config.gateway[2], config.gateway[3] - ); + let ip = config.ip_addr; + let gw = config.gateway; + drop(config); + irq_restore(saved); + + net_log!("NET: IP address: {}.{}.{}.{}", ip[0], ip[1], ip[2], ip[3]); + net_log!("NET: Gateway: {}.{}.{}.{}", gw[0], gw[1], gw[2], gw[3]); // Initialize ARP cache arp::init(); @@ -276,8 +328,7 @@ fn init_common() { net_log!("Network stack initialized"); // Send ARP request for gateway to test network connectivity - let gateway = config.gateway; - drop(config); // Release lock before calling arp::request + let gateway = gw; net_log!("NET: Sending ARP request for gateway {}.{}.{}.{}", gateway[0], gateway[1], gateway[2], gateway[3]); if let Err(e) = arp::request(&gateway) { @@ -288,12 +339,17 @@ fn init_common() { // Wait for ARP reply (poll RX a few times to get the gateway MAC) // The reply comes via interrupt, so we just need to give it time to arrive - for _ in 0..100 { + for _i in 0..100 { process_rx(); - // Delay to let packets arrive and interrupts fire + // Delay to let packets arrive and timer-based polling process them for _ in 0..1_000_000 { core::hint::spin_loop(); } + // Diagnostic: dump RX queue state on first few iterations + #[cfg(target_arch = "aarch64")] + if _i < 5 || _i % 20 == 0 { + net_pci::dump_rx_state(); + } // Check if we got the ARP reply yet if let Some(gateway_mac) = arp::lookup(&gateway) { net_log!("NET: ARP resolved gateway MAC: {:02x}:{:02x}:{:02x}:{:02x}:{:02x}:{:02x}", @@ -333,16 +389,24 @@ fn init_common() { // interrupt-driven RX doesn't interfere with the polling. #[cfg(target_arch = "aarch64")] { - if !net_pci::is_initialized() { + if net_pci::is_initialized() { + // Enable MSI-X SPI at GIC now that the used ring is drained. + // During init, timer-based polling handled RX. Now switch to + // interrupt-driven NAPI-style processing. + net_pci::enable_msi_spi(); + } else { net_mmio::enable_net_irq(); } - // PCI net uses polling mode (no GIC IRQ needed — softirq handles packet processing) } } -/// Get the current network configuration +/// Get the current network configuration. +/// IRQ-safe: disables interrupts to prevent deadlock with softirq handler. pub fn config() -> NetConfig { - *NET_CONFIG.lock() + let saved = irq_save(); + let c = *NET_CONFIG.lock(); + irq_restore(saved); + c } /// Process incoming packets (called from interrupt handler or polling loop) @@ -361,8 +425,19 @@ pub fn process_rx() { } /// Process incoming packets (ARM64 - polling or interrupt driven) +/// +/// Protected by RX_PROCESSING atomic to prevent re-entrancy. When MSI-X is +/// active, the softirq handler can preempt the ARP polling loop and try to +/// call process_rx() re-entrantly — the guard skips the nested call. #[cfg(target_arch = "aarch64")] pub fn process_rx() { + // Re-entrancy guard: if we're already inside process_rx (e.g., ARP polling + // loop interrupted by MSI-X → softirq → process_rx), skip this call. + use core::sync::atomic::Ordering; + if RX_PROCESSING.compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed).is_err() { + return; + } + // Try PCI driver first (Parallels), then e1000 (VMware), then MMIO (QEMU) if net_pci::is_initialized() { let mut processed = false; @@ -393,6 +468,12 @@ pub fn process_rx() { net_mmio::recycle_rx_buffers(); } } + + // Do NOT re-enable SPI here — the softirq handler does it after process_rx + // returns, regardless of whether we processed packets or bailed on re-entrancy. + // This avoids re-enabling from multiple code paths. + + RX_PROCESSING.store(false, Ordering::Release); } /// Process a received Ethernet frame diff --git a/kernel/src/process/manager.rs b/kernel/src/process/manager.rs index fb746e3a..59bd3714 100644 --- a/kernel/src/process/manager.rs +++ b/kernel/src/process/manager.rs @@ -948,10 +948,40 @@ impl ProcessManager { self.current_pid = None; } - // TODO: Clean up process resources - // - Unmap memory pages - // - Close file descriptors - // - Reparent children to init + // Free heavy resources immediately rather than waiting for waitpid reap. + // CoW refcounts were already decremented by terminate() -> cleanup_cow_frames(), + // so it's safe to drop the page table now. + process.page_table.take(); + process.stack.take(); + process.pending_old_page_tables.clear(); + + // Clean up window buffers so the compositor stops reading freed pages + #[cfg(target_arch = "aarch64")] + crate::syscall::graphics::cleanup_windows_for_pid(pid.as_u64()); + } + + // Reparent children to init (PID 1) + let init_pid = ProcessId::new(1); + if pid != init_pid { + let children: Vec = self + .processes + .get(&pid) + .map(|p| p.children.clone()) + .unwrap_or_default(); + + if !children.is_empty() { + for &child_pid in &children { + if let Some(child) = self.processes.get_mut(&child_pid) { + child.parent = Some(init_pid); + } + } + if let Some(init) = self.processes.get_mut(&init_pid) { + init.children.extend(children.iter()); + } + if let Some(exiting) = self.processes.get_mut(&pid) { + exiting.children.clear(); + } + } } // Send SIGCHLD to the parent process (if any) diff --git a/kernel/src/syscall/graphics.rs b/kernel/src/syscall/graphics.rs index d100f269..af99cf6b 100644 --- a/kernel/src/syscall/graphics.rs +++ b/kernel/src/syscall/graphics.rs @@ -70,6 +70,19 @@ pub fn wake_compositor_if_waiting() { } } +/// Clean up all window buffers owned by a terminated process. +/// Removes entries from the registry and wakes the compositor so it +/// discovers the removal and repaints. +#[cfg(target_arch = "aarch64")] +pub fn cleanup_windows_for_pid(pid: u64) { + let mut reg = WINDOW_REGISTRY.lock(); + if reg.remove_for_pid(pid) { + REGISTRY_GENERATION.fetch_add(1, core::sync::atomic::Ordering::Release); + drop(reg); + wake_compositor_if_waiting(); + } +} + /// Restore TTBR0 to the current process's page tables after blocking. /// /// After a blocking syscall (mark_window_dirty), TTBR0 may point to a different @@ -275,6 +288,21 @@ impl WindowRegistry { }) } + /// Remove all window buffers owned by a given process. + /// Returns true if any buffers were removed. + fn remove_for_pid(&mut self, pid: u64) -> bool { + let mut removed = false; + for slot in &mut self.buffers { + if let Some(ref buf) = slot { + if buf.owner_pid == pid { + *slot = None; + removed = true; + } + } + } + removed + } + fn registered_windows(&self) -> alloc::vec::Vec { let mut result = alloc::vec::Vec::new(); for slot in &self.buffers { @@ -1291,6 +1319,7 @@ fn handle_composite_windows(desc_ptr: u64) -> SyscallResult { }; // Collect window info and waiting thread IDs under lock, then release. + // Also lazy-initialize VirGL textures for windows that don't have them yet. let mut threads_to_wake: [Option; MAX_WINDOW_BUFFERS] = [None; MAX_WINDOW_BUFFERS]; let windows: alloc::vec::Vec = { let mut reg = WINDOW_REGISTRY.lock(); @@ -1301,6 +1330,28 @@ fn handle_composite_windows(desc_ptr: u64) -> SyscallResult { if !buf.registered { continue; } if buf.width == 0 || buf.height == 0 { continue; } + // Lazy VirGL texture init: create per-window GPU texture on first composite + if !buf.virgl_initialized && !buf.page_phys_addrs.is_empty() + && matches!(crate::graphics::compositor_backend(), + crate::graphics::CompositorBackend::VirGL) + { + let slot_idx = (buf.id as usize).saturating_sub(1) % 16; + match crate::drivers::virtio::gpu_pci::init_window_texture( + slot_idx, buf.width, buf.height, &buf.page_phys_addrs, buf.size + ) { + Ok(res_id) => { + buf.virgl_resource_id = res_id; + buf.virgl_initialized = true; + crate::serial_println!("[composite] Window {} got VirGL texture (res={})", + buf.id, res_id); + } + Err(e) => { + crate::serial_println!("[composite] Window {} texture init failed: {}", + buf.id, e); + } + } + } + let dirty = buf.generation > buf.last_uploaded_gen; result.push(WindowCompositeInfo { diff --git a/kernel/src/task/process_task.rs b/kernel/src/task/process_task.rs index b22dbb8e..ecc2378d 100644 --- a/kernel/src/task/process_task.rs +++ b/kernel/src/task/process_task.rs @@ -77,6 +77,7 @@ impl ProcessScheduler { if let Some((pid, process)) = manager.find_process_by_thread_mut(thread_id) { let parent_pid = process.parent; let process_name = process.name.clone(); + let children = core::mem::take(&mut process.children); // Mark terminated and extract FDs without closing them process.terminate_minimal(exit_code); @@ -85,6 +86,11 @@ impl ProcessScheduler { process.cleanup_cow_frames(); process.drain_old_page_tables(); + // Free heavy resources immediately (CoW refcounts already decremented) + process.page_table.take(); + process.stack.take(); + process.pending_old_page_tables.clear(); + #[cfg(feature = "btrt")] crate::test_framework::btrt::on_process_exit(pid.as_u64(), exit_code); @@ -101,6 +107,20 @@ impl ProcessScheduler { None }; + // Reparent children to init (PID 1) + if !children.is_empty() { + use crate::process::ProcessId; + let init_pid = ProcessId::new(1); + for &child_pid in &children { + if let Some(child) = manager.get_process_mut(child_pid) { + child.parent = Some(init_pid); + } + } + if let Some(init) = manager.get_process_mut(init_pid) { + init.children.extend(children.iter()); + } + } + Some((pid, process_name, fd_entries, parent_tid)) } else { None @@ -115,6 +135,10 @@ impl ProcessScheduler { // Close FDs outside PM lock (pipe close_write wakes readers, etc.) close_extracted_fds(fd_entries); + // Clean up window buffers so the compositor stops reading freed pages + #[cfg(target_arch = "aarch64")] + crate::syscall::graphics::cleanup_windows_for_pid(pid.as_u64()); + // Wake parent thread if blocked on waitpid or pause() if let Some(parent_tid) = parent_tid { scheduler::with_scheduler(|sched| { diff --git a/scripts/parallels/virgl_multi_texture_test.c b/scripts/parallels/virgl_multi_texture_test.c new file mode 100644 index 00000000..7baae572 --- /dev/null +++ b/scripts/parallels/virgl_multi_texture_test.c @@ -0,0 +1,1387 @@ +/* + * virgl_multi_texture_test.c — Multi-texture VirGL compositing test + * + * Proves that multiple VirGL TEXTURE_2D resources can be: + * 1. Created independently + * 2. Rendered to via separate SUBMIT_3D batches (CLEAR to different colors) + * 3. Sampled from in a compositing pass that draws textured quads + * + * The final display shows: + * - Dark gray background + * - RED rectangle on the left (texture A, pixels 100-500 x 100-400) + * - BLUE rectangle on the right (texture B, pixels 600-1000 x 100-400) + * + * Pixel readback verifies the composited result. + * + * Build: gcc -O2 -o virgl_multi_texture_test virgl_multi_texture_test.c -ldrm + * Run: sudo ./virgl_multi_texture_test + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* ========================================================================= + * VirtGPU DRM ioctl definitions (from linux/virtgpu_drm.h) + * ========================================================================= */ + +struct drm_virtgpu_resource_create { + uint32_t target; + uint32_t format; + uint32_t bind; + uint32_t width; + uint32_t height; + uint32_t depth; + uint32_t array_size; + uint32_t last_level; + uint32_t nr_samples; + uint32_t flags; + uint32_t bo_handle; /* output */ + uint32_t res_handle; /* output */ + uint32_t size; /* output */ + uint32_t stride; /* output */ +}; + +struct drm_virtgpu_execbuffer { + uint32_t flags; + uint32_t size; + uint64_t command; + uint64_t bo_handles; + uint32_t num_bo_handles; + int32_t fence_fd; +}; + +#define DRM_VIRTGPU_MAP 0x01 +#define DRM_VIRTGPU_EXECBUFFER 0x02 +#define DRM_VIRTGPU_RESOURCE_CREATE 0x04 +#define DRM_VIRTGPU_TRANSFER_FROM_HOST 0x06 +#define DRM_VIRTGPU_TRANSFER_TO_HOST 0x07 +#define DRM_VIRTGPU_WAIT 0x08 + +struct drm_virtgpu_map { + uint32_t handle; + uint32_t pad; + uint64_t offset; /* output: mmap offset */ +}; + +struct drm_virtgpu_3d_transfer_to_host { + uint32_t bo_handle; + uint32_t pad; + uint64_t offset; + uint32_t level; + uint32_t stride; + uint32_t layer_stride; + struct { + uint32_t x, y, z, w, h, d; + } box; +}; + +/* TRANSFER_FROM_HOST uses the same struct layout */ +typedef struct drm_virtgpu_3d_transfer_to_host drm_virtgpu_3d_transfer_from_host; + +struct drm_virtgpu_3d_wait { + uint32_t handle; + uint32_t flags; +}; + +#define DRM_IOCTL_VIRTGPU_MAP \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, \ + struct drm_virtgpu_map) + +#define DRM_IOCTL_VIRTGPU_EXECBUFFER \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_EXECBUFFER, \ + struct drm_virtgpu_execbuffer) + +#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE, \ + struct drm_virtgpu_resource_create) + +#define DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_FROM_HOST, \ + drm_virtgpu_3d_transfer_from_host) + +#define DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_TO_HOST, \ + struct drm_virtgpu_3d_transfer_to_host) + +#define DRM_IOCTL_VIRTGPU_WAIT \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_WAIT, \ + struct drm_virtgpu_3d_wait) + +/* ========================================================================= + * VirGL constants — must match kernel/src/drivers/virtio/virgl.rs exactly + * ========================================================================= */ + +/* Command types */ +#define VIRGL_CCMD_NOP 0 +#define VIRGL_CCMD_CREATE_OBJECT 1 +#define VIRGL_CCMD_BIND_OBJECT 2 +#define VIRGL_CCMD_SET_VIEWPORT_STATE 4 +#define VIRGL_CCMD_SET_FRAMEBUFFER_STATE 5 +#define VIRGL_CCMD_SET_VERTEX_BUFFERS 6 +#define VIRGL_CCMD_CLEAR 7 +#define VIRGL_CCMD_DRAW_VBO 8 +#define VIRGL_CCMD_RESOURCE_INLINE_WRITE 9 +#define VIRGL_CCMD_SET_SAMPLER_VIEWS 10 +#define VIRGL_CCMD_SET_SCISSOR_STATE 15 +#define VIRGL_CCMD_SET_SUB_CTX 28 +#define VIRGL_CCMD_CREATE_SUB_CTX 29 +#define VIRGL_CCMD_BIND_SHADER 31 +#define VIRGL_CCMD_SET_TWEAKS 46 + +/* Object types */ +#define VIRGL_OBJ_BLEND 1 +#define VIRGL_OBJ_RASTERIZER 2 +#define VIRGL_OBJ_DSA 3 +#define VIRGL_OBJ_SHADER 4 +#define VIRGL_OBJ_VERTEX_ELEMENTS 5 +#define VIRGL_OBJ_SAMPLER_VIEW 6 +#define VIRGL_OBJ_SAMPLER_STATE 7 +#define VIRGL_OBJ_SURFACE 8 + +/* Pipe constants */ +#define PIPE_BUFFER 0 +#define PIPE_TEXTURE_2D 2 +#define PIPE_PRIM_TRIANGLE_STRIP 5 + +#define PIPE_FORMAT_B8G8R8X8_UNORM 2 +#define PIPE_FORMAT_R32G32B32A32_FLOAT 31 + +#define PIPE_BIND_RENDER_TARGET 0x002 +#define PIPE_BIND_SAMPLER_VIEW 0x008 +#define PIPE_BIND_VERTEX_BUFFER 0x010 +#define PIPE_BIND_SCANOUT 0x40000 +#define PIPE_BIND_SHARED 0x100000 + +#define PIPE_CLEAR_COLOR0 0x04 + +#define PIPE_SHADER_VERTEX 0 +#define PIPE_SHADER_FRAGMENT 1 + +#define PIPE_TEX_FILTER_LINEAR 1 + +/* ========================================================================= + * VirGL command buffer builder + * ========================================================================= */ + +#define CMD_BUF_MAX 8192 + +static uint32_t cmd_buf[CMD_BUF_MAX]; +static int cmd_len; + +static void cmd_reset(void) { cmd_len = 0; } + +static void cmd_push(uint32_t v) +{ + if (cmd_len < CMD_BUF_MAX) + cmd_buf[cmd_len++] = v; + else { + fprintf(stderr, "FATAL: cmd_buf overflow at DWORD %d\n", cmd_len); + exit(1); + } +} + +/* Build VirGL command header: + * bits [7:0] = command opcode + * bits [15:8] = object type (for create/bind commands) + * bits [31:16] = payload length in DWORDs (not including this header) + */ +static uint32_t cmd0(uint32_t cmd, uint32_t obj, uint32_t len) +{ + return cmd | (obj << 8) | (len << 16); +} + +static uint32_t f32_bits(float f) +{ + uint32_t u; + memcpy(&u, &f, 4); + return u; +} + +/* Pack TGSI text into DWORDs (little-endian, null-terminated, zero-padded). + * Returns number of DWORDs pushed. */ +static int push_tgsi_text(const char *text) +{ + int text_len = strlen(text) + 1; /* include null terminator */ + int text_dwords = (text_len + 3) / 4; + for (int i = 0; i < text_dwords; i++) { + uint32_t dw = 0; + for (int b = 0; b < 4; b++) { + int idx = i * 4 + b; + if (idx < text_len) + dw |= ((uint32_t)(unsigned char)text[idx]) << (b * 8); + } + cmd_push(dw); + } + return text_dwords; +} + +/* ------------------------------------------------------------------------- + * VirGL command builders + * ------------------------------------------------------------------------- */ + +static void cmd_create_sub_ctx(uint32_t id) +{ + cmd_push(cmd0(VIRGL_CCMD_CREATE_SUB_CTX, 0, 1)); + cmd_push(id); +} + +static void cmd_set_sub_ctx(uint32_t id) +{ + cmd_push(cmd0(VIRGL_CCMD_SET_SUB_CTX, 0, 1)); + cmd_push(id); +} + +static void cmd_set_tweaks(uint32_t id, uint32_t value) +{ + cmd_push(cmd0(VIRGL_CCMD_SET_TWEAKS, 0, 2)); + cmd_push(id); + cmd_push(value); +} + +/* Create shader with num_tokens=300 (Mesa default). + * CRITICAL: num_tokens=0 silently corrupts the VirGL context. */ +static void cmd_create_shader(uint32_t handle, uint32_t shader_type, const char *tgsi) +{ + int text_len = strlen(tgsi) + 1; + int text_dwords = (text_len + 3) / 4; + int payload_len = 5 + text_dwords; + + cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_SHADER, payload_len)); + cmd_push(handle); + cmd_push(shader_type); + cmd_push(text_len); /* bit 31 clear = first/only chunk */ + cmd_push(300); /* NUM_TOKENS = 300 (Mesa default, MUST be nonzero) */ + cmd_push(0); /* num_so_outputs */ + push_tgsi_text(tgsi); +} + +static void cmd_create_blend_simple(uint32_t handle) +{ + /* S0=0x04 (dither), S2[0]=0x78000000 (colormask=0xF<<27) — matches Mesa */ + cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_BLEND, 11)); + cmd_push(handle); + cmd_push(0x00000004); /* S0: dither enabled */ + cmd_push(0); /* S1: logicop_func */ + cmd_push(0x78000000); /* S2[0]: colormask=0xF<<27, blend disabled */ + cmd_push(0); cmd_push(0); cmd_push(0); /* S2[1..3] */ + cmd_push(0); cmd_push(0); cmd_push(0); /* S2[4..6] */ + cmd_push(0); /* S2[7] */ +} + +static void cmd_create_dsa_disabled(uint32_t handle) +{ + cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_DSA, 5)); + cmd_push(handle); + cmd_push(0); /* S0: depth/alpha test disabled */ + cmd_push(0); /* S1: front stencil disabled */ + cmd_push(0); /* S2: back stencil disabled */ + cmd_push(0); /* alpha_ref = 0.0f */ +} + +static void cmd_create_rasterizer_default(uint32_t handle) +{ + /* 0x60008082: depth_clip_near | point_quad | front_ccw | half_pixel | bottom_edge */ + uint32_t s0 = (1 << 1) | (1 << 7) | (1 << 15) | (1 << 29) | (1 << 30); + + cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_RASTERIZER, 9)); + cmd_push(handle); + cmd_push(s0); /* 0x60008082 */ + cmd_push(f32_bits(1.0f)); /* point_size */ + cmd_push(0); /* sprite_coord_enable */ + cmd_push(0x0000FFFF); /* clip_plane_enable = all */ + cmd_push(f32_bits(1.0f)); /* line_width */ + cmd_push(0); /* offset_units */ + cmd_push(0); /* offset_scale */ + cmd_push(0); /* offset_clamp */ +} + +static void cmd_create_vertex_elements(uint32_t handle, int count, + uint32_t offsets[], uint32_t divisors[], + uint32_t vb_indices[], uint32_t formats[]) +{ + cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_VERTEX_ELEMENTS, + 4 * count + 1)); + cmd_push(handle); + for (int i = 0; i < count; i++) { + cmd_push(offsets[i]); + cmd_push(divisors[i]); + cmd_push(vb_indices[i]); + cmd_push(formats[i]); + } +} + +static void cmd_bind_object(uint32_t handle, uint32_t obj_type) +{ + cmd_push(cmd0(VIRGL_CCMD_BIND_OBJECT, obj_type, 1)); + cmd_push(handle); +} + +static void cmd_bind_shader(uint32_t handle, uint32_t shader_type) +{ + cmd_push(cmd0(VIRGL_CCMD_BIND_SHADER, 0, 2)); + cmd_push(handle); + cmd_push(shader_type); +} + +static void cmd_set_viewport(float width, float height) +{ + cmd_push(cmd0(VIRGL_CCMD_SET_VIEWPORT_STATE, 0, 7)); + cmd_push(0); /* start_slot */ + cmd_push(f32_bits(width / 2.0f)); /* scale_x */ + cmd_push(f32_bits(-height / 2.0f)); /* scale_y (neg for GL Y-up) */ + cmd_push(f32_bits(0.5f)); /* scale_z */ + cmd_push(f32_bits(width / 2.0f)); /* translate_x */ + cmd_push(f32_bits(height / 2.0f)); /* translate_y */ + cmd_push(f32_bits(0.5f)); /* translate_z */ +} + +static void cmd_create_surface(uint32_t handle, uint32_t res_handle, + uint32_t fmt, uint32_t level, uint32_t layers) +{ + cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_SURFACE, 5)); + cmd_push(handle); + cmd_push(res_handle); + cmd_push(fmt); + cmd_push(level); + cmd_push(layers); /* first_layer | (last_layer << 16) */ +} + +static void cmd_set_framebuffer_state(uint32_t zsurf_handle, + int nr_cbufs, uint32_t cbuf_handles[]) +{ + cmd_push(cmd0(VIRGL_CCMD_SET_FRAMEBUFFER_STATE, 0, nr_cbufs + 2)); + cmd_push(nr_cbufs); + cmd_push(zsurf_handle); + for (int i = 0; i < nr_cbufs; i++) + cmd_push(cbuf_handles[i]); +} + +static void cmd_clear_color(float r, float g, float b, float a) +{ + cmd_push(cmd0(VIRGL_CCMD_CLEAR, 0, 8)); + cmd_push(PIPE_CLEAR_COLOR0); /* buffers = 0x04 */ + cmd_push(f32_bits(r)); + cmd_push(f32_bits(g)); + cmd_push(f32_bits(b)); + cmd_push(f32_bits(a)); + cmd_push(0x00000000); /* depth f64 low */ + cmd_push(0x3FF00000); /* depth f64 high = 1.0 */ + cmd_push(0); /* stencil */ +} + +/* Create sampler view for a TEXTURE_2D resource. + * CRITICAL: bits [24:31] of the format DWORD must contain PIPE_TEXTURE_2D << 24. + * Without this, the host creates a BUFFER-targeted sampler view and you get BLACK. */ +static void cmd_create_sampler_view(uint32_t handle, uint32_t res_handle, + uint32_t format, uint32_t first_level, + uint32_t last_level, uint32_t swizzle_r, + uint32_t swizzle_g, uint32_t swizzle_b, + uint32_t swizzle_a) +{ + /* Format DWORD encoding: + * bits [5:0] = PIPE_FORMAT + * bits [24:31] = texture target (PIPE_TEXTURE_2D = 2) + * Swizzle DWORD encoding: + * bits [2:0] = swizzle_r + * bits [5:3] = swizzle_g + * bits [8:6] = swizzle_b + * bits [11:9] = swizzle_a + */ + uint32_t format_dw = format | (PIPE_TEXTURE_2D << 24); + uint32_t swizzle_dw = swizzle_r | (swizzle_g << 3) | (swizzle_b << 6) | (swizzle_a << 9); + + cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_SAMPLER_VIEW, 6)); + cmd_push(handle); + cmd_push(res_handle); + cmd_push(format_dw); + cmd_push(first_level | (last_level << 8)); /* first_element / first_level + last_element / last_level */ + cmd_push(swizzle_dw); + cmd_push(0); /* buffer_offset (unused for TEXTURE_2D) */ +} + +/* Bind sampler views to a shader stage */ +static void cmd_set_sampler_views(uint32_t shader_type, int count, + uint32_t view_handles[]) +{ + cmd_push(cmd0(VIRGL_CCMD_SET_SAMPLER_VIEWS, 0, count + 2)); + cmd_push(shader_type); + cmd_push(0); /* start_slot */ + for (int i = 0; i < count; i++) + cmd_push(view_handles[i]); +} + +/* Create sampler state (texture filtering) */ +static void cmd_create_sampler_state(uint32_t handle, + uint32_t wrap_s, uint32_t wrap_t, uint32_t wrap_r, + uint32_t min_filter, uint32_t mag_filter, + uint32_t mip_filter) +{ + /* S0 encoding (from virglrenderer): + * bits [2:0] = wrap_s + * bits [5:3] = wrap_t + * bits [8:6] = wrap_r + * bits [11:9] = min_img_filter + * bits [14:12] = min_mip_filter + * bits [17:15] = mag_img_filter + * bits [20:18] = compare_mode + * bits [23:21] = compare_func + * bit 24 = seamless_cube_map + */ + uint32_t s0 = wrap_s | (wrap_t << 3) | (wrap_r << 6) + | (min_filter << 9) | (mip_filter << 12) | (mag_filter << 15); + + cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_SAMPLER_STATE, 5)); + cmd_push(handle); + cmd_push(s0); + cmd_push(0); /* lod_bias (float) */ + cmd_push(0); /* min_lod (float) */ + cmd_push(f32_bits(1000.0f)); /* max_lod */ +} + +/* Bind sampler states */ +static void cmd_bind_sampler_states(uint32_t shader_type, int count, + uint32_t state_handles[]) +{ + /* BIND_SAMPLER_STATES = VIRGL_CCMD_BIND_OBJECT with obj_type = SAMPLER_STATE + * Actually it's a dedicated command: VIRGL_CCMD_BIND_SAMPLER_STATES = 3 */ + cmd_push(cmd0(3, 0, count + 2)); /* VIRGL_CCMD_BIND_SAMPLER_STATES = 3 */ + cmd_push(shader_type); + cmd_push(0); /* start_slot */ + for (int i = 0; i < count; i++) + cmd_push(state_handles[i]); +} + +/* RESOURCE_INLINE_WRITE: write data directly into a VirGL resource. + * Used for vertex buffer data. */ +static void cmd_resource_inline_write(uint32_t res_handle, uint32_t level, + uint32_t usage, uint32_t stride, + uint32_t layer_stride, + uint32_t x, uint32_t y, uint32_t z, + uint32_t w, uint32_t h, uint32_t d, + const void *data, uint32_t data_bytes) +{ + uint32_t data_dwords = (data_bytes + 3) / 4; + cmd_push(cmd0(VIRGL_CCMD_RESOURCE_INLINE_WRITE, 0, 11 + data_dwords)); + cmd_push(res_handle); + cmd_push(level); + cmd_push(usage); + cmd_push(stride); + cmd_push(layer_stride); + cmd_push(x); + cmd_push(y); + cmd_push(z); + cmd_push(w); + cmd_push(h); + cmd_push(d); + /* Copy data as DWORDs */ + const uint8_t *bytes = (const uint8_t *)data; + for (uint32_t i = 0; i < data_dwords; i++) { + uint32_t dw = 0; + for (int b = 0; b < 4; b++) { + uint32_t idx = i * 4 + b; + if (idx < data_bytes) + dw |= ((uint32_t)bytes[idx]) << (b * 8); + } + cmd_push(dw); + } +} + +/* SET_VERTEX_BUFFERS: bind vertex buffers for drawing */ +static void cmd_set_vertex_buffers(int count, uint32_t strides[], + uint32_t offsets[], uint32_t res_handles[]) +{ + cmd_push(cmd0(VIRGL_CCMD_SET_VERTEX_BUFFERS, 0, count * 3)); + for (int i = 0; i < count; i++) { + cmd_push(strides[i]); + cmd_push(offsets[i]); + cmd_push(res_handles[i]); + } +} + +/* DRAW_VBO */ +static void cmd_draw_vbo(uint32_t start, uint32_t count, uint32_t mode, + uint32_t indexed, uint32_t instance_count, + uint32_t min_index, uint32_t max_index) +{ + cmd_push(cmd0(VIRGL_CCMD_DRAW_VBO, 0, 12)); + cmd_push(start); + cmd_push(count); + cmd_push(mode); + cmd_push(indexed); + cmd_push(instance_count); + cmd_push(0); /* index_bias */ + cmd_push(0); /* start_instance */ + cmd_push(0); /* primitive_restart */ + cmd_push(0); /* restart_index */ + cmd_push(min_index); + cmd_push(max_index); + cmd_push(0); /* cso (unused) */ +} + +/* ========================================================================= + * Hex dump + * ========================================================================= */ + +static void hex_dump_dwords(const char *label, const uint32_t *data, int count) +{ + printf("[hex-dump] %s (%d DWORDs, %d bytes):\n", label, count, count * 4); + for (int i = 0; i < count; i++) { + printf("[hex-dump] %s +%03d (0x%03X): 0x%08X\n", label, i * 4, i * 4, data[i]); + } + printf("[hex-dump] %s END\n\n", label); +} + +static void hex_dump_resource_create(const char *label, + const struct drm_virtgpu_resource_create *rc) +{ + printf("[hex-dump] %s:\n", label); + printf("[hex-dump] target = 0x%08X (%u)\n", rc->target, rc->target); + printf("[hex-dump] format = 0x%08X (%u)\n", rc->format, rc->format); + printf("[hex-dump] bind = 0x%08X\n", rc->bind); + printf("[hex-dump] width = %u\n", rc->width); + printf("[hex-dump] height = %u\n", rc->height); + printf("[hex-dump] depth = %u\n", rc->depth); + printf("[hex-dump] array_size = %u\n", rc->array_size); + printf("[hex-dump] last_level = %u\n", rc->last_level); + printf("[hex-dump] nr_samples = %u\n", rc->nr_samples); + printf("[hex-dump] flags = 0x%08X\n", rc->flags); + printf("[hex-dump] bo_handle = %u (output)\n", rc->bo_handle); + printf("[hex-dump] res_handle = %u (output)\n", rc->res_handle); + printf("[hex-dump] size = %u (output)\n", rc->size); + printf("[hex-dump] stride = %u (output)\n", rc->stride); + printf("\n"); +} + +/* ========================================================================= + * DRM helpers + * ========================================================================= */ + +static int drm_fd = -1; +static uint32_t conn_id, crtc_id; +static drmModeModeInfo mode; +static drmModeCrtcPtr saved_crtc; + +static int find_drm_device(void) +{ + const char *cards[] = {"/dev/dri/card0", "/dev/dri/card1", NULL}; + + for (int i = 0; cards[i]; i++) { + int fd = open(cards[i], O_RDWR | O_CLOEXEC); + if (fd < 0) + continue; + + if (drmSetMaster(fd) < 0) { + close(fd); + continue; + } + + drmModeResPtr res = drmModeGetResources(fd); + if (!res) { + close(fd); + continue; + } + + /* Find connected connector */ + drmModeConnectorPtr conn = NULL; + for (int c = 0; c < res->count_connectors; c++) { + conn = drmModeGetConnector(fd, res->connectors[c]); + if (conn && conn->connection == DRM_MODE_CONNECTED && + conn->count_modes > 0) { + break; + } + if (conn) drmModeFreeConnector(conn); + conn = NULL; + } + + if (!conn) { + drmModeFreeResources(res); + close(fd); + continue; + } + + conn_id = conn->connector_id; + mode = conn->modes[0]; /* preferred mode */ + + /* Find CRTC */ + drmModeEncoderPtr enc = NULL; + if (conn->encoder_id) + enc = drmModeGetEncoder(fd, conn->encoder_id); + if (!enc && res->count_encoders > 0) + enc = drmModeGetEncoder(fd, res->encoders[0]); + + if (enc) { + crtc_id = enc->crtc_id; + if (!crtc_id && res->count_crtcs > 0) + crtc_id = res->crtcs[0]; + drmModeFreeEncoder(enc); + } else if (res->count_crtcs > 0) { + crtc_id = res->crtcs[0]; + } + + saved_crtc = drmModeGetCrtc(fd, crtc_id); + + printf("DRM: %s -- %s %ux%u@%u\n", cards[i], + conn->connector_type_id ? "connected" : "?", + mode.hdisplay, mode.vdisplay, mode.vrefresh); + printf("DRM: connector=%u, crtc=%u\n", conn_id, crtc_id); + + drmModeFreeConnector(conn); + drmModeFreeResources(res); + drm_fd = fd; + return 0; + } + + fprintf(stderr, "No DRM device found\n"); + return -1; +} + +/* ========================================================================= + * VirtGPU resource + execbuffer wrappers + * ========================================================================= */ + +static int virtgpu_resource_create(struct drm_virtgpu_resource_create *rc) +{ + int ret = drmIoctl(drm_fd, DRM_IOCTL_VIRTGPU_RESOURCE_CREATE, rc); + if (ret < 0) { + fprintf(stderr, "RESOURCE_CREATE failed: %s\n", strerror(errno)); + return -1; + } + return 0; +} + +static int virtgpu_execbuffer(uint32_t *cmds, int dword_count, + uint32_t *bo_handles, int num_bos) +{ + struct drm_virtgpu_execbuffer eb; + memset(&eb, 0, sizeof(eb)); + eb.size = dword_count * 4; + eb.command = (uint64_t)(uintptr_t)cmds; + if (num_bos > 0) { + eb.bo_handles = (uint64_t)(uintptr_t)bo_handles; + eb.num_bo_handles = num_bos; + } + eb.fence_fd = -1; + + int ret = drmIoctl(drm_fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &eb); + if (ret < 0) { + fprintf(stderr, "EXECBUFFER failed: %s\n", strerror(errno)); + return -1; + } + return 0; +} + +static int virtgpu_wait(uint32_t bo_handle) +{ + struct drm_virtgpu_3d_wait wait; + memset(&wait, 0, sizeof(wait)); + wait.handle = bo_handle; + wait.flags = 0; + return drmIoctl(drm_fd, DRM_IOCTL_VIRTGPU_WAIT, &wait); +} + +static int virtgpu_transfer_from_host(uint32_t bo_handle, uint32_t stride, + uint32_t width, uint32_t height) +{ + drm_virtgpu_3d_transfer_from_host xfer; + memset(&xfer, 0, sizeof(xfer)); + xfer.bo_handle = bo_handle; + xfer.stride = stride; + xfer.box.w = width; + xfer.box.h = height; + xfer.box.d = 1; + return drmIoctl(drm_fd, DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST, &xfer); +} + +static int virtgpu_transfer_to_host(uint32_t bo_handle, uint32_t stride, + uint32_t width, uint32_t height) +{ + struct drm_virtgpu_3d_transfer_to_host xfer; + memset(&xfer, 0, sizeof(xfer)); + xfer.bo_handle = bo_handle; + xfer.stride = stride; + xfer.box.w = width; + xfer.box.h = height; + xfer.box.d = 1; + return drmIoctl(drm_fd, DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST, &xfer); +} + +/* ========================================================================= + * Texture dimensions and quad positions + * ========================================================================= */ + +#define TEX_W 400 +#define TEX_H 300 + +/* Quad A: pixels (100,100) to (500,400) — shows texture A (RED) */ +#define QUAD_A_X0 100 +#define QUAD_A_Y0 100 +#define QUAD_A_X1 500 +#define QUAD_A_Y1 400 + +/* Quad B: pixels (600,100) to (1000,400) — shows texture B (BLUE) */ +#define QUAD_B_X0 600 +#define QUAD_B_Y0 100 +#define QUAD_B_X1 1000 +#define QUAD_B_Y1 400 + +/* Pixel sample points for verification */ +#define SAMPLE_RED_X 300 /* center of quad A */ +#define SAMPLE_RED_Y 250 +#define SAMPLE_BLUE_X 800 /* center of quad B */ +#define SAMPLE_BLUE_Y 250 +#define SAMPLE_GRAY_X 50 /* background area */ +#define SAMPLE_GRAY_Y 50 + +/* ========================================================================= + * VirGL object handle allocation + * + * CRITICAL: VirGL object handles must be globally unique across ALL types. + * virglrenderer uses a single hash table per sub-context. + * + * We use separate ranges to avoid collisions: + * Surfaces: 1-10 + * Blend: 11 + * DSA: 12 + * Rasterizer: 13 + * VS: 14 + * FS (color): 15 (for clear batches — unused in composite) + * FS (texture): 16 + * Vertex elements: 17 + * Sampler view A: 18 + * Sampler view B: 19 + * Sampler state: 20 + * VB resource: created via DRM as resource 4 + * ========================================================================= */ + +#define HANDLE_SURFACE_A 1 /* surface for texture A (render-to) */ +#define HANDLE_SURFACE_B 2 /* surface for texture B (render-to) */ +#define HANDLE_SURFACE_DISP 3 /* surface for display resource (composite target) */ +#define HANDLE_BLEND 11 +#define HANDLE_DSA 12 +#define HANDLE_RASTERIZER 13 +#define HANDLE_VS 14 +#define HANDLE_FS_TEXTURE 16 +#define HANDLE_VE 17 +#define HANDLE_SAMPLER_VIEW_A 18 +#define HANDLE_SAMPLER_VIEW_B 19 +#define HANDLE_SAMPLER_STATE 20 + +/* ========================================================================= + * Vertex data helpers + * ========================================================================= */ + +/* Convert pixel coordinates to NDC (-1 to +1). + * Note: Y is flipped (OpenGL convention: bottom = -1, top = +1). + * ndc_x = (pixel_x / screen_w) * 2.0 - 1.0 + * ndc_y = 1.0 - (pixel_y / screen_h) * 2.0 + */ +typedef struct { + float pos[4]; /* x, y, z, w */ + float tex[4]; /* s, t, 0, 1 */ +} vertex_t; + +static void make_quad_vertices(vertex_t verts[4], + float px0, float py0, float px1, float py1, + float screen_w, float screen_h) +{ + float x0 = (px0 / screen_w) * 2.0f - 1.0f; + float x1 = (px1 / screen_w) * 2.0f - 1.0f; + float y0 = 1.0f - (py0 / screen_h) * 2.0f; /* top (higher Y in pixels = lower in NDC) */ + float y1 = 1.0f - (py1 / screen_h) * 2.0f; /* bottom */ + + /* TRIANGLE_STRIP order: top-left, top-right, bottom-left, bottom-right */ + /* Vertex 0: top-left */ + verts[0] = (vertex_t){{ x0, y0, 0.0f, 1.0f }, { 0.0f, 0.0f, 0.0f, 1.0f }}; + /* Vertex 1: top-right */ + verts[1] = (vertex_t){{ x1, y0, 0.0f, 1.0f }, { 1.0f, 0.0f, 0.0f, 1.0f }}; + /* Vertex 2: bottom-left */ + verts[2] = (vertex_t){{ x0, y1, 0.0f, 1.0f }, { 0.0f, 1.0f, 0.0f, 1.0f }}; + /* Vertex 3: bottom-right */ + verts[3] = (vertex_t){{ x1, y1, 0.0f, 1.0f }, { 1.0f, 1.0f, 0.0f, 1.0f }}; +} + +/* ========================================================================= + * main + * ========================================================================= */ + +int main(void) +{ + printf("=== VirGL Multi-Texture Compositing Test ===\n\n"); + + /* Step 1: Find DRM device */ + if (find_drm_device() < 0) + return 1; + + uint32_t width = mode.hdisplay; + uint32_t height = mode.vdisplay; + printf("Resolution: %ux%u\n\n", width, height); + + /* ===================================================================== + * Step 2: Create resources + * ===================================================================== */ + + /* Resource 1: Display surface (composited output) — 1920x1200, SCANOUT */ + struct drm_virtgpu_resource_create rc_disp; + memset(&rc_disp, 0, sizeof(rc_disp)); + rc_disp.target = PIPE_TEXTURE_2D; + rc_disp.format = PIPE_FORMAT_B8G8R8X8_UNORM; + rc_disp.bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW | + PIPE_BIND_SCANOUT | PIPE_BIND_SHARED; + rc_disp.width = width; + rc_disp.height = height; + rc_disp.depth = 1; + rc_disp.array_size = 1; + + printf("=== Creating display resource (res 1: %ux%u) ===\n", width, height); + hex_dump_resource_create("RESOURCE_CREATE display", &rc_disp); + if (virtgpu_resource_create(&rc_disp) < 0) return 1; + printf("Display resource: bo=%u res=%u stride=%u size=%u\n\n", + rc_disp.bo_handle, rc_disp.res_handle, rc_disp.stride, rc_disp.size); + + /* Resource 2: Texture A (RED window) — 400x300, no SCANOUT */ + struct drm_virtgpu_resource_create rc_texA; + memset(&rc_texA, 0, sizeof(rc_texA)); + rc_texA.target = PIPE_TEXTURE_2D; + rc_texA.format = PIPE_FORMAT_B8G8R8X8_UNORM; + rc_texA.bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW; + rc_texA.width = TEX_W; + rc_texA.height = TEX_H; + rc_texA.depth = 1; + rc_texA.array_size = 1; + + printf("=== Creating texture A (res 2: %ux%u) ===\n", TEX_W, TEX_H); + hex_dump_resource_create("RESOURCE_CREATE texA", &rc_texA); + if (virtgpu_resource_create(&rc_texA) < 0) return 1; + printf("Texture A: bo=%u res=%u stride=%u size=%u\n\n", + rc_texA.bo_handle, rc_texA.res_handle, rc_texA.stride, rc_texA.size); + + /* Resource 3: Texture B (BLUE window) — 400x300, no SCANOUT */ + struct drm_virtgpu_resource_create rc_texB; + memset(&rc_texB, 0, sizeof(rc_texB)); + rc_texB.target = PIPE_TEXTURE_2D; + rc_texB.format = PIPE_FORMAT_B8G8R8X8_UNORM; + rc_texB.bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW; + rc_texB.width = TEX_W; + rc_texB.height = TEX_H; + rc_texB.depth = 1; + rc_texB.array_size = 1; + + printf("=== Creating texture B (res 3: %ux%u) ===\n", TEX_W, TEX_H); + hex_dump_resource_create("RESOURCE_CREATE texB", &rc_texB); + if (virtgpu_resource_create(&rc_texB) < 0) return 1; + printf("Texture B: bo=%u res=%u stride=%u size=%u\n\n", + rc_texB.bo_handle, rc_texB.res_handle, rc_texB.stride, rc_texB.size); + + /* Resource 4: Vertex buffer (PIPE_BUFFER, VERTEX_BUFFER bind) */ + struct drm_virtgpu_resource_create rc_vb; + memset(&rc_vb, 0, sizeof(rc_vb)); + rc_vb.target = PIPE_BUFFER; + rc_vb.format = PIPE_FORMAT_R32G32B32A32_FLOAT; /* doesn't matter for buffers, but Mesa uses this */ + rc_vb.bind = PIPE_BIND_VERTEX_BUFFER; + rc_vb.width = 4096; /* size in bytes (width for PIPE_BUFFER) */ + rc_vb.height = 1; + rc_vb.depth = 1; + rc_vb.array_size = 1; + + printf("=== Creating vertex buffer resource (res 4: buffer, 4096 bytes) ===\n"); + hex_dump_resource_create("RESOURCE_CREATE VB", &rc_vb); + if (virtgpu_resource_create(&rc_vb) < 0) return 1; + printf("VB resource: bo=%u res=%u\n\n", + rc_vb.bo_handle, rc_vb.res_handle); + + /* Collect all BO handles for EXECBUFFER */ + uint32_t all_bos[4] = { + rc_disp.bo_handle, + rc_texA.bo_handle, + rc_texB.bo_handle, + rc_vb.bo_handle + }; + + /* ===================================================================== + * Step 2b: Prime all TEXTURE_2D resources with TRANSFER_TO_HOST + * + * CRITICAL: Parallels requires an initial TRANSFER_TO_HOST_3D to + * establish the host-side buffer before any VirGL rendering will + * produce visible results. Without this "priming" step, SUBMIT_3D + * rendering targets a non-existent host buffer and produces black. + * ===================================================================== */ + printf("=== Priming resources with TRANSFER_TO_HOST ===\n"); + { + uint32_t disp_stride = rc_disp.stride; + if (disp_stride == 0) disp_stride = width * 4; + int r1 = virtgpu_transfer_to_host(rc_disp.bo_handle, disp_stride, width, height); + printf(" Prime display (res %u, bo %u): %s\n", rc_disp.res_handle, rc_disp.bo_handle, + r1 < 0 ? "FAILED" : "OK"); + + uint32_t tex_stride = rc_texA.stride; + if (tex_stride == 0) tex_stride = TEX_W * 4; + int r2 = virtgpu_transfer_to_host(rc_texA.bo_handle, tex_stride, TEX_W, TEX_H); + printf(" Prime texA (res %u, bo %u): %s\n", rc_texA.res_handle, rc_texA.bo_handle, + r2 < 0 ? "FAILED" : "OK"); + + int r3 = virtgpu_transfer_to_host(rc_texB.bo_handle, tex_stride, TEX_W, TEX_H); + printf(" Prime texB (res %u, bo %u): %s\n", rc_texB.res_handle, rc_texB.bo_handle, + r3 < 0 ? "FAILED" : "OK"); + } + printf("\n"); + + /* ===================================================================== + * Step 3: Render to Texture A (RED) + * + * Each SUBMIT_3D batch must start with create_sub_ctx(1) + set_sub_ctx(1). + * Objects do NOT survive create_sub_ctx — must recreate everything. + * ===================================================================== */ + + printf("=== Batch 1: Render RED to Texture A ===\n"); + cmd_reset(); + + cmd_create_sub_ctx(1); + cmd_set_sub_ctx(1); + cmd_set_tweaks(1, 1); + cmd_set_tweaks(2, TEX_W); + + /* Create surface for texture A's resource, set as framebuffer, clear RED */ + cmd_create_surface(HANDLE_SURFACE_A, rc_texA.res_handle, + PIPE_FORMAT_B8G8R8X8_UNORM, 0, 0); + { + uint32_t cbufs[] = { HANDLE_SURFACE_A }; + cmd_set_framebuffer_state(0, 1, cbufs); + } + cmd_clear_color(1.0f, 0.0f, 0.0f, 1.0f); /* RED */ + + hex_dump_dwords("BATCH_1_CLEAR_RED", cmd_buf, cmd_len); + + if (virtgpu_execbuffer(cmd_buf, cmd_len, all_bos, 4) < 0) return 1; + virtgpu_wait(rc_texA.bo_handle); + printf("Batch 1 (RED clear to texA): OK\n\n"); + + /* ===================================================================== + * Step 4: Render to Texture B (BLUE) + * ===================================================================== */ + + printf("=== Batch 2: Render BLUE to Texture B ===\n"); + cmd_reset(); + + cmd_create_sub_ctx(1); + cmd_set_sub_ctx(1); + cmd_set_tweaks(1, 1); + cmd_set_tweaks(2, TEX_W); + + /* Create surface for texture B's resource, set as framebuffer, clear BLUE */ + cmd_create_surface(HANDLE_SURFACE_B, rc_texB.res_handle, + PIPE_FORMAT_B8G8R8X8_UNORM, 0, 0); + { + uint32_t cbufs[] = { HANDLE_SURFACE_B }; + cmd_set_framebuffer_state(0, 1, cbufs); + } + cmd_clear_color(0.0f, 0.0f, 1.0f, 1.0f); /* BLUE */ + + hex_dump_dwords("BATCH_2_CLEAR_BLUE", cmd_buf, cmd_len); + + if (virtgpu_execbuffer(cmd_buf, cmd_len, all_bos, 4) < 0) return 1; + virtgpu_wait(rc_texB.bo_handle); + printf("Batch 2 (BLUE clear to texB): OK\n\n"); + + /* ===================================================================== + * Step 5: Composite both textures onto display resource + * + * This is the key batch that proves multi-texture sampling works: + * 1. Clear display to dark gray + * 2. Draw textured quad sampling from texture A at left position + * 3. Switch sampler view to texture B, draw quad at right position + * ===================================================================== */ + + printf("=== Batch 3: Composite both textures onto display ===\n"); + cmd_reset(); + + /* --- Sub-context setup --- */ + cmd_create_sub_ctx(1); + cmd_set_sub_ctx(1); + cmd_set_tweaks(1, 1); + cmd_set_tweaks(2, width); + + /* --- Create display surface and set as framebuffer --- */ + cmd_create_surface(HANDLE_SURFACE_DISP, rc_disp.res_handle, + PIPE_FORMAT_B8G8R8X8_UNORM, 0, 0); + { + uint32_t cbufs[] = { HANDLE_SURFACE_DISP }; + cmd_set_framebuffer_state(0, 1, cbufs); + } + + /* --- Clear display to dark gray background (0.2, 0.2, 0.2) --- */ + cmd_clear_color(0.2f, 0.2f, 0.2f, 1.0f); + + /* --- Create pipeline state objects --- */ + cmd_create_blend_simple(HANDLE_BLEND); + cmd_bind_object(HANDLE_BLEND, VIRGL_OBJ_BLEND); + + cmd_create_dsa_disabled(HANDLE_DSA); + cmd_bind_object(HANDLE_DSA, VIRGL_OBJ_DSA); + + cmd_create_rasterizer_default(HANDLE_RASTERIZER); + cmd_bind_object(HANDLE_RASTERIZER, VIRGL_OBJ_RASTERIZER); + + /* --- Create and bind shaders --- */ + /* Vertex shader: passthrough position + texcoord */ + const char *vs_text = + "VERT\n" + "DCL IN[0]\n" + "DCL IN[1]\n" + "DCL OUT[0], POSITION\n" + "DCL OUT[1], GENERIC[0]\n" + "MOV OUT[0], IN[0]\n" + "MOV OUT[1], IN[1]\n" + "END\n"; + + /* Fragment shader: sample texture and output */ + const char *fs_text = + "FRAG\n" + "PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1\n" + "DCL IN[0], GENERIC[0], PERSPECTIVE\n" + "DCL OUT[0], COLOR\n" + "DCL SAMP[0]\n" + "DCL SVIEW[0], 2D, FLOAT\n" + "TEX OUT[0], IN[0], SAMP[0], 2D\n" + "END\n"; + + cmd_create_shader(HANDLE_VS, PIPE_SHADER_VERTEX, vs_text); + cmd_bind_shader(HANDLE_VS, PIPE_SHADER_VERTEX); + + cmd_create_shader(HANDLE_FS_TEXTURE, PIPE_SHADER_FRAGMENT, fs_text); + cmd_bind_shader(HANDLE_FS_TEXTURE, PIPE_SHADER_FRAGMENT); + + /* --- Create vertex elements (2 attributes: position + texcoord) --- + * Each vertex has 8 floats: 4 for position, 4 for texcoord. + * Attribute 0: offset=0, format=R32G32B32A32_FLOAT (position) + * Attribute 1: offset=16, format=R32G32B32A32_FLOAT (texcoord) + */ + { + uint32_t offsets[] = { 0, 16 }; + uint32_t divisors[] = { 0, 0 }; + uint32_t vb_indices[] = { 0, 0 }; + uint32_t formats[] = { PIPE_FORMAT_R32G32B32A32_FLOAT, + PIPE_FORMAT_R32G32B32A32_FLOAT }; + cmd_create_vertex_elements(HANDLE_VE, 2, offsets, divisors, vb_indices, formats); + } + cmd_bind_object(HANDLE_VE, VIRGL_OBJ_VERTEX_ELEMENTS); + + /* --- Set viewport to full display --- */ + cmd_set_viewport((float)width, (float)height); + + /* --- Create sampler state (LINEAR filtering) --- */ + /* wrap modes: CLAMP_TO_EDGE = 2 */ + cmd_create_sampler_state(HANDLE_SAMPLER_STATE, 2, 2, 2, + PIPE_TEX_FILTER_LINEAR, PIPE_TEX_FILTER_LINEAR, 0); + { + uint32_t states[] = { HANDLE_SAMPLER_STATE }; + cmd_bind_sampler_states(PIPE_SHADER_FRAGMENT, 1, states); + } + + /* --- Bind vertex buffer resource --- */ + { + uint32_t strides[] = { sizeof(vertex_t) }; /* 32 bytes per vertex */ + uint32_t offsets[] = { 0 }; + uint32_t res_handles[] = { rc_vb.res_handle }; + cmd_set_vertex_buffers(1, strides, offsets, res_handles); + } + + /* ---- Draw Quad A (texture A = RED) at left position ---- */ + + /* Create sampler view for texture A. + * Swizzle: identity (R=0, G=1, B=2, A=3) */ + cmd_create_sampler_view(HANDLE_SAMPLER_VIEW_A, rc_texA.res_handle, + PIPE_FORMAT_B8G8R8X8_UNORM, + 0, 0, /* first_level, last_level */ + 0, 1, 2, 3); /* RGBA identity swizzle */ + { + uint32_t views[] = { HANDLE_SAMPLER_VIEW_A }; + cmd_set_sampler_views(PIPE_SHADER_FRAGMENT, 1, views); + } + + /* Upload vertex data for quad A via RESOURCE_INLINE_WRITE */ + { + vertex_t verts[4]; + make_quad_vertices(verts, + (float)QUAD_A_X0, (float)QUAD_A_Y0, + (float)QUAD_A_X1, (float)QUAD_A_Y1, + (float)width, (float)height); + + printf("Quad A vertices (NDC):\n"); + for (int i = 0; i < 4; i++) { + printf(" v%d: pos=(%.4f, %.4f, %.4f, %.4f) tex=(%.4f, %.4f, %.4f, %.4f)\n", + i, verts[i].pos[0], verts[i].pos[1], verts[i].pos[2], verts[i].pos[3], + verts[i].tex[0], verts[i].tex[1], verts[i].tex[2], verts[i].tex[3]); + } + + /* Write quad A vertices at offset 0 in the VB resource */ + cmd_resource_inline_write(rc_vb.res_handle, 0, 0, 0, 0, + 0, 0, 0, /* x, y, z */ + sizeof(verts), 1, 1, /* w, h, d (bytes for buffer) */ + verts, sizeof(verts)); + } + + /* Draw quad A: 4 vertices, TRIANGLE_STRIP */ + cmd_draw_vbo(0, 4, PIPE_PRIM_TRIANGLE_STRIP, 0, 1, 0, 3); + + /* ---- Draw Quad B (texture B = BLUE) at right position ---- */ + + /* Create sampler view for texture B */ + cmd_create_sampler_view(HANDLE_SAMPLER_VIEW_B, rc_texB.res_handle, + PIPE_FORMAT_B8G8R8X8_UNORM, + 0, 0, + 0, 1, 2, 3); + { + uint32_t views[] = { HANDLE_SAMPLER_VIEW_B }; + cmd_set_sampler_views(PIPE_SHADER_FRAGMENT, 1, views); + } + + /* Upload vertex data for quad B via RESOURCE_INLINE_WRITE */ + { + vertex_t verts[4]; + make_quad_vertices(verts, + (float)QUAD_B_X0, (float)QUAD_B_Y0, + (float)QUAD_B_X1, (float)QUAD_B_Y1, + (float)width, (float)height); + + printf("Quad B vertices (NDC):\n"); + for (int i = 0; i < 4; i++) { + printf(" v%d: pos=(%.4f, %.4f, %.4f, %.4f) tex=(%.4f, %.4f, %.4f, %.4f)\n", + i, verts[i].pos[0], verts[i].pos[1], verts[i].pos[2], verts[i].pos[3], + verts[i].tex[0], verts[i].tex[1], verts[i].tex[2], verts[i].tex[3]); + } + + /* Write quad B vertices at offset 128 to avoid overwriting quad A + * (4 vertices * 32 bytes = 128 bytes for quad A) */ + cmd_resource_inline_write(rc_vb.res_handle, 0, 0, 0, 0, + 128, 0, 0, /* x=128 (byte offset), y, z */ + sizeof(verts), 1, 1, /* w, h, d */ + verts, sizeof(verts)); + } + + /* Re-bind vertex buffer with offset 128 for quad B */ + { + uint32_t strides[] = { sizeof(vertex_t) }; + uint32_t offsets[] = { 128 }; + uint32_t res_handles[] = { rc_vb.res_handle }; + cmd_set_vertex_buffers(1, strides, offsets, res_handles); + } + + /* Draw quad B: 4 vertices, TRIANGLE_STRIP */ + cmd_draw_vbo(0, 4, PIPE_PRIM_TRIANGLE_STRIP, 0, 1, 0, 3); + + hex_dump_dwords("BATCH_3_COMPOSITE", cmd_buf, cmd_len); + + if (virtgpu_execbuffer(cmd_buf, cmd_len, all_bos, 4) < 0) return 1; + virtgpu_wait(rc_disp.bo_handle); + printf("Batch 3 (composite both textures): OK\n\n"); + + /* ===================================================================== + * Step 6: Display via DRM KMS + * ===================================================================== */ + + printf("=== Displaying composited result ===\n"); + + /* TRANSFER_FROM_HOST to pull GPU-rendered content into guest backing for DRM display */ + uint32_t disp_stride = rc_disp.stride; + if (disp_stride == 0) disp_stride = width * 4; + + if (virtgpu_transfer_from_host(rc_disp.bo_handle, disp_stride, width, height) < 0) + printf("TRANSFER_FROM_HOST (display readback): failed\n"); + else + printf("TRANSFER_FROM_HOST (display readback): OK\n"); + virtgpu_wait(rc_disp.bo_handle); + + uint32_t fb_id = 0; + int ret = drmModeAddFB(drm_fd, width, height, 24, 32, + disp_stride, rc_disp.bo_handle, &fb_id); + if (ret < 0) { + fprintf(stderr, "drmModeAddFB failed: %s\n", strerror(errno)); + return 1; + } + printf("AddFB: fb_id=%u\n", fb_id); + + ret = drmModeSetCrtc(drm_fd, crtc_id, fb_id, 0, 0, &conn_id, 1, &mode); + if (ret < 0) { + fprintf(stderr, "drmModeSetCrtc failed: %s\n", strerror(errno)); + drmModeRmFB(drm_fd, fb_id); + return 1; + } + printf("SetCrtc: OK -- display should show gray background + RED left + BLUE right\n\n"); + + /* Mark dirty to trigger display update */ + { + drmModeClip clip = { 0, 0, (uint16_t)width, (uint16_t)height }; + drmModeDirtyFB(drm_fd, fb_id, &clip, 1); + } + + /* ===================================================================== + * Step 7: Readback + pixel verification + * ===================================================================== */ + + printf("=== Pixel readback verification ===\n"); + + /* TRANSFER_FROM_HOST to get rendered pixels into guest backing */ + if (virtgpu_transfer_from_host(rc_disp.bo_handle, disp_stride, width, height) < 0) { + printf("TRANSFER_FROM_HOST: FAILED\n"); + } else { + printf("TRANSFER_FROM_HOST: OK\n"); + } + virtgpu_wait(rc_disp.bo_handle); + + /* MAP the display resource */ + struct drm_virtgpu_map vmap; + memset(&vmap, 0, sizeof(vmap)); + vmap.handle = rc_disp.bo_handle; + uint32_t *pixels = NULL; + uint32_t map_size = disp_stride * height; + + if (drmIoctl(drm_fd, DRM_IOCTL_VIRTGPU_MAP, &vmap) < 0) { + printf("VIRTGPU_MAP: FAILED -- %s\n", strerror(errno)); + } else { + pixels = mmap(NULL, map_size, PROT_READ | PROT_WRITE, + MAP_SHARED, drm_fd, vmap.offset); + if (pixels == MAP_FAILED) { + printf("mmap: FAILED -- %s\n", strerror(errno)); + pixels = NULL; + } else { + printf("mmap: OK (%u bytes at %p)\n", map_size, (void *)pixels); + } + } + + int pass_count = 0; + int fail_count = 0; + + if (pixels) { + uint32_t stride_px = disp_stride / 4; + + /* Sample pixel at center of quad A — should be RED. + * B8G8R8X8_UNORM byte order: B, G, R, X in memory. + * RED = B=0x00, G=0x00, R=0xFF, X=0xFF => LE u32 = 0xFF0000FF + * Or X might be 0x00 => 0x000000FF + * Actually in B8G8R8X8: byte[0]=B, byte[1]=G, byte[2]=R, byte[3]=X + * As LE uint32: (X << 24) | (R << 16) | (G << 8) | B + * RED: B=0, G=0, R=0xFF => 0x??FF0000 where ?? depends on X channel */ + uint32_t px_red = pixels[SAMPLE_RED_Y * stride_px + SAMPLE_RED_X]; + uint32_t px_blue = pixels[SAMPLE_BLUE_Y * stride_px + SAMPLE_BLUE_X]; + uint32_t px_gray = pixels[SAMPLE_GRAY_Y * stride_px + SAMPLE_GRAY_X]; + + printf("\nPixel samples (B8G8R8X8_UNORM as LE uint32):\n"); + printf(" (%d,%d) = 0x%08X (expect RED: R channel high, B/G low)\n", + SAMPLE_RED_X, SAMPLE_RED_Y, px_red); + printf(" (%d,%d) = 0x%08X (expect BLUE: B channel high, R/G low)\n", + SAMPLE_BLUE_X, SAMPLE_BLUE_Y, px_blue); + printf(" (%d,%d) = 0x%08X (expect GRAY: R=G=B ~0x33)\n", + SAMPLE_GRAY_X, SAMPLE_GRAY_Y, px_gray); + + /* Extract channels from B8G8R8X8_UNORM (LE): + * B = byte 0 = bits [7:0] + * G = byte 1 = bits [15:8] + * R = byte 2 = bits [23:16] + * X = byte 3 = bits [31:24] + */ + #define GET_B(px) ((px) & 0xFF) + #define GET_G(px) (((px) >> 8) & 0xFF) + #define GET_R(px) (((px) >> 16) & 0xFF) + + /* Check RED pixel: R should be high (>= 0xC0), B and G should be low (<= 0x40) */ + uint8_t r_r = GET_R(px_red), r_g = GET_G(px_red), r_b = GET_B(px_red); + printf("\n RED check: R=%u G=%u B=%u ", r_r, r_g, r_b); + if (r_r >= 0xC0 && r_g <= 0x40 && r_b <= 0x40) { + printf("PASS\n"); + pass_count++; + } else { + printf("FAIL\n"); + fail_count++; + } + + /* Check BLUE pixel: B should be high, R and G should be low */ + uint8_t b_r = GET_R(px_blue), b_g = GET_G(px_blue), b_b = GET_B(px_blue); + printf(" BLUE check: R=%u G=%u B=%u ", b_r, b_g, b_b); + if (b_b >= 0xC0 && b_r <= 0x40 && b_g <= 0x40) { + printf("PASS\n"); + pass_count++; + } else { + printf("FAIL\n"); + fail_count++; + } + + /* Check GRAY pixel: R, G, B should all be similar and in ~0x20-0x40 range + * 0.2 * 255 = 51 = 0x33 */ + uint8_t g_r = GET_R(px_gray), g_g = GET_G(px_gray), g_b = GET_B(px_gray); + printf(" GRAY check: R=%u G=%u B=%u ", g_r, g_g, g_b); + if (g_r >= 0x20 && g_r <= 0x50 && + g_g >= 0x20 && g_g <= 0x50 && + g_b >= 0x20 && g_b <= 0x50 && + abs((int)g_r - (int)g_g) < 0x10 && + abs((int)g_r - (int)g_b) < 0x10) { + printf("PASS\n"); + pass_count++; + } else { + printf("FAIL\n"); + fail_count++; + } + + /* Print additional diagnostic pixels */ + printf("\nAdditional pixel samples:\n"); + /* Top-left of quad A */ + printf(" (%d,%d) = 0x%08X (quad A top-left)\n", + QUAD_A_X0 + 5, QUAD_A_Y0 + 5, + pixels[(QUAD_A_Y0 + 5) * stride_px + QUAD_A_X0 + 5]); + /* Top-left of quad B */ + printf(" (%d,%d) = 0x%08X (quad B top-left)\n", + QUAD_B_X0 + 5, QUAD_B_Y0 + 5, + pixels[(QUAD_B_Y0 + 5) * stride_px + QUAD_B_X0 + 5]); + /* Between the quads (should be gray) */ + printf(" (550,250) = 0x%08X (between quads, expect gray)\n", + pixels[250 * stride_px + 550]); + /* Bottom-right corner (should be gray) */ + printf(" (%u,%u) = 0x%08X (bottom-right corner)\n", + width - 5, height - 5, + pixels[(height - 5) * stride_px + width - 5]); + + munmap(pixels, map_size); + } else { + printf("Cannot verify pixels -- MAP failed\n"); + fail_count = 3; + } + + /* ===================================================================== + * Final verdict + * ===================================================================== */ + + printf("\n========================================\n"); + if (fail_count == 0 && pass_count == 3) { + printf("MULTI-TEXTURE TEST: PASS (%d/3 checks passed)\n", pass_count); + } else { + printf("MULTI-TEXTURE TEST: FAIL (%d passed, %d failed)\n", pass_count, fail_count); + } + printf("========================================\n\n"); + + /* Hold display for 5 seconds */ + printf("Holding display for 5 seconds...\n"); + sleep(5); + + /* Cleanup */ + if (saved_crtc) { + drmModeSetCrtc(drm_fd, saved_crtc->crtc_id, saved_crtc->buffer_id, + saved_crtc->x, saved_crtc->y, &conn_id, 1, + &saved_crtc->mode); + drmModeFreeCrtc(saved_crtc); + } + drmModeRmFB(drm_fd, fb_id); + close(drm_fd); + + printf("Done.\n"); + return (fail_count == 0) ? 0 : 1; +} diff --git a/userspace/programs/src/bcheck.rs b/userspace/programs/src/bcheck.rs index 208a3781..33e42a79 100644 --- a/userspace/programs/src/bcheck.rs +++ b/userspace/programs/src/bcheck.rs @@ -432,22 +432,29 @@ fn main() { let total_h = content_height(&tests); let max_scroll = (total_h - visible_h).max(0); let mut scroll_offset: i32 = 0; + let sleep_ts = libbreenix::types::Timespec { tv_sec: 0, tv_nsec: 50_000_000 }; // 50ms loop { + let mut need_redraw = false; for event in win.poll_events() { match event { Event::KeyPress { keycode, .. } => { match keycode { - 0x52 => scroll_offset = (scroll_offset - ROW_H).max(0), // Up - 0x51 => scroll_offset = (scroll_offset + ROW_H).min(max_scroll), // Down + 0x52 => { scroll_offset = (scroll_offset - ROW_H).max(0); need_redraw = true; } + 0x51 => { scroll_offset = (scroll_offset + ROW_H).min(max_scroll); need_redraw = true; } _ => {} } } + Event::CloseRequested => std::process::exit(0), _ => {} } } - render(win.framebuf(), &tests, scroll_offset); - let _ = win.present(); + if need_redraw { + render(win.framebuf(), &tests, scroll_offset); + let _ = win.present(); + } else { + let _ = time::nanosleep(&sleep_ts); + } } } diff --git a/userspace/programs/src/bwm.rs b/userspace/programs/src/bwm.rs index 4a1e686a..f3b6628c 100644 --- a/userspace/programs/src/bwm.rs +++ b/userspace/programs/src/bwm.rs @@ -188,10 +188,14 @@ struct Window { /// Stable ordering for appbar (assigned at discovery time, never changes) creation_order: u32, /// Direct-mapped pointer to client window's pixel buffer (read-only, MAP_SHARED) + /// Stored for future per-window direct blit (currently compositor uses bulk composite). + #[allow(dead_code)] mapped_ptr: *const u32, /// Client window buffer width (from map_window_buffer) + #[allow(dead_code)] mapped_w: u32, /// Client window buffer height (from map_window_buffer) + #[allow(dead_code)] mapped_h: u32, } @@ -247,11 +251,6 @@ impl Window { } } - -fn rects_overlap(a: (i32, i32, i32, i32), b: (i32, i32, i32, i32)) -> bool { - a.0 < b.2 && a.2 > b.0 && a.1 < b.3 && a.3 > b.1 -} - // ─── Drawing Helpers ───────────────────────────────────────────────────────── fn fill_rect(fb: &mut FrameBuf, x: i32, y: i32, w: usize, h: usize, color: Color) { @@ -624,6 +623,11 @@ fn discover_windows(windows: &mut Vec, screen_w: usize, screen_h: usize, } }; + // Tell kernel where the client content goes on screen (for GPU compositing) + let content_x = cascade_x + BORDER_WIDTH as i32; + let content_y = cascade_y + TITLE_BAR_HEIGHT as i32 + BORDER_WIDTH as i32; + let _ = graphics::set_window_position(info.buffer_id, content_x, content_y); + let order = *next_order; *next_order += 1; windows.push(Window { @@ -640,144 +644,15 @@ fn discover_windows(windows: &mut Vec, screen_w: usize, screen_h: usize, removed || added } -// ─── Client Pixel Blitting ────────────────────────────────────────────────── - -/// Core pixel blit — direct u32 writes to compositor buffer for speed. -/// Bypasses FrameBuf::put_pixel which does per-pixel bounds checking + color conversion. -fn blit_pixels_to_fb(fb: &mut FrameBuf, win: &Window, src: &[u32], w: usize, h: usize) { - let cx = win.content_x(); - let cy = win.content_y(); - let cw = win.content_width(); - let ch = win.content_height(); - let pw = w.min(cw); - let ph = h.min(ch); - let fb_w = fb.width; - let fb_h = fb.height; - // Get raw u32 pointer to compositor buffer - let fb_ptr = fb.raw_ptr() as *mut u32; - for row in 0..ph { - let py = (cy + row as i32) as usize; - if py >= fb_h { continue; } - let dst_row_start = py * fb_w; - let src_row_start = row * w; - let x_start = cx.max(0) as usize; - let x_end = ((cx + pw as i32) as usize).min(fb_w); - let src_offset = if cx < 0 { (-cx) as usize } else { 0 }; - if x_start >= x_end { continue; } - let count = x_end - x_start; - let si = src_row_start + src_offset; - if si + count > src.len() { continue; } - unsafe { - core::ptr::copy_nonoverlapping( - src.as_ptr().add(si), - fb_ptr.add(dst_row_start + x_start), - count, - ); - } - } -} - -/// Check if a window has new pixels and blit from mapped memory to compositor. -/// Skips pixels covered by higher-z windows (occluders) so no z-repair is needed. -/// Returns true if new data was available. -fn blit_client_pixels(fb: &mut FrameBuf, win: &Window, - occluders: &[(i32, i32, i32, i32)]) -> bool { - if win.mapped_ptr.is_null() || win.mapped_w == 0 || win.mapped_h == 0 { - return false; - } - let dirty = graphics::check_window_dirty(win.window_id).unwrap_or(false); - if !dirty { return false; } - - if occluders.is_empty() { - blit_mapped_pixels(fb, win); - return true; - } - - // Occluded blit: for each row, skip pixels covered by higher windows. - let w = win.mapped_w as usize; - let h = win.mapped_h as usize; - let src = unsafe { core::slice::from_raw_parts(win.mapped_ptr, w * h) }; - - let cx = win.content_x(); - let cy = win.content_y(); - let cw = win.content_width().min(w); - let ch = win.content_height().min(h); - let fb_w = fb.width; - let fb_h = fb.height; - let fb_ptr = fb.raw_ptr() as *mut u32; - - for row in 0..ch { - let py = cy + row as i32; - if py < 0 || py >= fb_h as i32 { continue; } - let row_x_start = cx.max(0) as usize; - let row_x_end = ((cx + cw as i32) as usize).min(fb_w); - if row_x_start >= row_x_end { continue; } - - // Build visible spans by subtracting occluder columns from the full row - let mut spans = [(0usize, 0usize); 8]; - let mut n_spans = 1; - spans[0] = (row_x_start, row_x_end); - - for &(ox0, oy0, ox1, oy1) in occluders { - if py < oy0 || py >= oy1 { continue; } - let os = ox0.max(0) as usize; - let oe = ox1.max(0) as usize; - let mut new_spans = [(0usize, 0usize); 8]; - let mut nc = 0; - for k in 0..n_spans { - let (sx, ex) = spans[k]; - if sx >= ex { continue; } - if oe <= sx || os >= ex { - if nc < 8 { new_spans[nc] = (sx, ex); nc += 1; } - } else { - if sx < os && nc < 8 { new_spans[nc] = (sx, os); nc += 1; } - if ex > oe && nc < 8 { new_spans[nc] = (oe, ex); nc += 1; } - } - } - spans = new_spans; - n_spans = nc; - } - - let src_row = row * w; - let src_col_base = if cx < 0 { (-cx) as usize } else { 0 }; - for k in 0..n_spans { - let (sx, ex) = spans[k]; - if sx >= ex { continue; } - let count = ex - sx; - let si = src_row + src_col_base + (sx - row_x_start); - if si + count > w * h { continue; } - unsafe { - core::ptr::copy_nonoverlapping( - src.as_ptr().add(si), - fb_ptr.add(py as usize * fb_w + sx), - count, - ); - } - } - } - true -} - -/// Blit a window's pixels from its mapped memory to the compositor buffer. -fn blit_mapped_pixels(fb: &mut FrameBuf, win: &Window) { - if win.mapped_ptr.is_null() { return; } - let w = win.mapped_w as usize; - let h = win.mapped_h as usize; - let pixel_count = w * h; - let src = unsafe { core::slice::from_raw_parts(win.mapped_ptr, pixel_count) }; - blit_pixels_to_fb(fb, win, src, w, h); -} - /// Redraw all windows in z-order (index 0 = bottom), plus taskbar and app bar. -/// Reads directly from mapped memory (zero-copy from client window pages). +/// Window frames and decorations go into the compositor buffer; GPU compositing +/// handles client content via per-window textured quads. fn redraw_all_windows(fb: &mut FrameBuf, windows: &[Window], focused_win: usize, clock_text: &[u8]) { draw_taskbar(fb, clock_text); for i in 0..windows.len() { if windows[i].minimized { continue; } draw_window_frame(fb, &windows[i], i == focused_win); - if windows[i].window_id != 0 { - blit_mapped_pixels(fb, &windows[i]); - } + // GPU compositing handles client content — don't blit here } draw_appbar(fb, windows, focused_win); } @@ -835,6 +710,7 @@ fn compose_partial_redraw( sbuf[start..end].copy_from_slice(&bg[start..end]); } // 2. Redraw UI elements that intersect dirty region + // GPU compositing handles client content — only draw frames/decorations if dy0 < TASKBAR_HEIGHT { draw_taskbar(sfb, clock); } @@ -845,9 +721,6 @@ fn compose_partial_redraw( && (wy1 as usize) > dy0 && (wy0 as usize) < dy1 { draw_window_frame(sfb, &windows[i], i == focused); - if windows[i].window_id != 0 { - blit_mapped_pixels(sfb, &windows[i]); - } } } if dy1 > screen_h - APPBAR_HEIGHT { @@ -861,6 +734,7 @@ fn compose_partial_redraw( } } else { // Non-shadow path: restore bg region, redraw affected windows + // GPU compositing handles client content — only draw frames/decorations for row in dy0..dy1 { let start = row * screen_w + dx0; let end = row * screen_w + dx1; @@ -876,9 +750,6 @@ fn compose_partial_redraw( && (wy1 as usize) > dy0 && (wy0 as usize) < dy1 { draw_window_frame(fb, &windows[i], i == focused); - if windows[i].window_id != 0 { - blit_mapped_pixels(fb, &windows[i]); - } } } if dy1 > screen_h - APPBAR_HEIGHT { @@ -1008,17 +879,6 @@ fn main() { let mut read_buf = [0u8; 512]; let mut poll_fds = [io::PollFd { fd: 0, events: io::poll_events::POLLIN as i16, revents: 0 }]; - // Performance tracing - let mut perf_frame: u64 = 0; - let mut perf_total_ns: u64 = 0; - let mut perf_composites: u64 = 0; - let mut perf_waits: u64 = 0; - - fn mono_ns() -> u64 { - let ts = libbreenix::time::now_monotonic().unwrap_or_default(); - (ts.tv_sec as u64) * 1_000_000_000 + (ts.tv_nsec as u64) - } - // Registry generation tracking for compositor_wait let mut registry_gen: u32 = 0; @@ -1036,9 +896,6 @@ fn main() { // 16ms timeout ensures keyboard input via stdin is checked at least ~60Hz. let (ready, new_reg_gen) = graphics::compositor_wait(16, registry_gen).unwrap_or((0, registry_gen)); registry_gen = new_reg_gen; - perf_waits += 1; - - let t0 = mono_ns(); // ── 1. Discover new/removed client windows (only when registry changed) ── if ready & graphics::COMPOSITOR_READY_REGISTRY != 0 { @@ -1121,6 +978,12 @@ fn main() { let (ox0, oy0, ox1, oy1) = windows[win_idx].bounds(); windows[win_idx].x = new_x; windows[win_idx].y = new_y; + // Update kernel window position for GPU compositing + if windows[win_idx].window_id != 0 { + let cx = windows[win_idx].content_x(); + let cy = windows[win_idx].content_y(); + let _ = graphics::set_window_position(windows[win_idx].window_id, cx, cy); + } // Dirty region = union of old and new bounds let (nx0, ny0, nx1, ny1) = windows[win_idx].bounds(); let dr_x0 = ox0.min(nx0).max(0) as usize; @@ -1271,33 +1134,21 @@ fn main() { } } - // ── 5. Blit dirty client window pixels (occluded by higher-z windows) ── + // ── 5. GPU compositing handles window content — just check which are dirty ── // Skip entirely if compositor_wait didn't report dirty content if ready & graphics::COMPOSITOR_READY_DIRTY != 0 { - for i in 0..windows.len().min(16) { - if windows[i].window_id != 0 && !windows[i].minimized { - let mut occ = [(0i32, 0i32, 0i32, 0i32); 16]; - let mut n_occ = 0; - let ib = windows[i].bounds(); - for j in (i + 1)..windows.len().min(16) { - if !windows[j].minimized { - let jb = windows[j].bounds(); - if rects_overlap(ib, jb) && n_occ < 16 { - occ[n_occ] = jb; - n_occ += 1; - } + for i in 0..windows.len().min(16) { + if windows[i].window_id != 0 && !windows[i].minimized { + if graphics::check_window_dirty(windows[i].window_id).unwrap_or(false) { + content_dirty = true; + let (bx0, by0, bx1, by1) = windows[i].bounds(); + dirty_x0 = dirty_x0.min(bx0); + dirty_y0 = dirty_y0.min(by0); + dirty_x1 = dirty_x1.max(bx1); + dirty_y1 = dirty_y1.max(by1); } } - if blit_client_pixels(&mut fb, &windows[i], &occ[..n_occ]) { - content_dirty = true; - let (bx0, by0, bx1, by1) = ib; - dirty_x0 = dirty_x0.min(bx0); - dirty_y0 = dirty_y0.min(by0); - dirty_x1 = dirty_x1.max(bx1); - dirty_y1 = dirty_y1.max(by1); - } } - } } // end if DIRTY // ── 5b. Update clock (once per second) ── @@ -1328,7 +1179,6 @@ fn main() { ); full_redraw = false; content_dirty = false; - perf_composites += 1; } else if content_dirty { let sw = screen_w as i32; let sh = screen_h as i32; @@ -1341,7 +1191,6 @@ fn main() { 2, dx, dy, dw, dh, ); content_dirty = false; - perf_composites += 1; } else if mouse_moved_this_frame { // Mouse-only update: no content changed, but kernel draws cursor let _ = graphics::virgl_composite_windows_rect( @@ -1351,19 +1200,5 @@ fn main() { } // No sleep — compositor_wait handles blocking - let t_end = mono_ns(); - - perf_total_ns += t_end.saturating_sub(t0); - perf_frame += 1; - - if perf_frame % 500 == 0 { - let avg_us = perf_total_ns / 1000 / 500; - print!("[bwm-perf] iter={} composites={} waits={} avg_work={}us\n", - perf_frame, perf_composites, perf_waits, avg_us, - ); - perf_total_ns = 0; - perf_composites = 0; - perf_waits = 0; - } } } diff --git a/userspace/programs/src/init.rs b/userspace/programs/src/init.rs index c1616b21..da5d66ce 100644 --- a/userspace/programs/src/init.rs +++ b/userspace/programs/src/init.rs @@ -14,10 +14,10 @@ //! BWM is a pure compositor: it no longer spawns terminals internally. Instead, //! bterm and blog are standalone Breengel GUI apps that register windows with BWM. //! -//! Main loop reaps terminated children with waitpid(WNOHANG) and respawns +//! Main loop blocks on waitpid() until a child exits, then respawns //! crashed services with backoff to prevent tight respawn loops. -use libbreenix::process::{fork, exec, execv, waitpid, getpid, yield_now, ForkResult, WNOHANG}; +use libbreenix::process::{fork, exec, execv, waitpid, getpid, yield_now, ForkResult}; const TELNETD_PATH: &[u8] = b"/sbin/telnetd\0"; const BLOGD_PATH: &[u8] = b"/sbin/blogd\0"; @@ -188,57 +188,61 @@ fn main() { print!("[init] BUSYBOX TEST: cat /etc/passwd\n"); test_busybox_cat(); - // Main loop: reap zombies, respawn crashed services. + // Main loop: block on waitpid until a child exits, then respawn if needed. let mut status: i32 = 0; loop { - match waitpid(-1, &mut status as *mut i32, WNOHANG) { - Ok(reaped_pid) => { - let reaped = reaped_pid.raw() as i64; - if reaped > 0 { - if reaped == bwm_pid { - print!("[init] BWM exited (status {})\n", status); - bwm_pid = try_respawn(BWM_PATH, "bwm", &mut bwm_failures); - if bwm_pid == -1 { - print!("[init] BWM failed {} times, giving up\n", MAX_RESPAWN_FAILURES); - } - } else if reaped == bterm_pid { - print!("[init] bterm exited (status {})\n", status); - bterm_pid = try_respawn(BTERM_PATH, "bterm", &mut bterm_failures); - if bterm_pid == -1 { - print!("[init] bterm failed {} times, giving up\n", MAX_RESPAWN_FAILURES); - } - } else if reaped == blog_pid { - print!("[init] blog exited (status {})\n", status); - blog_pid = try_respawn(BLOG_PATH, "blog", &mut blog_failures); - if blog_pid == -1 { - print!("[init] blog failed {} times, giving up\n", MAX_RESPAWN_FAILURES); - } - } else if reaped == bounce_pid { - print!("[init] bounce exited (status {})\n", status); - bounce_pid = try_respawn(BOUNCE_PATH, "bounce", &mut bounce_failures); - if bounce_pid == -1 { - print!("[init] bounce failed {} times, giving up\n", MAX_RESPAWN_FAILURES); - } - } else if reaped == bcheck_pid { - print!("[init] bcheck exited (status {})\n", status); - // Don't respawn — bcheck runs once then displays results - bcheck_pid = -1; - } else if reaped == blogd_pid { - print!("[init] blogd exited (status {})\n", status); - blogd_pid = try_respawn(BLOGD_PATH, "blogd", &mut blogd_failures); - if blogd_pid == -1 { - print!("[init] blogd failed {} times, giving up\n", MAX_RESPAWN_FAILURES); - } - } else if reaped == telnetd_pid { - telnetd_pid = try_respawn(TELNETD_PATH, "telnetd", &mut telnetd_failures); - if telnetd_pid == -1 && telnetd_failures >= MAX_RESPAWN_FAILURES { - print!("[init] telnetd unavailable, continuing without it\n"); - } - } - } + let reaped = match waitpid(-1, &mut status as *mut i32, 0) { + Ok(pid) => pid.raw() as i64, + Err(_) => { + // ECHILD — no children at all. Sleep to avoid spinning. + let ts = libbreenix::types::Timespec { tv_sec: 1, tv_nsec: 0 }; + let _ = libbreenix::time::nanosleep(&ts); + continue; + } + }; + + if reaped <= 0 { + continue; + } + + if reaped == bwm_pid { + print!("[init] BWM exited (status {})\n", status); + bwm_pid = try_respawn(BWM_PATH, "bwm", &mut bwm_failures); + if bwm_pid == -1 { + print!("[init] BWM failed {} times, giving up\n", MAX_RESPAWN_FAILURES); + } + } else if reaped == bterm_pid { + print!("[init] bterm exited (status {})\n", status); + bterm_pid = try_respawn(BTERM_PATH, "bterm", &mut bterm_failures); + if bterm_pid == -1 { + print!("[init] bterm failed {} times, giving up\n", MAX_RESPAWN_FAILURES); + } + } else if reaped == blog_pid { + print!("[init] blog exited (status {})\n", status); + blog_pid = try_respawn(BLOG_PATH, "blog", &mut blog_failures); + if blog_pid == -1 { + print!("[init] blog failed {} times, giving up\n", MAX_RESPAWN_FAILURES); + } + } else if reaped == bounce_pid { + print!("[init] bounce exited (status {})\n", status); + bounce_pid = try_respawn(BOUNCE_PATH, "bounce", &mut bounce_failures); + if bounce_pid == -1 { + print!("[init] bounce failed {} times, giving up\n", MAX_RESPAWN_FAILURES); + } + } else if reaped == bcheck_pid { + print!("[init] bcheck exited (status {})\n", status); + bcheck_pid = -1; + } else if reaped == blogd_pid { + print!("[init] blogd exited (status {})\n", status); + blogd_pid = try_respawn(BLOGD_PATH, "blogd", &mut blogd_failures); + if blogd_pid == -1 { + print!("[init] blogd failed {} times, giving up\n", MAX_RESPAWN_FAILURES); + } + } else if reaped == telnetd_pid { + telnetd_pid = try_respawn(TELNETD_PATH, "telnetd", &mut telnetd_failures); + if telnetd_pid == -1 && telnetd_failures >= MAX_RESPAWN_FAILURES { + print!("[init] telnetd unavailable, continuing without it\n"); } - Err(_) => {} } - let _ = yield_now(); } }