diff --git a/build.rs b/build.rs
index f35372b0..edc9f637 100644
--- a/build.rs
+++ b/build.rs
@@ -26,36 +26,26 @@ fn main() {
     boot_config.frame_buffer.minimum_framebuffer_height = Some(fb_height);
     disk_builder.set_boot_config(&boot_config);
 
-    println!("cargo:warning=Configured framebuffer: {}x{}", fb_width, fb_height);
-
     // specify output paths
     let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
     let uefi_path = out_dir.join("breenix-uefi.img");
     let bios_path = out_dir.join("breenix-bios.img");
 
     // Only create the UEFI image by default. BIOS image can be enabled via env var.
-    println!("cargo:warning=Creating UEFI disk image at {}", uefi_path.display());
     disk_builder
         .create_uefi_image(&uefi_path)
         .expect("failed to create UEFI disk image");
 
     let build_bios = env::var("BREENIX_BUILD_BIOS").is_ok();
     if build_bios {
-        println!(
-            "cargo:warning=BREENIX_BUILD_BIOS set; creating BIOS disk image at {}",
-            bios_path.display()
-        );
         // New bootloader API removed BIOS builder; use UEFI image as placeholder to keep API surface stable.
         // If BIOS support is needed, switch to a branch that still exposes create_bios_image or vendor our own.
-        println!("cargo:warning=bootloader no longer provides create_bios_image; duplicating UEFI image for BIOS placeholder");
         disk_builder
             .create_uefi_image(&bios_path)
             .expect("failed to create BIOS placeholder image");
-    } else {
-        println!("cargo:warning=Skipping BIOS image creation (BREENIX_BUILD_BIOS not set)");
     }
 
     // pass the disk image paths via environment variables
     println!("cargo:rustc-env=UEFI_IMAGE={}", uefi_path.display());
     println!("cargo:rustc-env=BIOS_IMAGE={}", bios_path.display());
-}
\ No newline at end of file
+}
diff --git a/docs/planning/PCI_MSI_NETWORKING_PLAN.md b/docs/planning/PCI_MSI_NETWORKING_PLAN.md
new file mode 100644
index 00000000..06398bc6
--- /dev/null
+++ b/docs/planning/PCI_MSI_NETWORKING_PLAN.md
@@ -0,0 +1,267 @@
+# PCI MSI Interrupt-Driven Networking
+
+## Problem
+
+ARM64 network drivers (VirtIO net PCI on Parallels, e1000 on VMware) rely on
+timer-based polling at 100Hz (every 10ms). This adds 5-10ms latency per
+network round-trip, which compounds across DNS, TCP handshake, and HTTP
+response phases. On x86, the e1000 has a proper IRQ 11 handler that processes
+packets immediately via softirq.
+
+## Goal
+
+Replace timer-based polling with interrupt-driven packet processing on ARM64,
+achieving sub-millisecond packet delivery latency.
+
+---
+
+## Phase 1: VirtIO Net PCI MSI on Parallels (Priority: Immediate)
+
+### Why This Is Easy
+
+All infrastructure already exists and is proven working:
+- **GIC driver** (`gic.rs`): `enable_spi()`, `disable_spi()`,
+  `configure_spi_edge_triggered()`, `clear_spi_pending()` — all present
+- **PCI driver** (`pci.rs`): `find_msi_capability()`, `configure_msi()`,
+  `disable_intx()` — all present
+- **GICv2m MSI** (`platform_config.rs`): `probe_gicv2m()`,
+  `allocate_msi_spi()` — already used by xHCI and GPU PCI drivers on Parallels
+- **net_pci.rs** already has `handle_interrupt()` (line 552) that reads ISR
+  and raises NetRx softirq — it's just never called from the interrupt path
+
+### Files to Modify
+
+#### 1. `kernel/src/drivers/virtio/net_pci.rs`
+
+Add MSI setup following the exact pattern from `xhci.rs:setup_xhci_msi()`:
+
+```rust
+static NET_PCI_IRQ: AtomicU32 = AtomicU32::new(0);
+
+pub fn get_irq() -> Option<u32> {
+    let irq = NET_PCI_IRQ.load(Ordering::Relaxed);
+    if irq != 0 { Some(irq) } else { None }
+}
+
+fn setup_net_pci_msi(pci_dev: &pci::Device) -> Option<u32> {
+    // 1. Find MSI capability (cap ID 0x05)
+    let cap_offset = pci_dev.find_msi_capability()?;
+    // 2. Probe GICv2m (already probed by xHCI, returns cached value)
+    let gicv2m_base = platform_config::gicv2m_base_phys()?;
+    // 3. Allocate SPI from GICv2m pool
+    let spi = platform_config::allocate_msi_spi()?;
+    // 4. Program MSI: address = GICv2m doorbell, data = SPI number
+    pci_dev.configure_msi(cap_offset, gicv2m_base + 0x40, spi);
+    // 5. Disable INTx (MSI replaces it)
+    pci_dev.disable_intx();
+    // 6. Configure GIC: edge-triggered, enable SPI
+    gic::configure_spi_edge_triggered(spi);
+    gic::enable_spi(spi);
+    Some(spi)
+}
+```
+
+In `init()`, after device setup: call `setup_net_pci_msi()`, store result in
+`NET_PCI_IRQ`.
+
+Update `handle_interrupt()` with disable/clear/ack/enable SPI pattern (matching
+the xHCI and GPU handlers):
+
+```rust
+pub fn handle_interrupt() {
+    let irq = NET_PCI_IRQ.load(Ordering::Relaxed);
+    if irq != 0 {
+        gic::disable_spi(irq);
+        gic::clear_spi_pending(irq);
+    }
+    // Read ISR status register (existing code — auto-acks on read for legacy VirtIO)
+    // Raise NetRx softirq (existing code)
+    if irq != 0 {
+        gic::enable_spi(irq);
+    }
+}
+```
+
+#### 2. `kernel/src/arch_impl/aarch64/exception.rs`
+
+Add dispatch entry in the SPI match arm (32..=1019), alongside existing GPU
+PCI handler:
+
+```rust
+if let Some(net_pci_irq) = crate::drivers::virtio::net_pci::get_irq() {
+    if irq_id == net_pci_irq {
+        crate::drivers::virtio::net_pci::handle_interrupt();
+    }
+}
+```
+
+#### 3. `kernel/src/arch_impl/aarch64/timer_interrupt.rs`
+
+Conditionalize polling — only poll when no MSI IRQ is configured:
+
+```rust
+if !crate::drivers::virtio::net_pci::get_irq().is_some()
+    && (net_pci::is_initialized() || e1000::is_initialized())
+    && _count % 10 == 0
+{
+    raise_softirq(SoftirqType::NetRx);
+}
+```
+
+### Verification
+
+- DNS resolution should complete in <200ms (was 4-5 seconds)
+- HTTP fetch should complete in <2 seconds (was 10 seconds)
+- `cat /proc/interrupts` or trace counters should show NIC interrupts firing
+
+---
+
+## Phase 2: E1000 MSI on VMware (Priority: Next)
+
+VMware Fusion uses GICv3 with ITS (Interrupt Translation Service), not GICv2m.
+This is a different MSI delivery mechanism.
+
+### Approach A: GICv3 ITS (Correct, Complex)
+
+The ITS provides MSI translation for GICv3 systems:
+
+1. **Discover ITS**: Parse ACPI MADT for ITS entry, or scan GIC redistributor
+   space. ITS is typically at a well-known address (e.g., 0x0801_0000 on
+   VMware virt).
+
+2. **Initialize ITS**:
+   - Allocate command queue (4KB aligned, mapped uncacheable)
+   - Allocate device table and collection table
+   - Enable ITS via GITS_CTLR
+
+3. **Per-device setup**:
+   - `MAPD` command: map device ID to interrupt table
+   - `MAPTI` command: map event ID to LPI (physical interrupt)
+   - `MAPI` command: map interrupt to collection (target CPU)
+   - `INV` command: invalidate cached translation
+
+4. **MSI configuration**:
+   - MSI address = `GITS_TRANSLATER` physical address
+   - MSI data = device-specific event ID
+   - Program via `pci_dev.configure_msi(cap, its_translater, event_id)`
+
+5. **IRQ handling**: LPIs are delivered via GICv3 ICC_IAR1_EL1, same as SPIs.
+   Dispatch by LPI number in exception.rs.
+
+**Estimated effort**: 200-400 lines of new code for ITS initialization + per-device
+setup. Most complex part is the command queue protocol.
+
+### Approach B: INTx via ACPI _PRT (Simpler, Limited)
+
+Parse the ACPI DSDT for PCI interrupt routing:
+
+1. **Parse ACPI _PRT**: The PCI Routing Table maps (slot, pin) -> GIC SPI.
+   Breenix already has basic ACPI parsing for MADT/SPCR. Extend to parse
+   DSDT for _PRT entries.
+
+2. **Configure SPI**: Once the SPI number is known from _PRT, configure it as
+   level-triggered (INTx is level, not edge), enable in GIC.
+
+3. **Shared interrupt handling**: INTx lines may be shared between devices.
+   Handler must check each device's ISR before claiming the interrupt.
+
+**Estimated effort**: 100-200 lines for _PRT parsing + level-triggered handler.
+
+### Approach C: VMware-Specific Probe (Pragmatic)
+
+If VMware always maps e1000 INTx to a known SPI (discoverable from the device
+tree or hardcoded for the vmware-aarch64 machine model), we could:
+
+1. Read `interrupt_line` from PCI config space (currently 0xFF on ARM64)
+2. Use VMware's DT to find the actual SPI mapping
+3. Hardcode the mapping as a platform quirk if it's stable
+
+**Estimated effort**: 20-50 lines, but fragile.
+
+### Recommendation
+
+Start with Approach B (_PRT parsing) since ACPI infrastructure partially exists.
+Defer ITS to Phase 3 when multiple PCI devices need independent MSI vectors.
+
+---
+
+## Phase 3: Generic PCI Interrupt Framework (Priority: Future)
+
+### Dynamic IRQ Dispatch Table
+
+Replace the chain of `if let Some(irq)` in exception.rs with a registration-
+based dispatch:
+
+```rust
+static PCI_IRQ_HANDLERS: Mutex<[(u32, fn()); 16]>;
+
+pub fn register_pci_irq(spi: u32, handler: fn()) { ... }
+```
+
+This allows any PCI driver to register its own handler without modifying
+exception.rs.
+
+### Full ITS Support
+
+For GICv3 platforms (VMware, newer QEMU configs, real hardware):
+- ITS command queue management
+- LPI configuration tables (PROPBASER, PENDBASER)
+- Per-device interrupt translation
+- Multi-CPU interrupt routing via collections
+
+### QEMU Virt INTx Mapping
+
+QEMU virt machine maps PCI INTx to fixed SPIs:
+- INTA -> SPI 3 (GIC INTID 35)
+- INTB -> SPI 4 (GIC INTID 36)
+- INTC -> SPI 5 (GIC INTID 37)
+- INTD -> SPI 6 (GIC INTID 38)
+
+With swizzling: `actual_pin = (slot + pin - 1) % 4`
+
+These are level-triggered and shared, requiring ISR checks per device.
+
+---
+
+## Architecture Reference
+
+### Current Packet Receive Path (Polling)
+
+```
+Timer interrupt (1000Hz)
+  -> every 10th tick: raise_softirq(NetRx)
+    -> net_rx_softirq_handler()
+      -> process_rx()
+        -> net_pci::receive() / e1000::receive()
+          -> process_packet()
+            -> udp::enqueue_packet() / tcp::handle_segment()
+              -> wake blocked thread
+```
+
+Latency: 0-10ms (mean 5ms) per packet.
+
+### Target Packet Receive Path (MSI)
+
+```
+NIC MSI interrupt -> GIC SPI
+  -> exception.rs handle_irq()
+    -> net_pci::handle_interrupt()
+      -> read ISR (auto-ack)
+      -> raise_softirq(NetRx)
+        -> net_rx_softirq_handler()
+          -> process_rx()
+            -> ... (same as above)
+```
+
+Latency: <100us per packet (GIC + softirq overhead).
+
+### MSI Delivery on Parallels (GICv2m)
+
+```
+Device writes MSI data to GICv2m doorbell address:
+  addr = GICV2M_BASE + 0x40 (MSI_SETSPI_NS)
+  data = allocated SPI number
+
+GICv2m translates write to GIC SPI assertion.
+GIC delivers SPI to target CPU via ICC_IAR1_EL1.
+```
diff --git a/docs/planning/gpu-rendering-attack-plan.md b/docs/planning/gpu-rendering-attack-plan.md
new file mode 100644
index 00000000..089b8ab9
--- /dev/null
+++ b/docs/planning/gpu-rendering-attack-plan.md
@@ -0,0 +1,164 @@
+# GPU-Only Rendering Attack Plan
+
+## Problem
+
+The current rendering pipeline wastes CPU on work the GPU should do:
+
+1. **BWM compositing**: CPU-blits window pixels into compositor texture row-by-row
+   (`blit_client_pixels`), then does TRANSFER_TO_HOST_3D to upload to GPU. Linux ftrace
+   proved this transfer is unnecessary: Mesa's per-frame path is just
+   **SUBMIT_3D -> SET_SCANOUT -> RESOURCE_FLUSH** with zero CPU transfers.
+
+2. **Bounce (and all Breengel clients)**: Software-renders pixels into shared memory
+   buffers. Bounce draws circles pixel-by-pixel on CPU. All rendering should use VirGL
+   GPU primitives (DRAW_VBO with shaders).
+
+3. **Per-window texture "limitation" was a bug**: The note "per-window VirGL textures
+   DON'T work" was a bug in our resource creation, not a Parallels limitation. Linux
+   probe VM proved multiple VirGL textures work correctly on identical hardware.
+
+## Target Architecture
+
+```
+Client (bounce, bterm, etc.)           BWM Compositor
+  |                                       |
+  | VirGL SUBMIT_3D                       | VirGL SUBMIT_3D
+  | (draw geometry into                   | (draw textured quads for
+  |  per-window texture)                  |  each window texture onto
+  |                                       |  compositor surface)
+  v                                       v
+  GPU renders to                          GPU composites all windows
+  window texture                          -> SET_SCANOUT -> RESOURCE_FLUSH
+```
+
+Zero CPU pixel copying. Zero TRANSFER_TO_HOST_3D per frame.
+
+## Phase 1: Fix Per-Window VirGL Textures
+
+**Goal**: Create multiple VirGL TEXTURE_2D resources that can be rendered to and sampled from.
+
+### Debugging Approach (Linux-first, per proven methodology)
+
+1. On Linux probe VM, write a test program that:
+   - Creates 2+ RESOURCE_CREATE_3D textures (TEXTURE_2D, B8G8R8X8_UNORM)
+   - ATTACH_BACKING with paged scatter-gather for each
+   - SUBMIT_3D: render different colors into each texture (set as render target, CLEAR)
+   - SUBMIT_3D: sample from both textures as textured quads onto a third surface
+   - SET_SCANOUT + RESOURCE_FLUSH
+   - Verify both textures display correctly
+
+2. If it works on Linux (expected), capture the exact VirGL byte sequence with
+   virgl_intercept.c LD_PRELOAD.
+
+3. Port the exact bytes to Breenix. If it fails, diff against the Linux bytes to find
+   the resource creation/backing bug.
+
+### Likely Bug Candidates
+
+- Missing ATTACH_BACKING on new resources (paged scatter-gather required)
+- Missing CTX_ATTACH_RESOURCE for new resources
+- Missing "priming" TRANSFER_TO_HOST_3D (required once per resource, not per frame)
+- Wrong bind flags (need RENDER_TARGET | SAMPLER_VIEW at minimum)
+- Handle collisions in virglrenderer hash table (handles must be globally unique)
+
+### Files
+- `kernel/src/drivers/virtio/gpu_pci.rs` — resource creation, backing attachment
+- `kernel/src/drivers/virtio/virgl.rs` — VirGL command encoding
+
+## Phase 2: GPU-Based BWM Compositing
+
+**Goal**: BWM composites windows using GPU textured quads instead of CPU blit.
+
+### Architecture
+
+1. Each registered window gets a VirGL TEXTURE_2D resource (created once)
+2. Window pixel data lives in the texture's backing pages (MAP_SHARED to client)
+3. Per-frame, BWM issues one SUBMIT_3D batch:
+   - For each visible window: create_sampler_view on window texture, bind as FS input,
+     DRAW_VBO textured quad at window position
+   - Background quad rendered first, windows in z-order on top
+4. SET_SCANOUT + RESOURCE_FLUSH (matches Linux per-frame sequence)
+
+### Key Change: No TRANSFER_TO_HOST_3D Per Frame
+
+The current pipeline does TRANSFER_TO_HOST_3D every frame to upload pixel data. Linux
+proves this is unnecessary — the host reads directly from the GPU texture's backing
+pages when rendering via SUBMIT_3D. The one-time "priming" TRANSFER_TO_HOST_3D at
+resource creation is sufficient.
+
+### Window Dirty Tracking
+
+When a client calls mark_window_dirty, BWM knows to include that window in the next
+SUBMIT_3D batch. Clean windows can be skipped (their texture is already on the GPU
+from the previous frame).
+
+### Files
+- `userspace/programs/src/bwm.rs` — compositor main loop, replace blit_client_pixels
+- `kernel/src/syscall/graphics.rs` — window buffer syscalls, texture resource management
+- `kernel/src/drivers/virtio/gpu_pci.rs` — per-window resource creation
+
+## Phase 3: Client-Side GPU Rendering (Bounce)
+
+**Goal**: Bounce renders spheres using VirGL DRAW_VBO instead of CPU pixel pushing.
+
+### Architecture
+
+1. Bounce creates its window (gets a VirGL texture resource as render target)
+2. Each frame, bounce issues VirGL commands via a new syscall:
+   - Set window texture as render target
+   - CLEAR background
+   - For each sphere: DRAW_VBO with colored vertices (triangle fan or instanced quad
+     with circle fragment shader)
+3. Calls mark_window_dirty to trigger BWM compositing
+
+### New API: Breengel GPU Drawing
+
+Breengel needs a GPU drawing API so clients don't need to encode raw VirGL:
+
+```rust
+// Proposed Breengel GPU API
+impl Window {
+    fn begin_frame(&mut self);
+    fn clear(&mut self, color: Color);
+    fn draw_circle(&mut self, cx: i32, cy: i32, radius: i32, color: Color);
+    fn draw_rect(&mut self, x: i32, y: i32, w: i32, h: i32, color: Color);
+    fn draw_text(&mut self, text: &[u8], x: i32, y: i32, color: Color);
+    fn end_frame(&mut self); // triggers SUBMIT_3D + mark_dirty
+}
+```
+
+Under the hood, these accumulate VirGL commands and submit in one batch.
+
+### Files
+- `libs/breengel/src/lib.rs` — GPU drawing API
+- `userspace/programs/src/bounce.rs` — convert to GPU rendering
+- `kernel/src/syscall/graphics.rs` — new syscall for client SUBMIT_3D
+
+## Phase 4: Text Rendering on GPU
+
+**Goal**: bterm, bcheck, btop render text using GPU textured quads with a font atlas.
+
+### Architecture
+
+1. Upload bitmap font as a VirGL texture (one-time)
+2. Each glyph = textured quad sampling from the font atlas
+3. Text rendering becomes a batch of DRAW_VBO calls with texture coordinates
+
+This eliminates the biggest CPU cost in terminal rendering — drawing characters
+pixel-by-pixel into framebuffers.
+
+## Verification
+
+Each phase should be verified independently:
+
+- **Phase 1**: Create 2 textures, render different colors, sample both in one frame
+- **Phase 2**: BWM composites without CPU blit, no TRANSFER_TO_HOST_3D per frame
+- **Phase 3**: Bounce renders at 60+ FPS with ~0% CPU (only physics simulation)
+- **Phase 4**: bterm scrolls smoothly with minimal CPU
+
+## Priority Order
+
+Phase 1 (fix per-window textures) unblocks everything else. Start there.
+Phase 2 (GPU compositing) gives the biggest immediate win — eliminates the CPU blit.
+Phase 3 (client GPU rendering) makes bounce truly GPU-rendered.
+Phase 4 (text on GPU) is the final polish for terminal/text apps.
diff --git a/kernel/src/arch_impl/aarch64/exception.rs b/kernel/src/arch_impl/aarch64/exception.rs
index 48f8cd3f..78208da1 100644
--- a/kernel/src/arch_impl/aarch64/exception.rs
+++ b/kernel/src/arch_impl/aarch64/exception.rs
@@ -1051,6 +1051,12 @@ pub extern "C" fn handle_irq() {
                         crate::drivers::virtio::gpu_pci::handle_interrupt();
                     }
                 }
+                // VirtIO network PCI interrupt dispatch (GICv2m MSI)
+                if let Some(net_pci_irq) = crate::drivers::virtio::net_pci::get_irq() {
+                    if irq_id == net_pci_irq {
+                        crate::drivers::virtio::net_pci::handle_interrupt();
+                    }
+                }
             }
 
             // Should not happen - GIC filters invalid IDs (1020+)
diff --git a/kernel/src/arch_impl/aarch64/timer_interrupt.rs b/kernel/src/arch_impl/aarch64/timer_interrupt.rs
index 314817b7..420896ee 100644
--- a/kernel/src/arch_impl/aarch64/timer_interrupt.rs
+++ b/kernel/src/arch_impl/aarch64/timer_interrupt.rs
@@ -280,9 +280,9 @@ pub extern "C" fn timer_interrupt_handler() {
         crate::drivers::usb::ehci::poll_keyboard();
         // Poll XHCI USB HID events (needed when PCI interrupt routing isn't available)
         crate::drivers::usb::xhci::poll_hid_events();
-        // Poll network RX for incoming packets (PCI INTx routing not wired up)
-        // Covers both VirtIO net PCI (Parallels) and e1000 (VMware)
-        // Poll every 10th tick (~100Hz at 1000Hz timer) for responsive networking
+        // Poll network RX as a safety net alongside MSI-X interrupts.
+        // MSI-X provides sub-ms latency; this 100Hz fallback ensures packets
+        // are still processed if MSI-X delivery fails for any reason.
         if (crate::drivers::virtio::net_pci::is_initialized()
             || crate::drivers::e1000::is_initialized())
             && _count % 10 == 0
diff --git a/kernel/src/drivers/virtio/gpu_pci.rs b/kernel/src/drivers/virtio/gpu_pci.rs
index c42d0eaf..b0695b25 100644
--- a/kernel/src/drivers/virtio/gpu_pci.rs
+++ b/kernel/src/drivers/virtio/gpu_pci.rs
@@ -614,6 +614,50 @@ fn init_composite_texture(width: u32, height: u32) -> Result<(), &'static str> {
 
     COMPOSITE_TEX_READY.store(true, Ordering::Release);
     crate::serial_println!("[virgl-composite] Texture resource initialized (id={})", RESOURCE_COMPOSITE_TEX_ID);
+
+    // ── Pre-allocate per-window texture pool ──
+    // Parallels requires resources to be created BEFORE the first SUBMIT_3D.
+    // Resources created after SUBMIT_3D has been called don't get their
+    // TRANSFER_TO_HOST_3D data. Pre-allocate all slots now with display-sized
+    // backing so they're ready when windows appear.
+    let pool_w = width;
+    let pool_h = height;
+    let pool_size = (pool_w as usize) * (pool_h as usize) * 4;
+    let mut pool_count = 0usize;
+    for slot in 0..MAX_WIN_TEX_SLOTS {
+        let res_id = RESOURCE_WIN_TEX_BASE + slot as u32;
+        let layout = alloc::alloc::Layout::from_size_align(pool_size, 4096)
+            .map_err(|_| "win texture pool: layout error")?;
+        let ptr = unsafe { alloc::alloc::alloc_zeroed(layout) };
+        if ptr.is_null() {
+            crate::serial_println!("[virgl-pool] slot {} alloc failed, pool stopped at {}", slot, slot);
+            break;
+        }
+
+        with_device_state(|state| {
+            virgl_resource_create_3d_cmd(
+                state, res_id, pipe::TEXTURE_2D, vfmt::B8G8R8X8_UNORM,
+                pipe::BIND_SAMPLER_VIEW | pipe::BIND_SCANOUT,
+                pool_w, pool_h, 1, 1,
+            )
+        })?;
+        with_device_state(|state| {
+            virgl_attach_backing_paged(state, res_id, ptr, pool_size)
+        })?;
+        with_device_state(|state| {
+            virgl_ctx_attach_resource_cmd(state, VIRGL_CTX_ID, res_id)
+        })?;
+        dma_cache_clean(ptr, pool_size);
+        with_device_state(|state| {
+            transfer_to_host_3d(state, res_id, 0, 0, pool_w, pool_h, pool_w * 4)
+        })?;
+
+        unsafe { WIN_TEX_BACKING[slot] = (ptr, pool_size); }
+        pool_count += 1;
+    }
+    crate::serial_println!("[virgl-pool] Pre-allocated {}/{} window texture slots ({}x{}, {}KB each)",
+        pool_count, MAX_WIN_TEX_SLOTS, pool_w, pool_h, pool_size / 1024);
+
     Ok(())
 }
 
@@ -2227,7 +2271,7 @@ fn virgl_attach_backing_from_pages(
 
 /// Base resource ID for per-window VirGL textures. Window slot N → resource (10 + N).
 const RESOURCE_WIN_TEX_BASE: u32 = 10;
-const MAX_WIN_TEX_SLOTS: usize = 16;
+const MAX_WIN_TEX_SLOTS: usize = 8;
 
 /// Per-window contiguous backing buffers for VirGL textures.
 /// Parallels requires contiguous physical backing for TRANSFER_TO_HOST_3D to work.
@@ -2246,81 +2290,72 @@ pub fn init_window_texture(
     width: u32,
     height: u32,
     _page_phys_addrs: &[u64],
-    total_len: usize,
+    _total_len: usize,
 ) -> Result<u32, &'static str> {
-    use super::virgl::{format as vfmt, pipe};
 
     if slot_index >= MAX_WIN_TEX_SLOTS {
         return Err("init_window_texture: slot_index out of range");
     }
 
     let resource_id = RESOURCE_WIN_TEX_BASE + slot_index as u32;
-    crate::serial_println!(
-        "[virgl-win] init_window_texture: slot={}, res_id={}, {}x{}, {} bytes (contiguous backing)",
-        slot_index, resource_id, width, height, total_len
-    );
 
-    // Allocate contiguous, page-aligned heap buffer for VirGL backing
-    let backing_layout = alloc::alloc::Layout::from_size_align(total_len, 4096)
-        .map_err(|_| "init_window_texture: invalid backing layout")?;
-    let backing_ptr = unsafe { alloc::alloc::alloc_zeroed(backing_layout) };
-    if backing_ptr.is_null() {
-        return Err("init_window_texture: failed to allocate contiguous backing");
+    // Pool was pre-allocated at init time (before first SUBMIT_3D).
+    // Just verify the slot exists and return the resource ID.
+    let (existing_ptr, existing_len) = unsafe { WIN_TEX_BACKING[slot_index] };
+    if existing_ptr.is_null() || existing_len == 0 {
+        return Err("init_window_texture: slot not pre-allocated");
     }
-    unsafe { WIN_TEX_BACKING[slot_index] = (backing_ptr, total_len); }
 
-    // Create TEXTURE_2D with SAMPLER_VIEW bind
-    with_device_state(|state| {
-        virgl_resource_create_3d_cmd(
-            state,
-            resource_id,
-            pipe::TEXTURE_2D,
-            vfmt::B8G8R8X8_UNORM,
-            pipe::BIND_SAMPLER_VIEW,
-            width, height, 1, 1,
-        )
-    })?;
-
-    // Attach contiguous backing (same method as compositor texture — proven working)
-    with_device_state(|state| {
-        virgl_attach_backing_paged(state, resource_id, backing_ptr, total_len)
-    })?;
-
-    // Attach to VirGL context
-    with_device_state(|state| {
-        virgl_ctx_attach_resource_cmd(state, VIRGL_CTX_ID, resource_id)
-    })?;
-
-    // Prime with TRANSFER_TO_HOST_3D
-    dma_cache_clean(backing_ptr, total_len);
-    with_device_state(|state| {
-        transfer_to_host_3d(state, resource_id, 0, 0, width, height, width * 4)
-    })?;
-
-    crate::serial_println!("[virgl-win] Window texture initialized (res_id={}, backing={:#x})",
-        resource_id, backing_ptr as u64);
+    crate::serial_println!(
+        "[virgl-win] init_window_texture: slot={} using pre-allocated res={} ({}x{}, backing={:#x})",
+        slot_index, resource_id, width, height, existing_ptr as u64
+    );
     Ok(resource_id)
 }
 
-/// Copy window pixels from MAP_SHARED pages to the contiguous VirGL backing buffer.
-/// Must be called before cache clean + TRANSFER_TO_HOST_3D.
-#[allow(dead_code)]
-fn copy_window_pages_to_backing(slot_index: usize, page_phys_addrs: &[u64], total_len: usize) {
-    let (backing_ptr, backing_len) = unsafe { WIN_TEX_BACKING[slot_index] };
-    if backing_ptr.is_null() || backing_len == 0 { return; }
-
-    let phys_mem_offset = crate::memory::physical_memory_offset().as_u64();
-    let copy_len = total_len.min(backing_len);
-    let mut offset = 0usize;
-
-    for &page_phys in page_phys_addrs {
-        if offset >= copy_len { break; }
-        let page_ptr = (phys_mem_offset + page_phys) as *const u8;
-        let chunk = (copy_len - offset).min(4096);
-        unsafe {
-            core::ptr::copy_nonoverlapping(page_ptr, backing_ptr.add(offset), chunk);
+/// Blit window content from MAP_SHARED pages directly into COMPOSITE_TEX at (x, y).
+/// This composites window pixels into the single compositor texture, giving correct
+/// z-order when called bottom-to-top. The cursor is drawn AFTER this, so it appears on top.
+fn blit_window_to_compositor(
+    win_x: u32, win_y: u32,
+    win_w: u32, win_h: u32,
+    page_phys_addrs: &[u64],
+    tex_w: u32, tex_h: u32,
+) {
+    let phys_offset = crate::memory::physical_memory_offset().as_u64();
+    let row_bytes = (win_w as usize) * 4;
+    let tex_stride = (tex_w as usize) * 4;
+    let tex_ptr = unsafe { COMPOSITE_TEX_PTR };
+
+    for row in 0..win_h as usize {
+        let dst_y = (win_y as usize) + row;
+        if dst_y >= tex_h as usize { break; }
+        let dst_x = win_x as usize;
+        let copy_w = (win_w as usize).min((tex_w as usize).saturating_sub(dst_x));
+        if copy_w == 0 { continue; }
+        let copy_bytes = copy_w * 4;
+
+        let src_offset = row * row_bytes;
+        let dst_offset = dst_y * tex_stride + dst_x * 4;
+
+        // Copy from scattered pages, handling page boundaries
+        let mut copied = 0usize;
+        while copied < copy_bytes {
+            let linear_pos = src_offset + copied;
+            let page_idx = linear_pos / 4096;
+            let page_off = linear_pos % 4096;
+            if page_idx >= page_phys_addrs.len() { break; }
+            let chunk = (4096 - page_off).min(copy_bytes - copied);
+            let src_ptr = (phys_offset + page_phys_addrs[page_idx] + page_off as u64) as *const u8;
+            unsafe {
+                core::ptr::copy_nonoverlapping(
+                    src_ptr,
+                    tex_ptr.add(dst_offset + copied),
+                    chunk,
+                );
+            }
+            copied += chunk;
         }
-        offset += chunk;
     }
 }
 
@@ -3376,6 +3411,77 @@ pub fn virgl_composite_frame_textured(
     Ok(())
 }
 
+/// Build and submit a single fullscreen textured quad from COMPOSITE_TEX.
+///
+/// COMPOSITE_TEX already contains the fully-composited frame: background, window
+/// frames/decorations, window content (blitted in z-order), and cursor.
+fn virgl_composite_single_quad() -> Result<(), &'static str> {
+    use super::virgl::{CommandBuffer, format as vfmt, pipe, swizzle};
+
+    let tex_w = COMPOSITE_TEX_W.load(Ordering::Relaxed);
+    let tex_h = COMPOSITE_TEX_H.load(Ordering::Relaxed);
+    let (display_w, display_h) = dimensions().ok_or("GPU not initialized")?;
+
+    let mut cmdbuf = CommandBuffer::new();
+    cmdbuf.create_sub_ctx(1);
+    cmdbuf.set_sub_ctx(1);
+    cmdbuf.set_tweaks(1, 1);
+    cmdbuf.set_tweaks(2, display_w);
+
+    cmdbuf.create_surface(10, RESOURCE_3D_ID, vfmt::B8G8R8X8_UNORM, 0, 0);
+    cmdbuf.set_framebuffer_state(0, &[10]);
+    cmdbuf.create_blend_simple(11);
+    cmdbuf.bind_object(11, super::virgl::OBJ_BLEND);
+    cmdbuf.create_dsa_default(12);
+    cmdbuf.bind_object(12, super::virgl::OBJ_DSA);
+    cmdbuf.create_rasterizer_default(13);
+    cmdbuf.bind_object(13, super::virgl::OBJ_RASTERIZER);
+
+    let tex_vs = b"VERT\nDCL IN[0]\nDCL IN[1]\nDCL OUT[0], POSITION\nDCL OUT[1], GENERIC[0]\n  0: MOV OUT[0], IN[0]\n  1: MOV OUT[1], IN[1]\n  2: END\n";
+    cmdbuf.create_shader(14, pipe::SHADER_VERTEX, 300, tex_vs);
+    cmdbuf.bind_shader(14, pipe::SHADER_VERTEX);
+    let tex_fs = b"FRAG\nPROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1\nDCL IN[0], GENERIC[0], LINEAR\nDCL OUT[0], COLOR\nDCL SAMP[0]\nDCL SVIEW[0], 2D, FLOAT\n  0: TEX OUT[0], IN[0], SAMP[0], 2D\n  1: END\n";
+    cmdbuf.create_shader(15, pipe::SHADER_FRAGMENT, 300, tex_fs);
+    cmdbuf.bind_shader(15, pipe::SHADER_FRAGMENT);
+
+    cmdbuf.create_vertex_elements(16, &[
+        (0, 0, 0, vfmt::R32G32B32A32_FLOAT),
+        (16, 0, 0, vfmt::R32G32B32A32_FLOAT),
+    ]);
+    cmdbuf.bind_object(16, super::virgl::OBJ_VERTEX_ELEMENTS);
+
+    cmdbuf.create_sampler_state(18, pipe::TEX_WRAP_CLAMP_TO_EDGE, pipe::TEX_WRAP_CLAMP_TO_EDGE,
+        pipe::TEX_WRAP_CLAMP_TO_EDGE, pipe::TEX_FILTER_NEAREST, pipe::TEX_MIPFILTER_NONE,
+        pipe::TEX_FILTER_NEAREST);
+    cmdbuf.bind_sampler_states(pipe::SHADER_FRAGMENT, 0, &[18]);
+    cmdbuf.set_min_samples(1);
+    cmdbuf.set_viewport(display_w as f32, display_h as f32);
+
+    cmdbuf.create_sampler_view(17, RESOURCE_COMPOSITE_TEX_ID, vfmt::B8G8R8X8_UNORM,
+        pipe::TEXTURE_2D, 0, 0, 0, 0, swizzle::IDENTITY);
+    cmdbuf.set_sampler_views(pipe::SHADER_FRAGMENT, 0, &[17]);
+
+    let u_max = (tex_w.min(display_w) as f32) / (tex_w as f32);
+    let v_max = (tex_h.min(display_h) as f32) / (tex_h as f32);
+    let bg_verts: [u32; 32] = [
+        (-1.0f32).to_bits(), (1.0f32).to_bits(), 0f32.to_bits(), 1.0f32.to_bits(),
+        0f32.to_bits(), 0f32.to_bits(), 0f32.to_bits(), 0f32.to_bits(),
+        (-1.0f32).to_bits(), (-1.0f32).to_bits(), 0f32.to_bits(), 1.0f32.to_bits(),
+        0f32.to_bits(), v_max.to_bits(), 0f32.to_bits(), 0f32.to_bits(),
+        1.0f32.to_bits(), (-1.0f32).to_bits(), 0f32.to_bits(), 1.0f32.to_bits(),
+        u_max.to_bits(), v_max.to_bits(), 0f32.to_bits(), 0f32.to_bits(),
+        1.0f32.to_bits(), (1.0f32).to_bits(), 0f32.to_bits(), 1.0f32.to_bits(),
+        u_max.to_bits(), 0f32.to_bits(), 0f32.to_bits(), 0f32.to_bits(),
+    ];
+    cmdbuf.resource_inline_write(RESOURCE_VB_ID, 0, 128, &bg_verts);
+    cmdbuf.set_vertex_buffers(&[(32, 0, RESOURCE_VB_ID)]);
+    cmdbuf.draw_vbo(0, 4, pipe::PRIM_TRIANGLE_FAN, 3);
+
+    virgl_submit_sync(cmdbuf.as_slice())?;
+    with_device_state(|state| set_scanout_resource(state, RESOURCE_3D_ID))?;
+    with_device_state(|state| resource_flush_3d(state, RESOURCE_3D_ID))
+}
+
 /// Multi-window GPU compositor.
 ///
 /// Uploads dirty textures (background + per-window), then renders all windows
@@ -3478,6 +3584,24 @@ pub fn virgl_composite_windows(
         }
     }
 
+    // Step 2: Blit window content from MAP_SHARED pages into COMPOSITE_TEX.
+    // Windows are composited in z-order (bottom first in the array, top last)
+    // so higher-z windows correctly overwrite lower-z windows where they overlap.
+    // This must happen BEFORE cursor drawing so the cursor appears on top.
+    if bg_dirty || any_window_dirty {
+        for win in windows.iter() {
+            if win.page_phys_addrs.is_empty() || win.width == 0 || win.height == 0 {
+                continue;
+            }
+            blit_window_to_compositor(
+                win.x as u32, win.y as u32,
+                win.width, win.height,
+                &win.page_phys_addrs,
+                tex_w, tex_h,
+            );
+        }
+    }
+
     // ── Step 3: Cursor rendering ────────────────────────────────────────────
     // Draw the mouse cursor directly into COMPOSITE_TEX so it appears in the
     // composited output without requiring a full 4.9MB upload from userspace.
@@ -3690,11 +3814,10 @@ pub fn virgl_composite_windows(
     }
 
     // =========================================================================
-    // Phase B+C: Direct scanout on COMPOSITE_TEX (skip SUBMIT_3D entirely)
+    // Phase B+C: Single fullscreen SUBMIT_3D quad + display
     // =========================================================================
-    // Instead of building a VirGL 3D pipeline (shaders, textured quad, SUBMIT_3D)
-    // to copy COMPOSITE_TEX onto RESOURCE_3D_ID, we set scanout directly on
-    // COMPOSITE_TEX_ID. This eliminates the SUBMIT_3D round-trip.
+    // Window content was already blitted into COMPOSITE_TEX in z-order (step 2),
+    // so a single textured quad correctly displays everything including cursor.
 
     // Perf: timestamp before display phase
     #[cfg(target_arch = "aarch64")]
@@ -3704,20 +3827,7 @@ pub fn virgl_composite_windows(
         v
     };
 
-    // Direct scanout on COMPOSITE_TEX — skip SUBMIT_3D entirely.
-    // TRANSFER_TO_HOST_3D already pushed pixels to the host texture.
-    // SET_SCANOUT + RESOURCE_FLUSH displays it directly.
-    static SCANOUT_ESTABLISHED: core::sync::atomic::AtomicBool =
-        core::sync::atomic::AtomicBool::new(false);
-    if !SCANOUT_ESTABLISHED.load(Ordering::Relaxed) {
-        with_device_state(|state| {
-            set_scanout_resource(state, RESOURCE_COMPOSITE_TEX_ID)
-        })?;
-        SCANOUT_ESTABLISHED.store(true, Ordering::Relaxed);
-    }
-    with_device_state(|state| {
-        resource_flush_3d(state, RESOURCE_COMPOSITE_TEX_ID)
-    })?;
+    virgl_composite_single_quad()?;
 
     // Perf: end of frame
     #[cfg(target_arch = "aarch64")]
@@ -3756,10 +3866,8 @@ pub fn virgl_composite_windows(
             let avg_display = to_us(PERF_DISPLAY_TICKS.swap(0, Ordering::Relaxed));
             let avg_total = to_us(PERF_TOTAL_TICKS.swap(0, Ordering::Relaxed));
 
-            crate::serial_println!(
-                "[gpu-perf] frame={} avg/frame: compose={}us display={}us TOTAL={}us",
-                frame, avg_compose, avg_display, avg_total
-            );
+            // GPU perf counters available via GDB: PERF_COMPOSE_TICKS, PERF_DISPLAY_TICKS, PERF_TOTAL_TICKS
+            let _ = (avg_compose, avg_display, avg_total);
         }
     }
 
diff --git a/kernel/src/drivers/virtio/net_pci.rs b/kernel/src/drivers/virtio/net_pci.rs
index 856cd524..03ec17fb 100644
--- a/kernel/src/drivers/virtio/net_pci.rs
+++ b/kernel/src/drivers/virtio/net_pci.rs
@@ -17,7 +17,7 @@
 
 use crate::drivers::pci;
 use core::ptr::{read_volatile, write_volatile};
-use core::sync::atomic::{fence, AtomicBool, Ordering};
+use core::sync::atomic::{fence, AtomicBool, AtomicU32, Ordering};
 
 // Legacy VirtIO PCI register offsets (from BAR0)
 const REG_DEVICE_FEATURES: usize = 0x00;
@@ -67,6 +67,12 @@ struct VirtqDesc {
 
 const DESC_F_WRITE: u16 = 2;
 
+/// When set in avail.flags, tells the device NOT to send interrupts (MSIs)
+/// when it adds entries to the used ring. Used for NAPI-style interrupt
+/// coalescing: handler sets this to suppress MSI storm, softirq clears it
+/// after draining the used ring.
+const VRING_AVAIL_F_NO_INTERRUPT: u16 = 1;
+
 /// Legacy VirtIO queue size — must match what the device reports.
 /// Parallels reports 256; the driver can't change it on legacy transport.
 const VIRTQ_SIZE: usize = 256;
@@ -174,6 +180,8 @@ struct NetPciState {
 
 static mut NET_PCI_STATE: Option<NetPciState> = None;
 static DEVICE_INITIALIZED: AtomicBool = AtomicBool::new(false);
+static NET_PCI_IRQ: AtomicU32 = AtomicU32::new(0);
+static NET_PCI_MSI_COUNT: AtomicU32 = AtomicU32::new(0);
 
 // Legacy register access helpers
 #[inline(always)]
@@ -211,6 +219,144 @@ fn virt_to_phys(addr: u64) -> u64 {
     addr - crate::memory::physical_memory_offset().as_u64()
 }
 
+/// Get the GIC INTID for the VirtIO PCI net interrupt, if MSI is enabled.
+pub fn get_irq() -> Option<u32> {
+    let irq = NET_PCI_IRQ.load(Ordering::Relaxed);
+    if irq != 0 { Some(irq) } else { None }
+}
+
+/// VirtIO legacy MSI-X register offsets (present when MSI-X is enabled at PCI level).
+/// These replace the device config at BAR0+0x14; device config shifts to 0x18.
+const MSIX_CONFIG_VECTOR: usize = 0x14;
+const MSIX_QUEUE_VECTOR: usize = 0x16;
+
+/// Resolve a GICv2m doorbell address. Returns the MSI_SETSPI_NS physical address.
+fn resolve_gicv2m_doorbell() -> Option<u64> {
+    const PARALLELS_GICV2M_BASE: u64 = 0x0225_0000;
+    let gicv2m_base = crate::platform_config::gicv2m_base_phys();
+    let base = if gicv2m_base != 0 {
+        gicv2m_base
+    } else if crate::platform_config::probe_gicv2m(PARALLELS_GICV2M_BASE) {
+        PARALLELS_GICV2M_BASE
+    } else {
+        return None;
+    };
+    Some(base + 0x40)
+}
+
+/// Set up PCI MSI or MSI-X delivery for the VirtIO network device through GICv2m.
+fn setup_net_pci_msi(pci_dev: &crate::drivers::pci::Device) {
+    use crate::arch_impl::aarch64::gic;
+
+    pci_dev.dump_capabilities();
+
+    // Try plain MSI first (some VirtIO devices have this)
+    if let Some(cap_offset) = pci_dev.find_msi_capability() {
+        crate::serial_println!("[virtio-net-pci] Found MSI capability at offset {:#x}", cap_offset);
+        if let Some(doorbell) = resolve_gicv2m_doorbell() {
+            let spi = crate::platform_config::allocate_msi_spi();
+            if spi != 0 {
+                pci_dev.configure_msi(cap_offset, doorbell as u32, spi as u16);
+                pci_dev.disable_intx();
+                gic::configure_spi_edge_triggered(spi);
+                NET_PCI_IRQ.store(spi, Ordering::Relaxed);
+                gic::enable_spi(spi);
+                crate::serial_println!("[virtio-net-pci] MSI enabled: SPI {} doorbell={:#x}", spi, doorbell);
+                return;
+            }
+        }
+        crate::serial_println!("[virtio-net-pci] MSI setup failed — trying MSI-X");
+    }
+
+    // Try MSI-X (Parallels VirtIO net PCI 1af4:1000 has MSI-X with 3 vectors)
+    let msix_cap = match pci_dev.find_msix_capability() {
+        Some(cap) => cap,
+        None => {
+            crate::serial_println!("[virtio-net-pci] No MSI or MSI-X capability — polling fallback");
+            return;
+        }
+    };
+
+    let table_size = pci_dev.msix_table_size(msix_cap);
+    crate::serial_println!("[virtio-net-pci] MSI-X cap at {:#x}: {} vectors", msix_cap, table_size);
+
+    let doorbell = match resolve_gicv2m_doorbell() {
+        Some(d) => d,
+        None => {
+            crate::serial_println!("[virtio-net-pci] GICv2m not available — polling fallback");
+            return;
+        }
+    };
+
+    let spi = crate::platform_config::allocate_msi_spi();
+    if spi == 0 {
+        crate::serial_println!("[virtio-net-pci] Failed to allocate MSI SPI — polling fallback");
+        return;
+    }
+
+    // Program all MSI-X table entries with the same SPI (single-vector mode).
+    for v in 0..table_size {
+        pci_dev.configure_msix_entry(msix_cap, v, doorbell, spi);
+    }
+
+    gic::configure_spi_edge_triggered(spi);
+    // Store IRQ but do NOT enable the SPI yet. The SPI is enabled by
+    // enable_msi_spi() after init_common() completes its synchronous
+    // ARP/ICMP polling. This avoids the GICv2m level-triggered SPI storm
+    // during init (the device fires MSIs for ARP replies, and the level
+    // stays asserted through EOI).
+    NET_PCI_IRQ.store(spi, Ordering::Release);
+
+    // Enable MSI-X at PCI level and disable legacy INTx
+    pci_dev.enable_msix(msix_cap);
+    pci_dev.disable_intx();
+
+    // Assign VirtIO-level MSI-X vectors.
+    let bar0_virt = unsafe {
+        let ptr = &raw const NET_PCI_STATE;
+        match (*ptr).as_ref() {
+            Some(s) => s.bar0_virt,
+            None => {
+                crate::serial_println!("[virtio-net-pci] MSI-X: device state not available");
+                return;
+            }
+        }
+    };
+
+    // Config change → no interrupt (0xFFFF). Avoids spurious config-change
+    // MSIs that could cause an interrupt storm unrelated to packet RX.
+    reg_write_u16(bar0_virt, MSIX_CONFIG_VECTOR, 0xFFFF);
+    let cfg_rb = reg_read_u16(bar0_virt, MSIX_CONFIG_VECTOR);
+
+    // RX queue (0) → vector 0
+    reg_write_u16(bar0_virt, REG_QUEUE_SELECT, 0);
+    reg_write_u16(bar0_virt, MSIX_QUEUE_VECTOR, 0);
+    let rx_rb = reg_read_u16(bar0_virt, MSIX_QUEUE_VECTOR);
+
+    // TX queue (1) → no interrupt
+    reg_write_u16(bar0_virt, REG_QUEUE_SELECT, 1);
+    reg_write_u16(bar0_virt, MSIX_QUEUE_VECTOR, 0xFFFF);
+
+    crate::serial_println!(
+        "[virtio-net-pci] MSI-X vector assignments: cfg={:#x} rx={:#x}",
+        cfg_rb, rx_rb
+    );
+
+    // Only RX vector must succeed; config vector is intentionally 0xFFFF
+    if rx_rb == 0xFFFF {
+        crate::serial_println!("[virtio-net-pci] MSI-X: device rejected RX vector — polling fallback");
+        pci_dev.disable_msix(msix_cap);
+        pci_dev.enable_intx();
+        NET_PCI_IRQ.store(0, Ordering::Relaxed);
+        return;
+    }
+
+    crate::serial_println!(
+        "[virtio-net-pci] MSI-X enabled: SPI {} doorbell={:#x} vectors={}",
+        spi, doorbell, table_size
+    );
+}
+
 /// Initialize the VirtIO network device via PCI legacy transport.
 pub fn init() -> Result<(), &'static str> {
     crate::serial_println!("[virtio-net-pci] Searching for VirtIO network device on PCI bus...");
@@ -311,6 +457,7 @@ pub fn init() -> Result<(), &'static str> {
     post_rx_buffers()?;
 
     DEVICE_INITIALIZED.store(true, Ordering::Release);
+    setup_net_pci_msi(pci_dev);
     crate::serial_println!("[virtio-net-pci] Network device initialized successfully");
     Ok(())
 }
@@ -548,12 +695,115 @@ pub fn mac_address() -> Option<[u8; 6]> {
     }
 }
 
-/// Interrupt handler for VirtIO network PCI device.
+/// Get the MSI interrupt count (for diagnostics).
+pub fn msi_interrupt_count() -> u32 {
+    NET_PCI_MSI_COUNT.load(Ordering::Relaxed)
+}
+
+/// Interrupt handler for VirtIO network PCI device (MSI-X).
+///
+/// Uses NAPI-style two-level suppression to prevent GICv2m SPI storms:
+/// 1. Device-level: sets VRING_AVAIL_F_NO_INTERRUPT so the device stops
+///    writing MSIs to GICv2m entirely.
+/// 2. GIC-level: disables the SPI as a safety net.
+///
+/// Does NOT process packets or raise softirq (locks in the packet
+/// processing path could deadlock with the interrupted thread).
+/// Timer-based NetRx softirq handles packet processing and calls
+/// re_enable_irq() to re-arm both levels.
 pub fn handle_interrupt() {
-    if !DEVICE_INITIALIZED.load(Ordering::Acquire) {
+    use crate::arch_impl::aarch64::gic;
+
+    NET_PCI_MSI_COUNT.fetch_add(1, Ordering::Relaxed);
+
+    let irq = NET_PCI_IRQ.load(Ordering::Relaxed);
+    if irq == 0 {
         return;
     }
 
+    // Suppress at the device level FIRST — prevents new MSI writes to GICv2m.
+    unsafe {
+        let q = &raw mut PCI_RX_QUEUE;
+        write_volatile(&mut (*q).avail.flags, VRING_AVAIL_F_NO_INTERRUPT);
+        fence(Ordering::SeqCst);
+    }
+
+    // Mask SPI at the GIC — belt-and-suspenders with device-level suppression.
+    gic::disable_spi(irq);
+    gic::clear_spi_pending(irq);
+
+    // Read ISR to clear the VirtIO device's internal interrupt condition.
+    let state = &raw const NET_PCI_STATE;
+    unsafe {
+        if let Some(ref s) = *state {
+            let _isr = reg_read_u8(s.bar0_virt, REG_ISR_STATUS);
+        }
+    }
+
+    // Both levels stay suppressed — re_enable_irq() called from timer softirq.
+}
+
+/// Re-enable the network device's MSI-X interrupt after softirq processing.
+///
+/// Called by the NetRx softirq handler after draining the used ring.
+/// Follows the Linux virtqueue_enable_cb() pattern:
+/// 1. Read ISR to clear any pending device interrupt condition
+/// 2. Re-enable device-level interrupts (clear NO_INTERRUPT flag)
+/// 3. Memory barrier + check for new used ring entries
+/// 4. If more work: re-suppress and let next softirq handle it
+/// 5. If clean: clear GIC pending + enable SPI
+pub fn re_enable_irq() {
+    use crate::arch_impl::aarch64::gic;
+
+    let irq = NET_PCI_IRQ.load(Ordering::Relaxed);
+    if irq == 0 {
+        return;
+    }
+
+    // Read ISR to clear any pending device interrupt condition before re-enabling.
+    let state_ptr = &raw const NET_PCI_STATE;
+    unsafe {
+        if let Some(ref s) = *state_ptr {
+            let _isr = reg_read_u8(s.bar0_virt, REG_ISR_STATUS);
+        }
+    }
+
+    // Re-enable device-level interrupts (Linux: virtqueue_enable_cb)
+    unsafe {
+        let q = &raw mut PCI_RX_QUEUE;
+        write_volatile(&mut (*q).avail.flags, 0);
+        fence(Ordering::SeqCst);
+    }
+
+    // Check if more work arrived while we were processing (race window).
+    // If so, re-suppress and let the next timer softirq cycle handle it.
+    let has_more = unsafe {
+        let q = &raw const PCI_RX_QUEUE;
+        let used_idx = read_volatile(&(*q).used.idx);
+        if let Some(ref s) = *state_ptr {
+            used_idx != s.rx_last_used_idx
+        } else {
+            false
+        }
+    };
+
+    if has_more {
+        // More work arrived — re-suppress device interrupts, don't enable SPI.
+        unsafe {
+            let q = &raw mut PCI_RX_QUEUE;
+            write_volatile(&mut (*q).avail.flags, VRING_AVAIL_F_NO_INTERRUPT);
+            fence(Ordering::SeqCst);
+        }
+        return;
+    }
+
+    // Used ring is drained — safe to re-enable the GIC SPI.
+    gic::clear_spi_pending(irq);
+    gic::enable_spi(irq);
+}
+
+/// Diagnostic: dump RX queue state for debugging MSI-X issues.
+pub fn dump_rx_state() {
     let state = unsafe {
         let ptr = &raw const NET_PCI_STATE;
         match (*ptr).as_ref() {
@@ -562,10 +812,43 @@ pub fn handle_interrupt() {
         }
     };
 
-    // Reading ISR status auto-acknowledges on legacy PCI
-    let _isr = reg_read_u8(state.bar0_virt, REG_ISR_STATUS);
+    let isr = reg_read_u8(state.bar0_virt, REG_ISR_STATUS);
+    let (used_idx, avail_idx) = unsafe {
+        let q = &raw const PCI_RX_QUEUE;
+        (read_volatile(&(*q).used.idx), read_volatile(&(*q).avail.idx))
+    };
+    let msi_count = NET_PCI_MSI_COUNT.load(Ordering::Relaxed);
+    crate::serial_println!(
+        "[virtio-net-pci] RX diag: used_idx={} last_used={} avail_idx={} isr={:#x} msi_count={}",
+        used_idx, state.rx_last_used_idx, avail_idx, isr, msi_count
+    );
+}
+
+/// Enable the MSI-X SPI at the GIC after init polling is complete.
+///
+/// During init, the ARP/ICMP polling loop processes RX via timer-based softirq.
+/// The SPI must NOT be enabled during init because the GICv2m level-triggered
+/// storm would prevent the main thread from making progress. After init drains
+/// all used ring entries, it's safe to enable the SPI for interrupt-driven RX.
+pub fn enable_msi_spi() {
+    use crate::arch_impl::aarch64::gic;
+
+    let irq = NET_PCI_IRQ.load(Ordering::Relaxed);
+    if irq == 0 {
+        return;
+    }
+
+    // Read ISR to clear any pending device interrupt from init polling
+    let state_ptr = &raw const NET_PCI_STATE;
+    unsafe {
+        if let Some(ref s) = *state_ptr {
+            let _isr = reg_read_u8(s.bar0_virt, REG_ISR_STATUS);
+        }
+    }
 
-    crate::task::softirqd::raise_softirq(crate::task::softirqd::SoftirqType::NetRx);
+    gic::clear_spi_pending(irq);
+    gic::enable_spi(irq);
+    crate::serial_println!("[virtio-net-pci] MSI-X SPI {} enabled (post-init)", irq);
 }
 
 /// Whether the PCI net device is initialized
diff --git a/kernel/src/fs/procfs/mod.rs b/kernel/src/fs/procfs/mod.rs
index ded541f7..ce13b779 100644
--- a/kernel/src/fs/procfs/mod.rs
+++ b/kernel/src/fs/procfs/mod.rs
@@ -774,6 +774,11 @@ fn generate_stat() -> String {
         GPU_FULL_UPLOADS.aggregate(),
         GPU_PARTIAL_UPLOADS.aggregate(),
     );
+    #[cfg(target_arch = "aarch64")]
+    {
+        let _ = write!(out, "net_msi_irqs {}\n",
+            crate::drivers::virtio::net_pci::msi_interrupt_count());
+    }
     out
 }
 
diff --git a/kernel/src/net/arp.rs b/kernel/src/net/arp.rs
index 548dacc1..a9c60180 100644
--- a/kernel/src/net/arp.rs
+++ b/kernel/src/net/arp.rs
@@ -218,14 +218,19 @@ pub fn handle_arp(eth_frame: &EthernetFrame, arp: &ArpPacket) {
     }
 }
 
-/// Update the ARP cache with a new entry
+/// Update the ARP cache with a new entry.
+/// IRQ-safe: disables interrupts to prevent deadlock with softirq handler
+/// which also calls update_cache via process_rx → handle_arp.
 fn update_cache(ip: &[u8; 4], mac: &[u8; 6]) {
+    let saved = super::irq_save();
     let mut cache = ARP_CACHE.lock();
 
     // First, check if entry already exists
     for entry in cache.iter_mut() {
         if entry.valid && entry.ip == *ip {
             entry.mac = *mac;
+            drop(cache);
+            super::irq_restore(saved);
             return;
         }
     }
@@ -236,6 +241,8 @@ fn update_cache(ip: &[u8; 4], mac: &[u8; 6]) {
             entry.ip = *ip;
             entry.mac = *mac;
             entry.valid = true;
+            drop(cache);
+            super::irq_restore(saved);
             return;
         }
     }
@@ -244,18 +251,27 @@ fn update_cache(ip: &[u8; 4], mac: &[u8; 6]) {
     cache[0].ip = *ip;
     cache[0].mac = *mac;
     cache[0].valid = true;
+    drop(cache);
+    super::irq_restore(saved);
 }
 
-/// Look up a MAC address in the ARP cache
+/// Look up a MAC address in the ARP cache.
+/// IRQ-safe: disables interrupts to prevent deadlock with softirq handler.
 pub fn lookup(ip: &[u8; 4]) -> Option<[u8; 6]> {
+    let saved = super::irq_save();
     let cache = ARP_CACHE.lock();
 
     for entry in cache.iter() {
         if entry.valid && entry.ip == *ip {
-            return Some(entry.mac);
+            let mac = entry.mac;
+            drop(cache);
+            super::irq_restore(saved);
+            return Some(mac);
         }
     }
 
+    drop(cache);
+    super::irq_restore(saved);
     None
 }
 
diff --git a/kernel/src/net/mod.rs b/kernel/src/net/mod.rs
index eb3cc68f..fe0f4cfb 100644
--- a/kernel/src/net/mod.rs
+++ b/kernel/src/net/mod.rs
@@ -31,6 +31,41 @@ use crate::drivers::virtio::net_pci;
 
 use crate::task::softirqd::{register_softirq_handler, SoftirqType};
 
+/// Disable IRQs and return saved DAIF state. Prevents timer interrupt →
+/// softirq → process_rx from deadlocking on shared locks (ARP_CACHE,
+/// NET_CONFIG) that the interrupted thread may hold.
+#[cfg(target_arch = "aarch64")]
+#[inline(always)]
+pub(crate) fn irq_save() -> u64 {
+    let daif: u64;
+    unsafe {
+        core::arch::asm!("mrs {}, daif", out(reg) daif, options(nomem, nostack));
+        core::arch::asm!("msr daifset, #2", options(nomem, nostack));
+    }
+    daif
+}
+
+#[cfg(target_arch = "aarch64")]
+#[inline(always)]
+pub(crate) fn irq_restore(saved: u64) {
+    unsafe {
+        core::arch::asm!("msr daif, {}", in(reg) saved, options(nomem, nostack));
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+pub(crate) fn irq_save() -> u64 { 0 }
+
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+pub(crate) fn irq_restore(_: u64) {}
+
+/// Re-entrancy guard for process_rx() on aarch64. Prevents the softirq handler
+/// from re-entering process_rx() while the ARP polling loop is already inside it.
+#[cfg(target_arch = "aarch64")]
+static RX_PROCESSING: core::sync::atomic::AtomicBool = core::sync::atomic::AtomicBool::new(false);
+
 // Logging macros that work on both architectures
 #[cfg(target_arch = "x86_64")]
 macro_rules! net_log {
@@ -189,10 +224,19 @@ pub fn drain_loopback_queue() {
     }
 }
 
-/// Softirq handler for network RX processing
-/// Called from softirq context when NetRx softirq is raised by network interrupt handler
+/// Softirq handler for network RX processing.
+/// Called from softirq context when NetRx softirq is raised by the timer (every 10ms).
+///
+/// The MSI handler does NOT raise softirq (to avoid lock contention in
+/// exception context). Instead, the timer raises NetRx every 10ms. This handler
+/// processes packets and then re-enables the MSI-X SPI so new interrupts can fire.
 fn net_rx_softirq_handler(_softirq: SoftirqType) {
     process_rx();
+
+    #[cfg(target_arch = "aarch64")]
+    if net_pci::is_initialized() {
+        net_pci::re_enable_irq();
+    }
 }
 
 /// Re-register the network softirq handler.
@@ -232,12 +276,18 @@ pub fn init() {
     // Auto-detect platform: PCI net = Parallels, e1000 = VMware, MMIO net = QEMU
     if net_pci::is_initialized() {
         crate::serial_println!("[net] Using VirtIO net PCI driver (Parallels)");
+        let saved = irq_save();
         let mut config = NET_CONFIG.lock();
         *config = PARALLELS_CONFIG;
+        drop(config);
+        irq_restore(saved);
     } else if e1000::is_initialized() {
         crate::serial_println!("[net] Using Intel e1000 driver (VMware)");
+        let saved = irq_save();
         let mut config = NET_CONFIG.lock();
         *config = VMWARE_CONFIG;
+        drop(config);
+        irq_restore(saved);
     }
 
     if let Some(mac) = get_mac_address() {
@@ -262,13 +312,15 @@ fn init_common() {
         return;
     }
 
+    let saved = irq_save();
     let config = NET_CONFIG.lock();
-    net_log!("NET: IP address: {}.{}.{}.{}",
-        config.ip_addr[0], config.ip_addr[1], config.ip_addr[2], config.ip_addr[3]
-    );
-    net_log!("NET: Gateway: {}.{}.{}.{}",
-        config.gateway[0], config.gateway[1], config.gateway[2], config.gateway[3]
-    );
+    let ip = config.ip_addr;
+    let gw = config.gateway;
+    drop(config);
+    irq_restore(saved);
+
+    net_log!("NET: IP address: {}.{}.{}.{}", ip[0], ip[1], ip[2], ip[3]);
+    net_log!("NET: Gateway: {}.{}.{}.{}", gw[0], gw[1], gw[2], gw[3]);
 
     // Initialize ARP cache
     arp::init();
@@ -276,8 +328,7 @@ fn init_common() {
     net_log!("Network stack initialized");
 
     // Send ARP request for gateway to test network connectivity
-    let gateway = config.gateway;
-    drop(config); // Release lock before calling arp::request
+    let gateway = gw;
     net_log!("NET: Sending ARP request for gateway {}.{}.{}.{}",
         gateway[0], gateway[1], gateway[2], gateway[3]);
     if let Err(e) = arp::request(&gateway) {
@@ -288,12 +339,17 @@ fn init_common() {
 
     // Wait for ARP reply (poll RX a few times to get the gateway MAC)
     // The reply comes via interrupt, so we just need to give it time to arrive
-    for _ in 0..100 {
+    for _i in 0..100 {
         process_rx();
-        // Delay to let packets arrive and interrupts fire
+        // Delay to let packets arrive and timer-based polling process them
         for _ in 0..1_000_000 {
             core::hint::spin_loop();
         }
+        // Diagnostic: dump RX queue state on first few iterations
+        #[cfg(target_arch = "aarch64")]
+        if _i < 5 || _i % 20 == 0 {
+            net_pci::dump_rx_state();
+        }
         // Check if we got the ARP reply yet
         if let Some(gateway_mac) = arp::lookup(&gateway) {
             net_log!("NET: ARP resolved gateway MAC: {:02x}:{:02x}:{:02x}:{:02x}:{:02x}:{:02x}",
@@ -333,16 +389,24 @@ fn init_common() {
     // interrupt-driven RX doesn't interfere with the polling.
     #[cfg(target_arch = "aarch64")]
     {
-        if !net_pci::is_initialized() {
+        if net_pci::is_initialized() {
+            // Enable MSI-X SPI at GIC now that the used ring is drained.
+            // During init, timer-based polling handled RX. Now switch to
+            // interrupt-driven NAPI-style processing.
+            net_pci::enable_msi_spi();
+        } else {
             net_mmio::enable_net_irq();
         }
-        // PCI net uses polling mode (no GIC IRQ needed — softirq handles packet processing)
     }
 }
 
-/// Get the current network configuration
+/// Get the current network configuration.
+/// IRQ-safe: disables interrupts to prevent deadlock with softirq handler.
 pub fn config() -> NetConfig {
-    *NET_CONFIG.lock()
+    let saved = irq_save();
+    let c = *NET_CONFIG.lock();
+    irq_restore(saved);
+    c
 }
 
 /// Process incoming packets (called from interrupt handler or polling loop)
@@ -361,8 +425,19 @@ pub fn process_rx() {
 }
 
 /// Process incoming packets (ARM64 - polling or interrupt driven)
+///
+/// Protected by RX_PROCESSING atomic to prevent re-entrancy. When MSI-X is
+/// active, the softirq handler can preempt the ARP polling loop and try to
+/// call process_rx() re-entrantly — the guard skips the nested call.
 #[cfg(target_arch = "aarch64")]
 pub fn process_rx() {
+    // Re-entrancy guard: if we're already inside process_rx (e.g., ARP polling
+    // loop interrupted by MSI-X → softirq → process_rx), skip this call.
+    use core::sync::atomic::Ordering;
+    if RX_PROCESSING.compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed).is_err() {
+        return;
+    }
+
     // Try PCI driver first (Parallels), then e1000 (VMware), then MMIO (QEMU)
     if net_pci::is_initialized() {
         let mut processed = false;
@@ -393,6 +468,12 @@ pub fn process_rx() {
             net_mmio::recycle_rx_buffers();
         }
     }
+
+    // Do NOT re-enable SPI here — the softirq handler does it after process_rx
+    // returns, regardless of whether we processed packets or bailed on re-entrancy.
+    // This avoids re-enabling from multiple code paths.
+
+    RX_PROCESSING.store(false, Ordering::Release);
 }
 
 /// Process a received Ethernet frame
diff --git a/kernel/src/process/manager.rs b/kernel/src/process/manager.rs
index fb746e3a..59bd3714 100644
--- a/kernel/src/process/manager.rs
+++ b/kernel/src/process/manager.rs
@@ -948,10 +948,40 @@ impl ProcessManager {
                 self.current_pid = None;
             }
 
-            // TODO: Clean up process resources
-            // - Unmap memory pages
-            // - Close file descriptors
-            // - Reparent children to init
+            // Free heavy resources immediately rather than waiting for waitpid reap.
+            // CoW refcounts were already decremented by terminate() -> cleanup_cow_frames(),
+            // so it's safe to drop the page table now.
+            process.page_table.take();
+            process.stack.take();
+            process.pending_old_page_tables.clear();
+
+            // Clean up window buffers so the compositor stops reading freed pages
+            #[cfg(target_arch = "aarch64")]
+            crate::syscall::graphics::cleanup_windows_for_pid(pid.as_u64());
+        }
+
+        // Reparent children to init (PID 1)
+        let init_pid = ProcessId::new(1);
+        if pid != init_pid {
+            let children: Vec<ProcessId> = self
+                .processes
+                .get(&pid)
+                .map(|p| p.children.clone())
+                .unwrap_or_default();
+
+            if !children.is_empty() {
+                for &child_pid in &children {
+                    if let Some(child) = self.processes.get_mut(&child_pid) {
+                        child.parent = Some(init_pid);
+                    }
+                }
+                if let Some(init) = self.processes.get_mut(&init_pid) {
+                    init.children.extend(children.iter());
+                }
+                if let Some(exiting) = self.processes.get_mut(&pid) {
+                    exiting.children.clear();
+                }
+            }
         }
 
         // Send SIGCHLD to the parent process (if any)
diff --git a/kernel/src/syscall/graphics.rs b/kernel/src/syscall/graphics.rs
index d100f269..af99cf6b 100644
--- a/kernel/src/syscall/graphics.rs
+++ b/kernel/src/syscall/graphics.rs
@@ -70,6 +70,19 @@ pub fn wake_compositor_if_waiting() {
     }
 }
 
+/// Clean up all window buffers owned by a terminated process.
+/// Removes entries from the registry and wakes the compositor so it
+/// discovers the removal and repaints.
+#[cfg(target_arch = "aarch64")]
+pub fn cleanup_windows_for_pid(pid: u64) {
+    let mut reg = WINDOW_REGISTRY.lock();
+    if reg.remove_for_pid(pid) {
+        REGISTRY_GENERATION.fetch_add(1, core::sync::atomic::Ordering::Release);
+        drop(reg);
+        wake_compositor_if_waiting();
+    }
+}
+
 /// Restore TTBR0 to the current process's page tables after blocking.
 ///
 /// After a blocking syscall (mark_window_dirty), TTBR0 may point to a different
@@ -275,6 +288,21 @@ impl WindowRegistry {
         })
     }
 
+    /// Remove all window buffers owned by a given process.
+    /// Returns true if any buffers were removed.
+    fn remove_for_pid(&mut self, pid: u64) -> bool {
+        let mut removed = false;
+        for slot in &mut self.buffers {
+            if let Some(ref buf) = slot {
+                if buf.owner_pid == pid {
+                    *slot = None;
+                    removed = true;
+                }
+            }
+        }
+        removed
+    }
+
     fn registered_windows(&self) -> alloc::vec::Vec<WindowInfo> {
         let mut result = alloc::vec::Vec::new();
         for slot in &self.buffers {
@@ -1291,6 +1319,7 @@ fn handle_composite_windows(desc_ptr: u64) -> SyscallResult {
     };
 
     // Collect window info and waiting thread IDs under lock, then release.
+    // Also lazy-initialize VirGL textures for windows that don't have them yet.
     let mut threads_to_wake: [Option<u64>; MAX_WINDOW_BUFFERS] = [None; MAX_WINDOW_BUFFERS];
     let windows: alloc::vec::Vec<WindowCompositeInfo> = {
         let mut reg = WINDOW_REGISTRY.lock();
@@ -1301,6 +1330,28 @@ fn handle_composite_windows(desc_ptr: u64) -> SyscallResult {
                 if !buf.registered { continue; }
                 if buf.width == 0 || buf.height == 0 { continue; }
 
+                // Lazy VirGL texture init: create per-window GPU texture on first composite
+                if !buf.virgl_initialized && !buf.page_phys_addrs.is_empty()
+                    && matches!(crate::graphics::compositor_backend(),
+                                crate::graphics::CompositorBackend::VirGL)
+                {
+                    let slot_idx = (buf.id as usize).saturating_sub(1) % 16;
+                    match crate::drivers::virtio::gpu_pci::init_window_texture(
+                        slot_idx, buf.width, buf.height, &buf.page_phys_addrs, buf.size
+                    ) {
+                        Ok(res_id) => {
+                            buf.virgl_resource_id = res_id;
+                            buf.virgl_initialized = true;
+                            crate::serial_println!("[composite] Window {} got VirGL texture (res={})",
+                                buf.id, res_id);
+                        }
+                        Err(e) => {
+                            crate::serial_println!("[composite] Window {} texture init failed: {}",
+                                buf.id, e);
+                        }
+                    }
+                }
+
                 let dirty = buf.generation > buf.last_uploaded_gen;
 
                 result.push(WindowCompositeInfo {
diff --git a/kernel/src/task/process_task.rs b/kernel/src/task/process_task.rs
index b22dbb8e..ecc2378d 100644
--- a/kernel/src/task/process_task.rs
+++ b/kernel/src/task/process_task.rs
@@ -77,6 +77,7 @@ impl ProcessScheduler {
                 if let Some((pid, process)) = manager.find_process_by_thread_mut(thread_id) {
                     let parent_pid = process.parent;
                     let process_name = process.name.clone();
+                    let children = core::mem::take(&mut process.children);
 
                     // Mark terminated and extract FDs without closing them
                     process.terminate_minimal(exit_code);
@@ -85,6 +86,11 @@ impl ProcessScheduler {
                     process.cleanup_cow_frames();
                     process.drain_old_page_tables();
 
+                    // Free heavy resources immediately (CoW refcounts already decremented)
+                    process.page_table.take();
+                    process.stack.take();
+                    process.pending_old_page_tables.clear();
+
                     #[cfg(feature = "btrt")]
                     crate::test_framework::btrt::on_process_exit(pid.as_u64(), exit_code);
 
@@ -101,6 +107,20 @@ impl ProcessScheduler {
                         None
                     };
 
+                    // Reparent children to init (PID 1)
+                    if !children.is_empty() {
+                        use crate::process::ProcessId;
+                        let init_pid = ProcessId::new(1);
+                        for &child_pid in &children {
+                            if let Some(child) = manager.get_process_mut(child_pid) {
+                                child.parent = Some(init_pid);
+                            }
+                        }
+                        if let Some(init) = manager.get_process_mut(init_pid) {
+                            init.children.extend(children.iter());
+                        }
+                    }
+
                     Some((pid, process_name, fd_entries, parent_tid))
                 } else {
                     None
@@ -115,6 +135,10 @@ impl ProcessScheduler {
             // Close FDs outside PM lock (pipe close_write wakes readers, etc.)
             close_extracted_fds(fd_entries);
 
+            // Clean up window buffers so the compositor stops reading freed pages
+            #[cfg(target_arch = "aarch64")]
+            crate::syscall::graphics::cleanup_windows_for_pid(pid.as_u64());
+
             // Wake parent thread if blocked on waitpid or pause()
             if let Some(parent_tid) = parent_tid {
                 scheduler::with_scheduler(|sched| {
diff --git a/scripts/parallels/virgl_multi_texture_test.c b/scripts/parallels/virgl_multi_texture_test.c
new file mode 100644
index 00000000..7baae572
--- /dev/null
+++ b/scripts/parallels/virgl_multi_texture_test.c
@@ -0,0 +1,1387 @@
+/*
+ * virgl_multi_texture_test.c — Multi-texture VirGL compositing test
+ *
+ * Proves that multiple VirGL TEXTURE_2D resources can be:
+ *   1. Created independently
+ *   2. Rendered to via separate SUBMIT_3D batches (CLEAR to different colors)
+ *   3. Sampled from in a compositing pass that draws textured quads
+ *
+ * The final display shows:
+ *   - Dark gray background
+ *   - RED rectangle on the left  (texture A, pixels 100-500 x 100-400)
+ *   - BLUE rectangle on the right (texture B, pixels 600-1000 x 100-400)
+ *
+ * Pixel readback verifies the composited result.
+ *
+ * Build:  gcc -O2 -o virgl_multi_texture_test virgl_multi_texture_test.c -ldrm
+ * Run:    sudo ./virgl_multi_texture_test
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/select.h>
+#include <math.h>
+#include <xf86drm.h>
+#include <xf86drmMode.h>
+
+/* =========================================================================
+ * VirtGPU DRM ioctl definitions (from linux/virtgpu_drm.h)
+ * ========================================================================= */
+
+struct drm_virtgpu_resource_create {
+    uint32_t target;
+    uint32_t format;
+    uint32_t bind;
+    uint32_t width;
+    uint32_t height;
+    uint32_t depth;
+    uint32_t array_size;
+    uint32_t last_level;
+    uint32_t nr_samples;
+    uint32_t flags;
+    uint32_t bo_handle;  /* output */
+    uint32_t res_handle; /* output */
+    uint32_t size;       /* output */
+    uint32_t stride;     /* output */
+};
+
+struct drm_virtgpu_execbuffer {
+    uint32_t flags;
+    uint32_t size;
+    uint64_t command;
+    uint64_t bo_handles;
+    uint32_t num_bo_handles;
+    int32_t  fence_fd;
+};
+
+#define DRM_VIRTGPU_MAP              0x01
+#define DRM_VIRTGPU_EXECBUFFER       0x02
+#define DRM_VIRTGPU_RESOURCE_CREATE  0x04
+#define DRM_VIRTGPU_TRANSFER_FROM_HOST 0x06
+#define DRM_VIRTGPU_TRANSFER_TO_HOST 0x07
+#define DRM_VIRTGPU_WAIT             0x08
+
+struct drm_virtgpu_map {
+    uint32_t handle;
+    uint32_t pad;
+    uint64_t offset;  /* output: mmap offset */
+};
+
+struct drm_virtgpu_3d_transfer_to_host {
+    uint32_t bo_handle;
+    uint32_t pad;
+    uint64_t offset;
+    uint32_t level;
+    uint32_t stride;
+    uint32_t layer_stride;
+    struct {
+        uint32_t x, y, z, w, h, d;
+    } box;
+};
+
+/* TRANSFER_FROM_HOST uses the same struct layout */
+typedef struct drm_virtgpu_3d_transfer_to_host drm_virtgpu_3d_transfer_from_host;
+
+struct drm_virtgpu_3d_wait {
+    uint32_t handle;
+    uint32_t flags;
+};
+
+#define DRM_IOCTL_VIRTGPU_MAP \
+    DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, \
+             struct drm_virtgpu_map)
+
+#define DRM_IOCTL_VIRTGPU_EXECBUFFER \
+    DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_EXECBUFFER, \
+             struct drm_virtgpu_execbuffer)
+
+#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE \
+    DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE, \
+             struct drm_virtgpu_resource_create)
+
+#define DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST \
+    DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_FROM_HOST, \
+             drm_virtgpu_3d_transfer_from_host)
+
+#define DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST \
+    DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_TO_HOST, \
+             struct drm_virtgpu_3d_transfer_to_host)
+
+#define DRM_IOCTL_VIRTGPU_WAIT \
+    DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_WAIT, \
+             struct drm_virtgpu_3d_wait)
+
+/* =========================================================================
+ * VirGL constants — must match kernel/src/drivers/virtio/virgl.rs exactly
+ * ========================================================================= */
+
+/* Command types */
+#define VIRGL_CCMD_NOP                   0
+#define VIRGL_CCMD_CREATE_OBJECT         1
+#define VIRGL_CCMD_BIND_OBJECT           2
+#define VIRGL_CCMD_SET_VIEWPORT_STATE    4
+#define VIRGL_CCMD_SET_FRAMEBUFFER_STATE 5
+#define VIRGL_CCMD_SET_VERTEX_BUFFERS    6
+#define VIRGL_CCMD_CLEAR                 7
+#define VIRGL_CCMD_DRAW_VBO             8
+#define VIRGL_CCMD_RESOURCE_INLINE_WRITE 9
+#define VIRGL_CCMD_SET_SAMPLER_VIEWS     10
+#define VIRGL_CCMD_SET_SCISSOR_STATE     15
+#define VIRGL_CCMD_SET_SUB_CTX           28
+#define VIRGL_CCMD_CREATE_SUB_CTX        29
+#define VIRGL_CCMD_BIND_SHADER           31
+#define VIRGL_CCMD_SET_TWEAKS            46
+
+/* Object types */
+#define VIRGL_OBJ_BLEND           1
+#define VIRGL_OBJ_RASTERIZER      2
+#define VIRGL_OBJ_DSA             3
+#define VIRGL_OBJ_SHADER          4
+#define VIRGL_OBJ_VERTEX_ELEMENTS 5
+#define VIRGL_OBJ_SAMPLER_VIEW    6
+#define VIRGL_OBJ_SAMPLER_STATE   7
+#define VIRGL_OBJ_SURFACE         8
+
+/* Pipe constants */
+#define PIPE_BUFFER        0
+#define PIPE_TEXTURE_2D    2
+#define PIPE_PRIM_TRIANGLE_STRIP 5
+
+#define PIPE_FORMAT_B8G8R8X8_UNORM    2
+#define PIPE_FORMAT_R32G32B32A32_FLOAT 31
+
+#define PIPE_BIND_RENDER_TARGET   0x002
+#define PIPE_BIND_SAMPLER_VIEW    0x008
+#define PIPE_BIND_VERTEX_BUFFER   0x010
+#define PIPE_BIND_SCANOUT         0x40000
+#define PIPE_BIND_SHARED          0x100000
+
+#define PIPE_CLEAR_COLOR0  0x04
+
+#define PIPE_SHADER_VERTEX   0
+#define PIPE_SHADER_FRAGMENT 1
+
+#define PIPE_TEX_FILTER_LINEAR  1
+
+/* =========================================================================
+ * VirGL command buffer builder
+ * ========================================================================= */
+
+#define CMD_BUF_MAX 8192
+
+static uint32_t cmd_buf[CMD_BUF_MAX];
+static int cmd_len;
+
+static void cmd_reset(void) { cmd_len = 0; }
+
+static void cmd_push(uint32_t v)
+{
+    if (cmd_len < CMD_BUF_MAX)
+        cmd_buf[cmd_len++] = v;
+    else {
+        fprintf(stderr, "FATAL: cmd_buf overflow at DWORD %d\n", cmd_len);
+        exit(1);
+    }
+}
+
+/* Build VirGL command header:
+ *   bits [7:0]   = command opcode
+ *   bits [15:8]  = object type (for create/bind commands)
+ *   bits [31:16] = payload length in DWORDs (not including this header)
+ */
+static uint32_t cmd0(uint32_t cmd, uint32_t obj, uint32_t len)
+{
+    return cmd | (obj << 8) | (len << 16);
+}
+
+static uint32_t f32_bits(float f)
+{
+    uint32_t u;
+    memcpy(&u, &f, 4);
+    return u;
+}
+
+/* Pack TGSI text into DWORDs (little-endian, null-terminated, zero-padded).
+ * Returns number of DWORDs pushed. */
+static int push_tgsi_text(const char *text)
+{
+    int text_len = strlen(text) + 1; /* include null terminator */
+    int text_dwords = (text_len + 3) / 4;
+    for (int i = 0; i < text_dwords; i++) {
+        uint32_t dw = 0;
+        for (int b = 0; b < 4; b++) {
+            int idx = i * 4 + b;
+            if (idx < text_len)
+                dw |= ((uint32_t)(unsigned char)text[idx]) << (b * 8);
+        }
+        cmd_push(dw);
+    }
+    return text_dwords;
+}
+
+/* -------------------------------------------------------------------------
+ * VirGL command builders
+ * ------------------------------------------------------------------------- */
+
+static void cmd_create_sub_ctx(uint32_t id)
+{
+    cmd_push(cmd0(VIRGL_CCMD_CREATE_SUB_CTX, 0, 1));
+    cmd_push(id);
+}
+
+static void cmd_set_sub_ctx(uint32_t id)
+{
+    cmd_push(cmd0(VIRGL_CCMD_SET_SUB_CTX, 0, 1));
+    cmd_push(id);
+}
+
+static void cmd_set_tweaks(uint32_t id, uint32_t value)
+{
+    cmd_push(cmd0(VIRGL_CCMD_SET_TWEAKS, 0, 2));
+    cmd_push(id);
+    cmd_push(value);
+}
+
+/* Create shader with num_tokens=300 (Mesa default).
+ * CRITICAL: num_tokens=0 silently corrupts the VirGL context. */
+static void cmd_create_shader(uint32_t handle, uint32_t shader_type, const char *tgsi)
+{
+    int text_len = strlen(tgsi) + 1;
+    int text_dwords = (text_len + 3) / 4;
+    int payload_len = 5 + text_dwords;
+
+    cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_SHADER, payload_len));
+    cmd_push(handle);
+    cmd_push(shader_type);
+    cmd_push(text_len);     /* bit 31 clear = first/only chunk */
+    cmd_push(300);           /* NUM_TOKENS = 300 (Mesa default, MUST be nonzero) */
+    cmd_push(0);             /* num_so_outputs */
+    push_tgsi_text(tgsi);
+}
+
+static void cmd_create_blend_simple(uint32_t handle)
+{
+    /* S0=0x04 (dither), S2[0]=0x78000000 (colormask=0xF<<27) — matches Mesa */
+    cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_BLEND, 11));
+    cmd_push(handle);
+    cmd_push(0x00000004);   /* S0: dither enabled */
+    cmd_push(0);            /* S1: logicop_func */
+    cmd_push(0x78000000);   /* S2[0]: colormask=0xF<<27, blend disabled */
+    cmd_push(0); cmd_push(0); cmd_push(0); /* S2[1..3] */
+    cmd_push(0); cmd_push(0); cmd_push(0); /* S2[4..6] */
+    cmd_push(0);                            /* S2[7] */
+}
+
+static void cmd_create_dsa_disabled(uint32_t handle)
+{
+    cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_DSA, 5));
+    cmd_push(handle);
+    cmd_push(0);    /* S0: depth/alpha test disabled */
+    cmd_push(0);    /* S1: front stencil disabled */
+    cmd_push(0);    /* S2: back stencil disabled */
+    cmd_push(0);    /* alpha_ref = 0.0f */
+}
+
+static void cmd_create_rasterizer_default(uint32_t handle)
+{
+    /* 0x60008082: depth_clip_near | point_quad | front_ccw | half_pixel | bottom_edge */
+    uint32_t s0 = (1 << 1) | (1 << 7) | (1 << 15) | (1 << 29) | (1 << 30);
+
+    cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_RASTERIZER, 9));
+    cmd_push(handle);
+    cmd_push(s0);                   /* 0x60008082 */
+    cmd_push(f32_bits(1.0f));       /* point_size */
+    cmd_push(0);                    /* sprite_coord_enable */
+    cmd_push(0x0000FFFF);           /* clip_plane_enable = all */
+    cmd_push(f32_bits(1.0f));       /* line_width */
+    cmd_push(0);                    /* offset_units */
+    cmd_push(0);                    /* offset_scale */
+    cmd_push(0);                    /* offset_clamp */
+}
+
+static void cmd_create_vertex_elements(uint32_t handle, int count,
+    uint32_t offsets[], uint32_t divisors[],
+    uint32_t vb_indices[], uint32_t formats[])
+{
+    cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_VERTEX_ELEMENTS,
+                  4 * count + 1));
+    cmd_push(handle);
+    for (int i = 0; i < count; i++) {
+        cmd_push(offsets[i]);
+        cmd_push(divisors[i]);
+        cmd_push(vb_indices[i]);
+        cmd_push(formats[i]);
+    }
+}
+
+static void cmd_bind_object(uint32_t handle, uint32_t obj_type)
+{
+    cmd_push(cmd0(VIRGL_CCMD_BIND_OBJECT, obj_type, 1));
+    cmd_push(handle);
+}
+
+static void cmd_bind_shader(uint32_t handle, uint32_t shader_type)
+{
+    cmd_push(cmd0(VIRGL_CCMD_BIND_SHADER, 0, 2));
+    cmd_push(handle);
+    cmd_push(shader_type);
+}
+
+static void cmd_set_viewport(float width, float height)
+{
+    cmd_push(cmd0(VIRGL_CCMD_SET_VIEWPORT_STATE, 0, 7));
+    cmd_push(0);                        /* start_slot */
+    cmd_push(f32_bits(width / 2.0f));   /* scale_x */
+    cmd_push(f32_bits(-height / 2.0f)); /* scale_y (neg for GL Y-up) */
+    cmd_push(f32_bits(0.5f));           /* scale_z */
+    cmd_push(f32_bits(width / 2.0f));   /* translate_x */
+    cmd_push(f32_bits(height / 2.0f));  /* translate_y */
+    cmd_push(f32_bits(0.5f));           /* translate_z */
+}
+
+static void cmd_create_surface(uint32_t handle, uint32_t res_handle,
+                               uint32_t fmt, uint32_t level, uint32_t layers)
+{
+    cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_SURFACE, 5));
+    cmd_push(handle);
+    cmd_push(res_handle);
+    cmd_push(fmt);
+    cmd_push(level);
+    cmd_push(layers);   /* first_layer | (last_layer << 16) */
+}
+
+static void cmd_set_framebuffer_state(uint32_t zsurf_handle,
+                                      int nr_cbufs, uint32_t cbuf_handles[])
+{
+    cmd_push(cmd0(VIRGL_CCMD_SET_FRAMEBUFFER_STATE, 0, nr_cbufs + 2));
+    cmd_push(nr_cbufs);
+    cmd_push(zsurf_handle);
+    for (int i = 0; i < nr_cbufs; i++)
+        cmd_push(cbuf_handles[i]);
+}
+
+static void cmd_clear_color(float r, float g, float b, float a)
+{
+    cmd_push(cmd0(VIRGL_CCMD_CLEAR, 0, 8));
+    cmd_push(PIPE_CLEAR_COLOR0);         /* buffers = 0x04 */
+    cmd_push(f32_bits(r));
+    cmd_push(f32_bits(g));
+    cmd_push(f32_bits(b));
+    cmd_push(f32_bits(a));
+    cmd_push(0x00000000);                /* depth f64 low */
+    cmd_push(0x3FF00000);                /* depth f64 high = 1.0 */
+    cmd_push(0);                         /* stencil */
+}
+
+/* Create sampler view for a TEXTURE_2D resource.
+ * CRITICAL: bits [24:31] of the format DWORD must contain PIPE_TEXTURE_2D << 24.
+ * Without this, the host creates a BUFFER-targeted sampler view and you get BLACK. */
+static void cmd_create_sampler_view(uint32_t handle, uint32_t res_handle,
+                                    uint32_t format, uint32_t first_level,
+                                    uint32_t last_level, uint32_t swizzle_r,
+                                    uint32_t swizzle_g, uint32_t swizzle_b,
+                                    uint32_t swizzle_a)
+{
+    /* Format DWORD encoding:
+     *   bits [5:0]   = PIPE_FORMAT
+     *   bits [24:31] = texture target (PIPE_TEXTURE_2D = 2)
+     * Swizzle DWORD encoding:
+     *   bits [2:0]   = swizzle_r
+     *   bits [5:3]   = swizzle_g
+     *   bits [8:6]   = swizzle_b
+     *   bits [11:9]  = swizzle_a
+     */
+    uint32_t format_dw = format | (PIPE_TEXTURE_2D << 24);
+    uint32_t swizzle_dw = swizzle_r | (swizzle_g << 3) | (swizzle_b << 6) | (swizzle_a << 9);
+
+    cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_SAMPLER_VIEW, 6));
+    cmd_push(handle);
+    cmd_push(res_handle);
+    cmd_push(format_dw);
+    cmd_push(first_level | (last_level << 8)); /* first_element / first_level + last_element / last_level */
+    cmd_push(swizzle_dw);
+    cmd_push(0);   /* buffer_offset (unused for TEXTURE_2D) */
+}
+
+/* Bind sampler views to a shader stage */
+static void cmd_set_sampler_views(uint32_t shader_type, int count,
+                                  uint32_t view_handles[])
+{
+    cmd_push(cmd0(VIRGL_CCMD_SET_SAMPLER_VIEWS, 0, count + 2));
+    cmd_push(shader_type);
+    cmd_push(0);  /* start_slot */
+    for (int i = 0; i < count; i++)
+        cmd_push(view_handles[i]);
+}
+
+/* Create sampler state (texture filtering) */
+static void cmd_create_sampler_state(uint32_t handle,
+                                     uint32_t wrap_s, uint32_t wrap_t, uint32_t wrap_r,
+                                     uint32_t min_filter, uint32_t mag_filter,
+                                     uint32_t mip_filter)
+{
+    /* S0 encoding (from virglrenderer):
+     *   bits [2:0]   = wrap_s
+     *   bits [5:3]   = wrap_t
+     *   bits [8:6]   = wrap_r
+     *   bits [11:9]  = min_img_filter
+     *   bits [14:12] = min_mip_filter
+     *   bits [17:15] = mag_img_filter
+     *   bits [20:18] = compare_mode
+     *   bits [23:21] = compare_func
+     *   bit  24      = seamless_cube_map
+     */
+    uint32_t s0 = wrap_s | (wrap_t << 3) | (wrap_r << 6)
+                | (min_filter << 9) | (mip_filter << 12) | (mag_filter << 15);
+
+    cmd_push(cmd0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJ_SAMPLER_STATE, 5));
+    cmd_push(handle);
+    cmd_push(s0);
+    cmd_push(0);                    /* lod_bias (float) */
+    cmd_push(0);                    /* min_lod (float) */
+    cmd_push(f32_bits(1000.0f));    /* max_lod */
+}
+
+/* Bind sampler states */
+static void cmd_bind_sampler_states(uint32_t shader_type, int count,
+                                    uint32_t state_handles[])
+{
+    /* BIND_SAMPLER_STATES = VIRGL_CCMD_BIND_OBJECT with obj_type = SAMPLER_STATE
+     * Actually it's a dedicated command: VIRGL_CCMD_BIND_SAMPLER_STATES = 3 */
+    cmd_push(cmd0(3, 0, count + 2)); /* VIRGL_CCMD_BIND_SAMPLER_STATES = 3 */
+    cmd_push(shader_type);
+    cmd_push(0); /* start_slot */
+    for (int i = 0; i < count; i++)
+        cmd_push(state_handles[i]);
+}
+
+/* RESOURCE_INLINE_WRITE: write data directly into a VirGL resource.
+ * Used for vertex buffer data. */
+static void cmd_resource_inline_write(uint32_t res_handle, uint32_t level,
+                                      uint32_t usage, uint32_t stride,
+                                      uint32_t layer_stride,
+                                      uint32_t x, uint32_t y, uint32_t z,
+                                      uint32_t w, uint32_t h, uint32_t d,
+                                      const void *data, uint32_t data_bytes)
+{
+    uint32_t data_dwords = (data_bytes + 3) / 4;
+    cmd_push(cmd0(VIRGL_CCMD_RESOURCE_INLINE_WRITE, 0, 11 + data_dwords));
+    cmd_push(res_handle);
+    cmd_push(level);
+    cmd_push(usage);
+    cmd_push(stride);
+    cmd_push(layer_stride);
+    cmd_push(x);
+    cmd_push(y);
+    cmd_push(z);
+    cmd_push(w);
+    cmd_push(h);
+    cmd_push(d);
+    /* Copy data as DWORDs */
+    const uint8_t *bytes = (const uint8_t *)data;
+    for (uint32_t i = 0; i < data_dwords; i++) {
+        uint32_t dw = 0;
+        for (int b = 0; b < 4; b++) {
+            uint32_t idx = i * 4 + b;
+            if (idx < data_bytes)
+                dw |= ((uint32_t)bytes[idx]) << (b * 8);
+        }
+        cmd_push(dw);
+    }
+}
+
+/* SET_VERTEX_BUFFERS: bind vertex buffers for drawing */
+static void cmd_set_vertex_buffers(int count, uint32_t strides[],
+                                   uint32_t offsets[], uint32_t res_handles[])
+{
+    cmd_push(cmd0(VIRGL_CCMD_SET_VERTEX_BUFFERS, 0, count * 3));
+    for (int i = 0; i < count; i++) {
+        cmd_push(strides[i]);
+        cmd_push(offsets[i]);
+        cmd_push(res_handles[i]);
+    }
+}
+
+/* DRAW_VBO */
+static void cmd_draw_vbo(uint32_t start, uint32_t count, uint32_t mode,
+                          uint32_t indexed, uint32_t instance_count,
+                          uint32_t min_index, uint32_t max_index)
+{
+    cmd_push(cmd0(VIRGL_CCMD_DRAW_VBO, 0, 12));
+    cmd_push(start);
+    cmd_push(count);
+    cmd_push(mode);
+    cmd_push(indexed);
+    cmd_push(instance_count);
+    cmd_push(0); /* index_bias */
+    cmd_push(0); /* start_instance */
+    cmd_push(0); /* primitive_restart */
+    cmd_push(0); /* restart_index */
+    cmd_push(min_index);
+    cmd_push(max_index);
+    cmd_push(0); /* cso (unused) */
+}
+
+/* =========================================================================
+ * Hex dump
+ * ========================================================================= */
+
+static void hex_dump_dwords(const char *label, const uint32_t *data, int count)
+{
+    printf("[hex-dump] %s (%d DWORDs, %d bytes):\n", label, count, count * 4);
+    for (int i = 0; i < count; i++) {
+        printf("[hex-dump] %s +%03d (0x%03X): 0x%08X\n", label, i * 4, i * 4, data[i]);
+    }
+    printf("[hex-dump] %s END\n\n", label);
+}
+
+static void hex_dump_resource_create(const char *label,
+                                     const struct drm_virtgpu_resource_create *rc)
+{
+    printf("[hex-dump] %s:\n", label);
+    printf("[hex-dump]   target     = 0x%08X (%u)\n", rc->target, rc->target);
+    printf("[hex-dump]   format     = 0x%08X (%u)\n", rc->format, rc->format);
+    printf("[hex-dump]   bind       = 0x%08X\n", rc->bind);
+    printf("[hex-dump]   width      = %u\n", rc->width);
+    printf("[hex-dump]   height     = %u\n", rc->height);
+    printf("[hex-dump]   depth      = %u\n", rc->depth);
+    printf("[hex-dump]   array_size = %u\n", rc->array_size);
+    printf("[hex-dump]   last_level = %u\n", rc->last_level);
+    printf("[hex-dump]   nr_samples = %u\n", rc->nr_samples);
+    printf("[hex-dump]   flags      = 0x%08X\n", rc->flags);
+    printf("[hex-dump]   bo_handle  = %u (output)\n", rc->bo_handle);
+    printf("[hex-dump]   res_handle = %u (output)\n", rc->res_handle);
+    printf("[hex-dump]   size       = %u (output)\n", rc->size);
+    printf("[hex-dump]   stride     = %u (output)\n", rc->stride);
+    printf("\n");
+}
+
+/* =========================================================================
+ * DRM helpers
+ * ========================================================================= */
+
+static int drm_fd = -1;
+static uint32_t conn_id, crtc_id;
+static drmModeModeInfo mode;
+static drmModeCrtcPtr saved_crtc;
+
+static int find_drm_device(void)
+{
+    const char *cards[] = {"/dev/dri/card0", "/dev/dri/card1", NULL};
+
+    for (int i = 0; cards[i]; i++) {
+        int fd = open(cards[i], O_RDWR | O_CLOEXEC);
+        if (fd < 0)
+            continue;
+
+        if (drmSetMaster(fd) < 0) {
+            close(fd);
+            continue;
+        }
+
+        drmModeResPtr res = drmModeGetResources(fd);
+        if (!res) {
+            close(fd);
+            continue;
+        }
+
+        /* Find connected connector */
+        drmModeConnectorPtr conn = NULL;
+        for (int c = 0; c < res->count_connectors; c++) {
+            conn = drmModeGetConnector(fd, res->connectors[c]);
+            if (conn && conn->connection == DRM_MODE_CONNECTED &&
+                conn->count_modes > 0) {
+                break;
+            }
+            if (conn) drmModeFreeConnector(conn);
+            conn = NULL;
+        }
+
+        if (!conn) {
+            drmModeFreeResources(res);
+            close(fd);
+            continue;
+        }
+
+        conn_id = conn->connector_id;
+        mode = conn->modes[0]; /* preferred mode */
+
+        /* Find CRTC */
+        drmModeEncoderPtr enc = NULL;
+        if (conn->encoder_id)
+            enc = drmModeGetEncoder(fd, conn->encoder_id);
+        if (!enc && res->count_encoders > 0)
+            enc = drmModeGetEncoder(fd, res->encoders[0]);
+
+        if (enc) {
+            crtc_id = enc->crtc_id;
+            if (!crtc_id && res->count_crtcs > 0)
+                crtc_id = res->crtcs[0];
+            drmModeFreeEncoder(enc);
+        } else if (res->count_crtcs > 0) {
+            crtc_id = res->crtcs[0];
+        }
+
+        saved_crtc = drmModeGetCrtc(fd, crtc_id);
+
+        printf("DRM: %s -- %s %ux%u@%u\n", cards[i],
+               conn->connector_type_id ? "connected" : "?",
+               mode.hdisplay, mode.vdisplay, mode.vrefresh);
+        printf("DRM: connector=%u, crtc=%u\n", conn_id, crtc_id);
+
+        drmModeFreeConnector(conn);
+        drmModeFreeResources(res);
+        drm_fd = fd;
+        return 0;
+    }
+
+    fprintf(stderr, "No DRM device found\n");
+    return -1;
+}
+
+/* =========================================================================
+ * VirtGPU resource + execbuffer wrappers
+ * ========================================================================= */
+
+static int virtgpu_resource_create(struct drm_virtgpu_resource_create *rc)
+{
+    int ret = drmIoctl(drm_fd, DRM_IOCTL_VIRTGPU_RESOURCE_CREATE, rc);
+    if (ret < 0) {
+        fprintf(stderr, "RESOURCE_CREATE failed: %s\n", strerror(errno));
+        return -1;
+    }
+    return 0;
+}
+
+static int virtgpu_execbuffer(uint32_t *cmds, int dword_count,
+                              uint32_t *bo_handles, int num_bos)
+{
+    struct drm_virtgpu_execbuffer eb;
+    memset(&eb, 0, sizeof(eb));
+    eb.size = dword_count * 4;
+    eb.command = (uint64_t)(uintptr_t)cmds;
+    if (num_bos > 0) {
+        eb.bo_handles = (uint64_t)(uintptr_t)bo_handles;
+        eb.num_bo_handles = num_bos;
+    }
+    eb.fence_fd = -1;
+
+    int ret = drmIoctl(drm_fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &eb);
+    if (ret < 0) {
+        fprintf(stderr, "EXECBUFFER failed: %s\n", strerror(errno));
+        return -1;
+    }
+    return 0;
+}
+
+static int virtgpu_wait(uint32_t bo_handle)
+{
+    struct drm_virtgpu_3d_wait wait;
+    memset(&wait, 0, sizeof(wait));
+    wait.handle = bo_handle;
+    wait.flags = 0;
+    return drmIoctl(drm_fd, DRM_IOCTL_VIRTGPU_WAIT, &wait);
+}
+
+static int virtgpu_transfer_from_host(uint32_t bo_handle, uint32_t stride,
+                                       uint32_t width, uint32_t height)
+{
+    drm_virtgpu_3d_transfer_from_host xfer;
+    memset(&xfer, 0, sizeof(xfer));
+    xfer.bo_handle = bo_handle;
+    xfer.stride = stride;
+    xfer.box.w = width;
+    xfer.box.h = height;
+    xfer.box.d = 1;
+    return drmIoctl(drm_fd, DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST, &xfer);
+}
+
+static int virtgpu_transfer_to_host(uint32_t bo_handle, uint32_t stride,
+                                     uint32_t width, uint32_t height)
+{
+    struct drm_virtgpu_3d_transfer_to_host xfer;
+    memset(&xfer, 0, sizeof(xfer));
+    xfer.bo_handle = bo_handle;
+    xfer.stride = stride;
+    xfer.box.w = width;
+    xfer.box.h = height;
+    xfer.box.d = 1;
+    return drmIoctl(drm_fd, DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST, &xfer);
+}
+
+/* =========================================================================
+ * Texture dimensions and quad positions
+ * ========================================================================= */
+
+#define TEX_W 400
+#define TEX_H 300
+
+/* Quad A: pixels (100,100) to (500,400) — shows texture A (RED) */
+#define QUAD_A_X0 100
+#define QUAD_A_Y0 100
+#define QUAD_A_X1 500
+#define QUAD_A_Y1 400
+
+/* Quad B: pixels (600,100) to (1000,400) — shows texture B (BLUE) */
+#define QUAD_B_X0 600
+#define QUAD_B_Y0 100
+#define QUAD_B_X1 1000
+#define QUAD_B_Y1 400
+
+/* Pixel sample points for verification */
+#define SAMPLE_RED_X   300   /* center of quad A */
+#define SAMPLE_RED_Y   250
+#define SAMPLE_BLUE_X  800   /* center of quad B */
+#define SAMPLE_BLUE_Y  250
+#define SAMPLE_GRAY_X  50    /* background area */
+#define SAMPLE_GRAY_Y  50
+
+/* =========================================================================
+ * VirGL object handle allocation
+ *
+ * CRITICAL: VirGL object handles must be globally unique across ALL types.
+ * virglrenderer uses a single hash table per sub-context.
+ *
+ * We use separate ranges to avoid collisions:
+ *   Surfaces:         1-10
+ *   Blend:            11
+ *   DSA:              12
+ *   Rasterizer:       13
+ *   VS:               14
+ *   FS (color):       15  (for clear batches — unused in composite)
+ *   FS (texture):     16
+ *   Vertex elements:  17
+ *   Sampler view A:   18
+ *   Sampler view B:   19
+ *   Sampler state:    20
+ *   VB resource:      created via DRM as resource 4
+ * ========================================================================= */
+
+#define HANDLE_SURFACE_A     1   /* surface for texture A (render-to) */
+#define HANDLE_SURFACE_B     2   /* surface for texture B (render-to) */
+#define HANDLE_SURFACE_DISP  3   /* surface for display resource (composite target) */
+#define HANDLE_BLEND         11
+#define HANDLE_DSA           12
+#define HANDLE_RASTERIZER    13
+#define HANDLE_VS            14
+#define HANDLE_FS_TEXTURE    16
+#define HANDLE_VE            17
+#define HANDLE_SAMPLER_VIEW_A 18
+#define HANDLE_SAMPLER_VIEW_B 19
+#define HANDLE_SAMPLER_STATE  20
+
+/* =========================================================================
+ * Vertex data helpers
+ * ========================================================================= */
+
+/* Convert pixel coordinates to NDC (-1 to +1).
+ * Note: Y is flipped (OpenGL convention: bottom = -1, top = +1).
+ *   ndc_x = (pixel_x / screen_w) * 2.0 - 1.0
+ *   ndc_y = 1.0 - (pixel_y / screen_h) * 2.0
+ */
+typedef struct {
+    float pos[4];  /* x, y, z, w */
+    float tex[4];  /* s, t, 0, 1 */
+} vertex_t;
+
+static void make_quad_vertices(vertex_t verts[4],
+                               float px0, float py0, float px1, float py1,
+                               float screen_w, float screen_h)
+{
+    float x0 = (px0 / screen_w) * 2.0f - 1.0f;
+    float x1 = (px1 / screen_w) * 2.0f - 1.0f;
+    float y0 = 1.0f - (py0 / screen_h) * 2.0f;  /* top (higher Y in pixels = lower in NDC) */
+    float y1 = 1.0f - (py1 / screen_h) * 2.0f;  /* bottom */
+
+    /* TRIANGLE_STRIP order: top-left, top-right, bottom-left, bottom-right */
+    /* Vertex 0: top-left */
+    verts[0] = (vertex_t){{ x0, y0, 0.0f, 1.0f }, { 0.0f, 0.0f, 0.0f, 1.0f }};
+    /* Vertex 1: top-right */
+    verts[1] = (vertex_t){{ x1, y0, 0.0f, 1.0f }, { 1.0f, 0.0f, 0.0f, 1.0f }};
+    /* Vertex 2: bottom-left */
+    verts[2] = (vertex_t){{ x0, y1, 0.0f, 1.0f }, { 0.0f, 1.0f, 0.0f, 1.0f }};
+    /* Vertex 3: bottom-right */
+    verts[3] = (vertex_t){{ x1, y1, 0.0f, 1.0f }, { 1.0f, 1.0f, 0.0f, 1.0f }};
+}
+
+/* =========================================================================
+ * main
+ * ========================================================================= */
+
+int main(void)
+{
+    printf("=== VirGL Multi-Texture Compositing Test ===\n\n");
+
+    /* Step 1: Find DRM device */
+    if (find_drm_device() < 0)
+        return 1;
+
+    uint32_t width = mode.hdisplay;
+    uint32_t height = mode.vdisplay;
+    printf("Resolution: %ux%u\n\n", width, height);
+
+    /* =====================================================================
+     * Step 2: Create resources
+     * ===================================================================== */
+
+    /* Resource 1: Display surface (composited output) — 1920x1200, SCANOUT */
+    struct drm_virtgpu_resource_create rc_disp;
+    memset(&rc_disp, 0, sizeof(rc_disp));
+    rc_disp.target = PIPE_TEXTURE_2D;
+    rc_disp.format = PIPE_FORMAT_B8G8R8X8_UNORM;
+    rc_disp.bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW |
+                   PIPE_BIND_SCANOUT | PIPE_BIND_SHARED;
+    rc_disp.width = width;
+    rc_disp.height = height;
+    rc_disp.depth = 1;
+    rc_disp.array_size = 1;
+
+    printf("=== Creating display resource (res 1: %ux%u) ===\n", width, height);
+    hex_dump_resource_create("RESOURCE_CREATE display", &rc_disp);
+    if (virtgpu_resource_create(&rc_disp) < 0) return 1;
+    printf("Display resource: bo=%u res=%u stride=%u size=%u\n\n",
+           rc_disp.bo_handle, rc_disp.res_handle, rc_disp.stride, rc_disp.size);
+
+    /* Resource 2: Texture A (RED window) — 400x300, no SCANOUT */
+    struct drm_virtgpu_resource_create rc_texA;
+    memset(&rc_texA, 0, sizeof(rc_texA));
+    rc_texA.target = PIPE_TEXTURE_2D;
+    rc_texA.format = PIPE_FORMAT_B8G8R8X8_UNORM;
+    rc_texA.bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
+    rc_texA.width = TEX_W;
+    rc_texA.height = TEX_H;
+    rc_texA.depth = 1;
+    rc_texA.array_size = 1;
+
+    printf("=== Creating texture A (res 2: %ux%u) ===\n", TEX_W, TEX_H);
+    hex_dump_resource_create("RESOURCE_CREATE texA", &rc_texA);
+    if (virtgpu_resource_create(&rc_texA) < 0) return 1;
+    printf("Texture A: bo=%u res=%u stride=%u size=%u\n\n",
+           rc_texA.bo_handle, rc_texA.res_handle, rc_texA.stride, rc_texA.size);
+
+    /* Resource 3: Texture B (BLUE window) — 400x300, no SCANOUT */
+    struct drm_virtgpu_resource_create rc_texB;
+    memset(&rc_texB, 0, sizeof(rc_texB));
+    rc_texB.target = PIPE_TEXTURE_2D;
+    rc_texB.format = PIPE_FORMAT_B8G8R8X8_UNORM;
+    rc_texB.bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
+    rc_texB.width = TEX_W;
+    rc_texB.height = TEX_H;
+    rc_texB.depth = 1;
+    rc_texB.array_size = 1;
+
+    printf("=== Creating texture B (res 3: %ux%u) ===\n", TEX_W, TEX_H);
+    hex_dump_resource_create("RESOURCE_CREATE texB", &rc_texB);
+    if (virtgpu_resource_create(&rc_texB) < 0) return 1;
+    printf("Texture B: bo=%u res=%u stride=%u size=%u\n\n",
+           rc_texB.bo_handle, rc_texB.res_handle, rc_texB.stride, rc_texB.size);
+
+    /* Resource 4: Vertex buffer (PIPE_BUFFER, VERTEX_BUFFER bind) */
+    struct drm_virtgpu_resource_create rc_vb;
+    memset(&rc_vb, 0, sizeof(rc_vb));
+    rc_vb.target = PIPE_BUFFER;
+    rc_vb.format = PIPE_FORMAT_R32G32B32A32_FLOAT;  /* doesn't matter for buffers, but Mesa uses this */
+    rc_vb.bind = PIPE_BIND_VERTEX_BUFFER;
+    rc_vb.width = 4096;  /* size in bytes (width for PIPE_BUFFER) */
+    rc_vb.height = 1;
+    rc_vb.depth = 1;
+    rc_vb.array_size = 1;
+
+    printf("=== Creating vertex buffer resource (res 4: buffer, 4096 bytes) ===\n");
+    hex_dump_resource_create("RESOURCE_CREATE VB", &rc_vb);
+    if (virtgpu_resource_create(&rc_vb) < 0) return 1;
+    printf("VB resource: bo=%u res=%u\n\n",
+           rc_vb.bo_handle, rc_vb.res_handle);
+
+    /* Collect all BO handles for EXECBUFFER */
+    uint32_t all_bos[4] = {
+        rc_disp.bo_handle,
+        rc_texA.bo_handle,
+        rc_texB.bo_handle,
+        rc_vb.bo_handle
+    };
+
+    /* =====================================================================
+     * Step 2b: Prime all TEXTURE_2D resources with TRANSFER_TO_HOST
+     *
+     * CRITICAL: Parallels requires an initial TRANSFER_TO_HOST_3D to
+     * establish the host-side buffer before any VirGL rendering will
+     * produce visible results. Without this "priming" step, SUBMIT_3D
+     * rendering targets a non-existent host buffer and produces black.
+     * ===================================================================== */
+    printf("=== Priming resources with TRANSFER_TO_HOST ===\n");
+    {
+        uint32_t disp_stride = rc_disp.stride;
+        if (disp_stride == 0) disp_stride = width * 4;
+        int r1 = virtgpu_transfer_to_host(rc_disp.bo_handle, disp_stride, width, height);
+        printf("  Prime display (res %u, bo %u): %s\n", rc_disp.res_handle, rc_disp.bo_handle,
+               r1 < 0 ? "FAILED" : "OK");
+
+        uint32_t tex_stride = rc_texA.stride;
+        if (tex_stride == 0) tex_stride = TEX_W * 4;
+        int r2 = virtgpu_transfer_to_host(rc_texA.bo_handle, tex_stride, TEX_W, TEX_H);
+        printf("  Prime texA   (res %u, bo %u): %s\n", rc_texA.res_handle, rc_texA.bo_handle,
+               r2 < 0 ? "FAILED" : "OK");
+
+        int r3 = virtgpu_transfer_to_host(rc_texB.bo_handle, tex_stride, TEX_W, TEX_H);
+        printf("  Prime texB   (res %u, bo %u): %s\n", rc_texB.res_handle, rc_texB.bo_handle,
+               r3 < 0 ? "FAILED" : "OK");
+    }
+    printf("\n");
+
+    /* =====================================================================
+     * Step 3: Render to Texture A (RED)
+     *
+     * Each SUBMIT_3D batch must start with create_sub_ctx(1) + set_sub_ctx(1).
+     * Objects do NOT survive create_sub_ctx — must recreate everything.
+     * ===================================================================== */
+
+    printf("=== Batch 1: Render RED to Texture A ===\n");
+    cmd_reset();
+
+    cmd_create_sub_ctx(1);
+    cmd_set_sub_ctx(1);
+    cmd_set_tweaks(1, 1);
+    cmd_set_tweaks(2, TEX_W);
+
+    /* Create surface for texture A's resource, set as framebuffer, clear RED */
+    cmd_create_surface(HANDLE_SURFACE_A, rc_texA.res_handle,
+                       PIPE_FORMAT_B8G8R8X8_UNORM, 0, 0);
+    {
+        uint32_t cbufs[] = { HANDLE_SURFACE_A };
+        cmd_set_framebuffer_state(0, 1, cbufs);
+    }
+    cmd_clear_color(1.0f, 0.0f, 0.0f, 1.0f);  /* RED */
+
+    hex_dump_dwords("BATCH_1_CLEAR_RED", cmd_buf, cmd_len);
+
+    if (virtgpu_execbuffer(cmd_buf, cmd_len, all_bos, 4) < 0) return 1;
+    virtgpu_wait(rc_texA.bo_handle);
+    printf("Batch 1 (RED clear to texA): OK\n\n");
+
+    /* =====================================================================
+     * Step 4: Render to Texture B (BLUE)
+     * ===================================================================== */
+
+    printf("=== Batch 2: Render BLUE to Texture B ===\n");
+    cmd_reset();
+
+    cmd_create_sub_ctx(1);
+    cmd_set_sub_ctx(1);
+    cmd_set_tweaks(1, 1);
+    cmd_set_tweaks(2, TEX_W);
+
+    /* Create surface for texture B's resource, set as framebuffer, clear BLUE */
+    cmd_create_surface(HANDLE_SURFACE_B, rc_texB.res_handle,
+                       PIPE_FORMAT_B8G8R8X8_UNORM, 0, 0);
+    {
+        uint32_t cbufs[] = { HANDLE_SURFACE_B };
+        cmd_set_framebuffer_state(0, 1, cbufs);
+    }
+    cmd_clear_color(0.0f, 0.0f, 1.0f, 1.0f);  /* BLUE */
+
+    hex_dump_dwords("BATCH_2_CLEAR_BLUE", cmd_buf, cmd_len);
+
+    if (virtgpu_execbuffer(cmd_buf, cmd_len, all_bos, 4) < 0) return 1;
+    virtgpu_wait(rc_texB.bo_handle);
+    printf("Batch 2 (BLUE clear to texB): OK\n\n");
+
+    /* =====================================================================
+     * Step 5: Composite both textures onto display resource
+     *
+     * This is the key batch that proves multi-texture sampling works:
+     *   1. Clear display to dark gray
+     *   2. Draw textured quad sampling from texture A at left position
+     *   3. Switch sampler view to texture B, draw quad at right position
+     * ===================================================================== */
+
+    printf("=== Batch 3: Composite both textures onto display ===\n");
+    cmd_reset();
+
+    /* --- Sub-context setup --- */
+    cmd_create_sub_ctx(1);
+    cmd_set_sub_ctx(1);
+    cmd_set_tweaks(1, 1);
+    cmd_set_tweaks(2, width);
+
+    /* --- Create display surface and set as framebuffer --- */
+    cmd_create_surface(HANDLE_SURFACE_DISP, rc_disp.res_handle,
+                       PIPE_FORMAT_B8G8R8X8_UNORM, 0, 0);
+    {
+        uint32_t cbufs[] = { HANDLE_SURFACE_DISP };
+        cmd_set_framebuffer_state(0, 1, cbufs);
+    }
+
+    /* --- Clear display to dark gray background (0.2, 0.2, 0.2) --- */
+    cmd_clear_color(0.2f, 0.2f, 0.2f, 1.0f);
+
+    /* --- Create pipeline state objects --- */
+    cmd_create_blend_simple(HANDLE_BLEND);
+    cmd_bind_object(HANDLE_BLEND, VIRGL_OBJ_BLEND);
+
+    cmd_create_dsa_disabled(HANDLE_DSA);
+    cmd_bind_object(HANDLE_DSA, VIRGL_OBJ_DSA);
+
+    cmd_create_rasterizer_default(HANDLE_RASTERIZER);
+    cmd_bind_object(HANDLE_RASTERIZER, VIRGL_OBJ_RASTERIZER);
+
+    /* --- Create and bind shaders --- */
+    /* Vertex shader: passthrough position + texcoord */
+    const char *vs_text =
+        "VERT\n"
+        "DCL IN[0]\n"
+        "DCL IN[1]\n"
+        "DCL OUT[0], POSITION\n"
+        "DCL OUT[1], GENERIC[0]\n"
+        "MOV OUT[0], IN[0]\n"
+        "MOV OUT[1], IN[1]\n"
+        "END\n";
+
+    /* Fragment shader: sample texture and output */
+    const char *fs_text =
+        "FRAG\n"
+        "PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1\n"
+        "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
+        "DCL OUT[0], COLOR\n"
+        "DCL SAMP[0]\n"
+        "DCL SVIEW[0], 2D, FLOAT\n"
+        "TEX OUT[0], IN[0], SAMP[0], 2D\n"
+        "END\n";
+
+    cmd_create_shader(HANDLE_VS, PIPE_SHADER_VERTEX, vs_text);
+    cmd_bind_shader(HANDLE_VS, PIPE_SHADER_VERTEX);
+
+    cmd_create_shader(HANDLE_FS_TEXTURE, PIPE_SHADER_FRAGMENT, fs_text);
+    cmd_bind_shader(HANDLE_FS_TEXTURE, PIPE_SHADER_FRAGMENT);
+
+    /* --- Create vertex elements (2 attributes: position + texcoord) ---
+     * Each vertex has 8 floats: 4 for position, 4 for texcoord.
+     * Attribute 0: offset=0, format=R32G32B32A32_FLOAT (position)
+     * Attribute 1: offset=16, format=R32G32B32A32_FLOAT (texcoord)
+     */
+    {
+        uint32_t offsets[] = { 0, 16 };
+        uint32_t divisors[] = { 0, 0 };
+        uint32_t vb_indices[] = { 0, 0 };
+        uint32_t formats[] = { PIPE_FORMAT_R32G32B32A32_FLOAT,
+                               PIPE_FORMAT_R32G32B32A32_FLOAT };
+        cmd_create_vertex_elements(HANDLE_VE, 2, offsets, divisors, vb_indices, formats);
+    }
+    cmd_bind_object(HANDLE_VE, VIRGL_OBJ_VERTEX_ELEMENTS);
+
+    /* --- Set viewport to full display --- */
+    cmd_set_viewport((float)width, (float)height);
+
+    /* --- Create sampler state (LINEAR filtering) --- */
+    /* wrap modes: CLAMP_TO_EDGE = 2 */
+    cmd_create_sampler_state(HANDLE_SAMPLER_STATE, 2, 2, 2,
+                             PIPE_TEX_FILTER_LINEAR, PIPE_TEX_FILTER_LINEAR, 0);
+    {
+        uint32_t states[] = { HANDLE_SAMPLER_STATE };
+        cmd_bind_sampler_states(PIPE_SHADER_FRAGMENT, 1, states);
+    }
+
+    /* --- Bind vertex buffer resource --- */
+    {
+        uint32_t strides[] = { sizeof(vertex_t) };  /* 32 bytes per vertex */
+        uint32_t offsets[] = { 0 };
+        uint32_t res_handles[] = { rc_vb.res_handle };
+        cmd_set_vertex_buffers(1, strides, offsets, res_handles);
+    }
+
+    /* ---- Draw Quad A (texture A = RED) at left position ---- */
+
+    /* Create sampler view for texture A.
+     * Swizzle: identity (R=0, G=1, B=2, A=3) */
+    cmd_create_sampler_view(HANDLE_SAMPLER_VIEW_A, rc_texA.res_handle,
+                            PIPE_FORMAT_B8G8R8X8_UNORM,
+                            0, 0,    /* first_level, last_level */
+                            0, 1, 2, 3);  /* RGBA identity swizzle */
+    {
+        uint32_t views[] = { HANDLE_SAMPLER_VIEW_A };
+        cmd_set_sampler_views(PIPE_SHADER_FRAGMENT, 1, views);
+    }
+
+    /* Upload vertex data for quad A via RESOURCE_INLINE_WRITE */
+    {
+        vertex_t verts[4];
+        make_quad_vertices(verts,
+                           (float)QUAD_A_X0, (float)QUAD_A_Y0,
+                           (float)QUAD_A_X1, (float)QUAD_A_Y1,
+                           (float)width, (float)height);
+
+        printf("Quad A vertices (NDC):\n");
+        for (int i = 0; i < 4; i++) {
+            printf("  v%d: pos=(%.4f, %.4f, %.4f, %.4f) tex=(%.4f, %.4f, %.4f, %.4f)\n",
+                   i, verts[i].pos[0], verts[i].pos[1], verts[i].pos[2], verts[i].pos[3],
+                   verts[i].tex[0], verts[i].tex[1], verts[i].tex[2], verts[i].tex[3]);
+        }
+
+        /* Write quad A vertices at offset 0 in the VB resource */
+        cmd_resource_inline_write(rc_vb.res_handle, 0, 0, 0, 0,
+                                  0, 0, 0,                      /* x, y, z */
+                                  sizeof(verts), 1, 1,          /* w, h, d (bytes for buffer) */
+                                  verts, sizeof(verts));
+    }
+
+    /* Draw quad A: 4 vertices, TRIANGLE_STRIP */
+    cmd_draw_vbo(0, 4, PIPE_PRIM_TRIANGLE_STRIP, 0, 1, 0, 3);
+
+    /* ---- Draw Quad B (texture B = BLUE) at right position ---- */
+
+    /* Create sampler view for texture B */
+    cmd_create_sampler_view(HANDLE_SAMPLER_VIEW_B, rc_texB.res_handle,
+                            PIPE_FORMAT_B8G8R8X8_UNORM,
+                            0, 0,
+                            0, 1, 2, 3);
+    {
+        uint32_t views[] = { HANDLE_SAMPLER_VIEW_B };
+        cmd_set_sampler_views(PIPE_SHADER_FRAGMENT, 1, views);
+    }
+
+    /* Upload vertex data for quad B via RESOURCE_INLINE_WRITE */
+    {
+        vertex_t verts[4];
+        make_quad_vertices(verts,
+                           (float)QUAD_B_X0, (float)QUAD_B_Y0,
+                           (float)QUAD_B_X1, (float)QUAD_B_Y1,
+                           (float)width, (float)height);
+
+        printf("Quad B vertices (NDC):\n");
+        for (int i = 0; i < 4; i++) {
+            printf("  v%d: pos=(%.4f, %.4f, %.4f, %.4f) tex=(%.4f, %.4f, %.4f, %.4f)\n",
+                   i, verts[i].pos[0], verts[i].pos[1], verts[i].pos[2], verts[i].pos[3],
+                   verts[i].tex[0], verts[i].tex[1], verts[i].tex[2], verts[i].tex[3]);
+        }
+
+        /* Write quad B vertices at offset 128 to avoid overwriting quad A
+         * (4 vertices * 32 bytes = 128 bytes for quad A) */
+        cmd_resource_inline_write(rc_vb.res_handle, 0, 0, 0, 0,
+                                  128, 0, 0,                    /* x=128 (byte offset), y, z */
+                                  sizeof(verts), 1, 1,          /* w, h, d */
+                                  verts, sizeof(verts));
+    }
+
+    /* Re-bind vertex buffer with offset 128 for quad B */
+    {
+        uint32_t strides[] = { sizeof(vertex_t) };
+        uint32_t offsets[] = { 128 };
+        uint32_t res_handles[] = { rc_vb.res_handle };
+        cmd_set_vertex_buffers(1, strides, offsets, res_handles);
+    }
+
+    /* Draw quad B: 4 vertices, TRIANGLE_STRIP */
+    cmd_draw_vbo(0, 4, PIPE_PRIM_TRIANGLE_STRIP, 0, 1, 0, 3);
+
+    hex_dump_dwords("BATCH_3_COMPOSITE", cmd_buf, cmd_len);
+
+    if (virtgpu_execbuffer(cmd_buf, cmd_len, all_bos, 4) < 0) return 1;
+    virtgpu_wait(rc_disp.bo_handle);
+    printf("Batch 3 (composite both textures): OK\n\n");
+
+    /* =====================================================================
+     * Step 6: Display via DRM KMS
+     * ===================================================================== */
+
+    printf("=== Displaying composited result ===\n");
+
+    /* TRANSFER_FROM_HOST to pull GPU-rendered content into guest backing for DRM display */
+    uint32_t disp_stride = rc_disp.stride;
+    if (disp_stride == 0) disp_stride = width * 4;
+
+    if (virtgpu_transfer_from_host(rc_disp.bo_handle, disp_stride, width, height) < 0)
+        printf("TRANSFER_FROM_HOST (display readback): failed\n");
+    else
+        printf("TRANSFER_FROM_HOST (display readback): OK\n");
+    virtgpu_wait(rc_disp.bo_handle);
+
+    uint32_t fb_id = 0;
+    int ret = drmModeAddFB(drm_fd, width, height, 24, 32,
+                           disp_stride, rc_disp.bo_handle, &fb_id);
+    if (ret < 0) {
+        fprintf(stderr, "drmModeAddFB failed: %s\n", strerror(errno));
+        return 1;
+    }
+    printf("AddFB: fb_id=%u\n", fb_id);
+
+    ret = drmModeSetCrtc(drm_fd, crtc_id, fb_id, 0, 0, &conn_id, 1, &mode);
+    if (ret < 0) {
+        fprintf(stderr, "drmModeSetCrtc failed: %s\n", strerror(errno));
+        drmModeRmFB(drm_fd, fb_id);
+        return 1;
+    }
+    printf("SetCrtc: OK -- display should show gray background + RED left + BLUE right\n\n");
+
+    /* Mark dirty to trigger display update */
+    {
+        drmModeClip clip = { 0, 0, (uint16_t)width, (uint16_t)height };
+        drmModeDirtyFB(drm_fd, fb_id, &clip, 1);
+    }
+
+    /* =====================================================================
+     * Step 7: Readback + pixel verification
+     * ===================================================================== */
+
+    printf("=== Pixel readback verification ===\n");
+
+    /* TRANSFER_FROM_HOST to get rendered pixels into guest backing */
+    if (virtgpu_transfer_from_host(rc_disp.bo_handle, disp_stride, width, height) < 0) {
+        printf("TRANSFER_FROM_HOST: FAILED\n");
+    } else {
+        printf("TRANSFER_FROM_HOST: OK\n");
+    }
+    virtgpu_wait(rc_disp.bo_handle);
+
+    /* MAP the display resource */
+    struct drm_virtgpu_map vmap;
+    memset(&vmap, 0, sizeof(vmap));
+    vmap.handle = rc_disp.bo_handle;
+    uint32_t *pixels = NULL;
+    uint32_t map_size = disp_stride * height;
+
+    if (drmIoctl(drm_fd, DRM_IOCTL_VIRTGPU_MAP, &vmap) < 0) {
+        printf("VIRTGPU_MAP: FAILED -- %s\n", strerror(errno));
+    } else {
+        pixels = mmap(NULL, map_size, PROT_READ | PROT_WRITE,
+                      MAP_SHARED, drm_fd, vmap.offset);
+        if (pixels == MAP_FAILED) {
+            printf("mmap: FAILED -- %s\n", strerror(errno));
+            pixels = NULL;
+        } else {
+            printf("mmap: OK (%u bytes at %p)\n", map_size, (void *)pixels);
+        }
+    }
+
+    int pass_count = 0;
+    int fail_count = 0;
+
+    if (pixels) {
+        uint32_t stride_px = disp_stride / 4;
+
+        /* Sample pixel at center of quad A — should be RED.
+         * B8G8R8X8_UNORM byte order: B, G, R, X in memory.
+         * RED = B=0x00, G=0x00, R=0xFF, X=0xFF => LE u32 = 0xFF0000FF
+         * Or X might be 0x00 => 0x000000FF
+         * Actually in B8G8R8X8: byte[0]=B, byte[1]=G, byte[2]=R, byte[3]=X
+         * As LE uint32: (X << 24) | (R << 16) | (G << 8) | B
+         * RED: B=0, G=0, R=0xFF => 0x??FF0000 where ?? depends on X channel */
+        uint32_t px_red = pixels[SAMPLE_RED_Y * stride_px + SAMPLE_RED_X];
+        uint32_t px_blue = pixels[SAMPLE_BLUE_Y * stride_px + SAMPLE_BLUE_X];
+        uint32_t px_gray = pixels[SAMPLE_GRAY_Y * stride_px + SAMPLE_GRAY_X];
+
+        printf("\nPixel samples (B8G8R8X8_UNORM as LE uint32):\n");
+        printf("  (%d,%d) = 0x%08X  (expect RED:  R channel high, B/G low)\n",
+               SAMPLE_RED_X, SAMPLE_RED_Y, px_red);
+        printf("  (%d,%d) = 0x%08X  (expect BLUE: B channel high, R/G low)\n",
+               SAMPLE_BLUE_X, SAMPLE_BLUE_Y, px_blue);
+        printf("  (%d,%d) = 0x%08X  (expect GRAY: R=G=B ~0x33)\n",
+               SAMPLE_GRAY_X, SAMPLE_GRAY_Y, px_gray);
+
+        /* Extract channels from B8G8R8X8_UNORM (LE):
+         *   B = byte 0 = bits [7:0]
+         *   G = byte 1 = bits [15:8]
+         *   R = byte 2 = bits [23:16]
+         *   X = byte 3 = bits [31:24]
+         */
+        #define GET_B(px) ((px) & 0xFF)
+        #define GET_G(px) (((px) >> 8) & 0xFF)
+        #define GET_R(px) (((px) >> 16) & 0xFF)
+
+        /* Check RED pixel: R should be high (>= 0xC0), B and G should be low (<= 0x40) */
+        uint8_t r_r = GET_R(px_red), r_g = GET_G(px_red), r_b = GET_B(px_red);
+        printf("\n  RED check:  R=%u G=%u B=%u  ", r_r, r_g, r_b);
+        if (r_r >= 0xC0 && r_g <= 0x40 && r_b <= 0x40) {
+            printf("PASS\n");
+            pass_count++;
+        } else {
+            printf("FAIL\n");
+            fail_count++;
+        }
+
+        /* Check BLUE pixel: B should be high, R and G should be low */
+        uint8_t b_r = GET_R(px_blue), b_g = GET_G(px_blue), b_b = GET_B(px_blue);
+        printf("  BLUE check: R=%u G=%u B=%u  ", b_r, b_g, b_b);
+        if (b_b >= 0xC0 && b_r <= 0x40 && b_g <= 0x40) {
+            printf("PASS\n");
+            pass_count++;
+        } else {
+            printf("FAIL\n");
+            fail_count++;
+        }
+
+        /* Check GRAY pixel: R, G, B should all be similar and in ~0x20-0x40 range
+         * 0.2 * 255 = 51 = 0x33 */
+        uint8_t g_r = GET_R(px_gray), g_g = GET_G(px_gray), g_b = GET_B(px_gray);
+        printf("  GRAY check: R=%u G=%u B=%u  ", g_r, g_g, g_b);
+        if (g_r >= 0x20 && g_r <= 0x50 &&
+            g_g >= 0x20 && g_g <= 0x50 &&
+            g_b >= 0x20 && g_b <= 0x50 &&
+            abs((int)g_r - (int)g_g) < 0x10 &&
+            abs((int)g_r - (int)g_b) < 0x10) {
+            printf("PASS\n");
+            pass_count++;
+        } else {
+            printf("FAIL\n");
+            fail_count++;
+        }
+
+        /* Print additional diagnostic pixels */
+        printf("\nAdditional pixel samples:\n");
+        /* Top-left of quad A */
+        printf("  (%d,%d) = 0x%08X  (quad A top-left)\n",
+               QUAD_A_X0 + 5, QUAD_A_Y0 + 5,
+               pixels[(QUAD_A_Y0 + 5) * stride_px + QUAD_A_X0 + 5]);
+        /* Top-left of quad B */
+        printf("  (%d,%d) = 0x%08X  (quad B top-left)\n",
+               QUAD_B_X0 + 5, QUAD_B_Y0 + 5,
+               pixels[(QUAD_B_Y0 + 5) * stride_px + QUAD_B_X0 + 5]);
+        /* Between the quads (should be gray) */
+        printf("  (550,250) = 0x%08X  (between quads, expect gray)\n",
+               pixels[250 * stride_px + 550]);
+        /* Bottom-right corner (should be gray) */
+        printf("  (%u,%u) = 0x%08X  (bottom-right corner)\n",
+               width - 5, height - 5,
+               pixels[(height - 5) * stride_px + width - 5]);
+
+        munmap(pixels, map_size);
+    } else {
+        printf("Cannot verify pixels -- MAP failed\n");
+        fail_count = 3;
+    }
+
+    /* =====================================================================
+     * Final verdict
+     * ===================================================================== */
+
+    printf("\n========================================\n");
+    if (fail_count == 0 && pass_count == 3) {
+        printf("MULTI-TEXTURE TEST: PASS (%d/3 checks passed)\n", pass_count);
+    } else {
+        printf("MULTI-TEXTURE TEST: FAIL (%d passed, %d failed)\n", pass_count, fail_count);
+    }
+    printf("========================================\n\n");
+
+    /* Hold display for 5 seconds */
+    printf("Holding display for 5 seconds...\n");
+    sleep(5);
+
+    /* Cleanup */
+    if (saved_crtc) {
+        drmModeSetCrtc(drm_fd, saved_crtc->crtc_id, saved_crtc->buffer_id,
+                       saved_crtc->x, saved_crtc->y, &conn_id, 1,
+                       &saved_crtc->mode);
+        drmModeFreeCrtc(saved_crtc);
+    }
+    drmModeRmFB(drm_fd, fb_id);
+    close(drm_fd);
+
+    printf("Done.\n");
+    return (fail_count == 0) ? 0 : 1;
+}
diff --git a/userspace/programs/src/bcheck.rs b/userspace/programs/src/bcheck.rs
index 208a3781..33e42a79 100644
--- a/userspace/programs/src/bcheck.rs
+++ b/userspace/programs/src/bcheck.rs
@@ -432,22 +432,29 @@ fn main() {
     let total_h = content_height(&tests);
     let max_scroll = (total_h - visible_h).max(0);
     let mut scroll_offset: i32 = 0;
+    let sleep_ts = libbreenix::types::Timespec { tv_sec: 0, tv_nsec: 50_000_000 }; // 50ms
 
     loop {
+        let mut need_redraw = false;
         for event in win.poll_events() {
             match event {
                 Event::KeyPress { keycode, .. } => {
                     match keycode {
-                        0x52 => scroll_offset = (scroll_offset - ROW_H).max(0),       // Up
-                        0x51 => scroll_offset = (scroll_offset + ROW_H).min(max_scroll), // Down
+                        0x52 => { scroll_offset = (scroll_offset - ROW_H).max(0); need_redraw = true; }
+                        0x51 => { scroll_offset = (scroll_offset + ROW_H).min(max_scroll); need_redraw = true; }
                         _ => {}
                     }
                 }
+                Event::CloseRequested => std::process::exit(0),
                 _ => {}
             }
         }
-        render(win.framebuf(), &tests, scroll_offset);
-        let _ = win.present();
+        if need_redraw {
+            render(win.framebuf(), &tests, scroll_offset);
+            let _ = win.present();
+        } else {
+            let _ = time::nanosleep(&sleep_ts);
+        }
     }
 }
 
diff --git a/userspace/programs/src/bwm.rs b/userspace/programs/src/bwm.rs
index 4a1e686a..f3b6628c 100644
--- a/userspace/programs/src/bwm.rs
+++ b/userspace/programs/src/bwm.rs
@@ -188,10 +188,14 @@ struct Window {
     /// Stable ordering for appbar (assigned at discovery time, never changes)
     creation_order: u32,
     /// Direct-mapped pointer to client window's pixel buffer (read-only, MAP_SHARED)
+    /// Stored for future per-window direct blit (currently compositor uses bulk composite).
+    #[allow(dead_code)]
     mapped_ptr: *const u32,
     /// Client window buffer width (from map_window_buffer)
+    #[allow(dead_code)]
     mapped_w: u32,
     /// Client window buffer height (from map_window_buffer)
+    #[allow(dead_code)]
     mapped_h: u32,
 }
 
@@ -247,11 +251,6 @@ impl Window {
     }
 }
 
-
-fn rects_overlap(a: (i32, i32, i32, i32), b: (i32, i32, i32, i32)) -> bool {
-    a.0 < b.2 && a.2 > b.0 && a.1 < b.3 && a.3 > b.1
-}
-
 // ─── Drawing Helpers ─────────────────────────────────────────────────────────
 
 fn fill_rect(fb: &mut FrameBuf, x: i32, y: i32, w: usize, h: usize, color: Color) {
@@ -624,6 +623,11 @@ fn discover_windows(windows: &mut Vec<Window>, screen_w: usize, screen_h: usize,
             }
         };
 
+        // Tell kernel where the client content goes on screen (for GPU compositing)
+        let content_x = cascade_x + BORDER_WIDTH as i32;
+        let content_y = cascade_y + TITLE_BAR_HEIGHT as i32 + BORDER_WIDTH as i32;
+        let _ = graphics::set_window_position(info.buffer_id, content_x, content_y);
+
         let order = *next_order;
         *next_order += 1;
         windows.push(Window {
@@ -640,144 +644,15 @@ fn discover_windows(windows: &mut Vec<Window>, screen_w: usize, screen_h: usize,
     removed || added
 }
 
-// ─── Client Pixel Blitting ──────────────────────────────────────────────────
-
-/// Core pixel blit — direct u32 writes to compositor buffer for speed.
-/// Bypasses FrameBuf::put_pixel which does per-pixel bounds checking + color conversion.
-fn blit_pixels_to_fb(fb: &mut FrameBuf, win: &Window, src: &[u32], w: usize, h: usize) {
-    let cx = win.content_x();
-    let cy = win.content_y();
-    let cw = win.content_width();
-    let ch = win.content_height();
-    let pw = w.min(cw);
-    let ph = h.min(ch);
-    let fb_w = fb.width;
-    let fb_h = fb.height;
-    // Get raw u32 pointer to compositor buffer
-    let fb_ptr = fb.raw_ptr() as *mut u32;
-    for row in 0..ph {
-        let py = (cy + row as i32) as usize;
-        if py >= fb_h { continue; }
-        let dst_row_start = py * fb_w;
-        let src_row_start = row * w;
-        let x_start = cx.max(0) as usize;
-        let x_end = ((cx + pw as i32) as usize).min(fb_w);
-        let src_offset = if cx < 0 { (-cx) as usize } else { 0 };
-        if x_start >= x_end { continue; }
-        let count = x_end - x_start;
-        let si = src_row_start + src_offset;
-        if si + count > src.len() { continue; }
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                src.as_ptr().add(si),
-                fb_ptr.add(dst_row_start + x_start),
-                count,
-            );
-        }
-    }
-}
-
-/// Check if a window has new pixels and blit from mapped memory to compositor.
-/// Skips pixels covered by higher-z windows (occluders) so no z-repair is needed.
-/// Returns true if new data was available.
-fn blit_client_pixels(fb: &mut FrameBuf, win: &Window,
-                      occluders: &[(i32, i32, i32, i32)]) -> bool {
-    if win.mapped_ptr.is_null() || win.mapped_w == 0 || win.mapped_h == 0 {
-        return false;
-    }
-    let dirty = graphics::check_window_dirty(win.window_id).unwrap_or(false);
-    if !dirty { return false; }
-
-    if occluders.is_empty() {
-        blit_mapped_pixels(fb, win);
-        return true;
-    }
-
-    // Occluded blit: for each row, skip pixels covered by higher windows.
-    let w = win.mapped_w as usize;
-    let h = win.mapped_h as usize;
-    let src = unsafe { core::slice::from_raw_parts(win.mapped_ptr, w * h) };
-
-    let cx = win.content_x();
-    let cy = win.content_y();
-    let cw = win.content_width().min(w);
-    let ch = win.content_height().min(h);
-    let fb_w = fb.width;
-    let fb_h = fb.height;
-    let fb_ptr = fb.raw_ptr() as *mut u32;
-
-    for row in 0..ch {
-        let py = cy + row as i32;
-        if py < 0 || py >= fb_h as i32 { continue; }
-        let row_x_start = cx.max(0) as usize;
-        let row_x_end = ((cx + cw as i32) as usize).min(fb_w);
-        if row_x_start >= row_x_end { continue; }
-
-        // Build visible spans by subtracting occluder columns from the full row
-        let mut spans = [(0usize, 0usize); 8];
-        let mut n_spans = 1;
-        spans[0] = (row_x_start, row_x_end);
-
-        for &(ox0, oy0, ox1, oy1) in occluders {
-            if py < oy0 || py >= oy1 { continue; }
-            let os = ox0.max(0) as usize;
-            let oe = ox1.max(0) as usize;
-            let mut new_spans = [(0usize, 0usize); 8];
-            let mut nc = 0;
-            for k in 0..n_spans {
-                let (sx, ex) = spans[k];
-                if sx >= ex { continue; }
-                if oe <= sx || os >= ex {
-                    if nc < 8 { new_spans[nc] = (sx, ex); nc += 1; }
-                } else {
-                    if sx < os && nc < 8 { new_spans[nc] = (sx, os); nc += 1; }
-                    if ex > oe && nc < 8 { new_spans[nc] = (oe, ex); nc += 1; }
-                }
-            }
-            spans = new_spans;
-            n_spans = nc;
-        }
-
-        let src_row = row * w;
-        let src_col_base = if cx < 0 { (-cx) as usize } else { 0 };
-        for k in 0..n_spans {
-            let (sx, ex) = spans[k];
-            if sx >= ex { continue; }
-            let count = ex - sx;
-            let si = src_row + src_col_base + (sx - row_x_start);
-            if si + count > w * h { continue; }
-            unsafe {
-                core::ptr::copy_nonoverlapping(
-                    src.as_ptr().add(si),
-                    fb_ptr.add(py as usize * fb_w + sx),
-                    count,
-                );
-            }
-        }
-    }
-    true
-}
-
-/// Blit a window's pixels from its mapped memory to the compositor buffer.
-fn blit_mapped_pixels(fb: &mut FrameBuf, win: &Window) {
-    if win.mapped_ptr.is_null() { return; }
-    let w = win.mapped_w as usize;
-    let h = win.mapped_h as usize;
-    let pixel_count = w * h;
-    let src = unsafe { core::slice::from_raw_parts(win.mapped_ptr, pixel_count) };
-    blit_pixels_to_fb(fb, win, src, w, h);
-}
-
 /// Redraw all windows in z-order (index 0 = bottom), plus taskbar and app bar.
-/// Reads directly from mapped memory (zero-copy from client window pages).
+/// Window frames and decorations go into the compositor buffer; GPU compositing
+/// handles client content via per-window textured quads.
 fn redraw_all_windows(fb: &mut FrameBuf, windows: &[Window], focused_win: usize, clock_text: &[u8]) {
     draw_taskbar(fb, clock_text);
     for i in 0..windows.len() {
         if windows[i].minimized { continue; }
         draw_window_frame(fb, &windows[i], i == focused_win);
-        if windows[i].window_id != 0 {
-            blit_mapped_pixels(fb, &windows[i]);
-        }
+        // GPU compositing handles client content — don't blit here
     }
     draw_appbar(fb, windows, focused_win);
 }
@@ -835,6 +710,7 @@ fn compose_partial_redraw(
             sbuf[start..end].copy_from_slice(&bg[start..end]);
         }
         // 2. Redraw UI elements that intersect dirty region
+        // GPU compositing handles client content — only draw frames/decorations
         if dy0 < TASKBAR_HEIGHT {
             draw_taskbar(sfb, clock);
         }
@@ -845,9 +721,6 @@ fn compose_partial_redraw(
                 && (wy1 as usize) > dy0 && (wy0 as usize) < dy1
             {
                 draw_window_frame(sfb, &windows[i], i == focused);
-                if windows[i].window_id != 0 {
-                    blit_mapped_pixels(sfb, &windows[i]);
-                }
             }
         }
         if dy1 > screen_h - APPBAR_HEIGHT {
@@ -861,6 +734,7 @@ fn compose_partial_redraw(
         }
     } else {
         // Non-shadow path: restore bg region, redraw affected windows
+        // GPU compositing handles client content — only draw frames/decorations
         for row in dy0..dy1 {
             let start = row * screen_w + dx0;
             let end = row * screen_w + dx1;
@@ -876,9 +750,6 @@ fn compose_partial_redraw(
                 && (wy1 as usize) > dy0 && (wy0 as usize) < dy1
             {
                 draw_window_frame(fb, &windows[i], i == focused);
-                if windows[i].window_id != 0 {
-                    blit_mapped_pixels(fb, &windows[i]);
-                }
             }
         }
         if dy1 > screen_h - APPBAR_HEIGHT {
@@ -1008,17 +879,6 @@ fn main() {
     let mut read_buf = [0u8; 512];
     let mut poll_fds = [io::PollFd { fd: 0, events: io::poll_events::POLLIN as i16, revents: 0 }];
 
-    // Performance tracing
-    let mut perf_frame: u64 = 0;
-    let mut perf_total_ns: u64 = 0;
-    let mut perf_composites: u64 = 0;
-    let mut perf_waits: u64 = 0;
-
-    fn mono_ns() -> u64 {
-        let ts = libbreenix::time::now_monotonic().unwrap_or_default();
-        (ts.tv_sec as u64) * 1_000_000_000 + (ts.tv_nsec as u64)
-    }
-
     // Registry generation tracking for compositor_wait
     let mut registry_gen: u32 = 0;
 
@@ -1036,9 +896,6 @@ fn main() {
         // 16ms timeout ensures keyboard input via stdin is checked at least ~60Hz.
         let (ready, new_reg_gen) = graphics::compositor_wait(16, registry_gen).unwrap_or((0, registry_gen));
         registry_gen = new_reg_gen;
-        perf_waits += 1;
-
-        let t0 = mono_ns();
 
         // ── 1. Discover new/removed client windows (only when registry changed) ──
         if ready & graphics::COMPOSITOR_READY_REGISTRY != 0 {
@@ -1121,6 +978,12 @@ fn main() {
                             let (ox0, oy0, ox1, oy1) = windows[win_idx].bounds();
                             windows[win_idx].x = new_x;
                             windows[win_idx].y = new_y;
+                            // Update kernel window position for GPU compositing
+                            if windows[win_idx].window_id != 0 {
+                                let cx = windows[win_idx].content_x();
+                                let cy = windows[win_idx].content_y();
+                                let _ = graphics::set_window_position(windows[win_idx].window_id, cx, cy);
+                            }
                             // Dirty region = union of old and new bounds
                             let (nx0, ny0, nx1, ny1) = windows[win_idx].bounds();
                             let dr_x0 = ox0.min(nx0).max(0) as usize;
@@ -1271,33 +1134,21 @@ fn main() {
             }
         }
 
-        // ── 5. Blit dirty client window pixels (occluded by higher-z windows) ──
+        // ── 5. GPU compositing handles window content — just check which are dirty ──
         // Skip entirely if compositor_wait didn't report dirty content
         if ready & graphics::COMPOSITOR_READY_DIRTY != 0 {
-        for i in 0..windows.len().min(16) {
-            if windows[i].window_id != 0 && !windows[i].minimized {
-                let mut occ = [(0i32, 0i32, 0i32, 0i32); 16];
-                let mut n_occ = 0;
-                let ib = windows[i].bounds();
-                for j in (i + 1)..windows.len().min(16) {
-                    if !windows[j].minimized {
-                        let jb = windows[j].bounds();
-                        if rects_overlap(ib, jb) && n_occ < 16 {
-                            occ[n_occ] = jb;
-                            n_occ += 1;
-                        }
+            for i in 0..windows.len().min(16) {
+                if windows[i].window_id != 0 && !windows[i].minimized {
+                    if graphics::check_window_dirty(windows[i].window_id).unwrap_or(false) {
+                        content_dirty = true;
+                        let (bx0, by0, bx1, by1) = windows[i].bounds();
+                        dirty_x0 = dirty_x0.min(bx0);
+                        dirty_y0 = dirty_y0.min(by0);
+                        dirty_x1 = dirty_x1.max(bx1);
+                        dirty_y1 = dirty_y1.max(by1);
                     }
                 }
-                if blit_client_pixels(&mut fb, &windows[i], &occ[..n_occ]) {
-                    content_dirty = true;
-                    let (bx0, by0, bx1, by1) = ib;
-                    dirty_x0 = dirty_x0.min(bx0);
-                    dirty_y0 = dirty_y0.min(by0);
-                    dirty_x1 = dirty_x1.max(bx1);
-                    dirty_y1 = dirty_y1.max(by1);
-                }
             }
-        }
         } // end if DIRTY
 
         // ── 5b. Update clock (once per second) ──
@@ -1328,7 +1179,6 @@ fn main() {
             );
             full_redraw = false;
             content_dirty = false;
-            perf_composites += 1;
         } else if content_dirty {
             let sw = screen_w as i32;
             let sh = screen_h as i32;
@@ -1341,7 +1191,6 @@ fn main() {
                 2, dx, dy, dw, dh,
             );
             content_dirty = false;
-            perf_composites += 1;
         } else if mouse_moved_this_frame {
             // Mouse-only update: no content changed, but kernel draws cursor
             let _ = graphics::virgl_composite_windows_rect(
@@ -1351,19 +1200,5 @@ fn main() {
         }
         // No sleep — compositor_wait handles blocking
 
-        let t_end = mono_ns();
-
-        perf_total_ns += t_end.saturating_sub(t0);
-        perf_frame += 1;
-
-        if perf_frame % 500 == 0 {
-            let avg_us = perf_total_ns / 1000 / 500;
-            print!("[bwm-perf] iter={} composites={} waits={} avg_work={}us\n",
-                perf_frame, perf_composites, perf_waits, avg_us,
-            );
-            perf_total_ns = 0;
-            perf_composites = 0;
-            perf_waits = 0;
-        }
     }
 }
diff --git a/userspace/programs/src/init.rs b/userspace/programs/src/init.rs
index c1616b21..da5d66ce 100644
--- a/userspace/programs/src/init.rs
+++ b/userspace/programs/src/init.rs
@@ -14,10 +14,10 @@
 //! BWM is a pure compositor: it no longer spawns terminals internally. Instead,
 //! bterm and blog are standalone Breengel GUI apps that register windows with BWM.
 //!
-//! Main loop reaps terminated children with waitpid(WNOHANG) and respawns
+//! Main loop blocks on waitpid() until a child exits, then respawns
 //! crashed services with backoff to prevent tight respawn loops.
 
-use libbreenix::process::{fork, exec, execv, waitpid, getpid, yield_now, ForkResult, WNOHANG};
+use libbreenix::process::{fork, exec, execv, waitpid, getpid, yield_now, ForkResult};
 
 const TELNETD_PATH: &[u8] = b"/sbin/telnetd\0";
 const BLOGD_PATH: &[u8] = b"/sbin/blogd\0";
@@ -188,57 +188,61 @@ fn main() {
     print!("[init] BUSYBOX TEST: cat /etc/passwd\n");
     test_busybox_cat();
 
-    // Main loop: reap zombies, respawn crashed services.
+    // Main loop: block on waitpid until a child exits, then respawn if needed.
     let mut status: i32 = 0;
     loop {
-        match waitpid(-1, &mut status as *mut i32, WNOHANG) {
-            Ok(reaped_pid) => {
-                let reaped = reaped_pid.raw() as i64;
-                if reaped > 0 {
-                    if reaped == bwm_pid {
-                        print!("[init] BWM exited (status {})\n", status);
-                        bwm_pid = try_respawn(BWM_PATH, "bwm", &mut bwm_failures);
-                        if bwm_pid == -1 {
-                            print!("[init] BWM failed {} times, giving up\n", MAX_RESPAWN_FAILURES);
-                        }
-                    } else if reaped == bterm_pid {
-                        print!("[init] bterm exited (status {})\n", status);
-                        bterm_pid = try_respawn(BTERM_PATH, "bterm", &mut bterm_failures);
-                        if bterm_pid == -1 {
-                            print!("[init] bterm failed {} times, giving up\n", MAX_RESPAWN_FAILURES);
-                        }
-                    } else if reaped == blog_pid {
-                        print!("[init] blog exited (status {})\n", status);
-                        blog_pid = try_respawn(BLOG_PATH, "blog", &mut blog_failures);
-                        if blog_pid == -1 {
-                            print!("[init] blog failed {} times, giving up\n", MAX_RESPAWN_FAILURES);
-                        }
-                    } else if reaped == bounce_pid {
-                        print!("[init] bounce exited (status {})\n", status);
-                        bounce_pid = try_respawn(BOUNCE_PATH, "bounce", &mut bounce_failures);
-                        if bounce_pid == -1 {
-                            print!("[init] bounce failed {} times, giving up\n", MAX_RESPAWN_FAILURES);
-                        }
-                    } else if reaped == bcheck_pid {
-                        print!("[init] bcheck exited (status {})\n", status);
-                        // Don't respawn — bcheck runs once then displays results
-                        bcheck_pid = -1;
-                    } else if reaped == blogd_pid {
-                        print!("[init] blogd exited (status {})\n", status);
-                        blogd_pid = try_respawn(BLOGD_PATH, "blogd", &mut blogd_failures);
-                        if blogd_pid == -1 {
-                            print!("[init] blogd failed {} times, giving up\n", MAX_RESPAWN_FAILURES);
-                        }
-                    } else if reaped == telnetd_pid {
-                        telnetd_pid = try_respawn(TELNETD_PATH, "telnetd", &mut telnetd_failures);
-                        if telnetd_pid == -1 && telnetd_failures >= MAX_RESPAWN_FAILURES {
-                            print!("[init] telnetd unavailable, continuing without it\n");
-                        }
-                    }
-                }
+        let reaped = match waitpid(-1, &mut status as *mut i32, 0) {
+            Ok(pid) => pid.raw() as i64,
+            Err(_) => {
+                // ECHILD — no children at all. Sleep to avoid spinning.
+                let ts = libbreenix::types::Timespec { tv_sec: 1, tv_nsec: 0 };
+                let _ = libbreenix::time::nanosleep(&ts);
+                continue;
+            }
+        };
+
+        if reaped <= 0 {
+            continue;
+        }
+
+        if reaped == bwm_pid {
+            print!("[init] BWM exited (status {})\n", status);
+            bwm_pid = try_respawn(BWM_PATH, "bwm", &mut bwm_failures);
+            if bwm_pid == -1 {
+                print!("[init] BWM failed {} times, giving up\n", MAX_RESPAWN_FAILURES);
+            }
+        } else if reaped == bterm_pid {
+            print!("[init] bterm exited (status {})\n", status);
+            bterm_pid = try_respawn(BTERM_PATH, "bterm", &mut bterm_failures);
+            if bterm_pid == -1 {
+                print!("[init] bterm failed {} times, giving up\n", MAX_RESPAWN_FAILURES);
+            }
+        } else if reaped == blog_pid {
+            print!("[init] blog exited (status {})\n", status);
+            blog_pid = try_respawn(BLOG_PATH, "blog", &mut blog_failures);
+            if blog_pid == -1 {
+                print!("[init] blog failed {} times, giving up\n", MAX_RESPAWN_FAILURES);
+            }
+        } else if reaped == bounce_pid {
+            print!("[init] bounce exited (status {})\n", status);
+            bounce_pid = try_respawn(BOUNCE_PATH, "bounce", &mut bounce_failures);
+            if bounce_pid == -1 {
+                print!("[init] bounce failed {} times, giving up\n", MAX_RESPAWN_FAILURES);
+            }
+        } else if reaped == bcheck_pid {
+            print!("[init] bcheck exited (status {})\n", status);
+            bcheck_pid = -1;
+        } else if reaped == blogd_pid {
+            print!("[init] blogd exited (status {})\n", status);
+            blogd_pid = try_respawn(BLOGD_PATH, "blogd", &mut blogd_failures);
+            if blogd_pid == -1 {
+                print!("[init] blogd failed {} times, giving up\n", MAX_RESPAWN_FAILURES);
+            }
+        } else if reaped == telnetd_pid {
+            telnetd_pid = try_respawn(TELNETD_PATH, "telnetd", &mut telnetd_failures);
+            if telnetd_pid == -1 && telnetd_failures >= MAX_RESPAWN_FAILURES {
+                print!("[init] telnetd unavailable, continuing without it\n");
             }
-            Err(_) => {}
         }
-        let _ = yield_now();
     }
 }