diff --git a/src/converter/mod.rs b/src/converter/mod.rs
index 1133922..a8de10c 100644
--- a/src/converter/mod.rs
+++ b/src/converter/mod.rs
@@ -541,6 +541,12 @@ impl ColorConverter {
     ///
     /// # Returns
     /// Returns `Ok(())` on success. The target_image is transitioned to VIDEO_ENCODE_SRC_KHR.
+    /// Convert an RGB source image to YUV, writing to the target image.
+    ///
+    /// Submits the command buffer and waits synchronously on a fence before
+    /// returning. The caller is responsible for any further sync between
+    /// convert and downstream consumers (e.g. an encoder reading the target
+    /// image).
     pub fn convert(
         &mut self,
         src_image: vk::Image,
@@ -858,7 +864,8 @@ impl ColorConverter {
                 .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?;
         }
 
-        // Submit and wait.
+        // Submit and wait synchronously on the fence — no semaphore overlap
+        // with the encoder; the caller is responsible for any further sync.
         unsafe {
             device
                 .reset_fences(&[self.fence])
diff --git a/src/encoder/av1/api.rs b/src/encoder/av1/api.rs
index 224a989..5d74365 100644
--- a/src/encoder/av1/api.rs
+++ b/src/encoder/av1/api.rs
@@ -12,7 +12,7 @@ impl AV1Encoder {
     /// This image can be used as a target for `ColorConverter::convert` to avoid
     /// an intermediate copy.
     pub fn input_image(&self) -> vk::Image {
-        self.input_image
+        self.slots[self.current_slot].input_image
     }
 
     /// Encode a frame from a GPU image.
@@ -26,22 +26,60 @@ impl AV1Encoder {
     /// The encoder will panic at creation time if B-frames are enabled (b_frame_count > 0),
     /// as B-frame encoding is not yet supported.
     pub fn encode(&mut self, src_image: vk::Image) -> Result<Vec<EncodedPacket>> {
+        let prev_packet = self.drain_current_slot()?;
+
         let gop_position = self.gop.get_next_frame();
         let display_order = self.input_frame_num;
         self.input_frame_num += 1;
 
         debug!(
-            "AV1 encode: frame {} from GPU image, type={:?}",
-            display_order, gop_position.frame_type
+            "AV1 encode: frame {} type={:?}, slot={}",
+            display_order, gop_position.frame_type, self.current_slot
         );
 
-        // Upload from GPU image.
         self.upload_from_image(src_image)?;
+        self.encode_current_frame(&gop_position, display_order)?;
 
-        // Encode immediately.
-        let packet = self.encode_current_frame(&gop_position, display_order)?;
+        self.current_slot = (self.current_slot + 1) % self.slots.len();
+        Ok(prev_packet.into_iter().collect())
+    }
 
-        Ok(vec![packet])
+    fn drain_current_slot(&mut self) -> Result<Option<EncodedPacket>> {
+        if !self.slots[self.current_slot].in_flight {
+            return Ok(None);
+        }
+        let bitstream = unsafe {
+            crate::encoder::resources::wait_and_read_bitstream(
+                self.context.device(),
+                self.slots[self.current_slot].encode_fence,
+                self.slots[self.current_slot].query_pool,
+                self.slots[self.current_slot].bitstream_buffer_ptr,
+            )?
+        };
+        self.slots[self.current_slot].in_flight = false;
+        let meta = self.slots[self.current_slot]
+            .pending_metadata
+            .take()
+            .ok_or_else(|| {
+                PixelForgeError::CommandBuffer(
+                    "Drained slot has bitstream but no metadata; encoder state corrupted"
+                        .to_string(),
+                )
+            })?;
+        // AV1 always prefixes a Temporal Delimiter OBU; key frames also need
+        // the sequence header captured at submit time.
+        let mut data = vec![0x12, 0x00];
+        if let Some(header) = meta.header {
+            data.extend_from_slice(&header);
+        }
+        data.extend_from_slice(&bitstream);
+        Ok(Some(EncodedPacket {
+            data,
+            frame_type: meta.frame_type,
+            is_key_frame: meta.is_key_frame,
+            pts: meta.pts,
+            dts: meta.dts,
+        }))
     }
 
     /// Internal method to encode the current frame already uploaded to input_image.
@@ -49,7 +87,7 @@ impl AV1Encoder {
         &mut self,
         gop_position: &GopPosition,
         display_order: u64,
-    ) -> Result<EncodedPacket> {
+    ) -> Result<()> {
         let is_key_frame =
             gop_position.frame_type.is_idr() || gop_position.frame_type == GopFrameType::I;
         let is_reference = gop_position.is_reference;
@@ -75,38 +113,41 @@ impl AV1Encoder {
             }
         }
 
-        let mut encoded_data = Vec::new();
-
-        // AV1 Temporal Delimiter OBU: type=2, has_size=1, size=0.
-        // Required as the first OBU in each temporal unit for conformant bitstreams.
-        // This enables ffmpeg's AV1 demuxer to detect frame boundaries in raw OBU streams.
-        encoded_data.extend_from_slice(&[0x12, 0x00]);
-
-        // For key frames, prepend the AV1 Sequence Header OBU.
-        // This is required for AV1 decoders to initialize (equivalent to H.265 VPS/SPS/PPS).
-        if is_key_frame {
+        // For key frames, capture the AV1 Sequence Header OBU to be prepended
+        // at drain time. (The Temporal Delimiter prefix is added in
+        // drain_current_slot for every frame.)
+        let header = if is_key_frame {
             if self.header_data.is_none() {
-                let header = self.get_av1_sequence_header()?;
+                let h = self.get_av1_sequence_header()?;
                 debug!(
                     "AV1 sequence header ({} bytes): {:02X?}",
-                    header.len(),
-                    &header[..std::cmp::min(32, header.len())]
+                    h.len(),
+                    &h[..std::cmp::min(32, h.len())]
                 );
-                self.header_data = Some(header);
+                self.header_data = Some(h);
             }
-            if let Some(ref header) = self.header_data {
-                encoded_data.extend_from_slice(header);
-            }
-        }
+            self.header_data.clone()
+        } else {
+            None
+        };
 
-        encoded_data.extend_from_slice(&self.encode_frame_internal(gop_position, is_key_frame)?);
+        // Submit the encode (no wait, no readback). Marks the slot in_flight.
+        self.encode_frame_internal(gop_position, is_key_frame)?;
 
-        // Save the order_hint used during encoding BEFORE incrementing.
         let encoded_order_hint = self.order_hint;
+        let dts = self.encode_frame_num;
         self.encode_frame_num += 1;
         self.frame_num += 1;
         self.order_hint = (self.order_hint + 1) & 0xFF; // 8-bit order hint
 
+        self.slots[self.current_slot].pending_metadata = Some(super::SlotPacketMetadata {
+            frame_type,
+            is_key_frame,
+            pts: display_order,
+            dts,
+            header,
+        });
+
         // Only KEY frames are stored as references. P frames all reference the KEY frame
         // and don't update any reference buffer, avoiding P→P which produces corrupt output
         // on NVIDIA AV1 encoders.
@@ -131,19 +172,25 @@ impl AV1Encoder {
         // P frames reuse the same scratch DPB slot (current_dpb_slot stays unchanged
         // between P frames since it's always different from the KEY frame's slot).
 
-        Ok(EncodedPacket {
-            data: encoded_data,
-            frame_type,
-            is_key_frame,
-            pts: display_order,
-            dts: self.encode_frame_num - 1,
-        })
+        Ok(())
     }
 
-    /// Flush the encoder and get any remaining packets.
+    /// Flush the encoder and drain any remaining in-flight slots.
     pub fn flush(&mut self) -> Result<Vec<EncodedPacket>> {
-        // No buffered frames in the current implementation.
-        Ok(Vec::new())
+        let mut out = Vec::new();
+        for offset in 0..self.slots.len() {
+            let idx = (self.current_slot + offset) % self.slots.len();
+            if !self.slots[idx].in_flight {
+                continue;
+            }
+            let saved_current = self.current_slot;
+            self.current_slot = idx;
+            if let Some(packet) = self.drain_current_slot()? {
+                out.push(packet);
+            }
+            self.current_slot = saved_current;
+        }
+        Ok(out)
     }
 
     /// Request that the next frame be an IDR/key frame.
@@ -214,17 +261,16 @@ impl AV1Encoder {
     /// containing the updated color configuration. The next encoded frame will
     /// be a key frame with the new sequence header prepended.
     pub fn set_color_description(&mut self, desc: ColorDescription) -> Result<()> {
-        // Wait for any in-flight encode to complete before modifying session params.
-        // Do NOT reset the fence here — submit_encode_and_read_bitstream() resets it
-        // before queue_submit. Leaving the fence signaled allows consecutive
-        // set_color_description() calls without deadlock.
+        // Wait for ALL slot fences before modifying session params. Do NOT reset
+        // here; submit_encode_only resets each fence on submit.
+        let fences: Vec<vk::Fence> = self.slots.iter().map(|s| s.encode_fence).collect();
         unsafe {
             self.context
                 .device()
-                .wait_for_fences(&[self.encode_fence], true, u64::MAX)
+                .wait_for_fences(&fences, true, u64::MAX)
                 .map_err(|e| {
                     PixelForgeError::Synchronization(format!(
-                        "Failed to wait for encode fence: {:?}",
+                        "Failed to wait for encode fences: {:?}",
                         e
                     ))
                 })?;
diff --git a/src/encoder/av1/encode.rs b/src/encoder/av1/encode.rs
index 9ac5fa1..c409228 100644
--- a/src/encoder/av1/encode.rs
+++ b/src/encoder/av1/encode.rs
@@ -3,18 +3,21 @@ use super::AV1Encoder;
 use crate::encoder::gop::GopPosition;
 use crate::encoder::resources::{
     prepare_encode_command_buffer, record_dpb_barriers, record_post_encode_dpb_barrier,
-    submit_encode_and_read_bitstream,
+    submit_encode_only,
 };
 use crate::error::{PixelForgeError, Result};
 use ash::vk;
 use tracing::debug;
 
 impl AV1Encoder {
+    /// Records and submits the encode commands for a single frame to the
+    /// current slot. Does NOT wait for completion — see encoder::h265 for the
+    /// pipelining contract.
     pub(super) fn encode_frame_internal(
         &mut self,
         _gop_position: &GopPosition,
         is_key_frame: bool,
-    ) -> Result<Vec<u8>> {
+    ) -> Result<()> {
         // All frames need a setup reference slot (DPB write) per Vulkan spec when maxDpbSlots > 0.
         let is_reference = true;
 
@@ -52,8 +55,8 @@ impl AV1Encoder {
         unsafe {
             prepare_encode_command_buffer(
                 self.context.device(),
-                self.encode_command_buffer,
-                self.query_pool,
+                self.slots[self.current_slot].encode_command_buffer,
+                self.slots[self.current_slot].query_pool,
             )?;
         }
 
@@ -62,7 +65,7 @@ impl AV1Encoder {
         unsafe {
             record_dpb_barriers(
                 self.context.device(),
-                self.encode_command_buffer,
+                self.slots[self.current_slot].encode_command_buffer,
                 &self.dpb_images,
                 false, // AV1 does not use layered DPB
                 self.current_dpb_slot,
@@ -418,8 +421,10 @@ impl AV1Encoder {
         };
 
         unsafe {
-            self.video_queue_fn
-                .cmd_begin_video_coding(self.encode_command_buffer, &begin_coding_info);
+            self.video_queue_fn.cmd_begin_video_coding(
+                self.slots[self.current_slot].encode_command_buffer,
+                &begin_coding_info,
+            );
         }
 
         // Reset video coding state for the first frame.
@@ -440,8 +445,10 @@ impl AV1Encoder {
                 (&quality_level_info as *const vk::VideoEncodeQualityLevelInfoKHR).cast();
 
             unsafe {
-                self.video_queue_fn
-                    .cmd_control_video_coding(self.encode_command_buffer, &control_info);
+                self.video_queue_fn.cmd_control_video_coding(
+                    self.slots[self.current_slot].encode_command_buffer,
+                    &control_info,
+                );
             }
         }
 
@@ -450,13 +457,13 @@ impl AV1Encoder {
             .coded_offset(vk::Offset2D { x: 0, y: 0 })
             .coded_extent(frame_extent)
             .base_array_layer(0)
-            .image_view_binding(self.input_image_view);
+            .image_view_binding(self.slots[self.current_slot].input_image_view);
 
         let mut encode_info = vk::VideoEncodeInfoKHR::default()
             .src_picture_resource(src_picture_resource)
-            .dst_buffer(self.bitstream_buffer)
+            .dst_buffer(self.slots[self.current_slot].bitstream_buffer)
             .dst_buffer_offset(0)
-            .dst_buffer_range(self.bitstream_buffer_size as u64);
+            .dst_buffer_range(self.slots[self.current_slot].bitstream_buffer_size as u64);
 
         if is_reference {
             encode_info = encode_info.setup_reference_slot(&setup_reference_slot);
@@ -471,30 +478,34 @@ impl AV1Encoder {
         // Begin query to capture encode feedback (bitstream size, status).
         unsafe {
             self.context.device().cmd_begin_query(
-                self.encode_command_buffer,
-                self.query_pool,
+                self.slots[self.current_slot].encode_command_buffer,
+                self.slots[self.current_slot].query_pool,
                 0,
                 vk::QueryControlFlags::empty(),
             );
         }
 
         unsafe {
-            self.video_encode_fn
-                .cmd_encode_video(self.encode_command_buffer, &encode_info);
+            self.video_encode_fn.cmd_encode_video(
+                self.slots[self.current_slot].encode_command_buffer,
+                &encode_info,
+            );
         }
 
         // End query.
         unsafe {
-            self.context
-                .device()
-                .cmd_end_query(self.encode_command_buffer, self.query_pool, 0);
+            self.context.device().cmd_end_query(
+                self.slots[self.current_slot].encode_command_buffer,
+                self.slots[self.current_slot].query_pool,
+                0,
+            );
         }
 
         // Add DPB synchronization barrier after encoding.
         unsafe {
             record_post_encode_dpb_barrier(
                 self.context.device(),
-                self.encode_command_buffer,
+                self.slots[self.current_slot].encode_command_buffer,
                 &self.dpb_images,
                 false, // AV1 does not use layered DPB
                 self.current_dpb_slot,
@@ -504,15 +515,17 @@ impl AV1Encoder {
         // End video coding.
         let end_coding_info = vk::VideoEndCodingInfoKHR::default();
         unsafe {
-            self.video_queue_fn
-                .cmd_end_video_coding(self.encode_command_buffer, &end_coding_info);
+            self.video_queue_fn.cmd_end_video_coding(
+                self.slots[self.current_slot].encode_command_buffer,
+                &end_coding_info,
+            );
         }
 
         // End command buffer.
         unsafe {
             self.context
                 .device()
-                .end_command_buffer(self.encode_command_buffer)
+                .end_command_buffer(self.slots[self.current_slot].encode_command_buffer)
         }
         .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?;
 
@@ -531,22 +544,24 @@ impl AV1Encoder {
 
         let gpu_start = std::time::Instant::now();
 
-        let encoded_data = unsafe {
-            submit_encode_and_read_bitstream(
+        unsafe {
+            submit_encode_only(
                 self.context.device(),
-                self.encode_command_buffer,
-                self.encode_fence,
+                self.slots[self.current_slot].encode_command_buffer,
+                self.slots[self.current_slot].encode_fence,
                 encode_queue,
-                self.query_pool,
-                self.bitstream_buffer_ptr,
-            )?
-        };
+                None,
+            )?;
+        }
 
-        debug!("GPU encode took {:?}", gpu_start.elapsed());
+        debug!("Submitted encode (no wait): {:?}", gpu_start.elapsed());
 
         // Mark current DPB slot as active.
         self.dpb_slot_active[self.current_dpb_slot as usize] = true;
 
-        Ok(encoded_data)
+        // Mark slot as in-flight; bitstream is drained on next encode() call.
+        self.slots[self.current_slot].in_flight = true;
+
+        Ok(())
     }
 }
diff --git a/src/encoder/av1/init.rs b/src/encoder/av1/init.rs
index 1a59b30..58c94f5 100644
--- a/src/encoder/av1/init.rs
+++ b/src/encoder/av1/init.rs
@@ -2,9 +2,10 @@ use super::{AV1Encoder, MIN_BITSTREAM_BUFFER_SIZE, SUPERBLOCK_SIZE};
 
 use crate::encoder::gop::GopStructure;
 use crate::encoder::resources::{
-    allocate_session_memory, clear_input_image, create_bitstream_buffer, create_command_resources,
-    create_dpb_images, create_image, get_video_format, make_codec_name, map_bitstream_buffer,
-    query_supported_video_formats, ClearImageParams,
+    allocate_session_memory, clear_input_image, clear_rgb_input_image, create_bitstream_buffer,
+    create_command_resources, create_dpb_images, create_image, get_video_format, make_codec_name,
+    map_bitstream_buffer, query_supported_video_formats, rgb_conversion_model,
+    rgb_conversion_range, rgb_input_format, ClearImageParams,
 };
 use crate::encoder::{ColorDescription, PixelFormat};
 use crate::error::{PixelForgeError, Result};
@@ -46,6 +47,15 @@ impl AV1Encoder {
         let video_encode_fn =
             ash::khr::video_encode_queue::Device::load(context.instance(), context.device());
 
+        if config.use_rgb_input && !context.supports_rgb_direct_encode() {
+            return Err(PixelForgeError::NoSuitableDevice(
+                "EncodeConfig::use_rgb_input requires VK_VALVE_video_encode_rgb_conversion, \
+                 which this device does not support."
+                    .to_string(),
+            ));
+        }
+        let use_rgb_input = config.use_rgb_input;
+
         // Get chroma subsampling from pixel format.
         let chroma_subsampling: vk::VideoChromaSubsamplingFlagsKHR = config.pixel_format.into();
         let luma_bit_depth: vk::VideoComponentBitDepthFlagsKHR = config.bit_depth.into();
@@ -62,8 +72,18 @@ impl AV1Encoder {
         // Preferred input format based on pixel format and bit depth.
         let preferred_src_format = get_video_format(config.pixel_format, config.bit_depth);
 
-        // Create AV1 encode profile.
+        // Create AV1 encode profile. When RGB-direct is enabled we chain
+        // VkVideoEncodeProfileRgbConversionInfoVALVE inside av1_profile_info
+        // so all downstream uses (capability query, session create, image
+        // creation, query pool) see a profile that matches.
+        let mut rgb_conv_profile_info = vk::VideoEncodeProfileRgbConversionInfoVALVE::default()
+            .perform_encode_rgb_conversion(true);
         let mut av1_profile_info = vk::VideoEncodeAV1ProfileInfoKHR::default().std_profile(profile);
+        if use_rgb_input {
+            av1_profile_info.p_next = (&mut rgb_conv_profile_info
+                as *mut vk::VideoEncodeProfileRgbConversionInfoVALVE)
+                .cast();
+        }
 
         let mut profile_info = vk::VideoProfileInfoKHR::default()
             .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_AV1)
@@ -182,7 +202,17 @@ impl AV1Encoder {
         info!("Supported SRC formats: {:?}", supported_src_formats);
         info!("Supported DPB formats: {:?}", supported_dpb_formats);
 
-        let picture_format = if supported_src_formats.contains(&preferred_src_format) {
+        let picture_format = if use_rgb_input {
+            let rgb_fmt = rgb_input_format(config.bit_depth);
+            if !supported_src_formats.contains(&rgb_fmt) {
+                return Err(PixelForgeError::NoSuitableDevice(format!(
+                    "RGB-direct encode requested but driver does not advertise {:?} as a \
+                     VIDEO_ENCODE_SRC_KHR format for this AV1 profile. Supported: {:?}",
+                    rgb_fmt, supported_src_formats
+                )));
+            }
+            rgb_fmt
+        } else if supported_src_formats.contains(&preferred_src_format) {
             preferred_src_format
         } else {
             return Err(PixelForgeError::NoSuitableDevice(format!(
@@ -191,11 +221,19 @@ impl AV1Encoder {
             )));
         };
 
-        let reference_picture_format = supported_dpb_formats
-            .iter()
-            .copied()
-            .find(|f| *f == picture_format)
-            .unwrap_or(supported_dpb_formats[0]);
+        let reference_picture_format = if use_rgb_input {
+            supported_dpb_formats
+                .iter()
+                .copied()
+                .find(|f| *f == preferred_src_format)
+                .unwrap_or(supported_dpb_formats[0])
+        } else {
+            supported_dpb_formats
+                .iter()
+                .copied()
+                .find(|f| *f == picture_format)
+                .unwrap_or(supported_dpb_formats[0])
+        };
 
         debug!(
             "Selected formats: picture={:?}, reference={:?}",
@@ -244,7 +282,18 @@ impl AV1Encoder {
             max_active_reference_pictures_supported
         );
 
-        let session_create_info = vk::VideoSessionCreateInfoKHR::default()
+        let color_desc = config
+            .color_description
+            .unwrap_or(ColorDescription::bt709());
+
+        let mut session_rgb_conv_info =
+            vk::VideoEncodeSessionRgbConversionCreateInfoVALVE::default()
+                .rgb_model(rgb_conversion_model(&color_desc))
+                .rgb_range(rgb_conversion_range(&color_desc))
+                .x_chroma_offset(vk::VideoEncodeRgbChromaOffsetFlagsVALVE::COSITED_EVEN)
+                .y_chroma_offset(vk::VideoEncodeRgbChromaOffsetFlagsVALVE::MIDPOINT);
+
+        let mut session_create_info = vk::VideoSessionCreateInfoKHR::default()
             .queue_family_index(encode_queue_family)
             .video_profile(&profile_info)
             .picture_format(picture_format)
@@ -256,6 +305,11 @@ impl AV1Encoder {
             .max_dpb_slots(requested_dpb_slots as u32)
             .max_active_reference_pictures(target_active_refs as u32)
             .std_header_version(&std_header_version);
+        if use_rgb_input {
+            session_create_info.p_next = (&mut session_rgb_conv_info
+                as *mut vk::VideoEncodeSessionRgbConversionCreateInfoVALVE)
+                .cast();
+        }
 
         let mut session = vk::VideoSessionKHR::null();
         let result = unsafe {
@@ -276,22 +330,7 @@ impl AV1Encoder {
         // Allocate session memory.
         let session_memory = allocate_session_memory(&context, session, &video_queue_fn)?;
 
-        let color_desc = config
-            .color_description
-            .unwrap_or(ColorDescription::bt709());
-
-        // Create input image.
-        let (input_image, input_image_memory, input_image_view) = create_image(
-            &context,
-            aligned_width,
-            aligned_height,
-            picture_format,
-            false, // is_dpb
-            &profile_info,
-        )?;
-        let input_image_layout = vk::ImageLayout::UNDEFINED;
-
-        // Create DPB images.
+        // Create DPB images (shared across slots).
         let (dpb_images, dpb_image_memories, dpb_image_views) = create_dpb_images(
             &context,
             aligned_width,
@@ -301,60 +340,122 @@ impl AV1Encoder {
             &profile_info,
             false,
         )?;
-        // Create bitstream buffer.
+
         let bitstream_buffer_size = MIN_BITSTREAM_BUFFER_SIZE.max(width as usize * height as usize);
-        let (bitstream_buffer, bitstream_buffer_memory) =
-            create_bitstream_buffer(&context, bitstream_buffer_size, &profile_info)?;
-        // Map bitstream buffer persistently.
-        let bitstream_buffer_ptr =
-            map_bitstream_buffer(&context, bitstream_buffer_memory, bitstream_buffer_size)?;
-        // Create command resources.
+
+        // Shared command pool / upload resources.
         let upload_queue_family = context.transfer_queue_family();
         let cmd_resources =
             create_command_resources(&context, encode_queue_family, upload_queue_family)?;
         let command_pool = cmd_resources.command_pool;
         let upload_command_buffer = cmd_resources.upload_command_buffer;
         let upload_fence = cmd_resources.upload_fence;
-        let encode_command_buffer = cmd_resources.encode_command_buffer;
-        let encode_fence = cmd_resources.encode_fence;
-        // Clear the input image so padding between user dimensions and the
-        // aligned coded extent is zero-initialized.
-        clear_input_image(
-            &context,
-            &ClearImageParams {
-                command_buffer: upload_command_buffer,
-                fence: upload_fence,
-                queue: context.transfer_queue(),
-                image: input_image,
-                width: aligned_width,
-                height: aligned_height,
-                pixel_format: config.pixel_format,
-                bit_depth: config.bit_depth,
-            },
-        )?;
-        // Create query pool for bitstream size queries.
-        // Need 1 query to capture bitstream offset and size.
-        // Need to provide profile info and feedback flags in pNext chain.
-        let mut query_feedback_info = vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR::default()
-            .encode_feedback_flags(
-                vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BUFFER_OFFSET
-                    | vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BYTES_WRITTEN,
-            );
-        query_feedback_info.p_next = (&profile_info as *const vk::VideoProfileInfoKHR).cast();
-
-        let mut query_pool_create_info = vk::QueryPoolCreateInfo::default()
-            .query_type(vk::QueryType::VIDEO_ENCODE_FEEDBACK_KHR)
-            .query_count(1);
-        query_pool_create_info.p_next =
-            (&query_feedback_info as *const vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR).cast();
-
-        let query_pool = unsafe {
-            context
-                .device()
-                .create_query_pool(&query_pool_create_info, None)
-                .map_err(|e| PixelForgeError::QueryPool(e.to_string()))?
+
+        // Allocate ENCODE_PIPELINE_DEPTH-1 additional encode command buffers.
+        let extra_buffers_needed = super::ENCODE_PIPELINE_DEPTH.saturating_sub(1) as u32;
+        let extra_encode_buffers: Vec<vk::CommandBuffer> = if extra_buffers_needed > 0 {
+            let alloc_info = vk::CommandBufferAllocateInfo::default()
+                .command_pool(command_pool)
+                .level(vk::CommandBufferLevel::PRIMARY)
+                .command_buffer_count(extra_buffers_needed);
+            unsafe { context.device().allocate_command_buffers(&alloc_info) }
+                .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?
+        } else {
+            Vec::new()
         };
 
+        // Build per-slot resources.
+        let mut slots: Vec<super::EncodeSlot> = Vec::with_capacity(super::ENCODE_PIPELINE_DEPTH);
+        for slot_idx in 0..super::ENCODE_PIPELINE_DEPTH {
+            let (input_image, input_image_memory, input_image_view) = create_image(
+                &context,
+                aligned_width,
+                aligned_height,
+                picture_format,
+                false,
+                &profile_info,
+            )?;
+
+            let (bitstream_buffer, bitstream_buffer_memory) =
+                create_bitstream_buffer(&context, bitstream_buffer_size, &profile_info)?;
+            let bitstream_buffer_ptr =
+                map_bitstream_buffer(&context, bitstream_buffer_memory, bitstream_buffer_size)?;
+
+            if use_rgb_input {
+                clear_rgb_input_image(
+                    &context,
+                    upload_command_buffer,
+                    upload_fence,
+                    context.transfer_queue(),
+                    input_image,
+                )?;
+            } else {
+                clear_input_image(
+                    &context,
+                    &ClearImageParams {
+                        command_buffer: upload_command_buffer,
+                        fence: upload_fence,
+                        queue: context.transfer_queue(),
+                        image: input_image,
+                        width: aligned_width,
+                        height: aligned_height,
+                        pixel_format: config.pixel_format,
+                        bit_depth: config.bit_depth,
+                    },
+                )?;
+            }
+
+            let encode_command_buffer = if slot_idx == 0 {
+                cmd_resources.encode_command_buffer
+            } else {
+                extra_encode_buffers[slot_idx - 1]
+            };
+
+            let encode_fence = if slot_idx == 0 {
+                cmd_resources.encode_fence
+            } else {
+                let signaled = vk::FenceCreateInfo::default().flags(vk::FenceCreateFlags::SIGNALED);
+                unsafe { context.device().create_fence(&signaled, None) }
+                    .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?
+            };
+
+            // Per-slot single-query pool.
+            let mut query_feedback_info = vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR::default()
+                .encode_feedback_flags(
+                    vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BUFFER_OFFSET
+                        | vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BYTES_WRITTEN,
+                );
+            query_feedback_info.p_next = (&profile_info as *const vk::VideoProfileInfoKHR).cast();
+            let mut query_pool_create_info = vk::QueryPoolCreateInfo::default()
+                .query_type(vk::QueryType::VIDEO_ENCODE_FEEDBACK_KHR)
+                .query_count(1);
+            query_pool_create_info.p_next = (&query_feedback_info
+                as *const vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR)
+                .cast();
+            let query_pool = unsafe {
+                context
+                    .device()
+                    .create_query_pool(&query_pool_create_info, None)
+                    .map_err(|e| PixelForgeError::QueryPool(e.to_string()))?
+            };
+
+            slots.push(super::EncodeSlot {
+                input_image,
+                input_image_memory,
+                input_image_view,
+                input_image_layout: vk::ImageLayout::UNDEFINED,
+                bitstream_buffer,
+                bitstream_buffer_memory,
+                bitstream_buffer_size,
+                bitstream_buffer_ptr,
+                encode_command_buffer,
+                encode_fence,
+                query_pool,
+                in_flight: false,
+                pending_metadata: None,
+            });
+        }
+
         // Initialize GOP structure.
         let gop = GopStructure::new(config.gop_size, config.b_frame_count, config.gop_size);
 
@@ -371,26 +472,17 @@ impl AV1Encoder {
             encode_frame_num: 0,
             frame_num: 0,
             order_hint: 0,
-            input_image,
-            input_image_memory,
-            input_image_view,
-            input_image_layout,
+            slots,
+            current_slot: 0,
             dpb_images,
             dpb_image_memories,
             dpb_image_views,
             dpb_slot_count: requested_dpb_slots,
             dpb_slot_active: vec![false; requested_dpb_slots],
-            bitstream_buffer,
-            bitstream_buffer_memory,
-            bitstream_buffer_size,
-            bitstream_buffer_ptr,
             command_pool,
             upload_command_pool: cmd_resources.upload_command_pool,
             upload_command_buffer,
             upload_fence,
-            encode_command_buffer,
-            encode_fence,
-            query_pool,
             header_data: None,
             current_dpb_slot: 0,
             references: Vec::new(),
diff --git a/src/encoder/av1/mod.rs b/src/encoder/av1/mod.rs
index 2edcde4..1bd0fea 100644
--- a/src/encoder/av1/mod.rs
+++ b/src/encoder/av1/mod.rs
@@ -23,6 +23,41 @@ const MIN_BITSTREAM_BUFFER_SIZE: usize = 2 * 1024 * 1024;
 /// AV1 superblock size in pixels (64x64, matching use_128x128_superblock=0 in the sequence header).
 pub const SUPERBLOCK_SIZE: u32 = 64;
 
+/// Number of in-flight encode slots. Depth=2 lets frame N+1 begin encoding
+/// while frame N is still on the encode hardware.
+pub(crate) const ENCODE_PIPELINE_DEPTH: usize = 2;
+
+/// One slot's worth of per-frame encode resources. Mirrors encoder::h265::EncodeSlot.
+pub(crate) struct EncodeSlot {
+    pub input_image: vk::Image,
+    pub input_image_memory: vk::DeviceMemory,
+    pub input_image_view: vk::ImageView,
+    pub input_image_layout: vk::ImageLayout,
+
+    pub bitstream_buffer: vk::Buffer,
+    pub bitstream_buffer_memory: vk::DeviceMemory,
+    pub bitstream_buffer_size: usize,
+    pub bitstream_buffer_ptr: *mut u8,
+
+    pub encode_command_buffer: vk::CommandBuffer,
+    pub encode_fence: vk::Fence,
+    pub query_pool: vk::QueryPool,
+
+    pub in_flight: bool,
+    pub pending_metadata: Option<SlotPacketMetadata>,
+}
+
+/// Metadata stashed at submit-time, returned with the bitstream when this
+/// slot's encode is drained.
+pub(crate) struct SlotPacketMetadata {
+    pub frame_type: crate::encoder::FrameType,
+    pub is_key_frame: bool,
+    pub pts: u64,
+    pub dts: u64,
+    /// AV1 sequence header OBU to prepend (Some only for IDR/key frames).
+    pub header: Option<Vec<u8>>,
+}
+
 #[derive(Clone, Copy, Debug)]
 pub(crate) struct ReferenceInfo {
     pub dpb_slot: u8,
@@ -49,12 +84,10 @@ pub struct AV1Encoder {
     frame_num: u32,
     order_hint: u32,
 
-    // Resources
-    input_image: vk::Image,
-    input_image_memory: vk::DeviceMemory,
-    input_image_view: vk::ImageView,
-    /// Current Vulkan image layout of `input_image` (tracked to avoid UB when transitioning).
-    input_image_layout: vk::ImageLayout,
+    /// Per-frame encode slots. See encoder::h265 for invariants.
+    pub(crate) slots: Vec<EncodeSlot>,
+    pub(crate) current_slot: usize,
+
     /// DPB images for reference frames.
     dpb_images: Vec<vk::Image>,
     dpb_image_memories: Vec<vk::DeviceMemory>,
@@ -63,21 +96,14 @@ pub struct AV1Encoder {
     dpb_slot_count: usize,
     /// Whether each DPB slot has been activated (written to at least once).
     dpb_slot_active: Vec<bool>,
-    bitstream_buffer: vk::Buffer,
-    bitstream_buffer_memory: vk::DeviceMemory,
-    /// Size of the allocated bitstream buffer in bytes.
-    bitstream_buffer_size: usize,
-    /// Persistently mapped pointer to the bitstream buffer (avoids per-frame map/unmap).
-    bitstream_buffer_ptr: *mut u8,
-
-    // Command resources.
+
+    // Command pool (encode command buffers per slot allocated from this pool).
     command_pool: vk::CommandPool,
     upload_command_pool: vk::CommandPool,
     upload_command_buffer: vk::CommandBuffer,
     upload_fence: vk::Fence,
-    encode_command_buffer: vk::CommandBuffer,
-    encode_fence: vk::Fence,
-    query_pool: vk::QueryPool,
+
+    // Optional semaphore to wait on before encoding (from color converter).
 
     // Cached AV1 sequence header OBU (retrieved from session parameters).
     header_data: Option<Vec<u8>>,
@@ -97,7 +123,8 @@ impl AV1Encoder {
     /// encoder's configured pixel format and dimensions, and should be in
     /// GENERAL layout.
     fn upload_from_image(&mut self, src_image: vk::Image) -> Result<()> {
-        if src_image == self.input_image {
+        let slot = &mut self.slots[self.current_slot];
+        if src_image == slot.input_image {
             debug!("Source image is the encoder's input image, skipping upload copy");
             return Ok(());
         }
@@ -106,18 +133,17 @@ impl AV1Encoder {
             upload_command_buffer: self.upload_command_buffer,
             upload_fence: self.upload_fence,
             src_image,
-            dst_image: self.input_image,
+            dst_image: slot.input_image,
             width: self.config.dimensions.width,
             height: self.config.dimensions.height,
             pixel_format: self.config.pixel_format,
-            input_image_layout: self.input_image_layout,
+            input_image_layout: slot.input_image_layout,
             upload_queue: self.context.transfer_queue(),
         };
 
         upload_image_to_input(&self.context, &params)?;
 
-        // Update tracked layout.
-        self.input_image_layout = vk::ImageLayout::VIDEO_ENCODE_SRC_KHR;
+        slot.input_image_layout = vk::ImageLayout::VIDEO_ENCODE_SRC_KHR;
 
         Ok(())
     }
@@ -130,48 +156,33 @@ unsafe impl Send for AV1Encoder {}
 impl Drop for AV1Encoder {
     fn drop(&mut self) {
         unsafe {
-            let _ = self.context.device().device_wait_idle();
-            self.context
-                .device()
-                .destroy_query_pool(self.query_pool, None);
-            self.context.device().destroy_fence(self.upload_fence, None);
-            self.context.device().destroy_fence(self.encode_fence, None);
-            self.context
-                .device()
-                .destroy_command_pool(self.command_pool, None);
+            let device = self.context.device();
+            let _ = device.device_wait_idle();
+
+            for slot in &mut self.slots {
+                if !slot.bitstream_buffer_ptr.is_null() {
+                    device.unmap_memory(slot.bitstream_buffer_memory);
+                    slot.bitstream_buffer_ptr = std::ptr::null_mut();
+                }
+                device.destroy_query_pool(slot.query_pool, None);
+                device.destroy_fence(slot.encode_fence, None);
+                device.destroy_buffer(slot.bitstream_buffer, None);
+                device.free_memory(slot.bitstream_buffer_memory, None);
+                device.destroy_image_view(slot.input_image_view, None);
+                device.destroy_image(slot.input_image, None);
+                device.free_memory(slot.input_image_memory, None);
+            }
+
+            device.destroy_fence(self.upload_fence, None);
+            device.destroy_command_pool(self.command_pool, None);
             if self.upload_command_pool != self.command_pool {
-                self.context
-                    .device()
-                    .destroy_command_pool(self.upload_command_pool, None);
+                device.destroy_command_pool(self.upload_command_pool, None);
             }
-            self.context
-                .device()
-                .destroy_buffer(self.bitstream_buffer, None);
-            // Unmap the persistently mapped bitstream buffer before freeing memory.
-            self.context
-                .device()
-                .unmap_memory(self.bitstream_buffer_memory);
-            self.context
-                .device()
-                .free_memory(self.bitstream_buffer_memory, None);
-            self.context
-                .device()
-                .destroy_image_view(self.input_image_view, None);
-            self.context.device().destroy_image(self.input_image, None);
-            self.context
-                .device()
-                .free_memory(self.input_image_memory, None);
 
             for i in 0..self.dpb_images.len() {
-                self.context
-                    .device()
-                    .destroy_image_view(self.dpb_image_views[i], None);
-                self.context
-                    .device()
-                    .destroy_image(self.dpb_images[i], None);
-                self.context
-                    .device()
-                    .free_memory(self.dpb_image_memories[i], None);
+                device.destroy_image_view(self.dpb_image_views[i], None);
+                device.destroy_image(self.dpb_images[i], None);
+                device.free_memory(self.dpb_image_memories[i], None);
             }
 
             if self.session_params != vk::VideoSessionParametersKHR::null() {
@@ -181,7 +192,7 @@ impl Drop for AV1Encoder {
             self.video_queue_fn
                 .destroy_video_session(self.session, None);
             for mem in &self.session_memory {
-                self.context.device().free_memory(*mem, None);
+                device.free_memory(*mem, None);
             }
         }
     }
diff --git a/src/encoder/h264/api.rs b/src/encoder/h264/api.rs
index f41a080..71e6299 100644
--- a/src/encoder/h264/api.rs
+++ b/src/encoder/h264/api.rs
@@ -14,36 +14,71 @@ impl H264Encoder {
     /// This image can be used as a target for `ColorConverter::convert` to avoid
     /// an intermediate copy.
     pub fn input_image(&self) -> vk::Image {
-        self.input_image
+        self.slots[self.current_slot].input_image
     }
 
-    /// Encode a frame from a GPU image.
+    /// Encode a frame from a GPU image (depth-2 pipelined).
     ///
-    /// This accepts a source NV12 image on the GPU and encodes it directly without.
-    /// any CPU-side data copies. The source image must be in NV12 format with the
-    /// same dimensions as the encoder configuration, and should be in GENERAL layout.
+    /// Submits the frame to the encode queue without waiting, drains the
+    /// previous in-flight frame from the slot we are about to overwrite,
+    /// and returns *that* drained frame's packet. The first call returns
+    /// an empty Vec (pipeline still filling); subsequent calls return one
+    /// packet per call. Use `flush()` to drain remaining slots at end of stream.
     ///
     /// # Panics
     ///
-    /// The encoder will panic at creation time if B-frames are enabled (b_frame_count > 0),
-    /// as B-frame encoding is not yet supported.
+    /// The encoder will panic at creation time if B-frames are enabled
+    /// (b_frame_count > 0), as B-frame encoding is not yet supported.
     pub fn encode(&mut self, src_image: vk::Image) -> Result<Vec<EncodedPacket>> {
+        let prev_packet = self.drain_current_slot()?;
+
         let gop_position = self.gop.get_next_frame();
         let display_order = self.input_frame_num;
         self.input_frame_num += 1;
 
         debug!(
-            "Encoding frame {} from GPU image: type={:?}, poc={}",
-            display_order, gop_position.frame_type, gop_position.pic_order_cnt
+            "Encoding frame {} from GPU image: type={:?}, poc={}, slot={}",
+            display_order, gop_position.frame_type, gop_position.pic_order_cnt, self.current_slot
         );
 
-        // Upload from GPU image.
         self.upload_from_image(src_image)?;
+        self.encode_current_frame(&gop_position, display_order)?;
 
-        // Encode immediately.
-        let packet = self.encode_current_frame(&gop_position, display_order)?;
+        self.current_slot = (self.current_slot + 1) % self.slots.len();
+        Ok(prev_packet.into_iter().collect())
+    }
 
-        Ok(vec![packet])
+    fn drain_current_slot(&mut self) -> Result<Option<EncodedPacket>> {
+        if !self.slots[self.current_slot].in_flight {
+            return Ok(None);
+        }
+        let bitstream = unsafe {
+            crate::encoder::resources::wait_and_read_bitstream(
+                self.context.device(),
+                self.slots[self.current_slot].encode_fence,
+                self.slots[self.current_slot].query_pool,
+                self.slots[self.current_slot].bitstream_buffer_ptr,
+            )?
+        };
+        self.slots[self.current_slot].in_flight = false;
+        let meta = self.slots[self.current_slot]
+            .pending_metadata
+            .take()
+            .ok_or_else(|| {
+                PixelForgeError::CommandBuffer(
+                    "Drained slot has bitstream but no metadata; encoder state corrupted"
+                        .to_string(),
+                )
+            })?;
+        let mut data = meta.header.unwrap_or_default();
+        data.extend_from_slice(&bitstream);
+        Ok(Some(EncodedPacket {
+            data,
+            frame_type: meta.frame_type,
+            is_key_frame: meta.is_key_frame,
+            pts: meta.pts,
+            dts: meta.dts,
+        }))
     }
 
     /// Internal method to encode the current frame already uploaded to input_image.
@@ -51,7 +86,7 @@ impl H264Encoder {
         &mut self,
         gop_position: &GopPosition,
         display_order: u64,
-    ) -> Result<EncodedPacket> {
+    ) -> Result<()> {
         let is_idr = gop_position.frame_type.is_idr();
         let is_reference = gop_position.is_reference;
         let is_b_frame = gop_position.frame_type == GopFrameType::B;
@@ -86,24 +121,33 @@ impl H264Encoder {
         let pic_order_cnt = gop_position.pic_order_cnt;
         let frame_num = self.frame_num_syntax;
 
-        let mut encoded_data = Vec::new();
-        if is_idr {
-            encoded_data.extend_from_slice(&self.get_h264_header()?);
+        // For IDR frames, capture SPS/PPS header to be prepended at drain time.
+        let header = if is_idr {
+            let h = self.get_h264_header()?;
             self.sps_written = true;
-        }
+            Some(h)
+        } else {
+            None
+        };
 
-        encoded_data.extend_from_slice(&self.encode_frame_internal(
-            gop_position,
-            frame_num,
-            pic_order_cnt,
-            is_idr,
-        )?);
+        // Submit the encode (no wait, no readback). Marks the slot in_flight.
+        self.encode_frame_internal(gop_position, frame_num, pic_order_cnt, is_idr)?;
 
+        let dts = self.encode_frame_num;
         self.encode_frame_num += 1;
         if is_reference && !is_b_frame {
             self.frame_num_syntax = (self.frame_num_syntax + 1) % 256;
         }
 
+        // Stash metadata so drain_current_slot() can build the packet later.
+        self.slots[self.current_slot].pending_metadata = Some(super::SlotPacketMetadata {
+            frame_type,
+            is_key_frame: is_idr,
+            pts: display_order,
+            dts,
+            header,
+        });
+
         if is_reference {
             let pic_type = if is_idr {
                 PictureType::Idr
@@ -146,19 +190,25 @@ impl H264Encoder {
             }
         }
 
-        Ok(EncodedPacket {
-            data: encoded_data,
-            frame_type,
-            is_key_frame: is_idr,
-            pts: display_order,
-            dts: self.encode_frame_num - 1,
-        })
+        Ok(())
     }
 
-    /// Flush the encoder and get any remaining packets.
+    /// Flush the encoder and drain any remaining in-flight slots.
     pub fn flush(&mut self) -> Result<Vec<EncodedPacket>> {
-        // No buffered frames in the current implementation.
-        Ok(Vec::new())
+        let mut out = Vec::new();
+        for offset in 0..self.slots.len() {
+            let idx = (self.current_slot + offset) % self.slots.len();
+            if !self.slots[idx].in_flight {
+                continue;
+            }
+            let saved_current = self.current_slot;
+            self.current_slot = idx;
+            if let Some(packet) = self.drain_current_slot()? {
+                out.push(packet);
+            }
+            self.current_slot = saved_current;
+        }
+        Ok(out)
     }
 
     /// Request that the next frame be an IDR frame.
@@ -244,17 +294,17 @@ impl H264Encoder {
     /// updated VUI color primaries, transfer characteristics, and matrix coefficients.
     /// The next encoded frame will be an IDR with the new SPS/PPS prepended.
     pub fn set_color_description(&mut self, desc: ColorDescription) -> Result<()> {
-        // Wait for any in-flight encode to complete before modifying session params.
-        // Do NOT reset the fence here — submit_encode_and_read_bitstream() resets it
-        // before queue_submit. Leaving the fence signaled allows consecutive
-        // set_color_description() calls without deadlock.
+        // Wait for ALL slot fences before modifying session params. Do NOT reset
+        // here; submit_encode_only resets each fence on submit so leaving them
+        // signaled lets consecutive set_color_description() calls work safely.
+        let fences: Vec<vk::Fence> = self.slots.iter().map(|s| s.encode_fence).collect();
         unsafe {
             self.context
                 .device()
-                .wait_for_fences(&[self.encode_fence], true, u64::MAX)
+                .wait_for_fences(&fences, true, u64::MAX)
                 .map_err(|e| {
                     PixelForgeError::Synchronization(format!(
-                        "Failed to wait for encode fence: {:?}",
+                        "Failed to wait for encode fences: {:?}",
                         e
                     ))
                 })?;
diff --git a/src/encoder/h264/encode.rs b/src/encoder/h264/encode.rs
index dd0d85f..fc5e153 100644
--- a/src/encoder/h264/encode.rs
+++ b/src/encoder/h264/encode.rs
@@ -2,7 +2,7 @@ use super::H264Encoder;
 
 use crate::encoder::gop::{GopFrameType, GopPosition};
 use crate::encoder::resources::{
-    prepare_encode_command_buffer, record_dpb_barriers, submit_encode_and_read_bitstream,
+    prepare_encode_command_buffer, record_dpb_barriers, submit_encode_only,
     MIN_BITSTREAM_BUFFER_SIZE,
 };
 use crate::error::{PixelForgeError, Result};
@@ -10,13 +10,18 @@ use ash::vk;
 use tracing::debug;
 
 impl H264Encoder {
+    /// Records and submits the encode commands for a single frame to the
+    /// current slot. Does NOT wait for completion or read the bitstream —
+    /// the caller drains the slot's prior in-flight encode before calling
+    /// this, and the slot is marked in_flight so a later call can drain the
+    /// submission made here.
     pub(super) fn encode_frame_internal(
         &mut self,
         gop_position: &GopPosition,
         frame_num: u32,
         pic_order_cnt: i32,
         is_idr: bool,
-    ) -> Result<Vec<u8>> {
+    ) -> Result<()> {
         let is_b_frame = gop_position.frame_type == GopFrameType::B;
         let is_reference = gop_position.is_reference;
 
@@ -55,8 +60,8 @@ impl H264Encoder {
         unsafe {
             prepare_encode_command_buffer(
                 self.context.device(),
-                self.encode_command_buffer,
-                self.query_pool,
+                self.slots[self.current_slot].encode_command_buffer,
+                self.slots[self.current_slot].query_pool,
             )?;
         }
 
@@ -65,7 +70,7 @@ impl H264Encoder {
         unsafe {
             record_dpb_barriers(
                 self.context.device(),
-                self.encode_command_buffer,
+                self.slots[self.current_slot].encode_command_buffer,
                 &self.dpb_images,
                 self.use_layered_dpb,
                 self.current_dpb_slot,
@@ -263,7 +268,7 @@ impl H264Encoder {
                 height: self.aligned_height,
             })
             .base_array_layer(0)
-            .image_view_binding(self.input_image_view);
+            .image_view_binding(self.slots[self.current_slot].input_image_view);
 
         // Set up DPB slot for reconstructed picture (setup slot)
         let setup_picture_resource = vk::VideoPictureResourceInfoKHR::default()
@@ -431,7 +436,7 @@ impl H264Encoder {
         }
 
         let mut encode_info = vk::VideoEncodeInfoKHR::default()
-            .dst_buffer(self.bitstream_buffer)
+            .dst_buffer(self.slots[self.current_slot].bitstream_buffer)
             .dst_buffer_offset(0)
             .dst_buffer_range(MIN_BITSTREAM_BUFFER_SIZE as vk::DeviceSize)
             .src_picture_resource(src_picture_resource)
@@ -537,7 +542,7 @@ impl H264Encoder {
 
         unsafe {
             (self.video_queue_fn.fp().cmd_begin_video_coding_khr)(
-                self.encode_command_buffer,
+                self.slots[self.current_slot].encode_command_buffer,
                 &begin_info,
             );
         }
@@ -561,7 +566,7 @@ impl H264Encoder {
 
             unsafe {
                 (self.video_queue_fn.fp().cmd_control_video_coding_khr)(
-                    self.encode_command_buffer,
+                    self.slots[self.current_slot].encode_command_buffer,
                     &control_info,
                 );
             }
@@ -570,8 +575,8 @@ impl H264Encoder {
         // Begin query.
         unsafe {
             self.context.device().cmd_begin_query(
-                self.encode_command_buffer,
-                self.query_pool,
+                self.slots[self.current_slot].encode_command_buffer,
+                self.slots[self.current_slot].query_pool,
                 0,
                 vk::QueryControlFlags::empty(),
             );
@@ -580,23 +585,25 @@ impl H264Encoder {
         // Encode
         unsafe {
             (self.video_encode_fn.fp().cmd_encode_video_khr)(
-                self.encode_command_buffer,
+                self.slots[self.current_slot].encode_command_buffer,
                 &encode_info,
             );
         }
 
         // End query.
         unsafe {
-            self.context
-                .device()
-                .cmd_end_query(self.encode_command_buffer, self.query_pool, 0);
+            self.context.device().cmd_end_query(
+                self.slots[self.current_slot].encode_command_buffer,
+                self.slots[self.current_slot].query_pool,
+                0,
+            );
         }
 
         // End video coding.
         let end_info = vk::VideoEndCodingInfoKHR::default();
         unsafe {
             (self.video_queue_fn.fp().cmd_end_video_coding_khr)(
-                self.encode_command_buffer,
+                self.slots[self.current_slot].encode_command_buffer,
                 &end_info,
             );
         }
@@ -605,7 +612,7 @@ impl H264Encoder {
         unsafe {
             self.context
                 .device()
-                .end_command_buffer(self.encode_command_buffer)
+                .end_command_buffer(self.slots[self.current_slot].encode_command_buffer)
         }
         .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?;
 
@@ -614,20 +621,23 @@ impl H264Encoder {
             PixelForgeError::NoSuitableDevice("No video encode queue available".to_string())
         })?;
 
-        let encoded_data = unsafe {
-            submit_encode_and_read_bitstream(
+        unsafe {
+            submit_encode_only(
                 self.context.device(),
-                self.encode_command_buffer,
-                self.encode_fence,
+                self.slots[self.current_slot].encode_command_buffer,
+                self.slots[self.current_slot].encode_fence,
                 encode_queue,
-                self.query_pool,
-                self.bitstream_buffer_ptr,
-            )?
-        };
+                None,
+            )?;
+        }
 
         // Mark DPB slot as active.
         self.dpb_slot_active[self.current_dpb_slot as usize] = true;
 
-        Ok(encoded_data)
+        // Mark slot as in-flight; bitstream is drained on the next encode()
+        // call that targets this slot.
+        self.slots[self.current_slot].in_flight = true;
+
+        Ok(())
     }
 }
diff --git a/src/encoder/h264/init.rs b/src/encoder/h264/init.rs
index 1ccb373..82314a3 100644
--- a/src/encoder/h264/init.rs
+++ b/src/encoder/h264/init.rs
@@ -3,9 +3,10 @@ use super::{H264Encoder, MB_SIZE};
 use crate::encoder::dpb::{DecodedPictureBuffer, DecodedPictureBufferTrait, DpbConfig};
 use crate::encoder::gop::GopStructure;
 use crate::encoder::resources::{
-    align_up, allocate_session_memory, clear_input_image, create_bitstream_buffer,
-    create_command_resources, create_dpb_images, create_image, get_video_format, lcm,
-    map_bitstream_buffer, query_supported_video_formats, ClearImageParams,
+    align_up, allocate_session_memory, clear_input_image, clear_rgb_input_image,
+    create_bitstream_buffer, create_command_resources, create_dpb_images, create_image,
+    get_video_format, lcm, map_bitstream_buffer, query_supported_video_formats,
+    rgb_conversion_model, rgb_conversion_range, rgb_input_format, ClearImageParams,
     MIN_BITSTREAM_BUFFER_SIZE,
 };
 use crate::encoder::ColorDescription;
@@ -42,6 +43,15 @@ impl H264Encoder {
         let video_encode_fn =
             ash::khr::video_encode_queue::Device::load(context.instance(), context.device());
 
+        if config.use_rgb_input && !context.supports_rgb_direct_encode() {
+            return Err(PixelForgeError::NoSuitableDevice(
+                "EncodeConfig::use_rgb_input requires VK_VALVE_video_encode_rgb_conversion, \
+                 which this device does not support."
+                    .to_string(),
+            ));
+        }
+        let use_rgb_input = config.use_rgb_input;
+
         // Get chroma subsampling from pixel format via `From` impl.
         let chroma_subsampling: vk::VideoChromaSubsamplingFlagsKHR = config.pixel_format.into();
 
@@ -62,9 +72,19 @@ impl H264Encoder {
         // Note: the DPB format may differ and must be queried separately.
         let preferred_src_format = get_video_format(config.pixel_format, config.bit_depth);
 
-        // Create H.264 encode profile.
+        // Create H.264 encode profile. When RGB-direct is enabled we chain
+        // VkVideoEncodeProfileRgbConversionInfoVALVE on every profile we
+        // build (capability query, image creation, query pool) — profiles
+        // must match across all of those.
+        let mut rgb_conv_profile_info = vk::VideoEncodeProfileRgbConversionInfoVALVE::default()
+            .perform_encode_rgb_conversion(true);
         let mut h264_profile_info =
             vk::VideoEncodeH264ProfileInfoKHR::default().std_profile_idc(profile_idc);
+        if use_rgb_input {
+            h264_profile_info.p_next = (&mut rgb_conv_profile_info
+                as *mut vk::VideoEncodeProfileRgbConversionInfoVALVE)
+                .cast();
+        }
 
         let mut profile_info = vk::VideoProfileInfoKHR::default()
             .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H264)
@@ -249,8 +269,21 @@ impl H264Encoder {
         }
         info!("Supported DPB formats: {:?}", supported_dpb_formats);
 
-        // For input uploads, we currently require the preferred 2-plane formats.
-        let picture_format = if supported_src_formats.contains(&preferred_src_format) {
+        // For input uploads, we currently require the preferred 2-plane
+        // formats — unless RGB-direct is enabled, in which case the SRC
+        // image must be one of the RGB formats VCN5 accepts and the DPB
+        // stays YUV.
+        let picture_format = if use_rgb_input {
+            let rgb_fmt = rgb_input_format(config.bit_depth);
+            if !supported_src_formats.contains(&rgb_fmt) {
+                return Err(PixelForgeError::NoSuitableDevice(format!(
+                    "RGB-direct encode requested but driver does not advertise {:?} as a \
+                     VIDEO_ENCODE_SRC_KHR format for this profile. Supported: {:?}",
+                    rgb_fmt, supported_src_formats
+                )));
+            }
+            rgb_fmt
+        } else if supported_src_formats.contains(&preferred_src_format) {
             preferred_src_format
         } else {
             return Err(PixelForgeError::NoSuitableDevice(format!(
@@ -259,12 +292,22 @@ impl H264Encoder {
             )));
         };
 
-        // DPB format can differ from the input format; prefer matching when possible.
-        let reference_picture_format = supported_dpb_formats
-            .iter()
-            .copied()
-            .find(|f| *f == picture_format)
-            .unwrap_or(supported_dpb_formats[0]);
+        // DPB format can differ from the input format; in RGB-direct mode
+        // DPB stays YUV (matching the encoder's internal pixel_format/
+        // bit_depth), otherwise prefer matching the picture_format.
+        let reference_picture_format = if use_rgb_input {
+            supported_dpb_formats
+                .iter()
+                .copied()
+                .find(|f| *f == preferred_src_format)
+                .unwrap_or(supported_dpb_formats[0])
+        } else {
+            supported_dpb_formats
+                .iter()
+                .copied()
+                .find(|f| *f == picture_format)
+                .unwrap_or(supported_dpb_formats[0])
+        };
 
         debug!(
             "Selected Vulkan Video formats: picture_format={:?}, reference_picture_format={:?} (preferred_src={:?})",
@@ -333,7 +376,18 @@ impl H264Encoder {
             PixelForgeError::NoSuitableDevice("No video encode queue family available".to_string())
         })?;
 
-        let session_create_info = vk::VideoSessionCreateInfoKHR::default()
+        let color_desc = config
+            .color_description
+            .unwrap_or(ColorDescription::bt709());
+
+        let mut session_rgb_conv_info =
+            vk::VideoEncodeSessionRgbConversionCreateInfoVALVE::default()
+                .rgb_model(rgb_conversion_model(&color_desc))
+                .rgb_range(rgb_conversion_range(&color_desc))
+                .x_chroma_offset(vk::VideoEncodeRgbChromaOffsetFlagsVALVE::COSITED_EVEN)
+                .y_chroma_offset(vk::VideoEncodeRgbChromaOffsetFlagsVALVE::MIDPOINT);
+
+        let mut session_create_info = vk::VideoSessionCreateInfoKHR::default()
             .queue_family_index(encode_queue_family)
             .flags(vk::VideoSessionCreateFlagsKHR::empty())
             .video_profile(&profile_info)
@@ -346,6 +400,11 @@ impl H264Encoder {
             .max_dpb_slots(dpb_slot_count as u32)
             .max_active_reference_pictures(max_active_reference_pictures as u32)
             .std_header_version(&std_header_version);
+        if use_rgb_input {
+            session_create_info.p_next = (&mut session_rgb_conv_info
+                as *mut vk::VideoEncodeSessionRgbConversionCreateInfoVALVE)
+                .cast();
+        }
 
         let mut session = vk::VideoSessionKHR::null();
         let result = unsafe {
@@ -403,13 +462,17 @@ impl H264Encoder {
             )));
         }
 
-        let color_desc = config
-            .color_description
-            .unwrap_or(ColorDescription::bt709());
-
-        // Create profile info for images/buffers.
+        // Create profile info for images/buffers (shared across slots).
+        let mut rgb_conv_profile_for_resources =
+            vk::VideoEncodeProfileRgbConversionInfoVALVE::default()
+                .perform_encode_rgb_conversion(true);
         let mut h264_profile_for_resources =
             vk::VideoEncodeH264ProfileInfoKHR::default().std_profile_idc(profile_idc);
+        if use_rgb_input {
+            h264_profile_for_resources.p_next = (&mut rgb_conv_profile_for_resources
+                as *mut vk::VideoEncodeProfileRgbConversionInfoVALVE)
+                .cast();
+        }
         let mut profile_for_resources = vk::VideoProfileInfoKHR::default()
             .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H264)
             .chroma_subsampling(chroma_subsampling)
@@ -418,16 +481,6 @@ impl H264Encoder {
         profile_for_resources.p_next =
             (&mut h264_profile_for_resources as *mut vk::VideoEncodeH264ProfileInfoKHR).cast();
 
-        // Create input image.
-        let (input_image, input_image_memory, input_image_view) = create_image(
-            &context,
-            aligned_width,
-            aligned_height,
-            picture_format,
-            false,
-            &profile_for_resources,
-        )?;
-
         // Determine DPB mode: use layered DPB when the driver does not advertise
         // support for separate reference images (required for AMD RADV).
         let supports_separate_dpb = capabilities
@@ -438,7 +491,7 @@ impl H264Encoder {
             info!("Using layered DPB (driver does not support separate reference images)");
         }
 
-        // Create DPB images.
+        // Create DPB images (shared across slots).
         let (dpb_images, dpb_image_memories, dpb_image_views) = create_dpb_images(
             &context,
             aligned_width,
@@ -449,77 +502,140 @@ impl H264Encoder {
             use_layered_dpb,
         )?;
 
-        // Create bitstream buffer.
-        let (bitstream_buffer, bitstream_buffer_memory) =
-            create_bitstream_buffer(&context, MIN_BITSTREAM_BUFFER_SIZE, &profile_for_resources)?;
-
-        // Persistently map the bitstream buffer to avoid per-frame map/unmap overhead.
-        let bitstream_buffer_ptr =
-            map_bitstream_buffer(&context, bitstream_buffer_memory, MIN_BITSTREAM_BUFFER_SIZE)?;
-
-        // Create command pool, buffers, and fences.
-        // Use the transfer queue family for upload commands when the encode queue
-        // doesn't support transfer operations (AMD RADV).
+        // Create command pool and shared upload resources. Encode command
+        // buffers (one per slot) are allocated below from `command_pool`.
         let upload_queue_family = context.transfer_queue_family();
         let cmd_resources =
             create_command_resources(&context, encode_queue_family, upload_queue_family)?;
         let command_pool = cmd_resources.command_pool;
         let upload_command_pool = cmd_resources.upload_command_pool;
         let upload_command_buffer = cmd_resources.upload_command_buffer;
-        let encode_command_buffer = cmd_resources.encode_command_buffer;
         let upload_fence = cmd_resources.upload_fence;
-        let encode_fence = cmd_resources.encode_fence;
 
-        // Clear the input image so padding between user dimensions and the
-        // aligned coded extent is zero-initialized.
-        clear_input_image(
-            &context,
-            &ClearImageParams {
-                command_buffer: upload_command_buffer,
-                fence: upload_fence,
-                queue: context.transfer_queue(),
-                image: input_image,
-                width: aligned_width,
-                height: aligned_height,
-                pixel_format: config.pixel_format,
-                bit_depth: config.bit_depth,
-            },
-        )?;
-
-        // Create query pool.
-        let mut h264_profile_info_query =
-            vk::VideoEncodeH264ProfileInfoKHR::default().std_profile_idc(profile_idc);
-
-        let mut profile_info_query = vk::VideoProfileInfoKHR::default()
-            .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H264)
-            .chroma_subsampling(chroma_subsampling)
-            .luma_bit_depth(luma_bit_depth)
-            .chroma_bit_depth(chroma_bit_depth);
-        profile_info_query.p_next =
-            (&mut h264_profile_info_query as *mut vk::VideoEncodeH264ProfileInfoKHR).cast();
-
-        let mut encode_feedback_create = vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR::default()
-            .encode_feedback_flags(
-                vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BUFFER_OFFSET
-                    | vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BYTES_WRITTEN,
-            );
-
-        encode_feedback_create.p_next =
-            (&mut profile_info_query as *mut vk::VideoProfileInfoKHR).cast();
+        // Allocate ENCODE_PIPELINE_DEPTH-1 additional encode command buffers.
+        let extra_buffers_needed = super::ENCODE_PIPELINE_DEPTH.saturating_sub(1) as u32;
+        let extra_encode_buffers: Vec<vk::CommandBuffer> = if extra_buffers_needed > 0 {
+            let alloc_info = vk::CommandBufferAllocateInfo::default()
+                .command_pool(command_pool)
+                .level(vk::CommandBufferLevel::PRIMARY)
+                .command_buffer_count(extra_buffers_needed);
+            unsafe { context.device().allocate_command_buffers(&alloc_info) }
+                .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?
+        } else {
+            Vec::new()
+        };
 
-        let mut query_pool_create_info = vk::QueryPoolCreateInfo::default()
-            .query_type(vk::QueryType::VIDEO_ENCODE_FEEDBACK_KHR)
-            .query_count(1);
-        query_pool_create_info.p_next = (&mut encode_feedback_create
-            as *mut vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR)
-            .cast();
+        // Build per-slot resources.
+        let mut slots: Vec<super::EncodeSlot> = Vec::with_capacity(super::ENCODE_PIPELINE_DEPTH);
+        for slot_idx in 0..super::ENCODE_PIPELINE_DEPTH {
+            let (input_image, input_image_memory, input_image_view) = create_image(
+                &context,
+                aligned_width,
+                aligned_height,
+                picture_format,
+                false,
+                &profile_for_resources,
+            )?;
+
+            let (bitstream_buffer, bitstream_buffer_memory) = create_bitstream_buffer(
+                &context,
+                MIN_BITSTREAM_BUFFER_SIZE,
+                &profile_for_resources,
+            )?;
+            let bitstream_buffer_ptr =
+                map_bitstream_buffer(&context, bitstream_buffer_memory, MIN_BITSTREAM_BUFFER_SIZE)?;
+
+            if use_rgb_input {
+                clear_rgb_input_image(
+                    &context,
+                    upload_command_buffer,
+                    upload_fence,
+                    context.transfer_queue(),
+                    input_image,
+                )?;
+            } else {
+                clear_input_image(
+                    &context,
+                    &ClearImageParams {
+                        command_buffer: upload_command_buffer,
+                        fence: upload_fence,
+                        queue: context.transfer_queue(),
+                        image: input_image,
+                        width: aligned_width,
+                        height: aligned_height,
+                        pixel_format: config.pixel_format,
+                        bit_depth: config.bit_depth,
+                    },
+                )?;
+            }
 
-        let query_pool = unsafe {
-            context
-                .device()
-                .create_query_pool(&query_pool_create_info, None)
+            let encode_command_buffer = if slot_idx == 0 {
+                cmd_resources.encode_command_buffer
+            } else {
+                extra_encode_buffers[slot_idx - 1]
+            };
+
+            let encode_fence = if slot_idx == 0 {
+                cmd_resources.encode_fence
+            } else {
+                let signaled = vk::FenceCreateInfo::default().flags(vk::FenceCreateFlags::SIGNALED);
+                unsafe { context.device().create_fence(&signaled, None) }
+                    .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?
+            };
+
+            // Per-slot single-query pool.
+            let mut rgb_conv_profile_query =
+                vk::VideoEncodeProfileRgbConversionInfoVALVE::default()
+                    .perform_encode_rgb_conversion(true);
+            let mut h264_profile_info_query =
+                vk::VideoEncodeH264ProfileInfoKHR::default().std_profile_idc(profile_idc);
+            if use_rgb_input {
+                h264_profile_info_query.p_next = (&mut rgb_conv_profile_query
+                    as *mut vk::VideoEncodeProfileRgbConversionInfoVALVE)
+                    .cast();
+            }
+            let mut profile_info_query = vk::VideoProfileInfoKHR::default()
+                .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H264)
+                .chroma_subsampling(chroma_subsampling)
+                .luma_bit_depth(luma_bit_depth)
+                .chroma_bit_depth(chroma_bit_depth);
+            profile_info_query.p_next =
+                (&mut h264_profile_info_query as *mut vk::VideoEncodeH264ProfileInfoKHR).cast();
+            let mut encode_feedback_create =
+                vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR::default().encode_feedback_flags(
+                    vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BUFFER_OFFSET
+                        | vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BYTES_WRITTEN,
+                );
+            encode_feedback_create.p_next =
+                (&mut profile_info_query as *mut vk::VideoProfileInfoKHR).cast();
+            let mut query_pool_create_info = vk::QueryPoolCreateInfo::default()
+                .query_type(vk::QueryType::VIDEO_ENCODE_FEEDBACK_KHR)
+                .query_count(1);
+            query_pool_create_info.p_next = (&mut encode_feedback_create
+                as *mut vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR)
+                .cast();
+            let query_pool = unsafe {
+                context
+                    .device()
+                    .create_query_pool(&query_pool_create_info, None)
+            }
+            .map_err(|e| PixelForgeError::QueryPool(e.to_string()))?;
+
+            slots.push(super::EncodeSlot {
+                input_image,
+                input_image_memory,
+                input_image_view,
+                input_image_layout: vk::ImageLayout::VIDEO_ENCODE_SRC_KHR,
+                bitstream_buffer,
+                bitstream_buffer_memory,
+                bitstream_buffer_ptr,
+                encode_command_buffer,
+                encode_fence,
+                query_pool,
+                in_flight: false,
+                pending_metadata: None,
+            });
         }
-        .map_err(|e| PixelForgeError::QueryPool(e.to_string()))?;
 
         // Create DPB and GOP structure.
         // The DPB size should match the actual number of allocated DPB slots.
@@ -565,10 +681,8 @@ impl H264Encoder {
             encode_frame_num: 0,
             frame_num_syntax: 0,
             idr_pic_id: 0,
-            input_image,
-            input_image_memory,
-            input_image_view,
-            input_image_layout: vk::ImageLayout::VIDEO_ENCODE_SRC_KHR,
+            slots,
+            current_slot: 0,
             dpb_images,
             dpb_image_memories,
             dpb_image_views,
@@ -578,16 +692,10 @@ impl H264Encoder {
             current_dpb_slot: 0,
             l0_references: Vec::new(),
             active_reference_count: max_active_reference_pictures as u32,
-            bitstream_buffer,
-            bitstream_buffer_memory,
-            bitstream_buffer_ptr,
             command_pool,
             upload_command_pool,
             upload_command_buffer,
             upload_fence,
-            encode_command_buffer,
-            encode_fence,
-            query_pool,
             sps_written: false,
             // has_reference: false, // removed
             // reference_frame_num: 0, // removed
diff --git a/src/encoder/h264/mod.rs b/src/encoder/h264/mod.rs
index 3d339d3..4be9a15 100644
--- a/src/encoder/h264/mod.rs
+++ b/src/encoder/h264/mod.rs
@@ -10,9 +10,7 @@ mod session_params;
 use ash::vk;
 use tracing::debug;
 
-use crate::encoder::resources::{
-    destroy_encoder_resources, upload_image_to_input, EncoderResources, UploadParams,
-};
+use crate::encoder::resources::{upload_image_to_input, UploadParams};
 use crate::error::Result;
 
 use crate::encoder::dpb::DecodedPictureBuffer;
@@ -23,6 +21,42 @@ use crate::vulkan::VideoContext;
 /// H.264 macroblock size in pixels.
 pub const MB_SIZE: u32 = 16;
 
+/// Number of in-flight encode slots. Depth=2 lets frame N+1 begin encoding
+/// while frame N is still on the encode hardware, so the per-frame budget
+/// becomes 2 × frame_interval (16.6ms at 120fps) instead of 1 ×.
+pub(crate) const ENCODE_PIPELINE_DEPTH: usize = 2;
+
+/// One slot's worth of per-frame encode resources. Mirrors the H.265 design
+/// (see encoder::h265::EncodeSlot).
+pub(crate) struct EncodeSlot {
+    pub input_image: vk::Image,
+    pub input_image_memory: vk::DeviceMemory,
+    pub input_image_view: vk::ImageView,
+    pub input_image_layout: vk::ImageLayout,
+
+    pub bitstream_buffer: vk::Buffer,
+    pub bitstream_buffer_memory: vk::DeviceMemory,
+    pub bitstream_buffer_ptr: *mut u8,
+
+    pub encode_command_buffer: vk::CommandBuffer,
+    pub encode_fence: vk::Fence,
+    pub query_pool: vk::QueryPool,
+
+    pub in_flight: bool,
+    pub pending_metadata: Option<SlotPacketMetadata>,
+}
+
+/// Metadata stashed at submit-time, returned with the bitstream when this
+/// slot's encode is drained on a later encode() call.
+pub(crate) struct SlotPacketMetadata {
+    pub frame_type: crate::encoder::FrameType,
+    pub is_key_frame: bool,
+    pub pts: u64,
+    pub dts: u64,
+    /// SPS/PPS header to prepend (Some only on first IDR).
+    pub header: Option<Vec<u8>>,
+}
+
 #[derive(Clone, Copy, Debug)]
 pub(crate) struct ReferenceInfo {
     pub dpb_slot: u8,
@@ -55,12 +89,10 @@ pub struct H264Encoder {
     frame_num_syntax: u32,
     idr_pic_id: u32,
 
-    // Resources
-    input_image: vk::Image,
-    input_image_memory: vk::DeviceMemory,
-    input_image_view: vk::ImageView,
-    /// Current Vulkan image layout of `input_image` (tracked to avoid UB when transitioning).
-    input_image_layout: vk::ImageLayout,
+    /// Per-frame encode slots. See encoder::h265 for invariants.
+    pub(crate) slots: Vec<EncodeSlot>,
+    pub(crate) current_slot: usize,
+
     /// DPB images (up to MAX_DPB_SLOTS for B-frame and long-term reference support).
     dpb_images: Vec<vk::Image>,
     dpb_image_memories: Vec<vk::DeviceMemory>,
@@ -71,19 +103,12 @@ pub struct H264Encoder {
     use_layered_dpb: bool,
     /// Tracks which DPB slots have been activated (used at least once).
     dpb_slot_active: Vec<bool>,
-    bitstream_buffer: vk::Buffer,
-    bitstream_buffer_memory: vk::DeviceMemory,
-    /// Persistently mapped pointer to the bitstream buffer (avoids per-frame map/unmap).
-    bitstream_buffer_ptr: *mut u8,
 
-    // Command resources.
+    // Command pool (encode command buffers per slot allocated from this pool).
     command_pool: vk::CommandPool,
     upload_command_pool: vk::CommandPool,
     upload_command_buffer: vk::CommandBuffer,
     upload_fence: vk::Fence,
-    encode_command_buffer: vk::CommandBuffer,
-    encode_fence: vk::Fence,
-    query_pool: vk::QueryPool,
 
     // SPS/PPS written flag.
     sps_written: bool,
@@ -117,7 +142,8 @@ impl H264Encoder {
     /// with the same dimensions as the encoder configuration. The source image
     /// should be in GENERAL layout.
     fn upload_from_image(&mut self, src_image: vk::Image) -> Result<()> {
-        if src_image == self.input_image {
+        let slot = &mut self.slots[self.current_slot];
+        if src_image == slot.input_image {
             debug!("Source image is the encoder's input image, skipping upload copy");
             return Ok(());
         }
@@ -126,18 +152,17 @@ impl H264Encoder {
             upload_command_buffer: self.upload_command_buffer,
             upload_fence: self.upload_fence,
             src_image,
-            dst_image: self.input_image,
+            dst_image: slot.input_image,
             width: self.config.dimensions.width,
             height: self.config.dimensions.height,
             pixel_format: self.config.pixel_format,
-            input_image_layout: self.input_image_layout,
+            input_image_layout: slot.input_image_layout,
             upload_queue: self.context.transfer_queue(),
         };
 
         upload_image_to_input(&self.context, &params)?;
 
-        // Update tracked layout.
-        self.input_image_layout = vk::ImageLayout::VIDEO_ENCODE_SRC_KHR;
+        slot.input_image_layout = vk::ImageLayout::VIDEO_ENCODE_SRC_KHR;
 
         Ok(())
     }
@@ -150,37 +175,61 @@ unsafe impl Send for H264Encoder {}
 impl Drop for H264Encoder {
     fn drop(&mut self) {
         unsafe {
-            // Wait on the queues used by the encoder rather than stalling
-            // the entire device.
-            let _ = self
-                .context
-                .device()
-                .queue_wait_idle(self.context.transfer_queue());
+            let device = self.context.device();
+            let _ = device.queue_wait_idle(self.context.transfer_queue());
             if let Some(q) = self.context.video_encode_queue() {
-                let _ = self.context.device().queue_wait_idle(q);
+                let _ = device.queue_wait_idle(q);
+            }
+
+            for slot in &mut self.slots {
+                if !slot.bitstream_buffer_ptr.is_null() {
+                    device.unmap_memory(slot.bitstream_buffer_memory);
+                    slot.bitstream_buffer_ptr = std::ptr::null_mut();
+                }
+                device.destroy_query_pool(slot.query_pool, None);
+                device.destroy_fence(slot.encode_fence, None);
+                device.destroy_buffer(slot.bitstream_buffer, None);
+                device.free_memory(slot.bitstream_buffer_memory, None);
+                device.destroy_image_view(slot.input_image_view, None);
+                device.destroy_image(slot.input_image, None);
+                device.free_memory(slot.input_image_memory, None);
+            }
+
+            device.destroy_fence(self.upload_fence, None);
+            device.destroy_command_pool(self.command_pool, None);
+            if self.upload_command_pool != self.command_pool {
+                device.destroy_command_pool(self.upload_command_pool, None);
             }
-            destroy_encoder_resources(
-                self.context.device(),
-                &self.video_queue_fn,
-                &EncoderResources {
-                    query_pool: self.query_pool,
-                    upload_fence: self.upload_fence,
-                    encode_fence: self.encode_fence,
-                    command_pool: self.command_pool,
-                    upload_command_pool: self.upload_command_pool,
-                    bitstream_buffer: self.bitstream_buffer,
-                    bitstream_buffer_memory: self.bitstream_buffer_memory,
-                    input_image: self.input_image,
-                    input_image_memory: self.input_image_memory,
-                    input_image_view: self.input_image_view,
-                    dpb_images: &self.dpb_images,
-                    dpb_image_memories: &self.dpb_image_memories,
-                    dpb_image_views: &self.dpb_image_views,
-                    session: self.session,
-                    session_params: self.session_params,
-                    session_memory: &self.session_memory,
-                },
+
+            for view in &self.dpb_image_views {
+                device.destroy_image_view(*view, None);
+            }
+            for image in &self.dpb_images {
+                device.destroy_image(*image, None);
+            }
+            for memory in &self.dpb_image_memories {
+                device.free_memory(*memory, None);
+            }
+
+            if self.session_params != vk::VideoSessionParametersKHR::null() {
+                (self
+                    .video_queue_fn
+                    .fp()
+                    .destroy_video_session_parameters_khr)(
+                    device.handle(),
+                    self.session_params,
+                    std::ptr::null(),
+                );
+            }
+            (self.video_queue_fn.fp().destroy_video_session_khr)(
+                device.handle(),
+                self.session,
+                std::ptr::null(),
             );
+
+            for memory in &self.session_memory {
+                device.free_memory(*memory, None);
+            }
         }
     }
 }
diff --git a/src/encoder/h265/api.rs b/src/encoder/h265/api.rs
index e52b210..3a2c947 100644
--- a/src/encoder/h265/api.rs
+++ b/src/encoder/h265/api.rs
@@ -14,36 +14,93 @@ impl H265Encoder {
     /// This image can be used as a target for `ColorConverter::convert` to avoid
     /// an intermediate copy.
     pub fn input_image(&self) -> vk::Image {
-        self.input_image
+        self.slots[self.current_slot].input_image
     }
 
     /// Encode a frame from a GPU image.
     ///
-    /// This accepts a source NV12 image on the GPU and encodes it directly without.
-    /// any CPU-side data copies. The source image must be in NV12 format with the
-    /// same dimensions as the encoder configuration, and should be in GENERAL layout.
+    /// Pipelined: this call submits frame N to the encode queue without waiting,
+    /// drains the previous in-flight frame from the slot we are about to overwrite,
+    /// and returns *that* drained frame's `EncodedPacket`. The first call returns
+    /// an empty Vec (the pipeline is still filling); subsequent calls return one
+    /// packet per call. Use `flush()` to drain remaining slots at end of stream.
+    ///
+    /// The source image must be in NV12 format with the same dimensions as the
+    /// encoder configuration, and should be in GENERAL layout.
     ///
     /// # Panics
     ///
-    /// The encoder will panic at creation time if B-frames are enabled (b_frame_count > 0),
-    /// as B-frame encoding is not yet supported.
+    /// The encoder will panic at creation time if B-frames are enabled
+    /// (b_frame_count > 0), as B-frame encoding is not yet supported.
     pub fn encode(&mut self, src_image: vk::Image) -> Result<Vec<EncodedPacket>> {
+        // Step 1: Drain the slot we're about to overwrite. Its previous encode
+        // submission must complete before we can re-record its command buffer
+        // *and* before the converter can write to its input image. Reading the
+        // bitstream here means the input_image is fully released by the encode
+        // hardware once we return.
+        let prev_packet = self.drain_current_slot()?;
+
         let gop_position = self.gop.get_next_frame();
         let display_order = self.input_frame_num;
         self.input_frame_num += 1;
 
         debug!(
-            "Encoding frame {} from GPU image: type={:?}, poc={}",
-            display_order, gop_position.frame_type, gop_position.pic_order_cnt
+            "Encoding frame {} from GPU image: type={:?}, poc={}, slot={}",
+            display_order, gop_position.frame_type, gop_position.pic_order_cnt, self.current_slot
         );
 
-        // Upload from GPU image.
+        // Upload from GPU image (no-op when src_image is already the slot's input).
         self.upload_from_image(src_image)?;
 
-        // Encode immediately.
-        let packet = self.encode_current_frame(&gop_position, display_order)?;
+        // Step 3: Submit the new encode (no wait) and stash its metadata in the
+        // slot so it can be returned when this slot is drained next time around.
+        self.encode_current_frame(&gop_position, display_order)?;
 
-        Ok(vec![packet])
+        // Step 4: Advance to the next slot for the upcoming frame.
+        self.current_slot = (self.current_slot + 1) % self.slots.len();
+
+        // Step 5: Return the packet drained at step 1. Empty Vec until the
+        // pipeline has filled (first ENCODE_PIPELINE_DEPTH-1 calls).
+        Ok(prev_packet.into_iter().collect())
+    }
+
+    /// Wait for the current slot's previously submitted encode (if any) to
+    /// finish, read its bitstream, and combine it with the metadata stashed at
+    /// submit-time into a complete EncodedPacket. Returns None if the slot has
+    /// no in-flight work (initial pipeline-fill phase or after a flush).
+    fn drain_current_slot(&mut self) -> Result<Option<EncodedPacket>> {
+        if !self.slots[self.current_slot].in_flight {
+            return Ok(None);
+        }
+        let bitstream = unsafe {
+            crate::encoder::resources::wait_and_read_bitstream(
+                self.context.device(),
+                self.slots[self.current_slot].encode_fence,
+                self.slots[self.current_slot].query_pool,
+                self.slots[self.current_slot].bitstream_buffer_ptr,
+            )?
+        };
+        self.slots[self.current_slot].in_flight = false;
+        let meta = self.slots[self.current_slot]
+            .pending_metadata
+            .take()
+            .ok_or_else(|| {
+                PixelForgeError::CommandBuffer(
+                    "Drained slot has bitstream but no metadata; encoder state corrupted"
+                        .to_string(),
+                )
+            })?;
+
+        let mut data = meta.header.unwrap_or_default();
+        data.extend_from_slice(&bitstream);
+
+        Ok(Some(EncodedPacket {
+            data,
+            frame_type: meta.frame_type,
+            is_key_frame: meta.is_key_frame,
+            pts: meta.pts,
+            dts: meta.dts,
+        }))
     }
 
     /// Internal method to encode the current frame already uploaded to input_image.
@@ -51,7 +108,7 @@ impl H265Encoder {
         &mut self,
         gop_position: &GopPosition,
         display_order: u64,
-    ) -> Result<EncodedPacket> {
+    ) -> Result<()> {
         let is_idr = gop_position.frame_type.is_idr();
         let is_reference = gop_position.is_reference;
         let is_b_frame = gop_position.frame_type == GopFrameType::B;
@@ -101,13 +158,11 @@ impl H265Encoder {
 
         let pic_order_cnt = gop_position.pic_order_cnt;
 
-        let mut encoded_data = Vec::new();
-
-        // For IDR frames, prepend VPS/SPS/PPS header.
-        if is_idr {
+        // For IDR frames, capture VPS/SPS/PPS header to be prepended to the
+        // bitstream when this slot's encode is drained later.
+        let header = if is_idr {
             if self.header_data.is_none() {
                 let header = self.get_h265_header()?;
-                // Debug: print first few bytes of header.
                 debug!(
                     "H.265 header ({} bytes): {:02X?}",
                     header.len(),
@@ -115,22 +170,27 @@ impl H265Encoder {
                 );
                 self.header_data = Some(header);
             }
-            if let Some(ref header) = self.header_data {
-                encoded_data.extend_from_slice(header);
-            }
-        }
+            self.header_data.clone()
+        } else {
+            None
+        };
 
-        let slice_data = self.encode_frame_internal(gop_position, pic_order_cnt, is_idr)?;
-        // Debug: print first few bytes of slice data.
-        debug!(
-            "H.265 slice ({} bytes): {:02X?}",
-            slice_data.len(),
-            &slice_data[..std::cmp::min(16, slice_data.len())]
-        );
-        encoded_data.extend_from_slice(&slice_data);
+        // Submit the encode (no wait, no readback). Marks the slot in_flight.
+        self.encode_frame_internal(gop_position, pic_order_cnt, is_idr)?;
 
+        let dts = self.encode_frame_num;
         self.encode_frame_num += 1;
 
+        // Stash the metadata so drain_current_slot() can build the
+        // EncodedPacket once the GPU finishes this submission.
+        self.slots[self.current_slot].pending_metadata = Some(super::SlotPacketMetadata {
+            frame_type,
+            is_key_frame: is_idr,
+            pts: display_order,
+            dts,
+            header,
+        });
+
         if is_reference {
             let dpb_pic_type = if is_idr {
                 PictureType::Idr
@@ -180,19 +240,34 @@ impl H265Encoder {
             }
         }
 
-        Ok(EncodedPacket {
-            data: encoded_data,
-            frame_type,
-            is_key_frame: is_idr,
-            pts: display_order,
-            dts: self.encode_frame_num - 1,
-        })
+        Ok(())
     }
 
-    /// Flush the encoder and get any remaining packets.
+    /// Flush the encoder and drain any remaining in-flight slots.
+    ///
+    /// Returns one EncodedPacket per still-in-flight slot, in submission
+    /// order (so the resulting Vec preserves the encoded sequence). After
+    /// flush the encoder has no in-flight work.
     pub fn flush(&mut self) -> Result<Vec<EncodedPacket>> {
-        // No buffered frames in the current implementation.
-        Ok(Vec::new())
+        let mut out = Vec::new();
+        // Drain in submission order: starting from current_slot (the slot we
+        // would *next* overwrite — the oldest one in flight) and advancing
+        // through the ring. Slots with no in_flight are skipped.
+        for offset in 0..self.slots.len() {
+            let idx = (self.current_slot + offset) % self.slots.len();
+            if !self.slots[idx].in_flight {
+                continue;
+            }
+            // Drain idx's bitstream the same way drain_current_slot does, but
+            // from an arbitrary slot index.
+            let saved_current = self.current_slot;
+            self.current_slot = idx;
+            if let Some(packet) = self.drain_current_slot()? {
+                out.push(packet);
+            }
+            self.current_slot = saved_current;
+        }
+        Ok(out)
     }
 
     /// Request that the next frame be an IDR frame.
@@ -278,17 +353,18 @@ impl H265Encoder {
     /// updated VUI color primaries, transfer characteristics, and matrix coefficients.
     /// The next encoded frame will be an IDR with the new VPS/SPS/PPS prepended.
     pub fn set_color_description(&mut self, desc: ColorDescription) -> Result<()> {
-        // Wait for any in-flight encode to complete before modifying session params.
-        // Do NOT reset the fence here — submit_encode_and_read_bitstream() resets it
-        // before queue_submit. Leaving the fence signaled allows consecutive
-        // set_color_description() calls without deadlock.
+        // Wait for all in-flight encodes (across every slot) to complete before
+        // modifying session params. Do NOT reset fences here — submit_encode_only
+        // resets them before queue_submit, and leaving them signaled allows
+        // consecutive set_color_description() calls without deadlock.
+        let fences: Vec<vk::Fence> = self.slots.iter().map(|s| s.encode_fence).collect();
         unsafe {
             self.context
                 .device()
-                .wait_for_fences(&[self.encode_fence], true, u64::MAX)
+                .wait_for_fences(&fences, true, u64::MAX)
                 .map_err(|e| {
                     PixelForgeError::Synchronization(format!(
-                        "Failed to wait for encode fence: {:?}",
+                        "Failed to wait for encode fences: {:?}",
                         e
                     ))
                 })?;
diff --git a/src/encoder/h265/encode.rs b/src/encoder/h265/encode.rs
index b7a8ce0..5f138e5 100644
--- a/src/encoder/h265/encode.rs
+++ b/src/encoder/h265/encode.rs
@@ -7,32 +7,30 @@ use super::H265Encoder;
 use crate::encoder::gop::{GopFrameType, GopPosition};
 use crate::encoder::resources::{
     prepare_encode_command_buffer, record_dpb_barriers, record_post_encode_dpb_barrier,
-    submit_encode_and_read_bitstream, MIN_BITSTREAM_BUFFER_SIZE,
+    submit_encode_only, MIN_BITSTREAM_BUFFER_SIZE,
 };
 use crate::error::{PixelForgeError, Result};
 use ash::vk;
 use tracing::debug;
 
 impl H265Encoder {
-    /// Encode a frame that has already been uploaded to the input image.
-    ///
-    /// This function:
-    /// 1. Records the video encode command buffer
-    /// 2. Sets up reference picture information
-    /// 3. Executes the encode operation
-    /// 4. Returns the encoded bitstream data
+    /// Records and submits the encode commands for a single frame to the
+    /// current slot. Does NOT wait for completion or read the bitstream —
+    /// the caller drains the slot's prior in-flight encode before calling
+    /// this, and the slot is marked in_flight so a later call can drain the
+    /// submission made here.
     pub(super) fn encode_frame_internal(
         &mut self,
         gop_position: &GopPosition,
         pic_order_cnt: i32,
         is_idr: bool,
-    ) -> Result<Vec<u8>> {
+    ) -> Result<()> {
         // Prepare command buffer for recording.
         unsafe {
             prepare_encode_command_buffer(
                 self.context.device(),
-                self.encode_command_buffer,
-                self.query_pool,
+                self.slots[self.current_slot].encode_command_buffer,
+                self.slots[self.current_slot].query_pool,
             )?;
         }
 
@@ -41,7 +39,7 @@ impl H265Encoder {
         unsafe {
             record_dpb_barriers(
                 self.context.device(),
-                self.encode_command_buffer,
+                self.slots[self.current_slot].encode_command_buffer,
                 &self.dpb_images,
                 self.use_layered_dpb,
                 self.current_dpb_slot,
@@ -326,7 +324,7 @@ impl H265Encoder {
                 height: self.aligned_height,
             })
             .base_array_layer(0)
-            .image_view_binding(self.input_image_view);
+            .image_view_binding(self.slots[self.current_slot].input_image_view);
 
         // Set up setup picture resource (reconstructed picture)
         let setup_picture_resource = vk::VideoPictureResourceInfoKHR::default()
@@ -570,7 +568,7 @@ impl H265Encoder {
 
         unsafe {
             (self.video_queue_fn.fp().cmd_begin_video_coding_khr)(
-                self.encode_command_buffer,
+                self.slots[self.current_slot].encode_command_buffer,
                 &begin_coding_info,
             );
         }
@@ -594,7 +592,7 @@ impl H265Encoder {
 
             unsafe {
                 (self.video_queue_fn.fp().cmd_control_video_coding_khr)(
-                    self.encode_command_buffer,
+                    self.slots[self.current_slot].encode_command_buffer,
                     &control_info,
                 );
             }
@@ -606,7 +604,7 @@ impl H265Encoder {
             .src_picture_resource(src_picture_resource)
             .setup_reference_slot(&setup_slot_info)
             .reference_slots(&reference_slots)
-            .dst_buffer(self.bitstream_buffer)
+            .dst_buffer(self.slots[self.current_slot].bitstream_buffer)
             .dst_buffer_offset(0)
             .dst_buffer_range(MIN_BITSTREAM_BUFFER_SIZE as u64);
         encode_info.p_next =
@@ -614,27 +612,29 @@ impl H265Encoder {
 
         unsafe {
             self.context.device().cmd_begin_query(
-                self.encode_command_buffer,
-                self.query_pool,
+                self.slots[self.current_slot].encode_command_buffer,
+                self.slots[self.current_slot].query_pool,
                 0,
                 vk::QueryControlFlags::empty(),
             );
 
             (self.video_encode_fn.fp().cmd_encode_video_khr)(
-                self.encode_command_buffer,
+                self.slots[self.current_slot].encode_command_buffer,
                 &encode_info,
             );
 
-            self.context
-                .device()
-                .cmd_end_query(self.encode_command_buffer, self.query_pool, 0);
+            self.context.device().cmd_end_query(
+                self.slots[self.current_slot].encode_command_buffer,
+                self.slots[self.current_slot].query_pool,
+                0,
+            );
         }
 
         // Add DPB synchronization barrier after encoding.
         unsafe {
             record_post_encode_dpb_barrier(
                 self.context.device(),
-                self.encode_command_buffer,
+                self.slots[self.current_slot].encode_command_buffer,
                 &self.dpb_images,
                 self.use_layered_dpb,
                 self.current_dpb_slot,
@@ -645,7 +645,7 @@ impl H265Encoder {
         let end_coding_info = vk::VideoEndCodingInfoKHR::default();
         unsafe {
             (self.video_queue_fn.fp().cmd_end_video_coding_khr)(
-                self.encode_command_buffer,
+                self.slots[self.current_slot].encode_command_buffer,
                 &end_coding_info,
             );
         }
@@ -654,7 +654,7 @@ impl H265Encoder {
         unsafe {
             self.context
                 .device()
-                .end_command_buffer(self.encode_command_buffer)
+                .end_command_buffer(self.slots[self.current_slot].encode_command_buffer)
         }
         .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?;
 
@@ -673,22 +673,25 @@ impl H265Encoder {
 
         let gpu_start = std::time::Instant::now();
 
-        let encoded_data = unsafe {
-            submit_encode_and_read_bitstream(
+        unsafe {
+            submit_encode_only(
                 self.context.device(),
-                self.encode_command_buffer,
-                self.encode_fence,
+                self.slots[self.current_slot].encode_command_buffer,
+                self.slots[self.current_slot].encode_fence,
                 encode_queue,
-                self.query_pool,
-                self.bitstream_buffer_ptr,
-            )?
-        };
+                None,
+            )?;
+        }
 
-        debug!("GPU encode took {:?}", gpu_start.elapsed());
+        debug!("Submitted encode (no wait): {:?}", gpu_start.elapsed());
 
         // Mark DPB slot as active.
         self.dpb_slot_active[self.current_dpb_slot as usize] = true;
 
-        Ok(encoded_data)
+        // Mark the slot as in flight; the bitstream is drained at the start
+        // of the next encode() call that targets this slot.
+        self.slots[self.current_slot].in_flight = true;
+
+        Ok(())
     }
 }
diff --git a/src/encoder/h265/init.rs b/src/encoder/h265/init.rs
index 12e3041..07f53b3 100644
--- a/src/encoder/h265/init.rs
+++ b/src/encoder/h265/init.rs
@@ -3,9 +3,10 @@ use super::H265Encoder;
 use crate::encoder::dpb::{DecodedPictureBuffer, DecodedPictureBufferTrait, DpbConfig};
 use crate::encoder::gop::GopStructure;
 use crate::encoder::resources::{
-    align_up, allocate_session_memory, clear_input_image, create_bitstream_buffer,
-    create_command_resources, create_dpb_images, create_image, get_video_format, lcm,
-    make_codec_name, map_bitstream_buffer, query_supported_video_formats, ClearImageParams,
+    align_up, allocate_session_memory, clear_input_image, clear_rgb_input_image,
+    create_bitstream_buffer, create_command_resources, create_dpb_images, create_image,
+    get_video_format, lcm, make_codec_name, map_bitstream_buffer, query_supported_video_formats,
+    rgb_conversion_model, rgb_conversion_range, rgb_input_format, ClearImageParams,
     MIN_BITSTREAM_BUFFER_SIZE,
 };
 use crate::encoder::{BitDepth, ColorDescription, PixelFormat};
@@ -41,6 +42,15 @@ impl H265Encoder {
         let video_encode_fn =
             ash::khr::video_encode_queue::Device::load(context.instance(), context.device());
 
+        if config.use_rgb_input && !context.supports_rgb_direct_encode() {
+            return Err(PixelForgeError::NoSuitableDevice(
+                "EncodeConfig::use_rgb_input requires VK_VALVE_video_encode_rgb_conversion, \
+                 which this device does not support."
+                    .to_string(),
+            ));
+        }
+        let use_rgb_input = config.use_rgb_input;
+
         // Get chroma subsampling from pixel format via `From` impl
         let chroma_subsampling: vk::VideoChromaSubsamplingFlagsKHR = config.pixel_format.into();
 
@@ -74,9 +84,19 @@ impl H265Encoder {
             }
         };
 
-        // Create H.265 encode profile
+        // Create H.265 encode profile. When RGB-direct is enabled we chain
+        // VkVideoEncodeProfileRgbConversionInfoVALVE on every profile we
+        // build (capability query, image creation, query pool) — profiles
+        // must match across all of those.
+        let mut rgb_conv_profile_info = vk::VideoEncodeProfileRgbConversionInfoVALVE::default()
+            .perform_encode_rgb_conversion(true);
         let mut h265_profile_info =
             vk::VideoEncodeH265ProfileInfoKHR::default().std_profile_idc(profile_idc);
+        if use_rgb_input {
+            h265_profile_info.p_next = (&mut rgb_conv_profile_info
+                as *mut vk::VideoEncodeProfileRgbConversionInfoVALVE)
+                .cast();
+        }
 
         let mut profile_info = vk::VideoProfileInfoKHR::default()
             .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H265)
@@ -189,7 +209,17 @@ impl H265Encoder {
         }
         info!("Supported DPB formats: {:?}", supported_dpb_formats);
 
-        let picture_format = if supported_src_formats.contains(&video_format) {
+        let picture_format = if use_rgb_input {
+            let rgb_fmt = rgb_input_format(config.bit_depth);
+            if !supported_src_formats.contains(&rgb_fmt) {
+                return Err(PixelForgeError::NoSuitableDevice(format!(
+                    "RGB-direct encode requested but driver does not advertise {:?} as a \
+                     VIDEO_ENCODE_SRC_KHR format for this profile. Supported: {:?}",
+                    rgb_fmt, supported_src_formats
+                )));
+            }
+            rgb_fmt
+        } else if supported_src_formats.contains(&video_format) {
             video_format
         } else {
             return Err(PixelForgeError::NoSuitableDevice(format!(
@@ -198,11 +228,19 @@ impl H265Encoder {
             )));
         };
 
-        let reference_picture_format = supported_dpb_formats
-            .iter()
-            .copied()
-            .find(|f| *f == picture_format)
-            .unwrap_or(supported_dpb_formats[0]);
+        let reference_picture_format = if use_rgb_input {
+            supported_dpb_formats
+                .iter()
+                .copied()
+                .find(|f| *f == video_format)
+                .unwrap_or(supported_dpb_formats[0])
+        } else {
+            supported_dpb_formats
+                .iter()
+                .copied()
+                .find(|f| *f == picture_format)
+                .unwrap_or(supported_dpb_formats[0])
+        };
 
         debug!(
             "Selected Vulkan Video formats: picture_format={:?}, reference_picture_format={:?}",
@@ -264,7 +302,18 @@ impl H265Encoder {
             PixelForgeError::NoSuitableDevice("No video encode queue family available".to_string())
         })?;
 
-        let session_create_info = vk::VideoSessionCreateInfoKHR::default()
+        let color_desc = config
+            .color_description
+            .unwrap_or(ColorDescription::bt709());
+
+        let mut session_rgb_conv_info =
+            vk::VideoEncodeSessionRgbConversionCreateInfoVALVE::default()
+                .rgb_model(rgb_conversion_model(&color_desc))
+                .rgb_range(rgb_conversion_range(&color_desc))
+                .x_chroma_offset(vk::VideoEncodeRgbChromaOffsetFlagsVALVE::COSITED_EVEN)
+                .y_chroma_offset(vk::VideoEncodeRgbChromaOffsetFlagsVALVE::MIDPOINT);
+
+        let mut session_create_info = vk::VideoSessionCreateInfoKHR::default()
             .queue_family_index(encode_queue_family)
             .flags(vk::VideoSessionCreateFlagsKHR::empty())
             .video_profile(&profile_info)
@@ -277,6 +326,11 @@ impl H265Encoder {
             .max_dpb_slots(dpb_slot_count as u32)
             .max_active_reference_pictures(max_active_reference_pictures as u32)
             .std_header_version(&std_header_version);
+        if use_rgb_input {
+            session_create_info.p_next = (&mut session_rgb_conv_info
+                as *mut vk::VideoEncodeSessionRgbConversionCreateInfoVALVE)
+                .cast();
+        }
 
         let mut session = vk::VideoSessionKHR::null();
         let result = unsafe {
@@ -297,14 +351,17 @@ impl H265Encoder {
         // Query and allocate session memory.
         let session_memory = allocate_session_memory(&context, session, &video_queue_fn)?;
 
-        // Build VPS/SPS/PPS and session parameters via shared helper.
-        let color_desc = config
-            .color_description
-            .unwrap_or(ColorDescription::bt709());
-
-        // Create profile info for images/buffers
+        // Create profile info for images/buffers (shared across slots).
+        let mut rgb_conv_profile_for_resources =
+            vk::VideoEncodeProfileRgbConversionInfoVALVE::default()
+                .perform_encode_rgb_conversion(true);
         let mut h265_profile_for_resources =
             vk::VideoEncodeH265ProfileInfoKHR::default().std_profile_idc(profile_idc);
+        if use_rgb_input {
+            h265_profile_for_resources.p_next = (&mut rgb_conv_profile_for_resources
+                as *mut vk::VideoEncodeProfileRgbConversionInfoVALVE)
+                .cast();
+        }
         let mut profile_for_resources = vk::VideoProfileInfoKHR::default()
             .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H265)
             .chroma_subsampling(chroma_subsampling)
@@ -313,16 +370,6 @@ impl H265Encoder {
         profile_for_resources.p_next =
             (&mut h265_profile_for_resources as *mut vk::VideoEncodeH265ProfileInfoKHR).cast();
 
-        // Create input image
-        let (input_image, input_image_memory, input_image_view) = create_image(
-            &context,
-            aligned_width,
-            aligned_height,
-            picture_format,
-            false,
-            &profile_for_resources,
-        )?;
-
         // Determine DPB mode: use layered DPB when the driver does not advertise
         // support for separate reference images (required for AMD RADV).
         let supports_separate_dpb = capabilities
@@ -333,7 +380,8 @@ impl H265Encoder {
             info!("Using layered DPB (driver does not support separate reference images)");
         }
 
-        // Create DPB images.
+        // Create DPB images (shared across all slots — references for the
+        // entire encode session, not per-frame).
         let (dpb_images, dpb_image_memories, dpb_image_views) = create_dpb_images(
             &context,
             aligned_width,
@@ -344,77 +392,153 @@ impl H265Encoder {
             use_layered_dpb,
         )?;
 
-        // Create bitstream buffer.
-        let (bitstream_buffer, bitstream_buffer_memory) =
-            create_bitstream_buffer(&context, MIN_BITSTREAM_BUFFER_SIZE, &profile_for_resources)?;
-
-        // Persistently map the bitstream buffer to avoid per-frame map/unmap overhead.
-        let bitstream_buffer_ptr =
-            map_bitstream_buffer(&context, bitstream_buffer_memory, MIN_BITSTREAM_BUFFER_SIZE)?;
-
-        // Create command pool, buffers, and fences.
-        // Use the transfer queue family for upload commands when the encode queue
-        // doesn't support transfer operations (AMD RADV).
+        // Create command pool and shared upload resources. The encode command
+        // buffers (one per slot) are allocated below from `command_pool`.
         let upload_queue_family = context.transfer_queue_family();
         let cmd_resources =
             create_command_resources(&context, encode_queue_family, upload_queue_family)?;
         let command_pool = cmd_resources.command_pool;
         let upload_command_pool = cmd_resources.upload_command_pool;
         let upload_command_buffer = cmd_resources.upload_command_buffer;
-        let encode_command_buffer = cmd_resources.encode_command_buffer;
         let upload_fence = cmd_resources.upload_fence;
-        let encode_fence = cmd_resources.encode_fence;
-
-        // Clear the input image so padding between user dimensions and the
-        // aligned coded extent is zero-initialized.
-        clear_input_image(
-            &context,
-            &ClearImageParams {
-                command_buffer: upload_command_buffer,
-                fence: upload_fence,
-                queue: context.transfer_queue(),
-                image: input_image,
-                width: aligned_width,
-                height: aligned_height,
-                pixel_format: config.pixel_format,
-                bit_depth: config.bit_depth,
-            },
-        )?;
-
-        // Create query pool
-        let mut h265_profile_info_query =
-            vk::VideoEncodeH265ProfileInfoKHR::default().std_profile_idc(profile_idc);
-
-        let mut profile_info_query = vk::VideoProfileInfoKHR::default()
-            .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H265)
-            .chroma_subsampling(chroma_subsampling)
-            .luma_bit_depth(bit_depth_flags)
-            .chroma_bit_depth(bit_depth_flags);
-        profile_info_query.p_next =
-            (&mut h265_profile_info_query as *mut vk::VideoEncodeH265ProfileInfoKHR).cast();
-
-        let mut encode_feedback_create = vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR::default()
-            .encode_feedback_flags(
-                vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BUFFER_OFFSET
-                    | vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BYTES_WRITTEN,
-            );
-
-        encode_feedback_create.p_next =
-            (&mut profile_info_query as *mut vk::VideoProfileInfoKHR).cast();
+        // The first slot reuses cmd_resources.encode_command_buffer and encode_fence;
+        // additional slots get fresh buffers/fences below. (Re-using avoids
+        // changing create_command_resources, which is also used by H264/AV1.)
+
+        // Allocate ENCODE_PIPELINE_DEPTH-1 additional encode command buffers
+        // from the same pool.
+        let extra_buffers_needed = super::ENCODE_PIPELINE_DEPTH.saturating_sub(1) as u32;
+        let extra_encode_buffers: Vec<vk::CommandBuffer> = if extra_buffers_needed > 0 {
+            let alloc_info = vk::CommandBufferAllocateInfo::default()
+                .command_pool(command_pool)
+                .level(vk::CommandBufferLevel::PRIMARY)
+                .command_buffer_count(extra_buffers_needed);
+            unsafe { context.device().allocate_command_buffers(&alloc_info) }
+                .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?
+        } else {
+            Vec::new()
+        };
 
-        let mut query_pool_create_info = vk::QueryPoolCreateInfo::default()
-            .query_type(vk::QueryType::VIDEO_ENCODE_FEEDBACK_KHR)
-            .query_count(1);
-        query_pool_create_info.p_next = (&mut encode_feedback_create
-            as *mut vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR)
-            .cast();
+        // Build per-slot resources.
+        let mut slots: Vec<super::EncodeSlot> = Vec::with_capacity(super::ENCODE_PIPELINE_DEPTH);
+        for slot_idx in 0..super::ENCODE_PIPELINE_DEPTH {
+            // Input image for this slot.
+            let (input_image, input_image_memory, input_image_view) = create_image(
+                &context,
+                aligned_width,
+                aligned_height,
+                picture_format,
+                false,
+                &profile_for_resources,
+            )?;
+
+            // Bitstream buffer for this slot.
+            let (bitstream_buffer, bitstream_buffer_memory) = create_bitstream_buffer(
+                &context,
+                MIN_BITSTREAM_BUFFER_SIZE,
+                &profile_for_resources,
+            )?;
+            let bitstream_buffer_ptr =
+                map_bitstream_buffer(&context, bitstream_buffer_memory, MIN_BITSTREAM_BUFFER_SIZE)?;
+
+            // Clear the input image so padding between user dimensions and the
+            // aligned coded extent is zero-initialized. RGB-direct uses the
+            // single-plane COLOR-aspect path; YUV uses a multi-plane buffer copy.
+            if use_rgb_input {
+                clear_rgb_input_image(
+                    &context,
+                    upload_command_buffer,
+                    upload_fence,
+                    context.transfer_queue(),
+                    input_image,
+                )?;
+            } else {
+                clear_input_image(
+                    &context,
+                    &ClearImageParams {
+                        command_buffer: upload_command_buffer,
+                        fence: upload_fence,
+                        queue: context.transfer_queue(),
+                        image: input_image,
+                        width: aligned_width,
+                        height: aligned_height,
+                        pixel_format: config.pixel_format,
+                        bit_depth: config.bit_depth,
+                    },
+                )?;
+            }
 
-        let query_pool = unsafe {
-            context
-                .device()
-                .create_query_pool(&query_pool_create_info, None)
+            // Encode command buffer: slot 0 reuses the one create_command_resources
+            // already allocated; slots 1..N pull from the extras vec.
+            let encode_command_buffer = if slot_idx == 0 {
+                cmd_resources.encode_command_buffer
+            } else {
+                extra_encode_buffers[slot_idx - 1]
+            };
+
+            // Encode fence: slot 0 reuses the one create_command_resources already
+            // created (signaled); additional slots get fresh signaled fences.
+            let encode_fence = if slot_idx == 0 {
+                cmd_resources.encode_fence
+            } else {
+                let signaled = vk::FenceCreateInfo::default().flags(vk::FenceCreateFlags::SIGNALED);
+                unsafe { context.device().create_fence(&signaled, None) }
+                    .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?
+            };
+
+            // Per-slot single-query pool (one feedback query per encode submit).
+            let mut rgb_conv_profile_query =
+                vk::VideoEncodeProfileRgbConversionInfoVALVE::default()
+                    .perform_encode_rgb_conversion(true);
+            let mut h265_profile_info_query =
+                vk::VideoEncodeH265ProfileInfoKHR::default().std_profile_idc(profile_idc);
+            if use_rgb_input {
+                h265_profile_info_query.p_next = (&mut rgb_conv_profile_query
+                    as *mut vk::VideoEncodeProfileRgbConversionInfoVALVE)
+                    .cast();
+            }
+            let mut profile_info_query = vk::VideoProfileInfoKHR::default()
+                .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H265)
+                .chroma_subsampling(chroma_subsampling)
+                .luma_bit_depth(bit_depth_flags)
+                .chroma_bit_depth(bit_depth_flags);
+            profile_info_query.p_next =
+                (&mut h265_profile_info_query as *mut vk::VideoEncodeH265ProfileInfoKHR).cast();
+            let mut encode_feedback_create =
+                vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR::default().encode_feedback_flags(
+                    vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BUFFER_OFFSET
+                        | vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BYTES_WRITTEN,
+                );
+            encode_feedback_create.p_next =
+                (&mut profile_info_query as *mut vk::VideoProfileInfoKHR).cast();
+            let mut query_pool_create_info = vk::QueryPoolCreateInfo::default()
+                .query_type(vk::QueryType::VIDEO_ENCODE_FEEDBACK_KHR)
+                .query_count(1);
+            query_pool_create_info.p_next = (&mut encode_feedback_create
+                as *mut vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR)
+                .cast();
+            let query_pool = unsafe {
+                context
+                    .device()
+                    .create_query_pool(&query_pool_create_info, None)
+            }
+            .map_err(|e| PixelForgeError::QueryPool(e.to_string()))?;
+
+            slots.push(super::EncodeSlot {
+                input_image,
+                input_image_memory,
+                input_image_view,
+                input_image_layout: vk::ImageLayout::VIDEO_ENCODE_SRC_KHR,
+                bitstream_buffer,
+                bitstream_buffer_memory,
+                bitstream_buffer_ptr,
+                encode_command_buffer,
+                encode_fence,
+                query_pool,
+                in_flight: false,
+                pending_metadata: None,
+            });
         }
-        .map_err(|e| PixelForgeError::QueryPool(e.to_string()))?;
 
         // Create DPB and GOP structure
         let mut dpb = DecodedPictureBuffer::new();
@@ -459,25 +583,17 @@ impl H265Encoder {
             session_memory,
             input_frame_num: 0,
             encode_frame_num: 0,
-            input_image,
-            input_image_memory,
-            input_image_view,
-            input_image_layout: vk::ImageLayout::VIDEO_ENCODE_SRC_KHR,
+            slots,
+            current_slot: 0,
             dpb_images,
             dpb_image_memories,
             dpb_image_views,
             dpb_slot_count,
             use_layered_dpb,
-            bitstream_buffer,
-            bitstream_buffer_memory,
-            bitstream_buffer_ptr,
             command_pool,
             upload_command_pool,
             upload_command_buffer,
             upload_fence,
-            encode_command_buffer,
-            encode_fence,
-            query_pool,
             header_data: None,
             has_backward_reference: false,
             backward_reference_poc: 0,
diff --git a/src/encoder/h265/mod.rs b/src/encoder/h265/mod.rs
index 522f262..36a8e98 100644
--- a/src/encoder/h265/mod.rs
+++ b/src/encoder/h265/mod.rs
@@ -12,9 +12,7 @@ use tracing::debug;
 
 use crate::encoder::dpb::DecodedPictureBuffer;
 use crate::encoder::gop::GopStructure;
-use crate::encoder::resources::{
-    destroy_encoder_resources, upload_image_to_input, EncoderResources, UploadParams,
-};
+use crate::encoder::resources::{upload_image_to_input, UploadParams};
 use crate::encoder::EncodeConfig;
 use crate::error::Result;
 use crate::vulkan::VideoContext;
@@ -28,6 +26,59 @@ pub(crate) struct ReferenceInfo {
     pub poc: i32,
 }
 
+/// Number of in-flight encode slots. Depth=2 lets frame N+1 begin encoding
+/// while frame N is still on the encode hardware, so the per-frame budget
+/// becomes 2 × frame_interval (16.6ms at 120fps) instead of 1 ×.
+pub(crate) const ENCODE_PIPELINE_DEPTH: usize = 2;
+
+/// One slot's worth of per-frame encode resources. All fields here are
+/// duplicated `ENCODE_PIPELINE_DEPTH` times so multiple frames can be
+/// in-flight concurrently. See the comment on `slots` below for the rotation
+/// invariants.
+pub(crate) struct EncodeSlot {
+    /// Image the converter writes into (and the encoder reads from) for
+    /// this slot's frame.
+    pub input_image: vk::Image,
+    pub input_image_memory: vk::DeviceMemory,
+    pub input_image_view: vk::ImageView,
+    /// Tracked layout of `input_image` for safe transitions between frames.
+    pub input_image_layout: vk::ImageLayout,
+
+    /// Bitstream destination buffer for this slot's encode.
+    pub bitstream_buffer: vk::Buffer,
+    pub bitstream_buffer_memory: vk::DeviceMemory,
+    /// Persistently-mapped pointer (avoids per-frame map/unmap).
+    pub bitstream_buffer_ptr: *mut u8,
+
+    /// Command buffer recorded fresh each time this slot is used.
+    pub encode_command_buffer: vk::CommandBuffer,
+    /// Signaled when the encode for this slot finishes on the GPU.
+    pub encode_fence: vk::Fence,
+    /// Single-query pool — one feedback query per encode submission.
+    pub query_pool: vk::QueryPool,
+
+    /// `true` after we've submitted to this slot but not yet drained it.
+    /// Used to decide whether `input_image()` must wait before returning.
+    pub in_flight: bool,
+
+    /// Metadata captured at submission time. The drained bitstream is wrapped
+    /// in an `EncodedPacket` using this metadata after the next encode() call
+    /// targeting the same slot waits on its fence.
+    pub pending_metadata: Option<SlotPacketMetadata>,
+}
+
+/// Frame metadata stashed alongside an in-flight encode submission. When the
+/// submission is drained on a later `encode()` call, we reconstruct the
+/// `EncodedPacket` using these fields plus the freshly-read bitstream.
+pub(crate) struct SlotPacketMetadata {
+    pub frame_type: crate::encoder::FrameType,
+    pub is_key_frame: bool,
+    pub pts: u64,
+    pub dts: u64,
+    /// VPS/SPS/PPS header bytes (Some only for IDR frames).
+    pub header: Option<Vec<u8>>,
+}
+
 /// H.265 encoder.
 pub struct H265Encoder {
     context: VideoContext,
@@ -51,12 +102,15 @@ pub struct H265Encoder {
     input_frame_num: u64,
     encode_frame_num: u64,
 
-    // Resources
-    input_image: vk::Image,
-    input_image_memory: vk::DeviceMemory,
-    input_image_view: vk::ImageView,
-    /// Current Vulkan image layout of `input_image` (tracked to avoid UB when transitioning).
-    input_image_layout: vk::ImageLayout,
+    /// Per-frame slots. Index `current_slot` is the slot we'll use for the
+    /// *next* encode submission (and whose `input_image` `input_image()`
+    /// returns). When `encode()` runs, it drains that slot's previous
+    /// in-flight work (if any), records new commands into it, submits, then
+    /// advances `current_slot` for the next frame. With depth=2 the encoder
+    /// can keep two frames in flight at once.
+    pub(crate) slots: Vec<EncodeSlot>,
+    pub(crate) current_slot: usize,
+
     /// DPB images.
     dpb_images: Vec<vk::Image>,
     dpb_image_memories: Vec<vk::DeviceMemory>,
@@ -65,19 +119,12 @@ pub struct H265Encoder {
     dpb_slot_count: usize,
     /// Whether the DPB uses a single layered image (true) or separate images (false).
     use_layered_dpb: bool,
-    bitstream_buffer: vk::Buffer,
-    bitstream_buffer_memory: vk::DeviceMemory,
-    /// Persistently mapped pointer to the bitstream buffer (avoids per-frame map/unmap).
-    bitstream_buffer_ptr: *mut u8,
 
-    // Command resources.
+    // Command pool (encode + upload command buffers allocated from these).
     command_pool: vk::CommandPool,
     upload_command_pool: vk::CommandPool,
     upload_command_buffer: vk::CommandBuffer,
     upload_fence: vk::Fence,
-    encode_command_buffer: vk::CommandBuffer,
-    encode_fence: vk::Fence,
-    query_pool: vk::QueryPool,
 
     // Parameter sets - cached header data (VPS/SPS/PPS)
     header_data: Option<Vec<u8>>,
@@ -111,7 +158,8 @@ impl H265Encoder {
     /// with the same dimensions as the encoder configuration. The source image
     /// should be in GENERAL layout.
     fn upload_from_image(&mut self, src_image: vk::Image) -> Result<()> {
-        if src_image == self.input_image {
+        let slot = &mut self.slots[self.current_slot];
+        if src_image == slot.input_image {
             debug!("Source image is the encoder's input image, skipping upload copy");
             return Ok(());
         }
@@ -120,18 +168,18 @@ impl H265Encoder {
             upload_command_buffer: self.upload_command_buffer,
             upload_fence: self.upload_fence,
             src_image,
-            dst_image: self.input_image,
+            dst_image: slot.input_image,
             width: self.config.dimensions.width,
             height: self.config.dimensions.height,
             pixel_format: self.config.pixel_format,
-            input_image_layout: self.input_image_layout,
+            input_image_layout: slot.input_image_layout,
             upload_queue: self.context.transfer_queue(),
         };
 
         upload_image_to_input(&self.context, &params)?;
 
         // Update tracked layout.
-        self.input_image_layout = vk::ImageLayout::VIDEO_ENCODE_SRC_KHR;
+        slot.input_image_layout = vk::ImageLayout::VIDEO_ENCODE_SRC_KHR;
 
         Ok(())
     }
@@ -144,37 +192,68 @@ unsafe impl Send for H265Encoder {}
 impl Drop for H265Encoder {
     fn drop(&mut self) {
         unsafe {
+            let device = self.context.device();
             // Wait on the queues used by the encoder rather than stalling
             // the entire device.
-            let _ = self
-                .context
-                .device()
-                .queue_wait_idle(self.context.transfer_queue());
+            let _ = device.queue_wait_idle(self.context.transfer_queue());
             if let Some(q) = self.context.video_encode_queue() {
-                let _ = self.context.device().queue_wait_idle(q);
+                let _ = device.queue_wait_idle(q);
+            }
+
+            // Destroy per-slot resources first (each slot has its own image,
+            // bitstream buffer, fence, query pool, and command buffer that
+            // was allocated from `command_pool`). The command buffers are
+            // freed implicitly when the pool is destroyed below.
+            for slot in &mut self.slots {
+                if !slot.bitstream_buffer_ptr.is_null() {
+                    device.unmap_memory(slot.bitstream_buffer_memory);
+                    slot.bitstream_buffer_ptr = std::ptr::null_mut();
+                }
+                device.destroy_query_pool(slot.query_pool, None);
+                device.destroy_fence(slot.encode_fence, None);
+                device.destroy_buffer(slot.bitstream_buffer, None);
+                device.free_memory(slot.bitstream_buffer_memory, None);
+                device.destroy_image_view(slot.input_image_view, None);
+                device.destroy_image(slot.input_image, None);
+                device.free_memory(slot.input_image_memory, None);
+            }
+
+            // Shared resources.
+            device.destroy_fence(self.upload_fence, None);
+            device.destroy_command_pool(self.command_pool, None);
+            if self.upload_command_pool != self.command_pool {
+                device.destroy_command_pool(self.upload_command_pool, None);
+            }
+
+            for view in &self.dpb_image_views {
+                device.destroy_image_view(*view, None);
+            }
+            for image in &self.dpb_images {
+                device.destroy_image(*image, None);
             }
-            destroy_encoder_resources(
-                self.context.device(),
-                &self.video_queue_fn,
-                &EncoderResources {
-                    query_pool: self.query_pool,
-                    upload_fence: self.upload_fence,
-                    encode_fence: self.encode_fence,
-                    command_pool: self.command_pool,
-                    upload_command_pool: self.upload_command_pool,
-                    bitstream_buffer: self.bitstream_buffer,
-                    bitstream_buffer_memory: self.bitstream_buffer_memory,
-                    input_image: self.input_image,
-                    input_image_memory: self.input_image_memory,
-                    input_image_view: self.input_image_view,
-                    dpb_images: &self.dpb_images,
-                    dpb_image_memories: &self.dpb_image_memories,
-                    dpb_image_views: &self.dpb_image_views,
-                    session: self.session,
-                    session_params: self.session_params,
-                    session_memory: &self.session_memory,
-                },
+            for memory in &self.dpb_image_memories {
+                device.free_memory(*memory, None);
+            }
+
+            if self.session_params != vk::VideoSessionParametersKHR::null() {
+                (self
+                    .video_queue_fn
+                    .fp()
+                    .destroy_video_session_parameters_khr)(
+                    device.handle(),
+                    self.session_params,
+                    std::ptr::null(),
+                );
+            }
+            (self.video_queue_fn.fp().destroy_video_session_khr)(
+                device.handle(),
+                self.session,
+                std::ptr::null(),
             );
+
+            for memory in &self.session_memory {
+                device.free_memory(*memory, None);
+            }
         }
     }
 }
diff --git a/src/encoder/mod.rs b/src/encoder/mod.rs
index 983ba12..707e39e 100644
--- a/src/encoder/mod.rs
+++ b/src/encoder/mod.rs
@@ -224,6 +224,16 @@ pub struct EncodeConfig {
     /// Color description for VUI signaling.
     /// Defaults to BT.709 (full-range) when `None`.
     pub color_description: Option<ColorDescription>,
+    /// When true, configure the encoder to take **RGB** input images directly
+    /// and have the hardware perform RGB→YUV conversion inline during encode.
+    /// Requires the device to advertise `VK_VALVE_video_encode_rgb_conversion`
+    /// (currently only AMD's RADV driver). Trying to enable this on a device
+    /// that doesn't support it returns an error from `Encoder::new`.
+    ///
+    /// When false (the default), the encoder takes its native YUV input
+    /// format (NV12 / P010 / 4:4:4 variants) and the caller is responsible
+    /// for converting RGB sources beforehand.
+    pub use_rgb_input: bool,
 }
 
 impl EncodeConfig {
@@ -249,6 +259,7 @@ impl EncodeConfig {
             virtual_buffer_size_ms: 1000,
             initial_virtual_buffer_size_ms: 1000,
             color_description: None,
+            use_rgb_input: false,
         }
     }
 
@@ -274,6 +285,7 @@ impl EncodeConfig {
             virtual_buffer_size_ms: 1000,
             initial_virtual_buffer_size_ms: 1000,
             color_description: None,
+            use_rgb_input: false,
         }
     }
 
@@ -299,6 +311,7 @@ impl EncodeConfig {
             virtual_buffer_size_ms: 1000,
             initial_virtual_buffer_size_ms: 1000,
             color_description: None,
+            use_rgb_input: false,
         }
     }
 
@@ -383,6 +396,13 @@ impl EncodeConfig {
         self.color_description = Some(desc);
         self
     }
+
+    /// Enable hardware-direct RGB input (`VK_VALVE_video_encode_rgb_conversion`).
+    /// See [`EncodeConfig::use_rgb_input`].
+    pub fn with_rgb_input(mut self, enable: bool) -> Self {
+        self.use_rgb_input = enable;
+        self
+    }
 }
 
 /// Encoded video packet.
@@ -466,7 +486,7 @@ impl Encoder {
     /// # let yuv_data = vec![0u8; 1920 * 1080 * 3 / 2];
     /// input.upload_yuv420(&yuv_data)?;
     ///
-    /// // Encode the image
+    /// // Encode the image (no GPU wait semaphore needed when uploaded synchronously).
     /// let packets = encoder.encode(input.image())?;
     /// # Ok(())
     /// # }
diff --git a/src/encoder/resources.rs b/src/encoder/resources.rs
index d63d157..c30ac9d 100644
--- a/src/encoder/resources.rs
+++ b/src/encoder/resources.rs
@@ -716,6 +716,149 @@ pub(crate) struct ClearImageParams {
     pub bit_depth: BitDepth,
 }
 
+/// Pick the input image format for the hardware-direct RGB encode path
+/// (`VK_VALVE_video_encode_rgb_conversion`). 8-bit picks B8G8R8A8_UNORM,
+/// 10-bit picks A2B10G10R10_UNORM_PACK32 — these match the formats RADV's
+/// VCN5 driver accepts for the RGB-conversion path, and are the
+/// `B`/`ABGR` variants that gamescope's override-surface DMA-BUFs
+/// typically arrive in (avoiding a channel swap).
+pub(crate) fn rgb_input_format(bit_depth: BitDepth) -> vk::Format {
+    match bit_depth {
+        BitDepth::Eight => vk::Format::B8G8R8A8_UNORM,
+        BitDepth::Ten => vk::Format::A2B10G10R10_UNORM_PACK32,
+    }
+}
+
+/// Pick the RGB→YUV model the hardware should apply, based on the
+/// configured colour description (BT.709 vs BT.2020).
+pub(crate) fn rgb_conversion_model(
+    desc: &crate::encoder::ColorDescription,
+) -> vk::VideoEncodeRgbModelConversionFlagsVALVE {
+    if desc.matrix_coefficients == 9 {
+        vk::VideoEncodeRgbModelConversionFlagsVALVE::YCBCR_2020
+    } else {
+        vk::VideoEncodeRgbModelConversionFlagsVALVE::YCBCR_709
+    }
+}
+
+/// Pick the RGB range compression flag (full vs limited) from the colour
+/// description.
+pub(crate) fn rgb_conversion_range(
+    desc: &crate::encoder::ColorDescription,
+) -> vk::VideoEncodeRgbRangeCompressionFlagsVALVE {
+    if desc.full_range {
+        vk::VideoEncodeRgbRangeCompressionFlagsVALVE::FULL_RANGE
+    } else {
+        vk::VideoEncodeRgbRangeCompressionFlagsVALVE::NARROW_RANGE
+    }
+}
+
+/// Clear an RGB-formatted encode input image (used by the
+/// `VK_VALVE_video_encode_rgb_conversion` path) to opaque black, then
+/// transition it to `VIDEO_ENCODE_SRC_KHR`.
+///
+/// RGB encode inputs are single-plane `COLOR` aspect images, so the
+/// multi-plane buffer-copy path used for YUV doesn't apply. We just
+/// `vkCmdClearColorImage` to zero, then barrier into encode layout.
+pub(crate) fn clear_rgb_input_image(
+    context: &VideoContext,
+    command_buffer: vk::CommandBuffer,
+    fence: vk::Fence,
+    queue: vk::Queue,
+    image: vk::Image,
+) -> Result<()> {
+    let device = context.device();
+
+    unsafe { device.reset_command_buffer(command_buffer, vk::CommandBufferResetFlags::empty()) }
+        .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?;
+
+    let begin_info =
+        vk::CommandBufferBeginInfo::default().flags(vk::CommandBufferUsageFlags::ONE_TIME_SUBMIT);
+    unsafe { device.begin_command_buffer(command_buffer, &begin_info) }
+        .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?;
+
+    let subresource = vk::ImageSubresourceRange {
+        aspect_mask: vk::ImageAspectFlags::COLOR,
+        base_mip_level: 0,
+        level_count: 1,
+        base_array_layer: 0,
+        layer_count: 1,
+    };
+
+    let to_transfer = vk::ImageMemoryBarrier::default()
+        .old_layout(vk::ImageLayout::UNDEFINED)
+        .new_layout(vk::ImageLayout::TRANSFER_DST_OPTIMAL)
+        .src_queue_family_index(vk::QUEUE_FAMILY_IGNORED)
+        .dst_queue_family_index(vk::QUEUE_FAMILY_IGNORED)
+        .image(image)
+        .subresource_range(subresource)
+        .src_access_mask(vk::AccessFlags::empty())
+        .dst_access_mask(vk::AccessFlags::TRANSFER_WRITE);
+
+    unsafe {
+        device.cmd_pipeline_barrier(
+            command_buffer,
+            vk::PipelineStageFlags::TOP_OF_PIPE,
+            vk::PipelineStageFlags::TRANSFER,
+            vk::DependencyFlags::empty(),
+            &[],
+            &[],
+            &[to_transfer],
+        );
+    }
+
+    let clear_color = vk::ClearColorValue {
+        float32: [0.0, 0.0, 0.0, 1.0],
+    };
+    unsafe {
+        device.cmd_clear_color_image(
+            command_buffer,
+            image,
+            vk::ImageLayout::TRANSFER_DST_OPTIMAL,
+            &clear_color,
+            &[subresource],
+        );
+    }
+
+    let to_encode = vk::ImageMemoryBarrier::default()
+        .old_layout(vk::ImageLayout::TRANSFER_DST_OPTIMAL)
+        .new_layout(vk::ImageLayout::VIDEO_ENCODE_SRC_KHR)
+        .src_queue_family_index(vk::QUEUE_FAMILY_IGNORED)
+        .dst_queue_family_index(vk::QUEUE_FAMILY_IGNORED)
+        .image(image)
+        .subresource_range(subresource)
+        .src_access_mask(vk::AccessFlags::TRANSFER_WRITE)
+        .dst_access_mask(vk::AccessFlags::empty());
+
+    unsafe {
+        device.cmd_pipeline_barrier(
+            command_buffer,
+            vk::PipelineStageFlags::TRANSFER,
+            vk::PipelineStageFlags::BOTTOM_OF_PIPE,
+            vk::DependencyFlags::empty(),
+            &[],
+            &[],
+            &[to_encode],
+        );
+    }
+
+    unsafe { device.end_command_buffer(command_buffer) }
+        .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?;
+
+    let submit_info =
+        vk::SubmitInfo::default().command_buffers(std::slice::from_ref(&command_buffer));
+    unsafe { device.reset_fences(&[fence]) }
+        .map_err(|e| PixelForgeError::CommandBuffer(format!("reset fence: {}", e)))?;
+    unsafe { device.queue_submit(queue, &[submit_info], fence) }
+        .map_err(|e| PixelForgeError::CommandBuffer(format!("submit rgb clear: {}", e)))?;
+    unsafe { device.wait_for_fences(&[fence], true, u64::MAX) }
+        .map_err(|e| PixelForgeError::CommandBuffer(format!("wait rgb clear: {}", e)))?;
+    unsafe { device.reset_fences(&[fence]) }
+        .map_err(|e| PixelForgeError::CommandBuffer(format!("reset fence after clear: {}", e)))?;
+
+    Ok(())
+}
+
 /// Clear the input image by filling it with zeros via a staging buffer.
 ///
 /// This must be called once after creating the input image to ensure
@@ -1159,77 +1302,6 @@ pub(crate) fn upload_image_to_input(
     Ok(())
 }
 
-/// Parameters for cleaning up shared encoder resources.
-pub(crate) struct EncoderResources<'a> {
-    pub query_pool: vk::QueryPool,
-    pub upload_fence: vk::Fence,
-    pub encode_fence: vk::Fence,
-    pub command_pool: vk::CommandPool,
-    pub upload_command_pool: vk::CommandPool,
-    pub bitstream_buffer: vk::Buffer,
-    pub bitstream_buffer_memory: vk::DeviceMemory,
-    pub input_image: vk::Image,
-    pub input_image_memory: vk::DeviceMemory,
-    pub input_image_view: vk::ImageView,
-    pub dpb_images: &'a [vk::Image],
-    pub dpb_image_memories: &'a [vk::DeviceMemory],
-    pub dpb_image_views: &'a [vk::ImageView],
-    pub session: vk::VideoSessionKHR,
-    pub session_params: vk::VideoSessionParametersKHR,
-    pub session_memory: &'a [vk::DeviceMemory],
-}
-
-/// Destroy all shared encoder resources.
-///
-/// # Safety
-///
-/// All queues that may reference these resources (transfer and video encode)
-/// must be idle before calling this function.
-pub(crate) unsafe fn destroy_encoder_resources(
-    device: &ash::Device,
-    video_queue_fn: &ash::khr::video_queue::Device,
-    res: &EncoderResources,
-) {
-    device.destroy_query_pool(res.query_pool, None);
-    device.destroy_fence(res.upload_fence, None);
-    device.destroy_fence(res.encode_fence, None);
-    device.destroy_command_pool(res.command_pool, None);
-    if res.upload_command_pool != res.command_pool {
-        device.destroy_command_pool(res.upload_command_pool, None);
-    }
-
-    device.unmap_memory(res.bitstream_buffer_memory);
-    device.destroy_buffer(res.bitstream_buffer, None);
-    device.free_memory(res.bitstream_buffer_memory, None);
-
-    device.destroy_image_view(res.input_image_view, None);
-    device.destroy_image(res.input_image, None);
-    device.free_memory(res.input_image_memory, None);
-
-    for view in res.dpb_image_views {
-        device.destroy_image_view(*view, None);
-    }
-    for image in res.dpb_images {
-        device.destroy_image(*image, None);
-    }
-    for memory in res.dpb_image_memories {
-        device.free_memory(*memory, None);
-    }
-
-    if res.session_params != vk::VideoSessionParametersKHR::null() {
-        (video_queue_fn.fp().destroy_video_session_parameters_khr)(
-            device.handle(),
-            res.session_params,
-            std::ptr::null(),
-        );
-    }
-    (video_queue_fn.fp().destroy_video_session_khr)(device.handle(), res.session, std::ptr::null());
-
-    for memory in res.session_memory {
-        device.free_memory(*memory, None);
-    }
-}
-
 /// Record DPB image barriers for encode.
 ///
 /// Transitions the setup DPB slot from UNDEFINED to VIDEO_ENCODE_DPB and
@@ -1397,55 +1469,75 @@ pub(crate) unsafe fn record_post_encode_dpb_barrier(
     );
 }
 
-/// Submit an encode command buffer and wait for completion.
+/// Submit an encode command buffer to the encode queue without waiting.
+///
+/// This is the asynchronous half of the encode submit. Use `wait_and_read_bitstream`
+/// later to drain the result. Lets pipelined encoders (H.265 with depth > 1) keep
+/// multiple encodes in flight on the encode queue.
 ///
-/// Submits the command buffer to the encode queue, waits for the fence,
-/// then reads query results and copies the encoded bitstream data.
-/// The fence is reset before submission so it may be in any state on entry.
+/// The fence is reset before submission so it may be in any state on entry, and
+/// will be signaled when the GPU encode finishes.
 ///
 /// # Safety
 ///
 /// The command buffer must have been ended.
-/// The bitstream buffer pointer must be valid and the buffer must be persistently mapped.
-pub(crate) unsafe fn submit_encode_and_read_bitstream(
+pub(crate) unsafe fn submit_encode_only(
     device: &ash::Device,
     command_buffer: vk::CommandBuffer,
     fence: vk::Fence,
     encode_queue: vk::Queue,
-    query_pool: vk::QueryPool,
-    bitstream_buffer_ptr: *const u8,
-) -> Result<Vec<u8>> {
-    let submit_info =
+    wait_semaphore: Option<vk::Semaphore>,
+) -> Result<()> {
+    let wait_semaphores: Vec<vk::Semaphore>;
+    let wait_dst_stage_mask: Vec<vk::PipelineStageFlags>;
+
+    let mut submit_info =
         vk::SubmitInfo::default().command_buffers(std::slice::from_ref(&command_buffer));
 
-    // Reset the fence before submit (it may be signaled from a previous encode
-    // or from initial creation with SIGNALED_BIT). This ensures the fence is
-    // unsignaled for queue_submit, and after wait_for_fences it stays signaled —
-    // which lets set_color_description() safely wait on it between encodes.
+    if let Some(sem) = wait_semaphore {
+        wait_semaphores = vec![sem];
+        wait_dst_stage_mask = vec![vk::PipelineStageFlags::ALL_COMMANDS];
+        submit_info = submit_info
+            .wait_semaphores(&wait_semaphores)
+            .wait_dst_stage_mask(&wait_dst_stage_mask);
+    }
+
     device
         .reset_fences(&[fence])
         .map_err(|e| PixelForgeError::Synchronization(e.to_string()))?;
-
     device
         .queue_submit(encode_queue, &[submit_info], fence)
         .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?;
+    Ok(())
+}
 
+/// Wait on the encode fence and read the bitstream produced by a prior
+/// `submit_encode_only` call on the same fence/query_pool/buffer triple.
+///
+/// # Safety
+///
+/// The fence must be the one signaled by the encode submission whose bitstream
+/// is being drained here, and `bitstream_buffer_ptr` must point to the
+/// persistently-mapped bitstream buffer for that submission.
+pub(crate) unsafe fn wait_and_read_bitstream(
+    device: &ash::Device,
+    fence: vk::Fence,
+    query_pool: vk::QueryPool,
+    bitstream_buffer_ptr: *const u8,
+) -> Result<Vec<u8>> {
     device
         .wait_for_fences(&[fence], true, u64::MAX)
         .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?;
 
-    // Read query results (offset + bytes_written).
     #[repr(C)]
     struct QueryResult {
         offset: u32,
         bytes_written: u32,
     }
-
     let mut query_results = [QueryResult {
         offset: 0,
         bytes_written: 0,
     }];
-
     device
         .get_query_pool_results(
             query_pool,
@@ -1457,19 +1549,16 @@ pub(crate) unsafe fn submit_encode_and_read_bitstream(
 
     let offset = query_results[0].offset as usize;
     let size = query_results[0].bytes_written as usize;
-
     if size == 0 {
         return Err(PixelForgeError::QueryPool(
             "Encoder produced 0 bytes".to_string(),
         ));
     }
-
     tracing::debug!("Encoded frame: offset={}, size={}", offset, size);
 
     let mut encoded_data = vec![0u8; size];
     let src = std::slice::from_raw_parts(bitstream_buffer_ptr.add(offset), size);
     encoded_data.copy_from_slice(src);
-
     Ok(encoded_data)
 }
 
diff --git a/src/vulkan.rs b/src/vulkan.rs
index f3fa382..f4718b7 100644
--- a/src/vulkan.rs
+++ b/src/vulkan.rs
@@ -78,6 +78,12 @@ struct VideoContextInner {
     device_properties: vk::PhysicalDeviceProperties,
     supported_encode_codecs: Vec<Codec>,
     has_descriptor_buffer: bool,
+    /// `true` when `VK_VALVE_video_encode_rgb_conversion` was both reported by
+    /// the device and enabled at `vkCreateDevice` time. Encoder codepaths
+    /// can use this to opt into the hardware-direct RGB→YUV path that lets
+    /// VCN do the colour conversion inline (skipping a separate compute
+    /// shader). When `false`, callers must run their own RGB→YUV step.
+    rgb_conversion_supported: bool,
 }
 
 impl Drop for VideoContextInner {
@@ -159,6 +165,17 @@ impl VideoContext {
     pub fn has_descriptor_buffer(&self) -> bool {
         self.inner.has_descriptor_buffer
     }
+
+    /// Returns `true` when the device supports — and we have enabled —
+    /// `VK_VALVE_video_encode_rgb_conversion`. When this is true, encoders
+    /// may opt into the hardware-direct RGB input path (VCN performs the
+    /// RGB→YUV conversion inline during the encode pass), eliminating the
+    /// need for a separate compute-shader colour converter. Currently only
+    /// AMD's RADV driver supports this; on other vendors the answer is
+    /// `false` and the caller must keep its own conversion path.
+    pub fn supports_rgb_direct_encode(&self) -> bool {
+        self.inner.rgb_conversion_supported
+    }
 }
 
 impl VideoContext {
@@ -465,6 +482,36 @@ impl VideoContext {
         // Add the 2-plane 444 formats extension.
         push_ext(ash::ext::ycbcr_2plane_444_formats::NAME.as_ptr());
 
+        // Probe for VK_VALVE_video_encode_rgb_conversion. When the device
+        // supports it we enable both the extension and its feature, which
+        // lets encoders take RGB images directly and have VCN do RGB→YUV
+        // conversion inline. Vendors without the extension (NVIDIA, Intel,
+        // most non-RADV stacks today) fall through and callers continue
+        // using their own RGB→YUV step.
+        let device_exts =
+            unsafe { instance.enumerate_device_extension_properties(physical_device) }
+                .unwrap_or_default();
+        let rgb_conversion_supported = video_encode_queue_family.is_some()
+            && device_exts.iter().any(|ext| {
+                ext.extension_name_as_c_str()
+                    .map(|n| n == ash::valve::video_encode_rgb_conversion::NAME)
+                    .unwrap_or(false)
+            });
+        let mut rgb_conv_features =
+            vk::PhysicalDeviceVideoEncodeRgbConversionFeaturesVALVE::default()
+                .video_encode_rgb_conversion(true);
+        if rgb_conversion_supported {
+            push_ext(ash::valve::video_encode_rgb_conversion::NAME.as_ptr());
+            info!(
+                "VK_VALVE_video_encode_rgb_conversion supported, enabling RGB-direct encode path"
+            );
+        } else if video_encode_queue_family.is_some() {
+            debug!(
+                "VK_VALVE_video_encode_rgb_conversion not supported on this device — \
+                 encoders will run with caller-provided YUV input"
+            );
+        }
+
         // Enable AV1 video encode feature only if AV1 is supported.
         // Only include AV1 features in the pNext chain when AV1 is actually supported,
         // to avoid chaining unknown feature structs on devices without AV1.
@@ -536,6 +583,24 @@ impl VideoContext {
             && desc_buf_features.descriptor_buffer != 0
             && desc_buf_features.descriptor_buffer_capture_replay != 0;
 
+        // Splice the RGB conversion feature onto the end of the chain when
+        // the device supports it. Walks from `sync2_features` (which the
+        // descriptor-buffer block leaves at the middle of the chain) to the
+        // tail and appends — keeping it conditional avoids passing an
+        // unknown feature struct on devices that don't recognise it.
+        if rgb_conversion_supported {
+            unsafe {
+                let mut cursor: *mut vk::BaseOutStructure =
+                    (&mut sync2_features as *mut vk::PhysicalDeviceSynchronization2Features).cast();
+                while !(*cursor).p_next.is_null() {
+                    cursor = (*cursor).p_next.cast();
+                }
+                (*cursor).p_next = (&mut rgb_conv_features
+                    as *mut vk::PhysicalDeviceVideoEncodeRgbConversionFeaturesVALVE)
+                    .cast();
+            }
+        }
+
         // Log all extensions being enabled
         debug!("Enabling {} device extensions:", extension_names.len());
         for ext_name_ptr in &extension_names {
@@ -589,6 +654,7 @@ impl VideoContext {
                 device_properties,
                 supported_encode_codecs,
                 has_descriptor_buffer,
+                rgb_conversion_supported,
             }),
         })
     }