diff --git a/src/converter/mod.rs b/src/converter/mod.rs index 1133922..a8de10c 100644 --- a/src/converter/mod.rs +++ b/src/converter/mod.rs @@ -541,6 +541,12 @@ impl ColorConverter { /// /// # Returns /// Returns `Ok(())` on success. The target_image is transitioned to VIDEO_ENCODE_SRC_KHR. + /// Convert an RGB source image to YUV, writing to the target image. + /// + /// Submits the command buffer and waits synchronously on a fence before + /// returning. The caller is responsible for any further sync between + /// convert and downstream consumers (e.g. an encoder reading the target + /// image). pub fn convert( &mut self, src_image: vk::Image, @@ -858,7 +864,8 @@ impl ColorConverter { .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?; } - // Submit and wait. + // Submit and wait synchronously on the fence — no semaphore overlap + // with the encoder; the caller is responsible for any further sync. unsafe { device .reset_fences(&[self.fence]) diff --git a/src/encoder/av1/api.rs b/src/encoder/av1/api.rs index 224a989..5d74365 100644 --- a/src/encoder/av1/api.rs +++ b/src/encoder/av1/api.rs @@ -12,7 +12,7 @@ impl AV1Encoder { /// This image can be used as a target for `ColorConverter::convert` to avoid /// an intermediate copy. pub fn input_image(&self) -> vk::Image { - self.input_image + self.slots[self.current_slot].input_image } /// Encode a frame from a GPU image. @@ -26,22 +26,60 @@ impl AV1Encoder { /// The encoder will panic at creation time if B-frames are enabled (b_frame_count > 0), /// as B-frame encoding is not yet supported. pub fn encode(&mut self, src_image: vk::Image) -> Result> { + let prev_packet = self.drain_current_slot()?; + let gop_position = self.gop.get_next_frame(); let display_order = self.input_frame_num; self.input_frame_num += 1; debug!( - "AV1 encode: frame {} from GPU image, type={:?}", - display_order, gop_position.frame_type + "AV1 encode: frame {} type={:?}, slot={}", + display_order, gop_position.frame_type, self.current_slot ); - // Upload from GPU image. self.upload_from_image(src_image)?; + self.encode_current_frame(&gop_position, display_order)?; - // Encode immediately. - let packet = self.encode_current_frame(&gop_position, display_order)?; + self.current_slot = (self.current_slot + 1) % self.slots.len(); + Ok(prev_packet.into_iter().collect()) + } - Ok(vec![packet]) + fn drain_current_slot(&mut self) -> Result> { + if !self.slots[self.current_slot].in_flight { + return Ok(None); + } + let bitstream = unsafe { + crate::encoder::resources::wait_and_read_bitstream( + self.context.device(), + self.slots[self.current_slot].encode_fence, + self.slots[self.current_slot].query_pool, + self.slots[self.current_slot].bitstream_buffer_ptr, + )? + }; + self.slots[self.current_slot].in_flight = false; + let meta = self.slots[self.current_slot] + .pending_metadata + .take() + .ok_or_else(|| { + PixelForgeError::CommandBuffer( + "Drained slot has bitstream but no metadata; encoder state corrupted" + .to_string(), + ) + })?; + // AV1 always prefixes a Temporal Delimiter OBU; key frames also need + // the sequence header captured at submit time. + let mut data = vec![0x12, 0x00]; + if let Some(header) = meta.header { + data.extend_from_slice(&header); + } + data.extend_from_slice(&bitstream); + Ok(Some(EncodedPacket { + data, + frame_type: meta.frame_type, + is_key_frame: meta.is_key_frame, + pts: meta.pts, + dts: meta.dts, + })) } /// Internal method to encode the current frame already uploaded to input_image. @@ -49,7 +87,7 @@ impl AV1Encoder { &mut self, gop_position: &GopPosition, display_order: u64, - ) -> Result { + ) -> Result<()> { let is_key_frame = gop_position.frame_type.is_idr() || gop_position.frame_type == GopFrameType::I; let is_reference = gop_position.is_reference; @@ -75,38 +113,41 @@ impl AV1Encoder { } } - let mut encoded_data = Vec::new(); - - // AV1 Temporal Delimiter OBU: type=2, has_size=1, size=0. - // Required as the first OBU in each temporal unit for conformant bitstreams. - // This enables ffmpeg's AV1 demuxer to detect frame boundaries in raw OBU streams. - encoded_data.extend_from_slice(&[0x12, 0x00]); - - // For key frames, prepend the AV1 Sequence Header OBU. - // This is required for AV1 decoders to initialize (equivalent to H.265 VPS/SPS/PPS). - if is_key_frame { + // For key frames, capture the AV1 Sequence Header OBU to be prepended + // at drain time. (The Temporal Delimiter prefix is added in + // drain_current_slot for every frame.) + let header = if is_key_frame { if self.header_data.is_none() { - let header = self.get_av1_sequence_header()?; + let h = self.get_av1_sequence_header()?; debug!( "AV1 sequence header ({} bytes): {:02X?}", - header.len(), - &header[..std::cmp::min(32, header.len())] + h.len(), + &h[..std::cmp::min(32, h.len())] ); - self.header_data = Some(header); + self.header_data = Some(h); } - if let Some(ref header) = self.header_data { - encoded_data.extend_from_slice(header); - } - } + self.header_data.clone() + } else { + None + }; - encoded_data.extend_from_slice(&self.encode_frame_internal(gop_position, is_key_frame)?); + // Submit the encode (no wait, no readback). Marks the slot in_flight. + self.encode_frame_internal(gop_position, is_key_frame)?; - // Save the order_hint used during encoding BEFORE incrementing. let encoded_order_hint = self.order_hint; + let dts = self.encode_frame_num; self.encode_frame_num += 1; self.frame_num += 1; self.order_hint = (self.order_hint + 1) & 0xFF; // 8-bit order hint + self.slots[self.current_slot].pending_metadata = Some(super::SlotPacketMetadata { + frame_type, + is_key_frame, + pts: display_order, + dts, + header, + }); + // Only KEY frames are stored as references. P frames all reference the KEY frame // and don't update any reference buffer, avoiding P→P which produces corrupt output // on NVIDIA AV1 encoders. @@ -131,19 +172,25 @@ impl AV1Encoder { // P frames reuse the same scratch DPB slot (current_dpb_slot stays unchanged // between P frames since it's always different from the KEY frame's slot). - Ok(EncodedPacket { - data: encoded_data, - frame_type, - is_key_frame, - pts: display_order, - dts: self.encode_frame_num - 1, - }) + Ok(()) } - /// Flush the encoder and get any remaining packets. + /// Flush the encoder and drain any remaining in-flight slots. pub fn flush(&mut self) -> Result> { - // No buffered frames in the current implementation. - Ok(Vec::new()) + let mut out = Vec::new(); + for offset in 0..self.slots.len() { + let idx = (self.current_slot + offset) % self.slots.len(); + if !self.slots[idx].in_flight { + continue; + } + let saved_current = self.current_slot; + self.current_slot = idx; + if let Some(packet) = self.drain_current_slot()? { + out.push(packet); + } + self.current_slot = saved_current; + } + Ok(out) } /// Request that the next frame be an IDR/key frame. @@ -214,17 +261,16 @@ impl AV1Encoder { /// containing the updated color configuration. The next encoded frame will /// be a key frame with the new sequence header prepended. pub fn set_color_description(&mut self, desc: ColorDescription) -> Result<()> { - // Wait for any in-flight encode to complete before modifying session params. - // Do NOT reset the fence here — submit_encode_and_read_bitstream() resets it - // before queue_submit. Leaving the fence signaled allows consecutive - // set_color_description() calls without deadlock. + // Wait for ALL slot fences before modifying session params. Do NOT reset + // here; submit_encode_only resets each fence on submit. + let fences: Vec = self.slots.iter().map(|s| s.encode_fence).collect(); unsafe { self.context .device() - .wait_for_fences(&[self.encode_fence], true, u64::MAX) + .wait_for_fences(&fences, true, u64::MAX) .map_err(|e| { PixelForgeError::Synchronization(format!( - "Failed to wait for encode fence: {:?}", + "Failed to wait for encode fences: {:?}", e )) })?; diff --git a/src/encoder/av1/encode.rs b/src/encoder/av1/encode.rs index 9ac5fa1..c409228 100644 --- a/src/encoder/av1/encode.rs +++ b/src/encoder/av1/encode.rs @@ -3,18 +3,21 @@ use super::AV1Encoder; use crate::encoder::gop::GopPosition; use crate::encoder::resources::{ prepare_encode_command_buffer, record_dpb_barriers, record_post_encode_dpb_barrier, - submit_encode_and_read_bitstream, + submit_encode_only, }; use crate::error::{PixelForgeError, Result}; use ash::vk; use tracing::debug; impl AV1Encoder { + /// Records and submits the encode commands for a single frame to the + /// current slot. Does NOT wait for completion — see encoder::h265 for the + /// pipelining contract. pub(super) fn encode_frame_internal( &mut self, _gop_position: &GopPosition, is_key_frame: bool, - ) -> Result> { + ) -> Result<()> { // All frames need a setup reference slot (DPB write) per Vulkan spec when maxDpbSlots > 0. let is_reference = true; @@ -52,8 +55,8 @@ impl AV1Encoder { unsafe { prepare_encode_command_buffer( self.context.device(), - self.encode_command_buffer, - self.query_pool, + self.slots[self.current_slot].encode_command_buffer, + self.slots[self.current_slot].query_pool, )?; } @@ -62,7 +65,7 @@ impl AV1Encoder { unsafe { record_dpb_barriers( self.context.device(), - self.encode_command_buffer, + self.slots[self.current_slot].encode_command_buffer, &self.dpb_images, false, // AV1 does not use layered DPB self.current_dpb_slot, @@ -418,8 +421,10 @@ impl AV1Encoder { }; unsafe { - self.video_queue_fn - .cmd_begin_video_coding(self.encode_command_buffer, &begin_coding_info); + self.video_queue_fn.cmd_begin_video_coding( + self.slots[self.current_slot].encode_command_buffer, + &begin_coding_info, + ); } // Reset video coding state for the first frame. @@ -440,8 +445,10 @@ impl AV1Encoder { (&quality_level_info as *const vk::VideoEncodeQualityLevelInfoKHR).cast(); unsafe { - self.video_queue_fn - .cmd_control_video_coding(self.encode_command_buffer, &control_info); + self.video_queue_fn.cmd_control_video_coding( + self.slots[self.current_slot].encode_command_buffer, + &control_info, + ); } } @@ -450,13 +457,13 @@ impl AV1Encoder { .coded_offset(vk::Offset2D { x: 0, y: 0 }) .coded_extent(frame_extent) .base_array_layer(0) - .image_view_binding(self.input_image_view); + .image_view_binding(self.slots[self.current_slot].input_image_view); let mut encode_info = vk::VideoEncodeInfoKHR::default() .src_picture_resource(src_picture_resource) - .dst_buffer(self.bitstream_buffer) + .dst_buffer(self.slots[self.current_slot].bitstream_buffer) .dst_buffer_offset(0) - .dst_buffer_range(self.bitstream_buffer_size as u64); + .dst_buffer_range(self.slots[self.current_slot].bitstream_buffer_size as u64); if is_reference { encode_info = encode_info.setup_reference_slot(&setup_reference_slot); @@ -471,30 +478,34 @@ impl AV1Encoder { // Begin query to capture encode feedback (bitstream size, status). unsafe { self.context.device().cmd_begin_query( - self.encode_command_buffer, - self.query_pool, + self.slots[self.current_slot].encode_command_buffer, + self.slots[self.current_slot].query_pool, 0, vk::QueryControlFlags::empty(), ); } unsafe { - self.video_encode_fn - .cmd_encode_video(self.encode_command_buffer, &encode_info); + self.video_encode_fn.cmd_encode_video( + self.slots[self.current_slot].encode_command_buffer, + &encode_info, + ); } // End query. unsafe { - self.context - .device() - .cmd_end_query(self.encode_command_buffer, self.query_pool, 0); + self.context.device().cmd_end_query( + self.slots[self.current_slot].encode_command_buffer, + self.slots[self.current_slot].query_pool, + 0, + ); } // Add DPB synchronization barrier after encoding. unsafe { record_post_encode_dpb_barrier( self.context.device(), - self.encode_command_buffer, + self.slots[self.current_slot].encode_command_buffer, &self.dpb_images, false, // AV1 does not use layered DPB self.current_dpb_slot, @@ -504,15 +515,17 @@ impl AV1Encoder { // End video coding. let end_coding_info = vk::VideoEndCodingInfoKHR::default(); unsafe { - self.video_queue_fn - .cmd_end_video_coding(self.encode_command_buffer, &end_coding_info); + self.video_queue_fn.cmd_end_video_coding( + self.slots[self.current_slot].encode_command_buffer, + &end_coding_info, + ); } // End command buffer. unsafe { self.context .device() - .end_command_buffer(self.encode_command_buffer) + .end_command_buffer(self.slots[self.current_slot].encode_command_buffer) } .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?; @@ -531,22 +544,24 @@ impl AV1Encoder { let gpu_start = std::time::Instant::now(); - let encoded_data = unsafe { - submit_encode_and_read_bitstream( + unsafe { + submit_encode_only( self.context.device(), - self.encode_command_buffer, - self.encode_fence, + self.slots[self.current_slot].encode_command_buffer, + self.slots[self.current_slot].encode_fence, encode_queue, - self.query_pool, - self.bitstream_buffer_ptr, - )? - }; + None, + )?; + } - debug!("GPU encode took {:?}", gpu_start.elapsed()); + debug!("Submitted encode (no wait): {:?}", gpu_start.elapsed()); // Mark current DPB slot as active. self.dpb_slot_active[self.current_dpb_slot as usize] = true; - Ok(encoded_data) + // Mark slot as in-flight; bitstream is drained on next encode() call. + self.slots[self.current_slot].in_flight = true; + + Ok(()) } } diff --git a/src/encoder/av1/init.rs b/src/encoder/av1/init.rs index 1a59b30..58c94f5 100644 --- a/src/encoder/av1/init.rs +++ b/src/encoder/av1/init.rs @@ -2,9 +2,10 @@ use super::{AV1Encoder, MIN_BITSTREAM_BUFFER_SIZE, SUPERBLOCK_SIZE}; use crate::encoder::gop::GopStructure; use crate::encoder::resources::{ - allocate_session_memory, clear_input_image, create_bitstream_buffer, create_command_resources, - create_dpb_images, create_image, get_video_format, make_codec_name, map_bitstream_buffer, - query_supported_video_formats, ClearImageParams, + allocate_session_memory, clear_input_image, clear_rgb_input_image, create_bitstream_buffer, + create_command_resources, create_dpb_images, create_image, get_video_format, make_codec_name, + map_bitstream_buffer, query_supported_video_formats, rgb_conversion_model, + rgb_conversion_range, rgb_input_format, ClearImageParams, }; use crate::encoder::{ColorDescription, PixelFormat}; use crate::error::{PixelForgeError, Result}; @@ -46,6 +47,15 @@ impl AV1Encoder { let video_encode_fn = ash::khr::video_encode_queue::Device::load(context.instance(), context.device()); + if config.use_rgb_input && !context.supports_rgb_direct_encode() { + return Err(PixelForgeError::NoSuitableDevice( + "EncodeConfig::use_rgb_input requires VK_VALVE_video_encode_rgb_conversion, \ + which this device does not support." + .to_string(), + )); + } + let use_rgb_input = config.use_rgb_input; + // Get chroma subsampling from pixel format. let chroma_subsampling: vk::VideoChromaSubsamplingFlagsKHR = config.pixel_format.into(); let luma_bit_depth: vk::VideoComponentBitDepthFlagsKHR = config.bit_depth.into(); @@ -62,8 +72,18 @@ impl AV1Encoder { // Preferred input format based on pixel format and bit depth. let preferred_src_format = get_video_format(config.pixel_format, config.bit_depth); - // Create AV1 encode profile. + // Create AV1 encode profile. When RGB-direct is enabled we chain + // VkVideoEncodeProfileRgbConversionInfoVALVE inside av1_profile_info + // so all downstream uses (capability query, session create, image + // creation, query pool) see a profile that matches. + let mut rgb_conv_profile_info = vk::VideoEncodeProfileRgbConversionInfoVALVE::default() + .perform_encode_rgb_conversion(true); let mut av1_profile_info = vk::VideoEncodeAV1ProfileInfoKHR::default().std_profile(profile); + if use_rgb_input { + av1_profile_info.p_next = (&mut rgb_conv_profile_info + as *mut vk::VideoEncodeProfileRgbConversionInfoVALVE) + .cast(); + } let mut profile_info = vk::VideoProfileInfoKHR::default() .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_AV1) @@ -182,7 +202,17 @@ impl AV1Encoder { info!("Supported SRC formats: {:?}", supported_src_formats); info!("Supported DPB formats: {:?}", supported_dpb_formats); - let picture_format = if supported_src_formats.contains(&preferred_src_format) { + let picture_format = if use_rgb_input { + let rgb_fmt = rgb_input_format(config.bit_depth); + if !supported_src_formats.contains(&rgb_fmt) { + return Err(PixelForgeError::NoSuitableDevice(format!( + "RGB-direct encode requested but driver does not advertise {:?} as a \ + VIDEO_ENCODE_SRC_KHR format for this AV1 profile. Supported: {:?}", + rgb_fmt, supported_src_formats + ))); + } + rgb_fmt + } else if supported_src_formats.contains(&preferred_src_format) { preferred_src_format } else { return Err(PixelForgeError::NoSuitableDevice(format!( @@ -191,11 +221,19 @@ impl AV1Encoder { ))); }; - let reference_picture_format = supported_dpb_formats - .iter() - .copied() - .find(|f| *f == picture_format) - .unwrap_or(supported_dpb_formats[0]); + let reference_picture_format = if use_rgb_input { + supported_dpb_formats + .iter() + .copied() + .find(|f| *f == preferred_src_format) + .unwrap_or(supported_dpb_formats[0]) + } else { + supported_dpb_formats + .iter() + .copied() + .find(|f| *f == picture_format) + .unwrap_or(supported_dpb_formats[0]) + }; debug!( "Selected formats: picture={:?}, reference={:?}", @@ -244,7 +282,18 @@ impl AV1Encoder { max_active_reference_pictures_supported ); - let session_create_info = vk::VideoSessionCreateInfoKHR::default() + let color_desc = config + .color_description + .unwrap_or(ColorDescription::bt709()); + + let mut session_rgb_conv_info = + vk::VideoEncodeSessionRgbConversionCreateInfoVALVE::default() + .rgb_model(rgb_conversion_model(&color_desc)) + .rgb_range(rgb_conversion_range(&color_desc)) + .x_chroma_offset(vk::VideoEncodeRgbChromaOffsetFlagsVALVE::COSITED_EVEN) + .y_chroma_offset(vk::VideoEncodeRgbChromaOffsetFlagsVALVE::MIDPOINT); + + let mut session_create_info = vk::VideoSessionCreateInfoKHR::default() .queue_family_index(encode_queue_family) .video_profile(&profile_info) .picture_format(picture_format) @@ -256,6 +305,11 @@ impl AV1Encoder { .max_dpb_slots(requested_dpb_slots as u32) .max_active_reference_pictures(target_active_refs as u32) .std_header_version(&std_header_version); + if use_rgb_input { + session_create_info.p_next = (&mut session_rgb_conv_info + as *mut vk::VideoEncodeSessionRgbConversionCreateInfoVALVE) + .cast(); + } let mut session = vk::VideoSessionKHR::null(); let result = unsafe { @@ -276,22 +330,7 @@ impl AV1Encoder { // Allocate session memory. let session_memory = allocate_session_memory(&context, session, &video_queue_fn)?; - let color_desc = config - .color_description - .unwrap_or(ColorDescription::bt709()); - - // Create input image. - let (input_image, input_image_memory, input_image_view) = create_image( - &context, - aligned_width, - aligned_height, - picture_format, - false, // is_dpb - &profile_info, - )?; - let input_image_layout = vk::ImageLayout::UNDEFINED; - - // Create DPB images. + // Create DPB images (shared across slots). let (dpb_images, dpb_image_memories, dpb_image_views) = create_dpb_images( &context, aligned_width, @@ -301,60 +340,122 @@ impl AV1Encoder { &profile_info, false, )?; - // Create bitstream buffer. + let bitstream_buffer_size = MIN_BITSTREAM_BUFFER_SIZE.max(width as usize * height as usize); - let (bitstream_buffer, bitstream_buffer_memory) = - create_bitstream_buffer(&context, bitstream_buffer_size, &profile_info)?; - // Map bitstream buffer persistently. - let bitstream_buffer_ptr = - map_bitstream_buffer(&context, bitstream_buffer_memory, bitstream_buffer_size)?; - // Create command resources. + + // Shared command pool / upload resources. let upload_queue_family = context.transfer_queue_family(); let cmd_resources = create_command_resources(&context, encode_queue_family, upload_queue_family)?; let command_pool = cmd_resources.command_pool; let upload_command_buffer = cmd_resources.upload_command_buffer; let upload_fence = cmd_resources.upload_fence; - let encode_command_buffer = cmd_resources.encode_command_buffer; - let encode_fence = cmd_resources.encode_fence; - // Clear the input image so padding between user dimensions and the - // aligned coded extent is zero-initialized. - clear_input_image( - &context, - &ClearImageParams { - command_buffer: upload_command_buffer, - fence: upload_fence, - queue: context.transfer_queue(), - image: input_image, - width: aligned_width, - height: aligned_height, - pixel_format: config.pixel_format, - bit_depth: config.bit_depth, - }, - )?; - // Create query pool for bitstream size queries. - // Need 1 query to capture bitstream offset and size. - // Need to provide profile info and feedback flags in pNext chain. - let mut query_feedback_info = vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR::default() - .encode_feedback_flags( - vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BUFFER_OFFSET - | vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BYTES_WRITTEN, - ); - query_feedback_info.p_next = (&profile_info as *const vk::VideoProfileInfoKHR).cast(); - - let mut query_pool_create_info = vk::QueryPoolCreateInfo::default() - .query_type(vk::QueryType::VIDEO_ENCODE_FEEDBACK_KHR) - .query_count(1); - query_pool_create_info.p_next = - (&query_feedback_info as *const vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR).cast(); - - let query_pool = unsafe { - context - .device() - .create_query_pool(&query_pool_create_info, None) - .map_err(|e| PixelForgeError::QueryPool(e.to_string()))? + + // Allocate ENCODE_PIPELINE_DEPTH-1 additional encode command buffers. + let extra_buffers_needed = super::ENCODE_PIPELINE_DEPTH.saturating_sub(1) as u32; + let extra_encode_buffers: Vec = if extra_buffers_needed > 0 { + let alloc_info = vk::CommandBufferAllocateInfo::default() + .command_pool(command_pool) + .level(vk::CommandBufferLevel::PRIMARY) + .command_buffer_count(extra_buffers_needed); + unsafe { context.device().allocate_command_buffers(&alloc_info) } + .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))? + } else { + Vec::new() }; + // Build per-slot resources. + let mut slots: Vec = Vec::with_capacity(super::ENCODE_PIPELINE_DEPTH); + for slot_idx in 0..super::ENCODE_PIPELINE_DEPTH { + let (input_image, input_image_memory, input_image_view) = create_image( + &context, + aligned_width, + aligned_height, + picture_format, + false, + &profile_info, + )?; + + let (bitstream_buffer, bitstream_buffer_memory) = + create_bitstream_buffer(&context, bitstream_buffer_size, &profile_info)?; + let bitstream_buffer_ptr = + map_bitstream_buffer(&context, bitstream_buffer_memory, bitstream_buffer_size)?; + + if use_rgb_input { + clear_rgb_input_image( + &context, + upload_command_buffer, + upload_fence, + context.transfer_queue(), + input_image, + )?; + } else { + clear_input_image( + &context, + &ClearImageParams { + command_buffer: upload_command_buffer, + fence: upload_fence, + queue: context.transfer_queue(), + image: input_image, + width: aligned_width, + height: aligned_height, + pixel_format: config.pixel_format, + bit_depth: config.bit_depth, + }, + )?; + } + + let encode_command_buffer = if slot_idx == 0 { + cmd_resources.encode_command_buffer + } else { + extra_encode_buffers[slot_idx - 1] + }; + + let encode_fence = if slot_idx == 0 { + cmd_resources.encode_fence + } else { + let signaled = vk::FenceCreateInfo::default().flags(vk::FenceCreateFlags::SIGNALED); + unsafe { context.device().create_fence(&signaled, None) } + .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))? + }; + + // Per-slot single-query pool. + let mut query_feedback_info = vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR::default() + .encode_feedback_flags( + vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BUFFER_OFFSET + | vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BYTES_WRITTEN, + ); + query_feedback_info.p_next = (&profile_info as *const vk::VideoProfileInfoKHR).cast(); + let mut query_pool_create_info = vk::QueryPoolCreateInfo::default() + .query_type(vk::QueryType::VIDEO_ENCODE_FEEDBACK_KHR) + .query_count(1); + query_pool_create_info.p_next = (&query_feedback_info + as *const vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR) + .cast(); + let query_pool = unsafe { + context + .device() + .create_query_pool(&query_pool_create_info, None) + .map_err(|e| PixelForgeError::QueryPool(e.to_string()))? + }; + + slots.push(super::EncodeSlot { + input_image, + input_image_memory, + input_image_view, + input_image_layout: vk::ImageLayout::UNDEFINED, + bitstream_buffer, + bitstream_buffer_memory, + bitstream_buffer_size, + bitstream_buffer_ptr, + encode_command_buffer, + encode_fence, + query_pool, + in_flight: false, + pending_metadata: None, + }); + } + // Initialize GOP structure. let gop = GopStructure::new(config.gop_size, config.b_frame_count, config.gop_size); @@ -371,26 +472,17 @@ impl AV1Encoder { encode_frame_num: 0, frame_num: 0, order_hint: 0, - input_image, - input_image_memory, - input_image_view, - input_image_layout, + slots, + current_slot: 0, dpb_images, dpb_image_memories, dpb_image_views, dpb_slot_count: requested_dpb_slots, dpb_slot_active: vec![false; requested_dpb_slots], - bitstream_buffer, - bitstream_buffer_memory, - bitstream_buffer_size, - bitstream_buffer_ptr, command_pool, upload_command_pool: cmd_resources.upload_command_pool, upload_command_buffer, upload_fence, - encode_command_buffer, - encode_fence, - query_pool, header_data: None, current_dpb_slot: 0, references: Vec::new(), diff --git a/src/encoder/av1/mod.rs b/src/encoder/av1/mod.rs index 2edcde4..1bd0fea 100644 --- a/src/encoder/av1/mod.rs +++ b/src/encoder/av1/mod.rs @@ -23,6 +23,41 @@ const MIN_BITSTREAM_BUFFER_SIZE: usize = 2 * 1024 * 1024; /// AV1 superblock size in pixels (64x64, matching use_128x128_superblock=0 in the sequence header). pub const SUPERBLOCK_SIZE: u32 = 64; +/// Number of in-flight encode slots. Depth=2 lets frame N+1 begin encoding +/// while frame N is still on the encode hardware. +pub(crate) const ENCODE_PIPELINE_DEPTH: usize = 2; + +/// One slot's worth of per-frame encode resources. Mirrors encoder::h265::EncodeSlot. +pub(crate) struct EncodeSlot { + pub input_image: vk::Image, + pub input_image_memory: vk::DeviceMemory, + pub input_image_view: vk::ImageView, + pub input_image_layout: vk::ImageLayout, + + pub bitstream_buffer: vk::Buffer, + pub bitstream_buffer_memory: vk::DeviceMemory, + pub bitstream_buffer_size: usize, + pub bitstream_buffer_ptr: *mut u8, + + pub encode_command_buffer: vk::CommandBuffer, + pub encode_fence: vk::Fence, + pub query_pool: vk::QueryPool, + + pub in_flight: bool, + pub pending_metadata: Option, +} + +/// Metadata stashed at submit-time, returned with the bitstream when this +/// slot's encode is drained. +pub(crate) struct SlotPacketMetadata { + pub frame_type: crate::encoder::FrameType, + pub is_key_frame: bool, + pub pts: u64, + pub dts: u64, + /// AV1 sequence header OBU to prepend (Some only for IDR/key frames). + pub header: Option>, +} + #[derive(Clone, Copy, Debug)] pub(crate) struct ReferenceInfo { pub dpb_slot: u8, @@ -49,12 +84,10 @@ pub struct AV1Encoder { frame_num: u32, order_hint: u32, - // Resources - input_image: vk::Image, - input_image_memory: vk::DeviceMemory, - input_image_view: vk::ImageView, - /// Current Vulkan image layout of `input_image` (tracked to avoid UB when transitioning). - input_image_layout: vk::ImageLayout, + /// Per-frame encode slots. See encoder::h265 for invariants. + pub(crate) slots: Vec, + pub(crate) current_slot: usize, + /// DPB images for reference frames. dpb_images: Vec, dpb_image_memories: Vec, @@ -63,21 +96,14 @@ pub struct AV1Encoder { dpb_slot_count: usize, /// Whether each DPB slot has been activated (written to at least once). dpb_slot_active: Vec, - bitstream_buffer: vk::Buffer, - bitstream_buffer_memory: vk::DeviceMemory, - /// Size of the allocated bitstream buffer in bytes. - bitstream_buffer_size: usize, - /// Persistently mapped pointer to the bitstream buffer (avoids per-frame map/unmap). - bitstream_buffer_ptr: *mut u8, - - // Command resources. + + // Command pool (encode command buffers per slot allocated from this pool). command_pool: vk::CommandPool, upload_command_pool: vk::CommandPool, upload_command_buffer: vk::CommandBuffer, upload_fence: vk::Fence, - encode_command_buffer: vk::CommandBuffer, - encode_fence: vk::Fence, - query_pool: vk::QueryPool, + + // Optional semaphore to wait on before encoding (from color converter). // Cached AV1 sequence header OBU (retrieved from session parameters). header_data: Option>, @@ -97,7 +123,8 @@ impl AV1Encoder { /// encoder's configured pixel format and dimensions, and should be in /// GENERAL layout. fn upload_from_image(&mut self, src_image: vk::Image) -> Result<()> { - if src_image == self.input_image { + let slot = &mut self.slots[self.current_slot]; + if src_image == slot.input_image { debug!("Source image is the encoder's input image, skipping upload copy"); return Ok(()); } @@ -106,18 +133,17 @@ impl AV1Encoder { upload_command_buffer: self.upload_command_buffer, upload_fence: self.upload_fence, src_image, - dst_image: self.input_image, + dst_image: slot.input_image, width: self.config.dimensions.width, height: self.config.dimensions.height, pixel_format: self.config.pixel_format, - input_image_layout: self.input_image_layout, + input_image_layout: slot.input_image_layout, upload_queue: self.context.transfer_queue(), }; upload_image_to_input(&self.context, ¶ms)?; - // Update tracked layout. - self.input_image_layout = vk::ImageLayout::VIDEO_ENCODE_SRC_KHR; + slot.input_image_layout = vk::ImageLayout::VIDEO_ENCODE_SRC_KHR; Ok(()) } @@ -130,48 +156,33 @@ unsafe impl Send for AV1Encoder {} impl Drop for AV1Encoder { fn drop(&mut self) { unsafe { - let _ = self.context.device().device_wait_idle(); - self.context - .device() - .destroy_query_pool(self.query_pool, None); - self.context.device().destroy_fence(self.upload_fence, None); - self.context.device().destroy_fence(self.encode_fence, None); - self.context - .device() - .destroy_command_pool(self.command_pool, None); + let device = self.context.device(); + let _ = device.device_wait_idle(); + + for slot in &mut self.slots { + if !slot.bitstream_buffer_ptr.is_null() { + device.unmap_memory(slot.bitstream_buffer_memory); + slot.bitstream_buffer_ptr = std::ptr::null_mut(); + } + device.destroy_query_pool(slot.query_pool, None); + device.destroy_fence(slot.encode_fence, None); + device.destroy_buffer(slot.bitstream_buffer, None); + device.free_memory(slot.bitstream_buffer_memory, None); + device.destroy_image_view(slot.input_image_view, None); + device.destroy_image(slot.input_image, None); + device.free_memory(slot.input_image_memory, None); + } + + device.destroy_fence(self.upload_fence, None); + device.destroy_command_pool(self.command_pool, None); if self.upload_command_pool != self.command_pool { - self.context - .device() - .destroy_command_pool(self.upload_command_pool, None); + device.destroy_command_pool(self.upload_command_pool, None); } - self.context - .device() - .destroy_buffer(self.bitstream_buffer, None); - // Unmap the persistently mapped bitstream buffer before freeing memory. - self.context - .device() - .unmap_memory(self.bitstream_buffer_memory); - self.context - .device() - .free_memory(self.bitstream_buffer_memory, None); - self.context - .device() - .destroy_image_view(self.input_image_view, None); - self.context.device().destroy_image(self.input_image, None); - self.context - .device() - .free_memory(self.input_image_memory, None); for i in 0..self.dpb_images.len() { - self.context - .device() - .destroy_image_view(self.dpb_image_views[i], None); - self.context - .device() - .destroy_image(self.dpb_images[i], None); - self.context - .device() - .free_memory(self.dpb_image_memories[i], None); + device.destroy_image_view(self.dpb_image_views[i], None); + device.destroy_image(self.dpb_images[i], None); + device.free_memory(self.dpb_image_memories[i], None); } if self.session_params != vk::VideoSessionParametersKHR::null() { @@ -181,7 +192,7 @@ impl Drop for AV1Encoder { self.video_queue_fn .destroy_video_session(self.session, None); for mem in &self.session_memory { - self.context.device().free_memory(*mem, None); + device.free_memory(*mem, None); } } } diff --git a/src/encoder/h264/api.rs b/src/encoder/h264/api.rs index f41a080..71e6299 100644 --- a/src/encoder/h264/api.rs +++ b/src/encoder/h264/api.rs @@ -14,36 +14,71 @@ impl H264Encoder { /// This image can be used as a target for `ColorConverter::convert` to avoid /// an intermediate copy. pub fn input_image(&self) -> vk::Image { - self.input_image + self.slots[self.current_slot].input_image } - /// Encode a frame from a GPU image. + /// Encode a frame from a GPU image (depth-2 pipelined). /// - /// This accepts a source NV12 image on the GPU and encodes it directly without. - /// any CPU-side data copies. The source image must be in NV12 format with the - /// same dimensions as the encoder configuration, and should be in GENERAL layout. + /// Submits the frame to the encode queue without waiting, drains the + /// previous in-flight frame from the slot we are about to overwrite, + /// and returns *that* drained frame's packet. The first call returns + /// an empty Vec (pipeline still filling); subsequent calls return one + /// packet per call. Use `flush()` to drain remaining slots at end of stream. /// /// # Panics /// - /// The encoder will panic at creation time if B-frames are enabled (b_frame_count > 0), - /// as B-frame encoding is not yet supported. + /// The encoder will panic at creation time if B-frames are enabled + /// (b_frame_count > 0), as B-frame encoding is not yet supported. pub fn encode(&mut self, src_image: vk::Image) -> Result> { + let prev_packet = self.drain_current_slot()?; + let gop_position = self.gop.get_next_frame(); let display_order = self.input_frame_num; self.input_frame_num += 1; debug!( - "Encoding frame {} from GPU image: type={:?}, poc={}", - display_order, gop_position.frame_type, gop_position.pic_order_cnt + "Encoding frame {} from GPU image: type={:?}, poc={}, slot={}", + display_order, gop_position.frame_type, gop_position.pic_order_cnt, self.current_slot ); - // Upload from GPU image. self.upload_from_image(src_image)?; + self.encode_current_frame(&gop_position, display_order)?; - // Encode immediately. - let packet = self.encode_current_frame(&gop_position, display_order)?; + self.current_slot = (self.current_slot + 1) % self.slots.len(); + Ok(prev_packet.into_iter().collect()) + } - Ok(vec![packet]) + fn drain_current_slot(&mut self) -> Result> { + if !self.slots[self.current_slot].in_flight { + return Ok(None); + } + let bitstream = unsafe { + crate::encoder::resources::wait_and_read_bitstream( + self.context.device(), + self.slots[self.current_slot].encode_fence, + self.slots[self.current_slot].query_pool, + self.slots[self.current_slot].bitstream_buffer_ptr, + )? + }; + self.slots[self.current_slot].in_flight = false; + let meta = self.slots[self.current_slot] + .pending_metadata + .take() + .ok_or_else(|| { + PixelForgeError::CommandBuffer( + "Drained slot has bitstream but no metadata; encoder state corrupted" + .to_string(), + ) + })?; + let mut data = meta.header.unwrap_or_default(); + data.extend_from_slice(&bitstream); + Ok(Some(EncodedPacket { + data, + frame_type: meta.frame_type, + is_key_frame: meta.is_key_frame, + pts: meta.pts, + dts: meta.dts, + })) } /// Internal method to encode the current frame already uploaded to input_image. @@ -51,7 +86,7 @@ impl H264Encoder { &mut self, gop_position: &GopPosition, display_order: u64, - ) -> Result { + ) -> Result<()> { let is_idr = gop_position.frame_type.is_idr(); let is_reference = gop_position.is_reference; let is_b_frame = gop_position.frame_type == GopFrameType::B; @@ -86,24 +121,33 @@ impl H264Encoder { let pic_order_cnt = gop_position.pic_order_cnt; let frame_num = self.frame_num_syntax; - let mut encoded_data = Vec::new(); - if is_idr { - encoded_data.extend_from_slice(&self.get_h264_header()?); + // For IDR frames, capture SPS/PPS header to be prepended at drain time. + let header = if is_idr { + let h = self.get_h264_header()?; self.sps_written = true; - } + Some(h) + } else { + None + }; - encoded_data.extend_from_slice(&self.encode_frame_internal( - gop_position, - frame_num, - pic_order_cnt, - is_idr, - )?); + // Submit the encode (no wait, no readback). Marks the slot in_flight. + self.encode_frame_internal(gop_position, frame_num, pic_order_cnt, is_idr)?; + let dts = self.encode_frame_num; self.encode_frame_num += 1; if is_reference && !is_b_frame { self.frame_num_syntax = (self.frame_num_syntax + 1) % 256; } + // Stash metadata so drain_current_slot() can build the packet later. + self.slots[self.current_slot].pending_metadata = Some(super::SlotPacketMetadata { + frame_type, + is_key_frame: is_idr, + pts: display_order, + dts, + header, + }); + if is_reference { let pic_type = if is_idr { PictureType::Idr @@ -146,19 +190,25 @@ impl H264Encoder { } } - Ok(EncodedPacket { - data: encoded_data, - frame_type, - is_key_frame: is_idr, - pts: display_order, - dts: self.encode_frame_num - 1, - }) + Ok(()) } - /// Flush the encoder and get any remaining packets. + /// Flush the encoder and drain any remaining in-flight slots. pub fn flush(&mut self) -> Result> { - // No buffered frames in the current implementation. - Ok(Vec::new()) + let mut out = Vec::new(); + for offset in 0..self.slots.len() { + let idx = (self.current_slot + offset) % self.slots.len(); + if !self.slots[idx].in_flight { + continue; + } + let saved_current = self.current_slot; + self.current_slot = idx; + if let Some(packet) = self.drain_current_slot()? { + out.push(packet); + } + self.current_slot = saved_current; + } + Ok(out) } /// Request that the next frame be an IDR frame. @@ -244,17 +294,17 @@ impl H264Encoder { /// updated VUI color primaries, transfer characteristics, and matrix coefficients. /// The next encoded frame will be an IDR with the new SPS/PPS prepended. pub fn set_color_description(&mut self, desc: ColorDescription) -> Result<()> { - // Wait for any in-flight encode to complete before modifying session params. - // Do NOT reset the fence here — submit_encode_and_read_bitstream() resets it - // before queue_submit. Leaving the fence signaled allows consecutive - // set_color_description() calls without deadlock. + // Wait for ALL slot fences before modifying session params. Do NOT reset + // here; submit_encode_only resets each fence on submit so leaving them + // signaled lets consecutive set_color_description() calls work safely. + let fences: Vec = self.slots.iter().map(|s| s.encode_fence).collect(); unsafe { self.context .device() - .wait_for_fences(&[self.encode_fence], true, u64::MAX) + .wait_for_fences(&fences, true, u64::MAX) .map_err(|e| { PixelForgeError::Synchronization(format!( - "Failed to wait for encode fence: {:?}", + "Failed to wait for encode fences: {:?}", e )) })?; diff --git a/src/encoder/h264/encode.rs b/src/encoder/h264/encode.rs index dd0d85f..fc5e153 100644 --- a/src/encoder/h264/encode.rs +++ b/src/encoder/h264/encode.rs @@ -2,7 +2,7 @@ use super::H264Encoder; use crate::encoder::gop::{GopFrameType, GopPosition}; use crate::encoder::resources::{ - prepare_encode_command_buffer, record_dpb_barriers, submit_encode_and_read_bitstream, + prepare_encode_command_buffer, record_dpb_barriers, submit_encode_only, MIN_BITSTREAM_BUFFER_SIZE, }; use crate::error::{PixelForgeError, Result}; @@ -10,13 +10,18 @@ use ash::vk; use tracing::debug; impl H264Encoder { + /// Records and submits the encode commands for a single frame to the + /// current slot. Does NOT wait for completion or read the bitstream — + /// the caller drains the slot's prior in-flight encode before calling + /// this, and the slot is marked in_flight so a later call can drain the + /// submission made here. pub(super) fn encode_frame_internal( &mut self, gop_position: &GopPosition, frame_num: u32, pic_order_cnt: i32, is_idr: bool, - ) -> Result> { + ) -> Result<()> { let is_b_frame = gop_position.frame_type == GopFrameType::B; let is_reference = gop_position.is_reference; @@ -55,8 +60,8 @@ impl H264Encoder { unsafe { prepare_encode_command_buffer( self.context.device(), - self.encode_command_buffer, - self.query_pool, + self.slots[self.current_slot].encode_command_buffer, + self.slots[self.current_slot].query_pool, )?; } @@ -65,7 +70,7 @@ impl H264Encoder { unsafe { record_dpb_barriers( self.context.device(), - self.encode_command_buffer, + self.slots[self.current_slot].encode_command_buffer, &self.dpb_images, self.use_layered_dpb, self.current_dpb_slot, @@ -263,7 +268,7 @@ impl H264Encoder { height: self.aligned_height, }) .base_array_layer(0) - .image_view_binding(self.input_image_view); + .image_view_binding(self.slots[self.current_slot].input_image_view); // Set up DPB slot for reconstructed picture (setup slot) let setup_picture_resource = vk::VideoPictureResourceInfoKHR::default() @@ -431,7 +436,7 @@ impl H264Encoder { } let mut encode_info = vk::VideoEncodeInfoKHR::default() - .dst_buffer(self.bitstream_buffer) + .dst_buffer(self.slots[self.current_slot].bitstream_buffer) .dst_buffer_offset(0) .dst_buffer_range(MIN_BITSTREAM_BUFFER_SIZE as vk::DeviceSize) .src_picture_resource(src_picture_resource) @@ -537,7 +542,7 @@ impl H264Encoder { unsafe { (self.video_queue_fn.fp().cmd_begin_video_coding_khr)( - self.encode_command_buffer, + self.slots[self.current_slot].encode_command_buffer, &begin_info, ); } @@ -561,7 +566,7 @@ impl H264Encoder { unsafe { (self.video_queue_fn.fp().cmd_control_video_coding_khr)( - self.encode_command_buffer, + self.slots[self.current_slot].encode_command_buffer, &control_info, ); } @@ -570,8 +575,8 @@ impl H264Encoder { // Begin query. unsafe { self.context.device().cmd_begin_query( - self.encode_command_buffer, - self.query_pool, + self.slots[self.current_slot].encode_command_buffer, + self.slots[self.current_slot].query_pool, 0, vk::QueryControlFlags::empty(), ); @@ -580,23 +585,25 @@ impl H264Encoder { // Encode unsafe { (self.video_encode_fn.fp().cmd_encode_video_khr)( - self.encode_command_buffer, + self.slots[self.current_slot].encode_command_buffer, &encode_info, ); } // End query. unsafe { - self.context - .device() - .cmd_end_query(self.encode_command_buffer, self.query_pool, 0); + self.context.device().cmd_end_query( + self.slots[self.current_slot].encode_command_buffer, + self.slots[self.current_slot].query_pool, + 0, + ); } // End video coding. let end_info = vk::VideoEndCodingInfoKHR::default(); unsafe { (self.video_queue_fn.fp().cmd_end_video_coding_khr)( - self.encode_command_buffer, + self.slots[self.current_slot].encode_command_buffer, &end_info, ); } @@ -605,7 +612,7 @@ impl H264Encoder { unsafe { self.context .device() - .end_command_buffer(self.encode_command_buffer) + .end_command_buffer(self.slots[self.current_slot].encode_command_buffer) } .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?; @@ -614,20 +621,23 @@ impl H264Encoder { PixelForgeError::NoSuitableDevice("No video encode queue available".to_string()) })?; - let encoded_data = unsafe { - submit_encode_and_read_bitstream( + unsafe { + submit_encode_only( self.context.device(), - self.encode_command_buffer, - self.encode_fence, + self.slots[self.current_slot].encode_command_buffer, + self.slots[self.current_slot].encode_fence, encode_queue, - self.query_pool, - self.bitstream_buffer_ptr, - )? - }; + None, + )?; + } // Mark DPB slot as active. self.dpb_slot_active[self.current_dpb_slot as usize] = true; - Ok(encoded_data) + // Mark slot as in-flight; bitstream is drained on the next encode() + // call that targets this slot. + self.slots[self.current_slot].in_flight = true; + + Ok(()) } } diff --git a/src/encoder/h264/init.rs b/src/encoder/h264/init.rs index 1ccb373..82314a3 100644 --- a/src/encoder/h264/init.rs +++ b/src/encoder/h264/init.rs @@ -3,9 +3,10 @@ use super::{H264Encoder, MB_SIZE}; use crate::encoder::dpb::{DecodedPictureBuffer, DecodedPictureBufferTrait, DpbConfig}; use crate::encoder::gop::GopStructure; use crate::encoder::resources::{ - align_up, allocate_session_memory, clear_input_image, create_bitstream_buffer, - create_command_resources, create_dpb_images, create_image, get_video_format, lcm, - map_bitstream_buffer, query_supported_video_formats, ClearImageParams, + align_up, allocate_session_memory, clear_input_image, clear_rgb_input_image, + create_bitstream_buffer, create_command_resources, create_dpb_images, create_image, + get_video_format, lcm, map_bitstream_buffer, query_supported_video_formats, + rgb_conversion_model, rgb_conversion_range, rgb_input_format, ClearImageParams, MIN_BITSTREAM_BUFFER_SIZE, }; use crate::encoder::ColorDescription; @@ -42,6 +43,15 @@ impl H264Encoder { let video_encode_fn = ash::khr::video_encode_queue::Device::load(context.instance(), context.device()); + if config.use_rgb_input && !context.supports_rgb_direct_encode() { + return Err(PixelForgeError::NoSuitableDevice( + "EncodeConfig::use_rgb_input requires VK_VALVE_video_encode_rgb_conversion, \ + which this device does not support." + .to_string(), + )); + } + let use_rgb_input = config.use_rgb_input; + // Get chroma subsampling from pixel format via `From` impl. let chroma_subsampling: vk::VideoChromaSubsamplingFlagsKHR = config.pixel_format.into(); @@ -62,9 +72,19 @@ impl H264Encoder { // Note: the DPB format may differ and must be queried separately. let preferred_src_format = get_video_format(config.pixel_format, config.bit_depth); - // Create H.264 encode profile. + // Create H.264 encode profile. When RGB-direct is enabled we chain + // VkVideoEncodeProfileRgbConversionInfoVALVE on every profile we + // build (capability query, image creation, query pool) — profiles + // must match across all of those. + let mut rgb_conv_profile_info = vk::VideoEncodeProfileRgbConversionInfoVALVE::default() + .perform_encode_rgb_conversion(true); let mut h264_profile_info = vk::VideoEncodeH264ProfileInfoKHR::default().std_profile_idc(profile_idc); + if use_rgb_input { + h264_profile_info.p_next = (&mut rgb_conv_profile_info + as *mut vk::VideoEncodeProfileRgbConversionInfoVALVE) + .cast(); + } let mut profile_info = vk::VideoProfileInfoKHR::default() .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H264) @@ -249,8 +269,21 @@ impl H264Encoder { } info!("Supported DPB formats: {:?}", supported_dpb_formats); - // For input uploads, we currently require the preferred 2-plane formats. - let picture_format = if supported_src_formats.contains(&preferred_src_format) { + // For input uploads, we currently require the preferred 2-plane + // formats — unless RGB-direct is enabled, in which case the SRC + // image must be one of the RGB formats VCN5 accepts and the DPB + // stays YUV. + let picture_format = if use_rgb_input { + let rgb_fmt = rgb_input_format(config.bit_depth); + if !supported_src_formats.contains(&rgb_fmt) { + return Err(PixelForgeError::NoSuitableDevice(format!( + "RGB-direct encode requested but driver does not advertise {:?} as a \ + VIDEO_ENCODE_SRC_KHR format for this profile. Supported: {:?}", + rgb_fmt, supported_src_formats + ))); + } + rgb_fmt + } else if supported_src_formats.contains(&preferred_src_format) { preferred_src_format } else { return Err(PixelForgeError::NoSuitableDevice(format!( @@ -259,12 +292,22 @@ impl H264Encoder { ))); }; - // DPB format can differ from the input format; prefer matching when possible. - let reference_picture_format = supported_dpb_formats - .iter() - .copied() - .find(|f| *f == picture_format) - .unwrap_or(supported_dpb_formats[0]); + // DPB format can differ from the input format; in RGB-direct mode + // DPB stays YUV (matching the encoder's internal pixel_format/ + // bit_depth), otherwise prefer matching the picture_format. + let reference_picture_format = if use_rgb_input { + supported_dpb_formats + .iter() + .copied() + .find(|f| *f == preferred_src_format) + .unwrap_or(supported_dpb_formats[0]) + } else { + supported_dpb_formats + .iter() + .copied() + .find(|f| *f == picture_format) + .unwrap_or(supported_dpb_formats[0]) + }; debug!( "Selected Vulkan Video formats: picture_format={:?}, reference_picture_format={:?} (preferred_src={:?})", @@ -333,7 +376,18 @@ impl H264Encoder { PixelForgeError::NoSuitableDevice("No video encode queue family available".to_string()) })?; - let session_create_info = vk::VideoSessionCreateInfoKHR::default() + let color_desc = config + .color_description + .unwrap_or(ColorDescription::bt709()); + + let mut session_rgb_conv_info = + vk::VideoEncodeSessionRgbConversionCreateInfoVALVE::default() + .rgb_model(rgb_conversion_model(&color_desc)) + .rgb_range(rgb_conversion_range(&color_desc)) + .x_chroma_offset(vk::VideoEncodeRgbChromaOffsetFlagsVALVE::COSITED_EVEN) + .y_chroma_offset(vk::VideoEncodeRgbChromaOffsetFlagsVALVE::MIDPOINT); + + let mut session_create_info = vk::VideoSessionCreateInfoKHR::default() .queue_family_index(encode_queue_family) .flags(vk::VideoSessionCreateFlagsKHR::empty()) .video_profile(&profile_info) @@ -346,6 +400,11 @@ impl H264Encoder { .max_dpb_slots(dpb_slot_count as u32) .max_active_reference_pictures(max_active_reference_pictures as u32) .std_header_version(&std_header_version); + if use_rgb_input { + session_create_info.p_next = (&mut session_rgb_conv_info + as *mut vk::VideoEncodeSessionRgbConversionCreateInfoVALVE) + .cast(); + } let mut session = vk::VideoSessionKHR::null(); let result = unsafe { @@ -403,13 +462,17 @@ impl H264Encoder { ))); } - let color_desc = config - .color_description - .unwrap_or(ColorDescription::bt709()); - - // Create profile info for images/buffers. + // Create profile info for images/buffers (shared across slots). + let mut rgb_conv_profile_for_resources = + vk::VideoEncodeProfileRgbConversionInfoVALVE::default() + .perform_encode_rgb_conversion(true); let mut h264_profile_for_resources = vk::VideoEncodeH264ProfileInfoKHR::default().std_profile_idc(profile_idc); + if use_rgb_input { + h264_profile_for_resources.p_next = (&mut rgb_conv_profile_for_resources + as *mut vk::VideoEncodeProfileRgbConversionInfoVALVE) + .cast(); + } let mut profile_for_resources = vk::VideoProfileInfoKHR::default() .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H264) .chroma_subsampling(chroma_subsampling) @@ -418,16 +481,6 @@ impl H264Encoder { profile_for_resources.p_next = (&mut h264_profile_for_resources as *mut vk::VideoEncodeH264ProfileInfoKHR).cast(); - // Create input image. - let (input_image, input_image_memory, input_image_view) = create_image( - &context, - aligned_width, - aligned_height, - picture_format, - false, - &profile_for_resources, - )?; - // Determine DPB mode: use layered DPB when the driver does not advertise // support for separate reference images (required for AMD RADV). let supports_separate_dpb = capabilities @@ -438,7 +491,7 @@ impl H264Encoder { info!("Using layered DPB (driver does not support separate reference images)"); } - // Create DPB images. + // Create DPB images (shared across slots). let (dpb_images, dpb_image_memories, dpb_image_views) = create_dpb_images( &context, aligned_width, @@ -449,77 +502,140 @@ impl H264Encoder { use_layered_dpb, )?; - // Create bitstream buffer. - let (bitstream_buffer, bitstream_buffer_memory) = - create_bitstream_buffer(&context, MIN_BITSTREAM_BUFFER_SIZE, &profile_for_resources)?; - - // Persistently map the bitstream buffer to avoid per-frame map/unmap overhead. - let bitstream_buffer_ptr = - map_bitstream_buffer(&context, bitstream_buffer_memory, MIN_BITSTREAM_BUFFER_SIZE)?; - - // Create command pool, buffers, and fences. - // Use the transfer queue family for upload commands when the encode queue - // doesn't support transfer operations (AMD RADV). + // Create command pool and shared upload resources. Encode command + // buffers (one per slot) are allocated below from `command_pool`. let upload_queue_family = context.transfer_queue_family(); let cmd_resources = create_command_resources(&context, encode_queue_family, upload_queue_family)?; let command_pool = cmd_resources.command_pool; let upload_command_pool = cmd_resources.upload_command_pool; let upload_command_buffer = cmd_resources.upload_command_buffer; - let encode_command_buffer = cmd_resources.encode_command_buffer; let upload_fence = cmd_resources.upload_fence; - let encode_fence = cmd_resources.encode_fence; - // Clear the input image so padding between user dimensions and the - // aligned coded extent is zero-initialized. - clear_input_image( - &context, - &ClearImageParams { - command_buffer: upload_command_buffer, - fence: upload_fence, - queue: context.transfer_queue(), - image: input_image, - width: aligned_width, - height: aligned_height, - pixel_format: config.pixel_format, - bit_depth: config.bit_depth, - }, - )?; - - // Create query pool. - let mut h264_profile_info_query = - vk::VideoEncodeH264ProfileInfoKHR::default().std_profile_idc(profile_idc); - - let mut profile_info_query = vk::VideoProfileInfoKHR::default() - .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H264) - .chroma_subsampling(chroma_subsampling) - .luma_bit_depth(luma_bit_depth) - .chroma_bit_depth(chroma_bit_depth); - profile_info_query.p_next = - (&mut h264_profile_info_query as *mut vk::VideoEncodeH264ProfileInfoKHR).cast(); - - let mut encode_feedback_create = vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR::default() - .encode_feedback_flags( - vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BUFFER_OFFSET - | vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BYTES_WRITTEN, - ); - - encode_feedback_create.p_next = - (&mut profile_info_query as *mut vk::VideoProfileInfoKHR).cast(); + // Allocate ENCODE_PIPELINE_DEPTH-1 additional encode command buffers. + let extra_buffers_needed = super::ENCODE_PIPELINE_DEPTH.saturating_sub(1) as u32; + let extra_encode_buffers: Vec = if extra_buffers_needed > 0 { + let alloc_info = vk::CommandBufferAllocateInfo::default() + .command_pool(command_pool) + .level(vk::CommandBufferLevel::PRIMARY) + .command_buffer_count(extra_buffers_needed); + unsafe { context.device().allocate_command_buffers(&alloc_info) } + .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))? + } else { + Vec::new() + }; - let mut query_pool_create_info = vk::QueryPoolCreateInfo::default() - .query_type(vk::QueryType::VIDEO_ENCODE_FEEDBACK_KHR) - .query_count(1); - query_pool_create_info.p_next = (&mut encode_feedback_create - as *mut vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR) - .cast(); + // Build per-slot resources. + let mut slots: Vec = Vec::with_capacity(super::ENCODE_PIPELINE_DEPTH); + for slot_idx in 0..super::ENCODE_PIPELINE_DEPTH { + let (input_image, input_image_memory, input_image_view) = create_image( + &context, + aligned_width, + aligned_height, + picture_format, + false, + &profile_for_resources, + )?; + + let (bitstream_buffer, bitstream_buffer_memory) = create_bitstream_buffer( + &context, + MIN_BITSTREAM_BUFFER_SIZE, + &profile_for_resources, + )?; + let bitstream_buffer_ptr = + map_bitstream_buffer(&context, bitstream_buffer_memory, MIN_BITSTREAM_BUFFER_SIZE)?; + + if use_rgb_input { + clear_rgb_input_image( + &context, + upload_command_buffer, + upload_fence, + context.transfer_queue(), + input_image, + )?; + } else { + clear_input_image( + &context, + &ClearImageParams { + command_buffer: upload_command_buffer, + fence: upload_fence, + queue: context.transfer_queue(), + image: input_image, + width: aligned_width, + height: aligned_height, + pixel_format: config.pixel_format, + bit_depth: config.bit_depth, + }, + )?; + } - let query_pool = unsafe { - context - .device() - .create_query_pool(&query_pool_create_info, None) + let encode_command_buffer = if slot_idx == 0 { + cmd_resources.encode_command_buffer + } else { + extra_encode_buffers[slot_idx - 1] + }; + + let encode_fence = if slot_idx == 0 { + cmd_resources.encode_fence + } else { + let signaled = vk::FenceCreateInfo::default().flags(vk::FenceCreateFlags::SIGNALED); + unsafe { context.device().create_fence(&signaled, None) } + .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))? + }; + + // Per-slot single-query pool. + let mut rgb_conv_profile_query = + vk::VideoEncodeProfileRgbConversionInfoVALVE::default() + .perform_encode_rgb_conversion(true); + let mut h264_profile_info_query = + vk::VideoEncodeH264ProfileInfoKHR::default().std_profile_idc(profile_idc); + if use_rgb_input { + h264_profile_info_query.p_next = (&mut rgb_conv_profile_query + as *mut vk::VideoEncodeProfileRgbConversionInfoVALVE) + .cast(); + } + let mut profile_info_query = vk::VideoProfileInfoKHR::default() + .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H264) + .chroma_subsampling(chroma_subsampling) + .luma_bit_depth(luma_bit_depth) + .chroma_bit_depth(chroma_bit_depth); + profile_info_query.p_next = + (&mut h264_profile_info_query as *mut vk::VideoEncodeH264ProfileInfoKHR).cast(); + let mut encode_feedback_create = + vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR::default().encode_feedback_flags( + vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BUFFER_OFFSET + | vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BYTES_WRITTEN, + ); + encode_feedback_create.p_next = + (&mut profile_info_query as *mut vk::VideoProfileInfoKHR).cast(); + let mut query_pool_create_info = vk::QueryPoolCreateInfo::default() + .query_type(vk::QueryType::VIDEO_ENCODE_FEEDBACK_KHR) + .query_count(1); + query_pool_create_info.p_next = (&mut encode_feedback_create + as *mut vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR) + .cast(); + let query_pool = unsafe { + context + .device() + .create_query_pool(&query_pool_create_info, None) + } + .map_err(|e| PixelForgeError::QueryPool(e.to_string()))?; + + slots.push(super::EncodeSlot { + input_image, + input_image_memory, + input_image_view, + input_image_layout: vk::ImageLayout::VIDEO_ENCODE_SRC_KHR, + bitstream_buffer, + bitstream_buffer_memory, + bitstream_buffer_ptr, + encode_command_buffer, + encode_fence, + query_pool, + in_flight: false, + pending_metadata: None, + }); } - .map_err(|e| PixelForgeError::QueryPool(e.to_string()))?; // Create DPB and GOP structure. // The DPB size should match the actual number of allocated DPB slots. @@ -565,10 +681,8 @@ impl H264Encoder { encode_frame_num: 0, frame_num_syntax: 0, idr_pic_id: 0, - input_image, - input_image_memory, - input_image_view, - input_image_layout: vk::ImageLayout::VIDEO_ENCODE_SRC_KHR, + slots, + current_slot: 0, dpb_images, dpb_image_memories, dpb_image_views, @@ -578,16 +692,10 @@ impl H264Encoder { current_dpb_slot: 0, l0_references: Vec::new(), active_reference_count: max_active_reference_pictures as u32, - bitstream_buffer, - bitstream_buffer_memory, - bitstream_buffer_ptr, command_pool, upload_command_pool, upload_command_buffer, upload_fence, - encode_command_buffer, - encode_fence, - query_pool, sps_written: false, // has_reference: false, // removed // reference_frame_num: 0, // removed diff --git a/src/encoder/h264/mod.rs b/src/encoder/h264/mod.rs index 3d339d3..4be9a15 100644 --- a/src/encoder/h264/mod.rs +++ b/src/encoder/h264/mod.rs @@ -10,9 +10,7 @@ mod session_params; use ash::vk; use tracing::debug; -use crate::encoder::resources::{ - destroy_encoder_resources, upload_image_to_input, EncoderResources, UploadParams, -}; +use crate::encoder::resources::{upload_image_to_input, UploadParams}; use crate::error::Result; use crate::encoder::dpb::DecodedPictureBuffer; @@ -23,6 +21,42 @@ use crate::vulkan::VideoContext; /// H.264 macroblock size in pixels. pub const MB_SIZE: u32 = 16; +/// Number of in-flight encode slots. Depth=2 lets frame N+1 begin encoding +/// while frame N is still on the encode hardware, so the per-frame budget +/// becomes 2 × frame_interval (16.6ms at 120fps) instead of 1 ×. +pub(crate) const ENCODE_PIPELINE_DEPTH: usize = 2; + +/// One slot's worth of per-frame encode resources. Mirrors the H.265 design +/// (see encoder::h265::EncodeSlot). +pub(crate) struct EncodeSlot { + pub input_image: vk::Image, + pub input_image_memory: vk::DeviceMemory, + pub input_image_view: vk::ImageView, + pub input_image_layout: vk::ImageLayout, + + pub bitstream_buffer: vk::Buffer, + pub bitstream_buffer_memory: vk::DeviceMemory, + pub bitstream_buffer_ptr: *mut u8, + + pub encode_command_buffer: vk::CommandBuffer, + pub encode_fence: vk::Fence, + pub query_pool: vk::QueryPool, + + pub in_flight: bool, + pub pending_metadata: Option, +} + +/// Metadata stashed at submit-time, returned with the bitstream when this +/// slot's encode is drained on a later encode() call. +pub(crate) struct SlotPacketMetadata { + pub frame_type: crate::encoder::FrameType, + pub is_key_frame: bool, + pub pts: u64, + pub dts: u64, + /// SPS/PPS header to prepend (Some only on first IDR). + pub header: Option>, +} + #[derive(Clone, Copy, Debug)] pub(crate) struct ReferenceInfo { pub dpb_slot: u8, @@ -55,12 +89,10 @@ pub struct H264Encoder { frame_num_syntax: u32, idr_pic_id: u32, - // Resources - input_image: vk::Image, - input_image_memory: vk::DeviceMemory, - input_image_view: vk::ImageView, - /// Current Vulkan image layout of `input_image` (tracked to avoid UB when transitioning). - input_image_layout: vk::ImageLayout, + /// Per-frame encode slots. See encoder::h265 for invariants. + pub(crate) slots: Vec, + pub(crate) current_slot: usize, + /// DPB images (up to MAX_DPB_SLOTS for B-frame and long-term reference support). dpb_images: Vec, dpb_image_memories: Vec, @@ -71,19 +103,12 @@ pub struct H264Encoder { use_layered_dpb: bool, /// Tracks which DPB slots have been activated (used at least once). dpb_slot_active: Vec, - bitstream_buffer: vk::Buffer, - bitstream_buffer_memory: vk::DeviceMemory, - /// Persistently mapped pointer to the bitstream buffer (avoids per-frame map/unmap). - bitstream_buffer_ptr: *mut u8, - // Command resources. + // Command pool (encode command buffers per slot allocated from this pool). command_pool: vk::CommandPool, upload_command_pool: vk::CommandPool, upload_command_buffer: vk::CommandBuffer, upload_fence: vk::Fence, - encode_command_buffer: vk::CommandBuffer, - encode_fence: vk::Fence, - query_pool: vk::QueryPool, // SPS/PPS written flag. sps_written: bool, @@ -117,7 +142,8 @@ impl H264Encoder { /// with the same dimensions as the encoder configuration. The source image /// should be in GENERAL layout. fn upload_from_image(&mut self, src_image: vk::Image) -> Result<()> { - if src_image == self.input_image { + let slot = &mut self.slots[self.current_slot]; + if src_image == slot.input_image { debug!("Source image is the encoder's input image, skipping upload copy"); return Ok(()); } @@ -126,18 +152,17 @@ impl H264Encoder { upload_command_buffer: self.upload_command_buffer, upload_fence: self.upload_fence, src_image, - dst_image: self.input_image, + dst_image: slot.input_image, width: self.config.dimensions.width, height: self.config.dimensions.height, pixel_format: self.config.pixel_format, - input_image_layout: self.input_image_layout, + input_image_layout: slot.input_image_layout, upload_queue: self.context.transfer_queue(), }; upload_image_to_input(&self.context, ¶ms)?; - // Update tracked layout. - self.input_image_layout = vk::ImageLayout::VIDEO_ENCODE_SRC_KHR; + slot.input_image_layout = vk::ImageLayout::VIDEO_ENCODE_SRC_KHR; Ok(()) } @@ -150,37 +175,61 @@ unsafe impl Send for H264Encoder {} impl Drop for H264Encoder { fn drop(&mut self) { unsafe { - // Wait on the queues used by the encoder rather than stalling - // the entire device. - let _ = self - .context - .device() - .queue_wait_idle(self.context.transfer_queue()); + let device = self.context.device(); + let _ = device.queue_wait_idle(self.context.transfer_queue()); if let Some(q) = self.context.video_encode_queue() { - let _ = self.context.device().queue_wait_idle(q); + let _ = device.queue_wait_idle(q); + } + + for slot in &mut self.slots { + if !slot.bitstream_buffer_ptr.is_null() { + device.unmap_memory(slot.bitstream_buffer_memory); + slot.bitstream_buffer_ptr = std::ptr::null_mut(); + } + device.destroy_query_pool(slot.query_pool, None); + device.destroy_fence(slot.encode_fence, None); + device.destroy_buffer(slot.bitstream_buffer, None); + device.free_memory(slot.bitstream_buffer_memory, None); + device.destroy_image_view(slot.input_image_view, None); + device.destroy_image(slot.input_image, None); + device.free_memory(slot.input_image_memory, None); + } + + device.destroy_fence(self.upload_fence, None); + device.destroy_command_pool(self.command_pool, None); + if self.upload_command_pool != self.command_pool { + device.destroy_command_pool(self.upload_command_pool, None); } - destroy_encoder_resources( - self.context.device(), - &self.video_queue_fn, - &EncoderResources { - query_pool: self.query_pool, - upload_fence: self.upload_fence, - encode_fence: self.encode_fence, - command_pool: self.command_pool, - upload_command_pool: self.upload_command_pool, - bitstream_buffer: self.bitstream_buffer, - bitstream_buffer_memory: self.bitstream_buffer_memory, - input_image: self.input_image, - input_image_memory: self.input_image_memory, - input_image_view: self.input_image_view, - dpb_images: &self.dpb_images, - dpb_image_memories: &self.dpb_image_memories, - dpb_image_views: &self.dpb_image_views, - session: self.session, - session_params: self.session_params, - session_memory: &self.session_memory, - }, + + for view in &self.dpb_image_views { + device.destroy_image_view(*view, None); + } + for image in &self.dpb_images { + device.destroy_image(*image, None); + } + for memory in &self.dpb_image_memories { + device.free_memory(*memory, None); + } + + if self.session_params != vk::VideoSessionParametersKHR::null() { + (self + .video_queue_fn + .fp() + .destroy_video_session_parameters_khr)( + device.handle(), + self.session_params, + std::ptr::null(), + ); + } + (self.video_queue_fn.fp().destroy_video_session_khr)( + device.handle(), + self.session, + std::ptr::null(), ); + + for memory in &self.session_memory { + device.free_memory(*memory, None); + } } } } diff --git a/src/encoder/h265/api.rs b/src/encoder/h265/api.rs index e52b210..3a2c947 100644 --- a/src/encoder/h265/api.rs +++ b/src/encoder/h265/api.rs @@ -14,36 +14,93 @@ impl H265Encoder { /// This image can be used as a target for `ColorConverter::convert` to avoid /// an intermediate copy. pub fn input_image(&self) -> vk::Image { - self.input_image + self.slots[self.current_slot].input_image } /// Encode a frame from a GPU image. /// - /// This accepts a source NV12 image on the GPU and encodes it directly without. - /// any CPU-side data copies. The source image must be in NV12 format with the - /// same dimensions as the encoder configuration, and should be in GENERAL layout. + /// Pipelined: this call submits frame N to the encode queue without waiting, + /// drains the previous in-flight frame from the slot we are about to overwrite, + /// and returns *that* drained frame's `EncodedPacket`. The first call returns + /// an empty Vec (the pipeline is still filling); subsequent calls return one + /// packet per call. Use `flush()` to drain remaining slots at end of stream. + /// + /// The source image must be in NV12 format with the same dimensions as the + /// encoder configuration, and should be in GENERAL layout. /// /// # Panics /// - /// The encoder will panic at creation time if B-frames are enabled (b_frame_count > 0), - /// as B-frame encoding is not yet supported. + /// The encoder will panic at creation time if B-frames are enabled + /// (b_frame_count > 0), as B-frame encoding is not yet supported. pub fn encode(&mut self, src_image: vk::Image) -> Result> { + // Step 1: Drain the slot we're about to overwrite. Its previous encode + // submission must complete before we can re-record its command buffer + // *and* before the converter can write to its input image. Reading the + // bitstream here means the input_image is fully released by the encode + // hardware once we return. + let prev_packet = self.drain_current_slot()?; + let gop_position = self.gop.get_next_frame(); let display_order = self.input_frame_num; self.input_frame_num += 1; debug!( - "Encoding frame {} from GPU image: type={:?}, poc={}", - display_order, gop_position.frame_type, gop_position.pic_order_cnt + "Encoding frame {} from GPU image: type={:?}, poc={}, slot={}", + display_order, gop_position.frame_type, gop_position.pic_order_cnt, self.current_slot ); - // Upload from GPU image. + // Upload from GPU image (no-op when src_image is already the slot's input). self.upload_from_image(src_image)?; - // Encode immediately. - let packet = self.encode_current_frame(&gop_position, display_order)?; + // Step 3: Submit the new encode (no wait) and stash its metadata in the + // slot so it can be returned when this slot is drained next time around. + self.encode_current_frame(&gop_position, display_order)?; - Ok(vec![packet]) + // Step 4: Advance to the next slot for the upcoming frame. + self.current_slot = (self.current_slot + 1) % self.slots.len(); + + // Step 5: Return the packet drained at step 1. Empty Vec until the + // pipeline has filled (first ENCODE_PIPELINE_DEPTH-1 calls). + Ok(prev_packet.into_iter().collect()) + } + + /// Wait for the current slot's previously submitted encode (if any) to + /// finish, read its bitstream, and combine it with the metadata stashed at + /// submit-time into a complete EncodedPacket. Returns None if the slot has + /// no in-flight work (initial pipeline-fill phase or after a flush). + fn drain_current_slot(&mut self) -> Result> { + if !self.slots[self.current_slot].in_flight { + return Ok(None); + } + let bitstream = unsafe { + crate::encoder::resources::wait_and_read_bitstream( + self.context.device(), + self.slots[self.current_slot].encode_fence, + self.slots[self.current_slot].query_pool, + self.slots[self.current_slot].bitstream_buffer_ptr, + )? + }; + self.slots[self.current_slot].in_flight = false; + let meta = self.slots[self.current_slot] + .pending_metadata + .take() + .ok_or_else(|| { + PixelForgeError::CommandBuffer( + "Drained slot has bitstream but no metadata; encoder state corrupted" + .to_string(), + ) + })?; + + let mut data = meta.header.unwrap_or_default(); + data.extend_from_slice(&bitstream); + + Ok(Some(EncodedPacket { + data, + frame_type: meta.frame_type, + is_key_frame: meta.is_key_frame, + pts: meta.pts, + dts: meta.dts, + })) } /// Internal method to encode the current frame already uploaded to input_image. @@ -51,7 +108,7 @@ impl H265Encoder { &mut self, gop_position: &GopPosition, display_order: u64, - ) -> Result { + ) -> Result<()> { let is_idr = gop_position.frame_type.is_idr(); let is_reference = gop_position.is_reference; let is_b_frame = gop_position.frame_type == GopFrameType::B; @@ -101,13 +158,11 @@ impl H265Encoder { let pic_order_cnt = gop_position.pic_order_cnt; - let mut encoded_data = Vec::new(); - - // For IDR frames, prepend VPS/SPS/PPS header. - if is_idr { + // For IDR frames, capture VPS/SPS/PPS header to be prepended to the + // bitstream when this slot's encode is drained later. + let header = if is_idr { if self.header_data.is_none() { let header = self.get_h265_header()?; - // Debug: print first few bytes of header. debug!( "H.265 header ({} bytes): {:02X?}", header.len(), @@ -115,22 +170,27 @@ impl H265Encoder { ); self.header_data = Some(header); } - if let Some(ref header) = self.header_data { - encoded_data.extend_from_slice(header); - } - } + self.header_data.clone() + } else { + None + }; - let slice_data = self.encode_frame_internal(gop_position, pic_order_cnt, is_idr)?; - // Debug: print first few bytes of slice data. - debug!( - "H.265 slice ({} bytes): {:02X?}", - slice_data.len(), - &slice_data[..std::cmp::min(16, slice_data.len())] - ); - encoded_data.extend_from_slice(&slice_data); + // Submit the encode (no wait, no readback). Marks the slot in_flight. + self.encode_frame_internal(gop_position, pic_order_cnt, is_idr)?; + let dts = self.encode_frame_num; self.encode_frame_num += 1; + // Stash the metadata so drain_current_slot() can build the + // EncodedPacket once the GPU finishes this submission. + self.slots[self.current_slot].pending_metadata = Some(super::SlotPacketMetadata { + frame_type, + is_key_frame: is_idr, + pts: display_order, + dts, + header, + }); + if is_reference { let dpb_pic_type = if is_idr { PictureType::Idr @@ -180,19 +240,34 @@ impl H265Encoder { } } - Ok(EncodedPacket { - data: encoded_data, - frame_type, - is_key_frame: is_idr, - pts: display_order, - dts: self.encode_frame_num - 1, - }) + Ok(()) } - /// Flush the encoder and get any remaining packets. + /// Flush the encoder and drain any remaining in-flight slots. + /// + /// Returns one EncodedPacket per still-in-flight slot, in submission + /// order (so the resulting Vec preserves the encoded sequence). After + /// flush the encoder has no in-flight work. pub fn flush(&mut self) -> Result> { - // No buffered frames in the current implementation. - Ok(Vec::new()) + let mut out = Vec::new(); + // Drain in submission order: starting from current_slot (the slot we + // would *next* overwrite — the oldest one in flight) and advancing + // through the ring. Slots with no in_flight are skipped. + for offset in 0..self.slots.len() { + let idx = (self.current_slot + offset) % self.slots.len(); + if !self.slots[idx].in_flight { + continue; + } + // Drain idx's bitstream the same way drain_current_slot does, but + // from an arbitrary slot index. + let saved_current = self.current_slot; + self.current_slot = idx; + if let Some(packet) = self.drain_current_slot()? { + out.push(packet); + } + self.current_slot = saved_current; + } + Ok(out) } /// Request that the next frame be an IDR frame. @@ -278,17 +353,18 @@ impl H265Encoder { /// updated VUI color primaries, transfer characteristics, and matrix coefficients. /// The next encoded frame will be an IDR with the new VPS/SPS/PPS prepended. pub fn set_color_description(&mut self, desc: ColorDescription) -> Result<()> { - // Wait for any in-flight encode to complete before modifying session params. - // Do NOT reset the fence here — submit_encode_and_read_bitstream() resets it - // before queue_submit. Leaving the fence signaled allows consecutive - // set_color_description() calls without deadlock. + // Wait for all in-flight encodes (across every slot) to complete before + // modifying session params. Do NOT reset fences here — submit_encode_only + // resets them before queue_submit, and leaving them signaled allows + // consecutive set_color_description() calls without deadlock. + let fences: Vec = self.slots.iter().map(|s| s.encode_fence).collect(); unsafe { self.context .device() - .wait_for_fences(&[self.encode_fence], true, u64::MAX) + .wait_for_fences(&fences, true, u64::MAX) .map_err(|e| { PixelForgeError::Synchronization(format!( - "Failed to wait for encode fence: {:?}", + "Failed to wait for encode fences: {:?}", e )) })?; diff --git a/src/encoder/h265/encode.rs b/src/encoder/h265/encode.rs index b7a8ce0..5f138e5 100644 --- a/src/encoder/h265/encode.rs +++ b/src/encoder/h265/encode.rs @@ -7,32 +7,30 @@ use super::H265Encoder; use crate::encoder::gop::{GopFrameType, GopPosition}; use crate::encoder::resources::{ prepare_encode_command_buffer, record_dpb_barriers, record_post_encode_dpb_barrier, - submit_encode_and_read_bitstream, MIN_BITSTREAM_BUFFER_SIZE, + submit_encode_only, MIN_BITSTREAM_BUFFER_SIZE, }; use crate::error::{PixelForgeError, Result}; use ash::vk; use tracing::debug; impl H265Encoder { - /// Encode a frame that has already been uploaded to the input image. - /// - /// This function: - /// 1. Records the video encode command buffer - /// 2. Sets up reference picture information - /// 3. Executes the encode operation - /// 4. Returns the encoded bitstream data + /// Records and submits the encode commands for a single frame to the + /// current slot. Does NOT wait for completion or read the bitstream — + /// the caller drains the slot's prior in-flight encode before calling + /// this, and the slot is marked in_flight so a later call can drain the + /// submission made here. pub(super) fn encode_frame_internal( &mut self, gop_position: &GopPosition, pic_order_cnt: i32, is_idr: bool, - ) -> Result> { + ) -> Result<()> { // Prepare command buffer for recording. unsafe { prepare_encode_command_buffer( self.context.device(), - self.encode_command_buffer, - self.query_pool, + self.slots[self.current_slot].encode_command_buffer, + self.slots[self.current_slot].query_pool, )?; } @@ -41,7 +39,7 @@ impl H265Encoder { unsafe { record_dpb_barriers( self.context.device(), - self.encode_command_buffer, + self.slots[self.current_slot].encode_command_buffer, &self.dpb_images, self.use_layered_dpb, self.current_dpb_slot, @@ -326,7 +324,7 @@ impl H265Encoder { height: self.aligned_height, }) .base_array_layer(0) - .image_view_binding(self.input_image_view); + .image_view_binding(self.slots[self.current_slot].input_image_view); // Set up setup picture resource (reconstructed picture) let setup_picture_resource = vk::VideoPictureResourceInfoKHR::default() @@ -570,7 +568,7 @@ impl H265Encoder { unsafe { (self.video_queue_fn.fp().cmd_begin_video_coding_khr)( - self.encode_command_buffer, + self.slots[self.current_slot].encode_command_buffer, &begin_coding_info, ); } @@ -594,7 +592,7 @@ impl H265Encoder { unsafe { (self.video_queue_fn.fp().cmd_control_video_coding_khr)( - self.encode_command_buffer, + self.slots[self.current_slot].encode_command_buffer, &control_info, ); } @@ -606,7 +604,7 @@ impl H265Encoder { .src_picture_resource(src_picture_resource) .setup_reference_slot(&setup_slot_info) .reference_slots(&reference_slots) - .dst_buffer(self.bitstream_buffer) + .dst_buffer(self.slots[self.current_slot].bitstream_buffer) .dst_buffer_offset(0) .dst_buffer_range(MIN_BITSTREAM_BUFFER_SIZE as u64); encode_info.p_next = @@ -614,27 +612,29 @@ impl H265Encoder { unsafe { self.context.device().cmd_begin_query( - self.encode_command_buffer, - self.query_pool, + self.slots[self.current_slot].encode_command_buffer, + self.slots[self.current_slot].query_pool, 0, vk::QueryControlFlags::empty(), ); (self.video_encode_fn.fp().cmd_encode_video_khr)( - self.encode_command_buffer, + self.slots[self.current_slot].encode_command_buffer, &encode_info, ); - self.context - .device() - .cmd_end_query(self.encode_command_buffer, self.query_pool, 0); + self.context.device().cmd_end_query( + self.slots[self.current_slot].encode_command_buffer, + self.slots[self.current_slot].query_pool, + 0, + ); } // Add DPB synchronization barrier after encoding. unsafe { record_post_encode_dpb_barrier( self.context.device(), - self.encode_command_buffer, + self.slots[self.current_slot].encode_command_buffer, &self.dpb_images, self.use_layered_dpb, self.current_dpb_slot, @@ -645,7 +645,7 @@ impl H265Encoder { let end_coding_info = vk::VideoEndCodingInfoKHR::default(); unsafe { (self.video_queue_fn.fp().cmd_end_video_coding_khr)( - self.encode_command_buffer, + self.slots[self.current_slot].encode_command_buffer, &end_coding_info, ); } @@ -654,7 +654,7 @@ impl H265Encoder { unsafe { self.context .device() - .end_command_buffer(self.encode_command_buffer) + .end_command_buffer(self.slots[self.current_slot].encode_command_buffer) } .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?; @@ -673,22 +673,25 @@ impl H265Encoder { let gpu_start = std::time::Instant::now(); - let encoded_data = unsafe { - submit_encode_and_read_bitstream( + unsafe { + submit_encode_only( self.context.device(), - self.encode_command_buffer, - self.encode_fence, + self.slots[self.current_slot].encode_command_buffer, + self.slots[self.current_slot].encode_fence, encode_queue, - self.query_pool, - self.bitstream_buffer_ptr, - )? - }; + None, + )?; + } - debug!("GPU encode took {:?}", gpu_start.elapsed()); + debug!("Submitted encode (no wait): {:?}", gpu_start.elapsed()); // Mark DPB slot as active. self.dpb_slot_active[self.current_dpb_slot as usize] = true; - Ok(encoded_data) + // Mark the slot as in flight; the bitstream is drained at the start + // of the next encode() call that targets this slot. + self.slots[self.current_slot].in_flight = true; + + Ok(()) } } diff --git a/src/encoder/h265/init.rs b/src/encoder/h265/init.rs index 12e3041..07f53b3 100644 --- a/src/encoder/h265/init.rs +++ b/src/encoder/h265/init.rs @@ -3,9 +3,10 @@ use super::H265Encoder; use crate::encoder::dpb::{DecodedPictureBuffer, DecodedPictureBufferTrait, DpbConfig}; use crate::encoder::gop::GopStructure; use crate::encoder::resources::{ - align_up, allocate_session_memory, clear_input_image, create_bitstream_buffer, - create_command_resources, create_dpb_images, create_image, get_video_format, lcm, - make_codec_name, map_bitstream_buffer, query_supported_video_formats, ClearImageParams, + align_up, allocate_session_memory, clear_input_image, clear_rgb_input_image, + create_bitstream_buffer, create_command_resources, create_dpb_images, create_image, + get_video_format, lcm, make_codec_name, map_bitstream_buffer, query_supported_video_formats, + rgb_conversion_model, rgb_conversion_range, rgb_input_format, ClearImageParams, MIN_BITSTREAM_BUFFER_SIZE, }; use crate::encoder::{BitDepth, ColorDescription, PixelFormat}; @@ -41,6 +42,15 @@ impl H265Encoder { let video_encode_fn = ash::khr::video_encode_queue::Device::load(context.instance(), context.device()); + if config.use_rgb_input && !context.supports_rgb_direct_encode() { + return Err(PixelForgeError::NoSuitableDevice( + "EncodeConfig::use_rgb_input requires VK_VALVE_video_encode_rgb_conversion, \ + which this device does not support." + .to_string(), + )); + } + let use_rgb_input = config.use_rgb_input; + // Get chroma subsampling from pixel format via `From` impl let chroma_subsampling: vk::VideoChromaSubsamplingFlagsKHR = config.pixel_format.into(); @@ -74,9 +84,19 @@ impl H265Encoder { } }; - // Create H.265 encode profile + // Create H.265 encode profile. When RGB-direct is enabled we chain + // VkVideoEncodeProfileRgbConversionInfoVALVE on every profile we + // build (capability query, image creation, query pool) — profiles + // must match across all of those. + let mut rgb_conv_profile_info = vk::VideoEncodeProfileRgbConversionInfoVALVE::default() + .perform_encode_rgb_conversion(true); let mut h265_profile_info = vk::VideoEncodeH265ProfileInfoKHR::default().std_profile_idc(profile_idc); + if use_rgb_input { + h265_profile_info.p_next = (&mut rgb_conv_profile_info + as *mut vk::VideoEncodeProfileRgbConversionInfoVALVE) + .cast(); + } let mut profile_info = vk::VideoProfileInfoKHR::default() .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H265) @@ -189,7 +209,17 @@ impl H265Encoder { } info!("Supported DPB formats: {:?}", supported_dpb_formats); - let picture_format = if supported_src_formats.contains(&video_format) { + let picture_format = if use_rgb_input { + let rgb_fmt = rgb_input_format(config.bit_depth); + if !supported_src_formats.contains(&rgb_fmt) { + return Err(PixelForgeError::NoSuitableDevice(format!( + "RGB-direct encode requested but driver does not advertise {:?} as a \ + VIDEO_ENCODE_SRC_KHR format for this profile. Supported: {:?}", + rgb_fmt, supported_src_formats + ))); + } + rgb_fmt + } else if supported_src_formats.contains(&video_format) { video_format } else { return Err(PixelForgeError::NoSuitableDevice(format!( @@ -198,11 +228,19 @@ impl H265Encoder { ))); }; - let reference_picture_format = supported_dpb_formats - .iter() - .copied() - .find(|f| *f == picture_format) - .unwrap_or(supported_dpb_formats[0]); + let reference_picture_format = if use_rgb_input { + supported_dpb_formats + .iter() + .copied() + .find(|f| *f == video_format) + .unwrap_or(supported_dpb_formats[0]) + } else { + supported_dpb_formats + .iter() + .copied() + .find(|f| *f == picture_format) + .unwrap_or(supported_dpb_formats[0]) + }; debug!( "Selected Vulkan Video formats: picture_format={:?}, reference_picture_format={:?}", @@ -264,7 +302,18 @@ impl H265Encoder { PixelForgeError::NoSuitableDevice("No video encode queue family available".to_string()) })?; - let session_create_info = vk::VideoSessionCreateInfoKHR::default() + let color_desc = config + .color_description + .unwrap_or(ColorDescription::bt709()); + + let mut session_rgb_conv_info = + vk::VideoEncodeSessionRgbConversionCreateInfoVALVE::default() + .rgb_model(rgb_conversion_model(&color_desc)) + .rgb_range(rgb_conversion_range(&color_desc)) + .x_chroma_offset(vk::VideoEncodeRgbChromaOffsetFlagsVALVE::COSITED_EVEN) + .y_chroma_offset(vk::VideoEncodeRgbChromaOffsetFlagsVALVE::MIDPOINT); + + let mut session_create_info = vk::VideoSessionCreateInfoKHR::default() .queue_family_index(encode_queue_family) .flags(vk::VideoSessionCreateFlagsKHR::empty()) .video_profile(&profile_info) @@ -277,6 +326,11 @@ impl H265Encoder { .max_dpb_slots(dpb_slot_count as u32) .max_active_reference_pictures(max_active_reference_pictures as u32) .std_header_version(&std_header_version); + if use_rgb_input { + session_create_info.p_next = (&mut session_rgb_conv_info + as *mut vk::VideoEncodeSessionRgbConversionCreateInfoVALVE) + .cast(); + } let mut session = vk::VideoSessionKHR::null(); let result = unsafe { @@ -297,14 +351,17 @@ impl H265Encoder { // Query and allocate session memory. let session_memory = allocate_session_memory(&context, session, &video_queue_fn)?; - // Build VPS/SPS/PPS and session parameters via shared helper. - let color_desc = config - .color_description - .unwrap_or(ColorDescription::bt709()); - - // Create profile info for images/buffers + // Create profile info for images/buffers (shared across slots). + let mut rgb_conv_profile_for_resources = + vk::VideoEncodeProfileRgbConversionInfoVALVE::default() + .perform_encode_rgb_conversion(true); let mut h265_profile_for_resources = vk::VideoEncodeH265ProfileInfoKHR::default().std_profile_idc(profile_idc); + if use_rgb_input { + h265_profile_for_resources.p_next = (&mut rgb_conv_profile_for_resources + as *mut vk::VideoEncodeProfileRgbConversionInfoVALVE) + .cast(); + } let mut profile_for_resources = vk::VideoProfileInfoKHR::default() .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H265) .chroma_subsampling(chroma_subsampling) @@ -313,16 +370,6 @@ impl H265Encoder { profile_for_resources.p_next = (&mut h265_profile_for_resources as *mut vk::VideoEncodeH265ProfileInfoKHR).cast(); - // Create input image - let (input_image, input_image_memory, input_image_view) = create_image( - &context, - aligned_width, - aligned_height, - picture_format, - false, - &profile_for_resources, - )?; - // Determine DPB mode: use layered DPB when the driver does not advertise // support for separate reference images (required for AMD RADV). let supports_separate_dpb = capabilities @@ -333,7 +380,8 @@ impl H265Encoder { info!("Using layered DPB (driver does not support separate reference images)"); } - // Create DPB images. + // Create DPB images (shared across all slots — references for the + // entire encode session, not per-frame). let (dpb_images, dpb_image_memories, dpb_image_views) = create_dpb_images( &context, aligned_width, @@ -344,77 +392,153 @@ impl H265Encoder { use_layered_dpb, )?; - // Create bitstream buffer. - let (bitstream_buffer, bitstream_buffer_memory) = - create_bitstream_buffer(&context, MIN_BITSTREAM_BUFFER_SIZE, &profile_for_resources)?; - - // Persistently map the bitstream buffer to avoid per-frame map/unmap overhead. - let bitstream_buffer_ptr = - map_bitstream_buffer(&context, bitstream_buffer_memory, MIN_BITSTREAM_BUFFER_SIZE)?; - - // Create command pool, buffers, and fences. - // Use the transfer queue family for upload commands when the encode queue - // doesn't support transfer operations (AMD RADV). + // Create command pool and shared upload resources. The encode command + // buffers (one per slot) are allocated below from `command_pool`. let upload_queue_family = context.transfer_queue_family(); let cmd_resources = create_command_resources(&context, encode_queue_family, upload_queue_family)?; let command_pool = cmd_resources.command_pool; let upload_command_pool = cmd_resources.upload_command_pool; let upload_command_buffer = cmd_resources.upload_command_buffer; - let encode_command_buffer = cmd_resources.encode_command_buffer; let upload_fence = cmd_resources.upload_fence; - let encode_fence = cmd_resources.encode_fence; - - // Clear the input image so padding between user dimensions and the - // aligned coded extent is zero-initialized. - clear_input_image( - &context, - &ClearImageParams { - command_buffer: upload_command_buffer, - fence: upload_fence, - queue: context.transfer_queue(), - image: input_image, - width: aligned_width, - height: aligned_height, - pixel_format: config.pixel_format, - bit_depth: config.bit_depth, - }, - )?; - - // Create query pool - let mut h265_profile_info_query = - vk::VideoEncodeH265ProfileInfoKHR::default().std_profile_idc(profile_idc); - - let mut profile_info_query = vk::VideoProfileInfoKHR::default() - .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H265) - .chroma_subsampling(chroma_subsampling) - .luma_bit_depth(bit_depth_flags) - .chroma_bit_depth(bit_depth_flags); - profile_info_query.p_next = - (&mut h265_profile_info_query as *mut vk::VideoEncodeH265ProfileInfoKHR).cast(); - - let mut encode_feedback_create = vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR::default() - .encode_feedback_flags( - vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BUFFER_OFFSET - | vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BYTES_WRITTEN, - ); - - encode_feedback_create.p_next = - (&mut profile_info_query as *mut vk::VideoProfileInfoKHR).cast(); + // The first slot reuses cmd_resources.encode_command_buffer and encode_fence; + // additional slots get fresh buffers/fences below. (Re-using avoids + // changing create_command_resources, which is also used by H264/AV1.) + + // Allocate ENCODE_PIPELINE_DEPTH-1 additional encode command buffers + // from the same pool. + let extra_buffers_needed = super::ENCODE_PIPELINE_DEPTH.saturating_sub(1) as u32; + let extra_encode_buffers: Vec = if extra_buffers_needed > 0 { + let alloc_info = vk::CommandBufferAllocateInfo::default() + .command_pool(command_pool) + .level(vk::CommandBufferLevel::PRIMARY) + .command_buffer_count(extra_buffers_needed); + unsafe { context.device().allocate_command_buffers(&alloc_info) } + .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))? + } else { + Vec::new() + }; - let mut query_pool_create_info = vk::QueryPoolCreateInfo::default() - .query_type(vk::QueryType::VIDEO_ENCODE_FEEDBACK_KHR) - .query_count(1); - query_pool_create_info.p_next = (&mut encode_feedback_create - as *mut vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR) - .cast(); + // Build per-slot resources. + let mut slots: Vec = Vec::with_capacity(super::ENCODE_PIPELINE_DEPTH); + for slot_idx in 0..super::ENCODE_PIPELINE_DEPTH { + // Input image for this slot. + let (input_image, input_image_memory, input_image_view) = create_image( + &context, + aligned_width, + aligned_height, + picture_format, + false, + &profile_for_resources, + )?; + + // Bitstream buffer for this slot. + let (bitstream_buffer, bitstream_buffer_memory) = create_bitstream_buffer( + &context, + MIN_BITSTREAM_BUFFER_SIZE, + &profile_for_resources, + )?; + let bitstream_buffer_ptr = + map_bitstream_buffer(&context, bitstream_buffer_memory, MIN_BITSTREAM_BUFFER_SIZE)?; + + // Clear the input image so padding between user dimensions and the + // aligned coded extent is zero-initialized. RGB-direct uses the + // single-plane COLOR-aspect path; YUV uses a multi-plane buffer copy. + if use_rgb_input { + clear_rgb_input_image( + &context, + upload_command_buffer, + upload_fence, + context.transfer_queue(), + input_image, + )?; + } else { + clear_input_image( + &context, + &ClearImageParams { + command_buffer: upload_command_buffer, + fence: upload_fence, + queue: context.transfer_queue(), + image: input_image, + width: aligned_width, + height: aligned_height, + pixel_format: config.pixel_format, + bit_depth: config.bit_depth, + }, + )?; + } - let query_pool = unsafe { - context - .device() - .create_query_pool(&query_pool_create_info, None) + // Encode command buffer: slot 0 reuses the one create_command_resources + // already allocated; slots 1..N pull from the extras vec. + let encode_command_buffer = if slot_idx == 0 { + cmd_resources.encode_command_buffer + } else { + extra_encode_buffers[slot_idx - 1] + }; + + // Encode fence: slot 0 reuses the one create_command_resources already + // created (signaled); additional slots get fresh signaled fences. + let encode_fence = if slot_idx == 0 { + cmd_resources.encode_fence + } else { + let signaled = vk::FenceCreateInfo::default().flags(vk::FenceCreateFlags::SIGNALED); + unsafe { context.device().create_fence(&signaled, None) } + .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))? + }; + + // Per-slot single-query pool (one feedback query per encode submit). + let mut rgb_conv_profile_query = + vk::VideoEncodeProfileRgbConversionInfoVALVE::default() + .perform_encode_rgb_conversion(true); + let mut h265_profile_info_query = + vk::VideoEncodeH265ProfileInfoKHR::default().std_profile_idc(profile_idc); + if use_rgb_input { + h265_profile_info_query.p_next = (&mut rgb_conv_profile_query + as *mut vk::VideoEncodeProfileRgbConversionInfoVALVE) + .cast(); + } + let mut profile_info_query = vk::VideoProfileInfoKHR::default() + .video_codec_operation(vk::VideoCodecOperationFlagsKHR::ENCODE_H265) + .chroma_subsampling(chroma_subsampling) + .luma_bit_depth(bit_depth_flags) + .chroma_bit_depth(bit_depth_flags); + profile_info_query.p_next = + (&mut h265_profile_info_query as *mut vk::VideoEncodeH265ProfileInfoKHR).cast(); + let mut encode_feedback_create = + vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR::default().encode_feedback_flags( + vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BUFFER_OFFSET + | vk::VideoEncodeFeedbackFlagsKHR::BITSTREAM_BYTES_WRITTEN, + ); + encode_feedback_create.p_next = + (&mut profile_info_query as *mut vk::VideoProfileInfoKHR).cast(); + let mut query_pool_create_info = vk::QueryPoolCreateInfo::default() + .query_type(vk::QueryType::VIDEO_ENCODE_FEEDBACK_KHR) + .query_count(1); + query_pool_create_info.p_next = (&mut encode_feedback_create + as *mut vk::QueryPoolVideoEncodeFeedbackCreateInfoKHR) + .cast(); + let query_pool = unsafe { + context + .device() + .create_query_pool(&query_pool_create_info, None) + } + .map_err(|e| PixelForgeError::QueryPool(e.to_string()))?; + + slots.push(super::EncodeSlot { + input_image, + input_image_memory, + input_image_view, + input_image_layout: vk::ImageLayout::VIDEO_ENCODE_SRC_KHR, + bitstream_buffer, + bitstream_buffer_memory, + bitstream_buffer_ptr, + encode_command_buffer, + encode_fence, + query_pool, + in_flight: false, + pending_metadata: None, + }); } - .map_err(|e| PixelForgeError::QueryPool(e.to_string()))?; // Create DPB and GOP structure let mut dpb = DecodedPictureBuffer::new(); @@ -459,25 +583,17 @@ impl H265Encoder { session_memory, input_frame_num: 0, encode_frame_num: 0, - input_image, - input_image_memory, - input_image_view, - input_image_layout: vk::ImageLayout::VIDEO_ENCODE_SRC_KHR, + slots, + current_slot: 0, dpb_images, dpb_image_memories, dpb_image_views, dpb_slot_count, use_layered_dpb, - bitstream_buffer, - bitstream_buffer_memory, - bitstream_buffer_ptr, command_pool, upload_command_pool, upload_command_buffer, upload_fence, - encode_command_buffer, - encode_fence, - query_pool, header_data: None, has_backward_reference: false, backward_reference_poc: 0, diff --git a/src/encoder/h265/mod.rs b/src/encoder/h265/mod.rs index 522f262..36a8e98 100644 --- a/src/encoder/h265/mod.rs +++ b/src/encoder/h265/mod.rs @@ -12,9 +12,7 @@ use tracing::debug; use crate::encoder::dpb::DecodedPictureBuffer; use crate::encoder::gop::GopStructure; -use crate::encoder::resources::{ - destroy_encoder_resources, upload_image_to_input, EncoderResources, UploadParams, -}; +use crate::encoder::resources::{upload_image_to_input, UploadParams}; use crate::encoder::EncodeConfig; use crate::error::Result; use crate::vulkan::VideoContext; @@ -28,6 +26,59 @@ pub(crate) struct ReferenceInfo { pub poc: i32, } +/// Number of in-flight encode slots. Depth=2 lets frame N+1 begin encoding +/// while frame N is still on the encode hardware, so the per-frame budget +/// becomes 2 × frame_interval (16.6ms at 120fps) instead of 1 ×. +pub(crate) const ENCODE_PIPELINE_DEPTH: usize = 2; + +/// One slot's worth of per-frame encode resources. All fields here are +/// duplicated `ENCODE_PIPELINE_DEPTH` times so multiple frames can be +/// in-flight concurrently. See the comment on `slots` below for the rotation +/// invariants. +pub(crate) struct EncodeSlot { + /// Image the converter writes into (and the encoder reads from) for + /// this slot's frame. + pub input_image: vk::Image, + pub input_image_memory: vk::DeviceMemory, + pub input_image_view: vk::ImageView, + /// Tracked layout of `input_image` for safe transitions between frames. + pub input_image_layout: vk::ImageLayout, + + /// Bitstream destination buffer for this slot's encode. + pub bitstream_buffer: vk::Buffer, + pub bitstream_buffer_memory: vk::DeviceMemory, + /// Persistently-mapped pointer (avoids per-frame map/unmap). + pub bitstream_buffer_ptr: *mut u8, + + /// Command buffer recorded fresh each time this slot is used. + pub encode_command_buffer: vk::CommandBuffer, + /// Signaled when the encode for this slot finishes on the GPU. + pub encode_fence: vk::Fence, + /// Single-query pool — one feedback query per encode submission. + pub query_pool: vk::QueryPool, + + /// `true` after we've submitted to this slot but not yet drained it. + /// Used to decide whether `input_image()` must wait before returning. + pub in_flight: bool, + + /// Metadata captured at submission time. The drained bitstream is wrapped + /// in an `EncodedPacket` using this metadata after the next encode() call + /// targeting the same slot waits on its fence. + pub pending_metadata: Option, +} + +/// Frame metadata stashed alongside an in-flight encode submission. When the +/// submission is drained on a later `encode()` call, we reconstruct the +/// `EncodedPacket` using these fields plus the freshly-read bitstream. +pub(crate) struct SlotPacketMetadata { + pub frame_type: crate::encoder::FrameType, + pub is_key_frame: bool, + pub pts: u64, + pub dts: u64, + /// VPS/SPS/PPS header bytes (Some only for IDR frames). + pub header: Option>, +} + /// H.265 encoder. pub struct H265Encoder { context: VideoContext, @@ -51,12 +102,15 @@ pub struct H265Encoder { input_frame_num: u64, encode_frame_num: u64, - // Resources - input_image: vk::Image, - input_image_memory: vk::DeviceMemory, - input_image_view: vk::ImageView, - /// Current Vulkan image layout of `input_image` (tracked to avoid UB when transitioning). - input_image_layout: vk::ImageLayout, + /// Per-frame slots. Index `current_slot` is the slot we'll use for the + /// *next* encode submission (and whose `input_image` `input_image()` + /// returns). When `encode()` runs, it drains that slot's previous + /// in-flight work (if any), records new commands into it, submits, then + /// advances `current_slot` for the next frame. With depth=2 the encoder + /// can keep two frames in flight at once. + pub(crate) slots: Vec, + pub(crate) current_slot: usize, + /// DPB images. dpb_images: Vec, dpb_image_memories: Vec, @@ -65,19 +119,12 @@ pub struct H265Encoder { dpb_slot_count: usize, /// Whether the DPB uses a single layered image (true) or separate images (false). use_layered_dpb: bool, - bitstream_buffer: vk::Buffer, - bitstream_buffer_memory: vk::DeviceMemory, - /// Persistently mapped pointer to the bitstream buffer (avoids per-frame map/unmap). - bitstream_buffer_ptr: *mut u8, - // Command resources. + // Command pool (encode + upload command buffers allocated from these). command_pool: vk::CommandPool, upload_command_pool: vk::CommandPool, upload_command_buffer: vk::CommandBuffer, upload_fence: vk::Fence, - encode_command_buffer: vk::CommandBuffer, - encode_fence: vk::Fence, - query_pool: vk::QueryPool, // Parameter sets - cached header data (VPS/SPS/PPS) header_data: Option>, @@ -111,7 +158,8 @@ impl H265Encoder { /// with the same dimensions as the encoder configuration. The source image /// should be in GENERAL layout. fn upload_from_image(&mut self, src_image: vk::Image) -> Result<()> { - if src_image == self.input_image { + let slot = &mut self.slots[self.current_slot]; + if src_image == slot.input_image { debug!("Source image is the encoder's input image, skipping upload copy"); return Ok(()); } @@ -120,18 +168,18 @@ impl H265Encoder { upload_command_buffer: self.upload_command_buffer, upload_fence: self.upload_fence, src_image, - dst_image: self.input_image, + dst_image: slot.input_image, width: self.config.dimensions.width, height: self.config.dimensions.height, pixel_format: self.config.pixel_format, - input_image_layout: self.input_image_layout, + input_image_layout: slot.input_image_layout, upload_queue: self.context.transfer_queue(), }; upload_image_to_input(&self.context, ¶ms)?; // Update tracked layout. - self.input_image_layout = vk::ImageLayout::VIDEO_ENCODE_SRC_KHR; + slot.input_image_layout = vk::ImageLayout::VIDEO_ENCODE_SRC_KHR; Ok(()) } @@ -144,37 +192,68 @@ unsafe impl Send for H265Encoder {} impl Drop for H265Encoder { fn drop(&mut self) { unsafe { + let device = self.context.device(); // Wait on the queues used by the encoder rather than stalling // the entire device. - let _ = self - .context - .device() - .queue_wait_idle(self.context.transfer_queue()); + let _ = device.queue_wait_idle(self.context.transfer_queue()); if let Some(q) = self.context.video_encode_queue() { - let _ = self.context.device().queue_wait_idle(q); + let _ = device.queue_wait_idle(q); + } + + // Destroy per-slot resources first (each slot has its own image, + // bitstream buffer, fence, query pool, and command buffer that + // was allocated from `command_pool`). The command buffers are + // freed implicitly when the pool is destroyed below. + for slot in &mut self.slots { + if !slot.bitstream_buffer_ptr.is_null() { + device.unmap_memory(slot.bitstream_buffer_memory); + slot.bitstream_buffer_ptr = std::ptr::null_mut(); + } + device.destroy_query_pool(slot.query_pool, None); + device.destroy_fence(slot.encode_fence, None); + device.destroy_buffer(slot.bitstream_buffer, None); + device.free_memory(slot.bitstream_buffer_memory, None); + device.destroy_image_view(slot.input_image_view, None); + device.destroy_image(slot.input_image, None); + device.free_memory(slot.input_image_memory, None); + } + + // Shared resources. + device.destroy_fence(self.upload_fence, None); + device.destroy_command_pool(self.command_pool, None); + if self.upload_command_pool != self.command_pool { + device.destroy_command_pool(self.upload_command_pool, None); + } + + for view in &self.dpb_image_views { + device.destroy_image_view(*view, None); + } + for image in &self.dpb_images { + device.destroy_image(*image, None); } - destroy_encoder_resources( - self.context.device(), - &self.video_queue_fn, - &EncoderResources { - query_pool: self.query_pool, - upload_fence: self.upload_fence, - encode_fence: self.encode_fence, - command_pool: self.command_pool, - upload_command_pool: self.upload_command_pool, - bitstream_buffer: self.bitstream_buffer, - bitstream_buffer_memory: self.bitstream_buffer_memory, - input_image: self.input_image, - input_image_memory: self.input_image_memory, - input_image_view: self.input_image_view, - dpb_images: &self.dpb_images, - dpb_image_memories: &self.dpb_image_memories, - dpb_image_views: &self.dpb_image_views, - session: self.session, - session_params: self.session_params, - session_memory: &self.session_memory, - }, + for memory in &self.dpb_image_memories { + device.free_memory(*memory, None); + } + + if self.session_params != vk::VideoSessionParametersKHR::null() { + (self + .video_queue_fn + .fp() + .destroy_video_session_parameters_khr)( + device.handle(), + self.session_params, + std::ptr::null(), + ); + } + (self.video_queue_fn.fp().destroy_video_session_khr)( + device.handle(), + self.session, + std::ptr::null(), ); + + for memory in &self.session_memory { + device.free_memory(*memory, None); + } } } } diff --git a/src/encoder/mod.rs b/src/encoder/mod.rs index 983ba12..707e39e 100644 --- a/src/encoder/mod.rs +++ b/src/encoder/mod.rs @@ -224,6 +224,16 @@ pub struct EncodeConfig { /// Color description for VUI signaling. /// Defaults to BT.709 (full-range) when `None`. pub color_description: Option, + /// When true, configure the encoder to take **RGB** input images directly + /// and have the hardware perform RGB→YUV conversion inline during encode. + /// Requires the device to advertise `VK_VALVE_video_encode_rgb_conversion` + /// (currently only AMD's RADV driver). Trying to enable this on a device + /// that doesn't support it returns an error from `Encoder::new`. + /// + /// When false (the default), the encoder takes its native YUV input + /// format (NV12 / P010 / 4:4:4 variants) and the caller is responsible + /// for converting RGB sources beforehand. + pub use_rgb_input: bool, } impl EncodeConfig { @@ -249,6 +259,7 @@ impl EncodeConfig { virtual_buffer_size_ms: 1000, initial_virtual_buffer_size_ms: 1000, color_description: None, + use_rgb_input: false, } } @@ -274,6 +285,7 @@ impl EncodeConfig { virtual_buffer_size_ms: 1000, initial_virtual_buffer_size_ms: 1000, color_description: None, + use_rgb_input: false, } } @@ -299,6 +311,7 @@ impl EncodeConfig { virtual_buffer_size_ms: 1000, initial_virtual_buffer_size_ms: 1000, color_description: None, + use_rgb_input: false, } } @@ -383,6 +396,13 @@ impl EncodeConfig { self.color_description = Some(desc); self } + + /// Enable hardware-direct RGB input (`VK_VALVE_video_encode_rgb_conversion`). + /// See [`EncodeConfig::use_rgb_input`]. + pub fn with_rgb_input(mut self, enable: bool) -> Self { + self.use_rgb_input = enable; + self + } } /// Encoded video packet. @@ -466,7 +486,7 @@ impl Encoder { /// # let yuv_data = vec![0u8; 1920 * 1080 * 3 / 2]; /// input.upload_yuv420(&yuv_data)?; /// - /// // Encode the image + /// // Encode the image (no GPU wait semaphore needed when uploaded synchronously). /// let packets = encoder.encode(input.image())?; /// # Ok(()) /// # } diff --git a/src/encoder/resources.rs b/src/encoder/resources.rs index d63d157..c30ac9d 100644 --- a/src/encoder/resources.rs +++ b/src/encoder/resources.rs @@ -716,6 +716,149 @@ pub(crate) struct ClearImageParams { pub bit_depth: BitDepth, } +/// Pick the input image format for the hardware-direct RGB encode path +/// (`VK_VALVE_video_encode_rgb_conversion`). 8-bit picks B8G8R8A8_UNORM, +/// 10-bit picks A2B10G10R10_UNORM_PACK32 — these match the formats RADV's +/// VCN5 driver accepts for the RGB-conversion path, and are the +/// `B`/`ABGR` variants that gamescope's override-surface DMA-BUFs +/// typically arrive in (avoiding a channel swap). +pub(crate) fn rgb_input_format(bit_depth: BitDepth) -> vk::Format { + match bit_depth { + BitDepth::Eight => vk::Format::B8G8R8A8_UNORM, + BitDepth::Ten => vk::Format::A2B10G10R10_UNORM_PACK32, + } +} + +/// Pick the RGB→YUV model the hardware should apply, based on the +/// configured colour description (BT.709 vs BT.2020). +pub(crate) fn rgb_conversion_model( + desc: &crate::encoder::ColorDescription, +) -> vk::VideoEncodeRgbModelConversionFlagsVALVE { + if desc.matrix_coefficients == 9 { + vk::VideoEncodeRgbModelConversionFlagsVALVE::YCBCR_2020 + } else { + vk::VideoEncodeRgbModelConversionFlagsVALVE::YCBCR_709 + } +} + +/// Pick the RGB range compression flag (full vs limited) from the colour +/// description. +pub(crate) fn rgb_conversion_range( + desc: &crate::encoder::ColorDescription, +) -> vk::VideoEncodeRgbRangeCompressionFlagsVALVE { + if desc.full_range { + vk::VideoEncodeRgbRangeCompressionFlagsVALVE::FULL_RANGE + } else { + vk::VideoEncodeRgbRangeCompressionFlagsVALVE::NARROW_RANGE + } +} + +/// Clear an RGB-formatted encode input image (used by the +/// `VK_VALVE_video_encode_rgb_conversion` path) to opaque black, then +/// transition it to `VIDEO_ENCODE_SRC_KHR`. +/// +/// RGB encode inputs are single-plane `COLOR` aspect images, so the +/// multi-plane buffer-copy path used for YUV doesn't apply. We just +/// `vkCmdClearColorImage` to zero, then barrier into encode layout. +pub(crate) fn clear_rgb_input_image( + context: &VideoContext, + command_buffer: vk::CommandBuffer, + fence: vk::Fence, + queue: vk::Queue, + image: vk::Image, +) -> Result<()> { + let device = context.device(); + + unsafe { device.reset_command_buffer(command_buffer, vk::CommandBufferResetFlags::empty()) } + .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?; + + let begin_info = + vk::CommandBufferBeginInfo::default().flags(vk::CommandBufferUsageFlags::ONE_TIME_SUBMIT); + unsafe { device.begin_command_buffer(command_buffer, &begin_info) } + .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?; + + let subresource = vk::ImageSubresourceRange { + aspect_mask: vk::ImageAspectFlags::COLOR, + base_mip_level: 0, + level_count: 1, + base_array_layer: 0, + layer_count: 1, + }; + + let to_transfer = vk::ImageMemoryBarrier::default() + .old_layout(vk::ImageLayout::UNDEFINED) + .new_layout(vk::ImageLayout::TRANSFER_DST_OPTIMAL) + .src_queue_family_index(vk::QUEUE_FAMILY_IGNORED) + .dst_queue_family_index(vk::QUEUE_FAMILY_IGNORED) + .image(image) + .subresource_range(subresource) + .src_access_mask(vk::AccessFlags::empty()) + .dst_access_mask(vk::AccessFlags::TRANSFER_WRITE); + + unsafe { + device.cmd_pipeline_barrier( + command_buffer, + vk::PipelineStageFlags::TOP_OF_PIPE, + vk::PipelineStageFlags::TRANSFER, + vk::DependencyFlags::empty(), + &[], + &[], + &[to_transfer], + ); + } + + let clear_color = vk::ClearColorValue { + float32: [0.0, 0.0, 0.0, 1.0], + }; + unsafe { + device.cmd_clear_color_image( + command_buffer, + image, + vk::ImageLayout::TRANSFER_DST_OPTIMAL, + &clear_color, + &[subresource], + ); + } + + let to_encode = vk::ImageMemoryBarrier::default() + .old_layout(vk::ImageLayout::TRANSFER_DST_OPTIMAL) + .new_layout(vk::ImageLayout::VIDEO_ENCODE_SRC_KHR) + .src_queue_family_index(vk::QUEUE_FAMILY_IGNORED) + .dst_queue_family_index(vk::QUEUE_FAMILY_IGNORED) + .image(image) + .subresource_range(subresource) + .src_access_mask(vk::AccessFlags::TRANSFER_WRITE) + .dst_access_mask(vk::AccessFlags::empty()); + + unsafe { + device.cmd_pipeline_barrier( + command_buffer, + vk::PipelineStageFlags::TRANSFER, + vk::PipelineStageFlags::BOTTOM_OF_PIPE, + vk::DependencyFlags::empty(), + &[], + &[], + &[to_encode], + ); + } + + unsafe { device.end_command_buffer(command_buffer) } + .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?; + + let submit_info = + vk::SubmitInfo::default().command_buffers(std::slice::from_ref(&command_buffer)); + unsafe { device.reset_fences(&[fence]) } + .map_err(|e| PixelForgeError::CommandBuffer(format!("reset fence: {}", e)))?; + unsafe { device.queue_submit(queue, &[submit_info], fence) } + .map_err(|e| PixelForgeError::CommandBuffer(format!("submit rgb clear: {}", e)))?; + unsafe { device.wait_for_fences(&[fence], true, u64::MAX) } + .map_err(|e| PixelForgeError::CommandBuffer(format!("wait rgb clear: {}", e)))?; + unsafe { device.reset_fences(&[fence]) } + .map_err(|e| PixelForgeError::CommandBuffer(format!("reset fence after clear: {}", e)))?; + + Ok(()) +} + /// Clear the input image by filling it with zeros via a staging buffer. /// /// This must be called once after creating the input image to ensure @@ -1159,77 +1302,6 @@ pub(crate) fn upload_image_to_input( Ok(()) } -/// Parameters for cleaning up shared encoder resources. -pub(crate) struct EncoderResources<'a> { - pub query_pool: vk::QueryPool, - pub upload_fence: vk::Fence, - pub encode_fence: vk::Fence, - pub command_pool: vk::CommandPool, - pub upload_command_pool: vk::CommandPool, - pub bitstream_buffer: vk::Buffer, - pub bitstream_buffer_memory: vk::DeviceMemory, - pub input_image: vk::Image, - pub input_image_memory: vk::DeviceMemory, - pub input_image_view: vk::ImageView, - pub dpb_images: &'a [vk::Image], - pub dpb_image_memories: &'a [vk::DeviceMemory], - pub dpb_image_views: &'a [vk::ImageView], - pub session: vk::VideoSessionKHR, - pub session_params: vk::VideoSessionParametersKHR, - pub session_memory: &'a [vk::DeviceMemory], -} - -/// Destroy all shared encoder resources. -/// -/// # Safety -/// -/// All queues that may reference these resources (transfer and video encode) -/// must be idle before calling this function. -pub(crate) unsafe fn destroy_encoder_resources( - device: &ash::Device, - video_queue_fn: &ash::khr::video_queue::Device, - res: &EncoderResources, -) { - device.destroy_query_pool(res.query_pool, None); - device.destroy_fence(res.upload_fence, None); - device.destroy_fence(res.encode_fence, None); - device.destroy_command_pool(res.command_pool, None); - if res.upload_command_pool != res.command_pool { - device.destroy_command_pool(res.upload_command_pool, None); - } - - device.unmap_memory(res.bitstream_buffer_memory); - device.destroy_buffer(res.bitstream_buffer, None); - device.free_memory(res.bitstream_buffer_memory, None); - - device.destroy_image_view(res.input_image_view, None); - device.destroy_image(res.input_image, None); - device.free_memory(res.input_image_memory, None); - - for view in res.dpb_image_views { - device.destroy_image_view(*view, None); - } - for image in res.dpb_images { - device.destroy_image(*image, None); - } - for memory in res.dpb_image_memories { - device.free_memory(*memory, None); - } - - if res.session_params != vk::VideoSessionParametersKHR::null() { - (video_queue_fn.fp().destroy_video_session_parameters_khr)( - device.handle(), - res.session_params, - std::ptr::null(), - ); - } - (video_queue_fn.fp().destroy_video_session_khr)(device.handle(), res.session, std::ptr::null()); - - for memory in res.session_memory { - device.free_memory(*memory, None); - } -} - /// Record DPB image barriers for encode. /// /// Transitions the setup DPB slot from UNDEFINED to VIDEO_ENCODE_DPB and @@ -1397,55 +1469,75 @@ pub(crate) unsafe fn record_post_encode_dpb_barrier( ); } -/// Submit an encode command buffer and wait for completion. +/// Submit an encode command buffer to the encode queue without waiting. +/// +/// This is the asynchronous half of the encode submit. Use `wait_and_read_bitstream` +/// later to drain the result. Lets pipelined encoders (H.265 with depth > 1) keep +/// multiple encodes in flight on the encode queue. /// -/// Submits the command buffer to the encode queue, waits for the fence, -/// then reads query results and copies the encoded bitstream data. -/// The fence is reset before submission so it may be in any state on entry. +/// The fence is reset before submission so it may be in any state on entry, and +/// will be signaled when the GPU encode finishes. /// /// # Safety /// /// The command buffer must have been ended. -/// The bitstream buffer pointer must be valid and the buffer must be persistently mapped. -pub(crate) unsafe fn submit_encode_and_read_bitstream( +pub(crate) unsafe fn submit_encode_only( device: &ash::Device, command_buffer: vk::CommandBuffer, fence: vk::Fence, encode_queue: vk::Queue, - query_pool: vk::QueryPool, - bitstream_buffer_ptr: *const u8, -) -> Result> { - let submit_info = + wait_semaphore: Option, +) -> Result<()> { + let wait_semaphores: Vec; + let wait_dst_stage_mask: Vec; + + let mut submit_info = vk::SubmitInfo::default().command_buffers(std::slice::from_ref(&command_buffer)); - // Reset the fence before submit (it may be signaled from a previous encode - // or from initial creation with SIGNALED_BIT). This ensures the fence is - // unsignaled for queue_submit, and after wait_for_fences it stays signaled — - // which lets set_color_description() safely wait on it between encodes. + if let Some(sem) = wait_semaphore { + wait_semaphores = vec![sem]; + wait_dst_stage_mask = vec![vk::PipelineStageFlags::ALL_COMMANDS]; + submit_info = submit_info + .wait_semaphores(&wait_semaphores) + .wait_dst_stage_mask(&wait_dst_stage_mask); + } + device .reset_fences(&[fence]) .map_err(|e| PixelForgeError::Synchronization(e.to_string()))?; - device .queue_submit(encode_queue, &[submit_info], fence) .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?; + Ok(()) +} +/// Wait on the encode fence and read the bitstream produced by a prior +/// `submit_encode_only` call on the same fence/query_pool/buffer triple. +/// +/// # Safety +/// +/// The fence must be the one signaled by the encode submission whose bitstream +/// is being drained here, and `bitstream_buffer_ptr` must point to the +/// persistently-mapped bitstream buffer for that submission. +pub(crate) unsafe fn wait_and_read_bitstream( + device: &ash::Device, + fence: vk::Fence, + query_pool: vk::QueryPool, + bitstream_buffer_ptr: *const u8, +) -> Result> { device .wait_for_fences(&[fence], true, u64::MAX) .map_err(|e| PixelForgeError::CommandBuffer(e.to_string()))?; - // Read query results (offset + bytes_written). #[repr(C)] struct QueryResult { offset: u32, bytes_written: u32, } - let mut query_results = [QueryResult { offset: 0, bytes_written: 0, }]; - device .get_query_pool_results( query_pool, @@ -1457,19 +1549,16 @@ pub(crate) unsafe fn submit_encode_and_read_bitstream( let offset = query_results[0].offset as usize; let size = query_results[0].bytes_written as usize; - if size == 0 { return Err(PixelForgeError::QueryPool( "Encoder produced 0 bytes".to_string(), )); } - tracing::debug!("Encoded frame: offset={}, size={}", offset, size); let mut encoded_data = vec![0u8; size]; let src = std::slice::from_raw_parts(bitstream_buffer_ptr.add(offset), size); encoded_data.copy_from_slice(src); - Ok(encoded_data) } diff --git a/src/vulkan.rs b/src/vulkan.rs index f3fa382..f4718b7 100644 --- a/src/vulkan.rs +++ b/src/vulkan.rs @@ -78,6 +78,12 @@ struct VideoContextInner { device_properties: vk::PhysicalDeviceProperties, supported_encode_codecs: Vec, has_descriptor_buffer: bool, + /// `true` when `VK_VALVE_video_encode_rgb_conversion` was both reported by + /// the device and enabled at `vkCreateDevice` time. Encoder codepaths + /// can use this to opt into the hardware-direct RGB→YUV path that lets + /// VCN do the colour conversion inline (skipping a separate compute + /// shader). When `false`, callers must run their own RGB→YUV step. + rgb_conversion_supported: bool, } impl Drop for VideoContextInner { @@ -159,6 +165,17 @@ impl VideoContext { pub fn has_descriptor_buffer(&self) -> bool { self.inner.has_descriptor_buffer } + + /// Returns `true` when the device supports — and we have enabled — + /// `VK_VALVE_video_encode_rgb_conversion`. When this is true, encoders + /// may opt into the hardware-direct RGB input path (VCN performs the + /// RGB→YUV conversion inline during the encode pass), eliminating the + /// need for a separate compute-shader colour converter. Currently only + /// AMD's RADV driver supports this; on other vendors the answer is + /// `false` and the caller must keep its own conversion path. + pub fn supports_rgb_direct_encode(&self) -> bool { + self.inner.rgb_conversion_supported + } } impl VideoContext { @@ -465,6 +482,36 @@ impl VideoContext { // Add the 2-plane 444 formats extension. push_ext(ash::ext::ycbcr_2plane_444_formats::NAME.as_ptr()); + // Probe for VK_VALVE_video_encode_rgb_conversion. When the device + // supports it we enable both the extension and its feature, which + // lets encoders take RGB images directly and have VCN do RGB→YUV + // conversion inline. Vendors without the extension (NVIDIA, Intel, + // most non-RADV stacks today) fall through and callers continue + // using their own RGB→YUV step. + let device_exts = + unsafe { instance.enumerate_device_extension_properties(physical_device) } + .unwrap_or_default(); + let rgb_conversion_supported = video_encode_queue_family.is_some() + && device_exts.iter().any(|ext| { + ext.extension_name_as_c_str() + .map(|n| n == ash::valve::video_encode_rgb_conversion::NAME) + .unwrap_or(false) + }); + let mut rgb_conv_features = + vk::PhysicalDeviceVideoEncodeRgbConversionFeaturesVALVE::default() + .video_encode_rgb_conversion(true); + if rgb_conversion_supported { + push_ext(ash::valve::video_encode_rgb_conversion::NAME.as_ptr()); + info!( + "VK_VALVE_video_encode_rgb_conversion supported, enabling RGB-direct encode path" + ); + } else if video_encode_queue_family.is_some() { + debug!( + "VK_VALVE_video_encode_rgb_conversion not supported on this device — \ + encoders will run with caller-provided YUV input" + ); + } + // Enable AV1 video encode feature only if AV1 is supported. // Only include AV1 features in the pNext chain when AV1 is actually supported, // to avoid chaining unknown feature structs on devices without AV1. @@ -536,6 +583,24 @@ impl VideoContext { && desc_buf_features.descriptor_buffer != 0 && desc_buf_features.descriptor_buffer_capture_replay != 0; + // Splice the RGB conversion feature onto the end of the chain when + // the device supports it. Walks from `sync2_features` (which the + // descriptor-buffer block leaves at the middle of the chain) to the + // tail and appends — keeping it conditional avoids passing an + // unknown feature struct on devices that don't recognise it. + if rgb_conversion_supported { + unsafe { + let mut cursor: *mut vk::BaseOutStructure = + (&mut sync2_features as *mut vk::PhysicalDeviceSynchronization2Features).cast(); + while !(*cursor).p_next.is_null() { + cursor = (*cursor).p_next.cast(); + } + (*cursor).p_next = (&mut rgb_conv_features + as *mut vk::PhysicalDeviceVideoEncodeRgbConversionFeaturesVALVE) + .cast(); + } + } + // Log all extensions being enabled debug!("Enabling {} device extensions:", extension_names.len()); for ext_name_ptr in &extension_names { @@ -589,6 +654,7 @@ impl VideoContext { device_properties, supported_encode_codecs, has_descriptor_buffer, + rgb_conversion_supported, }), }) }