diff --git a/src/api/channel/by_gop.rs b/src/api/channel/by_gop.rs index d7024e90bf..799faf6e55 100644 --- a/src/api/channel/by_gop.rs +++ b/src/api/channel/by_gop.rs @@ -39,15 +39,15 @@ impl SubGop { */ // TODO: Make the detector logic fitting the model -struct SceneChange { +struct SceneChange { frames: usize, pyramid_size: usize, processed: u64, last_keyframe: u64, - detector: SceneChangeDetector, + detector: SceneChangeDetector, } -impl SceneChange { +impl SceneChange { fn new(pyramid_size: usize, enc: &EncoderConfig) -> Self { let seq = Arc::new(Sequence::new(enc)); @@ -56,7 +56,6 @@ impl SceneChange { CpuFeatureLevel::default(), pyramid_size, seq, - true, ); Self { frames: 0, pyramid_size, processed: 0, last_keyframe: 0, detector } @@ -64,9 +63,7 @@ impl SceneChange { // Tell where to split the lookahead // - fn split( - &mut self, lookahead: &[Arc>], - ) -> Option<(usize, bool)> { + fn split(&mut self, lookahead: &[Arc>]) -> Option<(usize, bool)> { self.processed += 1; let new_gop = self.detector.analyze_next_frame( diff --git a/src/api/internal.rs b/src/api/internal.rs index 07625471f8..4dbb474052 100644 --- a/src/api/internal.rs +++ b/src/api/internal.rs @@ -246,7 +246,7 @@ pub(crate) struct ContextInner { gop_output_frameno_start: BTreeMap, /// Maps `output_frameno` to `gop_input_frameno_start`. pub(crate) gop_input_frameno_start: BTreeMap, - keyframe_detector: SceneChangeDetector, + keyframe_detector: SceneChangeDetector, pub(crate) config: Arc, seq: Arc, pub(crate) rc_state: RCState, @@ -291,7 +291,6 @@ impl ContextInner { CpuFeatureLevel::default(), lookahead_distance, seq.clone(), - true, ), config: Arc::new(*enc), seq, diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index b8a5f57054..48916a3cc5 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -13,35 +13,41 @@ use crate::cpu_features::CpuFeatureLevel; use crate::encoder::Sequence; use crate::frame::*; use crate::util::{CastFromPrimitive, Pixel}; +use itertools::Itertools; use rust_hawktracer::*; -use std::collections::BTreeSet; use std::sync::Arc; +use std::{cmp, u64}; /// Runs keyframe detection on frames from the lookahead queue. -pub struct SceneChangeDetector { +pub struct SceneChangeDetector { /// Minimum average difference between YUV deltas that will trigger a scene change. - threshold: u64, + threshold: usize, /// Fast scene cut detection mode, uses simple SAD instead of encoder cost estimates. fast_mode: bool, - /// Determine whether or not short scene flashes should be excluded - exclude_scene_flashes: bool, - /// Frames that cannot be marked as keyframes due to the algorithm excluding them. - /// Storing the frame numbers allows us to avoid looking back more than one frame. - excluded_frames: BTreeSet, + /// scaling factor for fast scene detection + scale_factor: usize, + // Frame buffer for scaled frames + frame_buffer: Vec>, + // Deque offset for current + lookahead_offset: usize, + // Start deque offset based on lookahead + deque_offset: usize, + // Scenechange results for adaptive threshold + score_deque: Vec<(f64, f64)>, + /// Number of pixels in scaled frame for fast mode + pixels: usize, /// The bit depth of the video. bit_depth: usize, /// The CPU feature level to be used. cpu_feature_level: CpuFeatureLevel, encoder_config: EncoderConfig, - lookahead_distance: usize, sequence: Arc, } -impl SceneChangeDetector { +impl SceneChangeDetector { pub fn new( encoder_config: EncoderConfig, cpu_feature_level: CpuFeatureLevel, lookahead_distance: usize, sequence: Arc, - exclude_scene_flashes: bool, ) -> Self { // This implementation is based on a Python implementation at // https://pyscenedetect.readthedocs.io/en/latest/reference/detection-methods/. @@ -54,20 +60,47 @@ impl SceneChangeDetector { // This may be adjusted later. // // This threshold is only used for the fast scenecut implementation. - const BASE_THRESHOLD: u64 = 12; + // + // Testing shown that default threshold of 12 overallocates keyframes by almost double, + // compared to other scene change implementations + const BASE_THRESHOLD: usize = 25; let bit_depth = encoder_config.bit_depth; let fast_mode = encoder_config.speed_settings.fast_scene_detection || encoder_config.low_latency; + // Scale factor for fast scene detection + let scale_factor = + if fast_mode { detect_scale_factor(&sequence) } else { 1_usize }; + + // Set lookahead offset to 5 if normal lookahead available + let lookahead_offset = if lookahead_distance >= 5 { 5 } else { 0 }; + let deque_offset = lookahead_offset; + + let score_deque = Vec::with_capacity(5 + lookahead_distance); + + // Pixel count for fast scenedetect + let pixels = if fast_mode { + (sequence.max_frame_height as usize / scale_factor) + * (sequence.max_frame_width as usize / scale_factor) + } else { + 1 + }; + + let frame_buffer = + if fast_mode { Vec::with_capacity(2) } else { Vec::new() }; + Self { - threshold: BASE_THRESHOLD * bit_depth as u64 / 8, + threshold: BASE_THRESHOLD * bit_depth / 8, fast_mode, - exclude_scene_flashes, - excluded_frames: BTreeSet::new(), + scale_factor, + frame_buffer, + lookahead_offset, + deque_offset, + score_deque, + pixels, bit_depth, cpu_feature_level, encoder_config, - lookahead_distance, sequence, } } @@ -81,10 +114,13 @@ impl SceneChangeDetector { /// /// This will gracefully handle the first frame in the video as well. #[hawktracer(analyze_next_frame)] - pub fn analyze_next_frame( + pub fn analyze_next_frame( &mut self, frame_set: &[Arc>], input_frameno: u64, previous_keyframe: u64, ) -> bool { + // Use score deque for adaptive threshold for scene cut + // Declare score_deque offset based on lookahead for scene change scores + // Find the distance to the previous keyframe. let distance = input_frameno - previous_keyframe; @@ -92,7 +128,7 @@ impl SceneChangeDetector { return false; } - // Handle minimum and maximum key frame intervals. + // Handle minimum and maximum keyframe intervals. if distance < self.encoder_config.min_key_frame_interval { return false; } @@ -104,215 +140,244 @@ impl SceneChangeDetector { return false; } - if self.exclude_scene_flashes { - self.exclude_scene_flashes(frame_set, input_frameno, previous_keyframe); - } - - self.is_key_frame( - frame_set[0].clone(), - frame_set[1].clone(), - input_frameno, - previous_keyframe, - ) - } + // Initiallization of score deque + // based on frame set length + if self.deque_offset > 0 + && frame_set.len() > self.deque_offset + 1 + && self.score_deque.is_empty() + { + self.initialize_score_deque( + frame_set, + input_frameno, + previous_keyframe, + self.deque_offset, + ); + } else if self.score_deque.is_empty() { + self.initialize_score_deque( + frame_set, + input_frameno, + previous_keyframe, + frame_set.len() - 1, + ); - /// Determines if `current_frame` should be a keyframe. - fn is_key_frame( - &self, previous_frame: Arc>, current_frame: Arc>, - current_frameno: u64, previous_keyframe: u64, - ) -> bool { - if self.excluded_frames.contains(¤t_frameno) { - return false; + self.deque_offset = frame_set.len() - 2; + } + // Running single frame comparison and adding it to deque + // Decrease deque offset if there is no new frames + if frame_set.len() > self.deque_offset + 1 { + self.run_comparison( + frame_set[self.deque_offset].clone(), + frame_set[self.deque_offset + 1].clone(), + input_frameno, + previous_keyframe, + ); + } else { + self.deque_offset -= 1; } - let result = self.has_scenecut( - previous_frame, - current_frame, - current_frameno, - previous_keyframe, - ); + // Adaptive scenecut check + let scenecut = self.adaptive_scenecut(); debug!( - "[SC-Detect] Frame {} to {}: I={:.3} T={:.3} P={:.3} {}", - current_frameno - 1, - current_frameno, - result.intra_cost, - result.threshold, - result.inter_cost, - if result.has_scenecut { "Scenecut" } else { "No cut" } + "[SC-Detect] Frame {}: I={:4.0} T= {:.0} {}", + input_frameno, + self.score_deque[self.deque_offset].0, + self.score_deque[self.deque_offset].1, + if scenecut { "Scenecut" } else { "No cut" } ); - result.has_scenecut - } - /// Uses lookahead to avoid coding short flashes as scenecuts. - /// Saves excluded frame numbers in `self.excluded_frames`. - fn exclude_scene_flashes( - &mut self, frame_subset: &[Arc>], frameno: u64, - previous_keyframe: u64, - ) { - let lookahead_distance = self.lookahead_distance; - - if frame_subset.len() - 1 < lookahead_distance { - // Don't add a keyframe in the last frame pyramid. - // It's effectively the same as a scene flash, - // and really wasteful for compression. - for frame in frameno..=(frameno + lookahead_distance as u64) { - self.excluded_frames.insert(frame); + if scenecut { + // Clear buffers and deque + self.frame_buffer.clear(); + debug!("[SC-score-deque]{:.0?}", self.score_deque); + self.score_deque.clear(); + } else { + // Keep score deque of 5 backward frames + // and forward frames of lenght of lookahead offset + if self.score_deque.len() > 5 + self.lookahead_offset { + self.score_deque.pop(); } - return; } - // Where A and B are scenes: AAAAAABBBAAAAAA - // If BBB is shorter than lookahead_distance, it is detected as a flash - // and not considered a scenecut. - // - // Search starting with the furthest frame, - // to enable early loop exit if we find a scene flash. - for j in (1..=lookahead_distance).rev() { - let result = self.has_scenecut( - frame_subset[0].clone(), - frame_subset[j].clone(), - frameno - 1 + j as u64, + scenecut + } + + // Initially fill score deque with frame scores + fn initialize_score_deque( + &mut self, frame_set: &[Arc>], input_frameno: u64, + previous_keyframe: u64, init_len: usize, + ) { + for x in 0..init_len { + self.run_comparison( + frame_set[x].clone(), + frame_set[x + 1].clone(), + input_frameno, previous_keyframe, ); - debug!( - "[SF-Detect-1] Frame {} to {}: I={:.3} T={:.3} P={:.3} {}", - frameno - 1, - frameno - 1 + j as u64, - result.intra_cost, - result.threshold, - result.inter_cost, - if result.has_scenecut { "No flash" } else { "Scene flash" } - ); - if !result.has_scenecut { - // Any frame in between `0` and `j` cannot be a real scenecut. - for i in 0..=j { - let frameno = frameno + i as u64 - 1; - self.excluded_frames.insert(frameno); - } - // Because all frames in this gap are already excluded, - // exit the loop early as an optimization. - break; - } } + } - // Where A-F are scenes: AAAAABBCCDDEEFFFFFF - // If each of BB ... EE are shorter than `lookahead_distance`, they are - // detected as flashes and not considered scenecuts. - // Instead, the first F frame becomes a scenecut. - // If the video ends before F, no frame becomes a scenecut. - for i in 1..lookahead_distance { - let result = self.has_scenecut( - frame_subset[i].clone(), - frame_subset[lookahead_distance].clone(), - frameno - 1 + lookahead_distance as u64, - previous_keyframe, - ); - debug!( - "[SF-Detect-2] Frame {} to {}: I={:.3} T={:.3} P={:.3} {}", - frameno - 1 + i as u64, - frameno - 1 + lookahead_distance as u64, - result.intra_cost, - result.threshold, - result.inter_cost, - if result.has_scenecut { "Scene flash" } else { "No flash" } - ); - if result.has_scenecut { - // If the current frame is the frame before a scenecut, it cannot also be the frame of a scenecut. - let frameno = frameno + i as u64 - 1; - self.excluded_frames.insert(frameno); + /// Runs scene change comparison beetween 2 given frames + /// Insert result to start of score deque + fn run_comparison( + &mut self, frame1: Arc>, frame2: Arc>, + input_frameno: u64, previous_keyframe: u64, + ) { + let result = if self.fast_mode { + self.fast_scenecut(frame1, frame2) + } else { + self.cost_scenecut(frame1, frame2, input_frameno, previous_keyframe) + }; + self + .score_deque + .insert(0, (result.inter_cost as f64, result.intra_cost as f64)); + } + + /// Compares current scene score to adapted threshold based on previous scores + /// Value of current frame is offset by lookahead, if lookahead >=5 + /// Returns true if current scene score is higher than adapted threshold + fn adaptive_scenecut(&mut self) -> bool { + let mut cloned_deque = self.score_deque.to_vec(); + cloned_deque.remove(self.deque_offset); + + let scene_score = self.score_deque[self.deque_offset].0; + let scene_threshold = self.score_deque[self.deque_offset].1; + + if scene_score >= scene_threshold as f64 { + let back_deque = self.score_deque[self.deque_offset + 1..].to_vec(); + let forward_deque = self.score_deque[..self.deque_offset].to_vec(); + let back_over_tr = + back_deque.iter().filter(|(x, y)| x > y).collect_vec(); + + let forward_over_tr = + forward_deque.iter().filter(|(x, y)| x > y).collect_vec(); + + // Check for scenecut after the flashes + // No frames over threshold forward + // and some frames over threshold backward + if !back_over_tr.is_empty() + && forward_over_tr.is_empty() + && back_deque.len() > 1 + && back_over_tr.len() > 1 + { + return true; + } + + // Check for scenecut before flash + // If distance longer than max flash length + if back_over_tr.is_empty() + && forward_over_tr.len() == 1 + && forward_deque[0].0 > forward_deque[0].1 + { + return true; + } + + if !back_over_tr.is_empty() || !forward_over_tr.is_empty() { + return false; } } + + scene_score >= scene_threshold + } + + /// The fast algorithm detects fast cuts using a raw difference + /// in pixel values between the scaled frames. + #[hawktracer(fast_scenecut)] + fn fast_scenecut( + &mut self, frame1: Arc>, frame2: Arc>, + ) -> ScenecutResult { + // Downscaling both frames for comparison + // Moving scaled frames to buffer + if self.frame_buffer.is_empty() { + let frame1_scaled = frame1.planes[0].downscale(self.scale_factor); + self.frame_buffer.push(frame1_scaled); + + let frame2_scaled = frame2.planes[0].downscale(self.scale_factor); + self.frame_buffer.push(frame2_scaled); + } else { + self.frame_buffer.remove(0); + self.frame_buffer.push(frame2.planes[0].downscale(self.scale_factor)); + } + + let delta = + self.delta_in_planes(&self.frame_buffer[0], &self.frame_buffer[1]); + + ScenecutResult { + intra_cost: self.threshold as f64, + threshold: self.threshold as f64, + inter_cost: delta as f64, + } } /// Run a comparison between two frames to determine if they qualify for a scenecut. /// - /// The standard algorithm uses block intra and inter costs + /// Using block intra and inter costs /// to determine which method would be more efficient /// for coding this frame. - /// - /// The fast algorithm detects fast cuts using a raw difference - /// in pixel values between the frames. - /// It does not handle pans well, but the scene flash detection compensates for this - /// in many cases. - fn has_scenecut( + #[hawktracer(cost_scenecut)] + fn cost_scenecut( &self, frame1: Arc>, frame2: Arc>, frameno: u64, previous_keyframe: u64, ) -> ScenecutResult { - if self.fast_mode { - let len = frame2.planes[0].cfg.width * frame2.planes[0].cfg.height; - let delta = self.delta_in_planes(&frame1.planes[0], &frame2.planes[0]); - let threshold = self.threshold * len as u64; - ScenecutResult { - intra_cost: threshold as f64, - threshold: threshold as f64, - inter_cost: delta as f64, - has_scenecut: delta >= threshold, - } + let frame2_ref2 = Arc::clone(&frame2); + let (intra_cost, inter_cost) = crate::rayon::join( + move || { + let intra_costs = estimate_intra_costs( + &*frame2, + self.bit_depth, + self.cpu_feature_level, + ); + intra_costs.iter().map(|&cost| cost as u64).sum::() as f64 + / intra_costs.len() as f64 + }, + move || { + let inter_costs = estimate_inter_costs( + frame2_ref2, + frame1, + self.bit_depth, + self.encoder_config, + self.sequence.clone(), + ); + inter_costs.iter().map(|&cost| cost as u64).sum::() as f64 + / inter_costs.len() as f64 + }, + ); + + // Sliding scale, more likely to choose a keyframe + // as we get farther from the last keyframe. + // Based on x264 scenecut code. + // + // `THRESH_MAX` determines how likely we are + // to choose a keyframe, between 0.0-1.0. + // Higher values mean we are more likely to choose a keyframe. + // `0.4` was chosen based on trials of the `scenecut-720p` set in AWCY, + // as it appeared to provide the best average compression. + // This also matches the default scenecut threshold in x264. + const THRESH_MAX: f64 = 0.4; + const THRESH_MIN: f64 = THRESH_MAX * 0.25; + let distance_from_keyframe = frameno - previous_keyframe; + let min_keyint = self.encoder_config.min_key_frame_interval; + let max_keyint = self.encoder_config.max_key_frame_interval; + let bias = if distance_from_keyframe <= min_keyint / 4 { + THRESH_MIN / 4.0 + } else if distance_from_keyframe <= min_keyint { + THRESH_MIN * distance_from_keyframe as f64 / min_keyint as f64 } else { - let frame2_ref2 = Arc::clone(&frame2); - let (intra_cost, inter_cost) = crate::rayon::join( - move || { - let intra_costs = estimate_intra_costs( - &*frame2, - self.bit_depth, - self.cpu_feature_level, - ); - intra_costs.iter().map(|&cost| cost as u64).sum::() as f64 - / intra_costs.len() as f64 - }, - move || { - let inter_costs = estimate_inter_costs( - frame2_ref2, - frame1, - self.bit_depth, - self.encoder_config, - self.sequence.clone(), - ); - inter_costs.iter().map(|&cost| cost as u64).sum::() as f64 - / inter_costs.len() as f64 - }, - ); + THRESH_MIN + + (THRESH_MAX - THRESH_MIN) + * (distance_from_keyframe - min_keyint) as f64 + / (max_keyint - min_keyint) as f64 + }; + let threshold = intra_cost * (1.0 - bias); - // Sliding scale, more likely to choose a keyframe - // as we get farther from the last keyframe. - // Based on x264 scenecut code. - // - // `THRESH_MAX` determines how likely we are - // to choose a keyframe, between 0.0-1.0. - // Higher values mean we are more likely to choose a keyframe. - // `0.4` was chosen based on trials of the `scenecut-720p` set in AWCY, - // as it appeared to provide the best average compression. - // This also matches the default scenecut threshold in x264. - const THRESH_MAX: f64 = 0.4; - const THRESH_MIN: f64 = THRESH_MAX * 0.25; - let distance_from_keyframe = frameno - previous_keyframe; - let min_keyint = self.encoder_config.min_key_frame_interval; - let max_keyint = self.encoder_config.max_key_frame_interval; - let bias = if distance_from_keyframe <= min_keyint / 4 { - THRESH_MIN / 4.0 - } else if distance_from_keyframe <= min_keyint { - THRESH_MIN * distance_from_keyframe as f64 / min_keyint as f64 - } else { - THRESH_MIN - + (THRESH_MAX - THRESH_MIN) - * (distance_from_keyframe - min_keyint) as f64 - / (max_keyint - min_keyint) as f64 - }; - let threshold = intra_cost * (1.0 - bias); - - ScenecutResult { - intra_cost, - threshold, - inter_cost, - has_scenecut: inter_cost > threshold, - } - } + ScenecutResult { intra_cost, inter_cost, threshold } } - fn delta_in_planes( - &self, plane1: &Plane, plane2: &Plane, - ) -> u64 { + /// Calculates delta beetween 2 planes + /// returns average for pixel + #[hawktracer(delta_in_planes)] + fn delta_in_planes(&self, plane1: &Plane, plane2: &Plane) -> f64 { let mut delta = 0; + let lines = plane1.rows_iter().zip(plane2.rows_iter()); for (l1, l2) in lines { @@ -320,21 +385,43 @@ impl SceneChangeDetector { .iter() .zip(l2.iter()) .map(|(&p1, &p2)| { - (i16::cast_from(p1) - i16::cast_from(p2)).abs() as u64 + (i16::cast_from(p1) - i16::cast_from(p2)).abs() as u32 }) - .sum::(); - delta += delta_line; + .sum::(); + delta += delta_line as u64; } - delta + delta as f64 / self.pixels as f64 } } +/// Scaling factor for frame in scene detection +fn detect_scale_factor(sequence: &Arc) -> usize { + let small_edge = + cmp::min(sequence.max_frame_height, sequence.max_frame_width) as usize; + let scale_factor = match small_edge { + 0..=240 => 1, + 241..=480 => 2, + 481..=720 => 4, + 721..=1080 => 8, + 1081..=1600 => 16, + 1601..=std::usize::MAX => 32, + _ => 1, + } as usize; + debug!( + "Scene detection scale factor {}, [{},{}] -> [{},{}]", + scale_factor, + sequence.max_frame_width, + sequence.max_frame_height, + sequence.max_frame_width as usize / scale_factor, + sequence.max_frame_height as usize / scale_factor + ); + scale_factor +} + /// This struct primarily exists for returning metrics to the caller -/// for logging debug information. #[derive(Debug, Clone, Copy)] struct ScenecutResult { intra_cost: f64, inter_cost: f64, threshold: f64, - has_scenecut: bool, }