GBR improvements, YUV 4:0:0 improvements

awxkee · Nov 30, 2024 · 79028e6 · 79028e6
1 parent 744dbbe
commit 79028e6
Show file tree

Hide file tree

Showing 7 changed files with 417 additions and 14 deletions.
diff --git a/app/src/main.rs b/app/src/main.rs
@@ -95,16 +95,17 @@ fn main() {
         YuvBiPlanarImageMut::<u8>::alloc(width as u32, height as u32, YuvChromaSubsampling::Yuv420);
 
     let mut planar_image =
-        YuvPlanarImageMut::<u8>::alloc(width as u32, height as u32, YuvChromaSubsampling::Yuv444);
+        YuvPlanarImageMut::<u8>::alloc(width as u32, height as u32, YuvChromaSubsampling::Yuv420);
 
     // let mut bytes_16: Vec<u16> = src_bytes.iter().map(|&x| (x as u16) << 4).collect();
 
     let start_time = Instant::now();
-    rgb_to_gbr(
+    rgb_to_yuv420(
         &mut planar_image,
         &src_bytes,
         rgba_stride as u32,
         YuvRange::Limited,
+        YuvStandardMatrix::Bt601,
     )
     .unwrap();
     // bytes_16.fill(0);
@@ -257,11 +258,12 @@ fn main() {
     // let rgba_stride = width as usize * 4;
     // let mut rgba = vec![0u8; height as usize * rgba_stride];
 
-    gbr_to_rgb(
+    yuv420_to_rgb(
         &fixed_planar,
         &mut rgba,
         rgba_stride as u32,
         YuvRange::Limited,
+        YuvStandardMatrix::Bt601,
     )
     .unwrap();
 

diff --git a/src/from_identity.rs b/src/from_identity.rs
@@ -158,8 +158,7 @@ impl FullRangeWideRow<u16> for WideRowGbrProcessor<u16> {
         _start_cx: usize,
         _width: usize,
     ) -> usize {
-        let mut _cx = 0;
-        _cx
+        0
     }
 }
 

diff --git a/src/neon/mod.rs b/src/neon/mod.rs
@@ -40,6 +40,7 @@ mod rgba_to_yuv;
 mod rgba_to_yuv420;
 mod y_p16_to_rgba16;
 mod y_to_rgb;
+mod y_to_rgb_alpha;
 mod ycgco_to_rgb;
 mod ycgco_to_rgb_alpha;
 mod ycgcor_to_rgb;
@@ -61,7 +62,7 @@ mod yuy2_to_yuv;
 pub(crate) use gbr_to_rgb::{
     yuv_to_rgba_row_full, yuv_to_rgba_row_limited, yuv_to_rgba_row_limited_rdm,
 };
-pub(crate) use rgb_to_y::neon_rgb_to_y_row;
+pub(crate) use rgb_to_y::{neon_rgb_to_y_rdm, neon_rgb_to_y_row};
 pub(crate) use rgb_to_ycgco::neon_rgb_to_ycgco_row;
 pub(crate) use rgb_to_ycgco_r::neon_rgb_to_ycgcor_row;
 pub(crate) use rgb_to_yuv_p16::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm};
@@ -70,6 +71,7 @@ pub(crate) use rgba_to_yuv::{neon_rgba_to_yuv, neon_rgba_to_yuv_rdm};
 pub(crate) use rgba_to_yuv420::{neon_rgba_to_yuv420, neon_rgba_to_yuv_rdm420};
 pub(crate) use y_p16_to_rgba16::{neon_y_p16_to_rgba16_rdm, neon_y_p16_to_rgba16_row};
 pub(crate) use y_to_rgb::{neon_y_to_rgb_row, neon_y_to_rgb_row_rdm};
+pub(crate) use y_to_rgb_alpha::{neon_y_to_rgb_alpha_row, neon_y_to_rgb_row_alpha_rdm};
 pub(crate) use ycgco_to_rgb::neon_ycgco_to_rgb_row;
 pub(crate) use ycgco_to_rgb_alpha::neon_ycgco_to_rgb_alpha_row;
 pub(crate) use ycgcor_to_rgb::neon_ycgcor_to_rgb_row;

diff --git a/src/neon/rgb_to_y.rs b/src/neon/rgb_to_y.rs
@@ -31,7 +31,7 @@ use crate::yuv_support::{CbCrForwardTransform, YuvChromaRange, YuvSourceChannels
 use std::arch::aarch64::*;
 
 #[target_feature(enable = "rdm")]
-pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
+pub(crate) unsafe fn neon_rgb_to_y_rdm<const ORIGIN_CHANNELS: u8>(
     transform: &CbCrForwardTransform<i32>,
     range: &YuvChromaRange,
     y_plane: *mut u8,
@@ -51,7 +51,6 @@ pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
     let v_yr = vdupq_n_s16(transform.yr as i16);
     let v_yg = vdupq_n_s16(transform.yg as i16);
     let v_yb = vdupq_n_s16(transform.yb as i16);
-    let v_zeros = vdupq_n_s16(0i16);
 
     let i_bias_y = vdupq_n_s16(range.bias_y as i16);
     let i_cap_y = vdupq_n_u16(range.range_y as u16 + range.bias_y as u16);
@@ -96,7 +95,6 @@ pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
         let mut y_high = vqrdmlahq_s16(y_bias, r_high, v_yr);
         y_high = vqrdmlahq_s16(y_high, g_high, v_yg);
         y_high = vqrdmlahq_s16(y_high, b_high, v_yb);
-        y_high = vmaxq_s16(y_high, v_zeros);
 
         let y_high = vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(y_high, i_bias_y)), i_cap_y);
 
@@ -107,7 +105,6 @@ pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
         let mut y_low = vqrdmlahq_s16(y_bias, r_low, v_yr);
         y_low = vqrdmlahq_s16(y_low, g_low, v_yg);
         y_low = vqrdmlahq_s16(y_low, b_low, v_yb);
-        y_low = vmaxq_s16(y_low, v_zeros);
 
         let y_low = vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(y_low, i_bias_y)), i_cap_y);
 
@@ -119,3 +116,131 @@ pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
 
     cx
 }
+
+pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8, const PRECISION: i32>(
+    transform: &CbCrForwardTransform<i32>,
+    range: &YuvChromaRange,
+    y_plane: *mut u8,
+    rgba: &[u8],
+    start_cx: usize,
+    width: usize,
+) -> usize {
+    let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
+    let channels = source_channels.get_channels_count();
+
+    let bias_y = range.bias_y as i32;
+
+    let y_ptr = y_plane;
+    let rgba_ptr = rgba.as_ptr();
+
+    let y_bias = vdupq_n_s32(bias_y);
+    let weights_arr: [i16; 8] = [
+        transform.yr as i16,
+        transform.yg as i16,
+        transform.yb as i16,
+        transform.cb_r as i16,
+        transform.cb_g as i16,
+        transform.cb_b as i16,
+        transform.cr_r as i16,
+        transform.cr_g as i16,
+    ];
+    let v_weights = vld1q_s16(weights_arr.as_ptr());
+
+    let i_bias_y = vdupq_n_s16(range.bias_y as i16);
+    let i_cap_y = vdupq_n_u16(range.range_y as u16 + range.bias_y as u16);
+
+    let mut cx = start_cx;
+    while cx + 16 < width {
+        let r_values_u8: uint8x16_t;
+        let g_values_u8: uint8x16_t;
+        let b_values_u8: uint8x16_t;
+
+        match source_channels {
+            YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => {
+                let rgb_values = vld3q_u8(rgba_ptr.add(cx * channels));
+                if source_channels == YuvSourceChannels::Rgb {
+                    r_values_u8 = rgb_values.0;
+                    g_values_u8 = rgb_values.1;
+                    b_values_u8 = rgb_values.2;
+                } else {
+                    r_values_u8 = rgb_values.2;
+                    g_values_u8 = rgb_values.1;
+                    b_values_u8 = rgb_values.0;
+                }
+            }
+            YuvSourceChannels::Rgba => {
+                let rgb_values = vld4q_u8(rgba_ptr.add(cx * channels));
+                r_values_u8 = rgb_values.0;
+                g_values_u8 = rgb_values.1;
+                b_values_u8 = rgb_values.2;
+            }
+            YuvSourceChannels::Bgra => {
+                let rgb_values = vld4q_u8(rgba_ptr.add(cx * channels));
+                r_values_u8 = rgb_values.2;
+                g_values_u8 = rgb_values.1;
+                b_values_u8 = rgb_values.0;
+            }
+        }
+
+        let r_high = vreinterpretq_s16_u16(vmovl_high_u8(r_values_u8));
+        let g_high = vreinterpretq_s16_u16(vmovl_high_u8(g_values_u8));
+        let b_high = vreinterpretq_s16_u16(vmovl_high_u8(b_values_u8));
+
+        let r_h_low = vget_low_s16(r_high);
+        let g_h_low = vget_low_s16(g_high);
+        let b_h_low = vget_low_s16(b_high);
+
+        let mut y_h_high = vmlal_high_laneq_s16::<0>(y_bias, r_high, v_weights);
+        y_h_high = vmlal_high_laneq_s16::<1>(y_h_high, g_high, v_weights);
+        y_h_high = vmlal_high_laneq_s16::<2>(y_h_high, b_high, v_weights);
+
+        let mut y_h_low = vmlal_laneq_s16::<0>(y_bias, r_h_low, v_weights);
+        y_h_low = vmlal_laneq_s16::<1>(y_h_low, g_h_low, v_weights);
+        y_h_low = vmlal_laneq_s16::<2>(y_h_low, b_h_low, v_weights);
+
+        let y_high = vminq_u16(
+            vreinterpretq_u16_s16(vmaxq_s16(
+                vcombine_s16(
+                    vshrn_n_s32::<PRECISION>(y_h_low),
+                    vshrn_n_s32::<PRECISION>(y_h_high),
+                ),
+                i_bias_y,
+            )),
+            i_cap_y,
+        );
+
+        let r_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r_values_u8)));
+        let g_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(g_values_u8)));
+        let b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b_values_u8)));
+
+        let r_l_low = vget_low_s16(r_low);
+        let g_l_low = vget_low_s16(g_low);
+        let b_l_low = vget_low_s16(b_low);
+
+        let mut y_l_high = vmlal_high_laneq_s16::<0>(y_bias, r_low, v_weights);
+        y_l_high = vmlal_high_laneq_s16::<1>(y_l_high, g_low, v_weights);
+        y_l_high = vmlal_high_laneq_s16::<2>(y_l_high, b_low, v_weights);
+
+        let mut y_l_low = vmlal_laneq_s16::<0>(y_bias, r_l_low, v_weights);
+        y_l_low = vmlal_laneq_s16::<1>(y_l_low, g_l_low, v_weights);
+        y_l_low = vmlal_laneq_s16::<2>(y_l_low, b_l_low, v_weights);
+
+        let y_low = vminq_u16(
+            vreinterpretq_u16_s16(vmaxq_s16(
+                vcombine_s16(
+                    vshrn_n_s32::<PRECISION>(y_l_low),
+                    vshrn_n_s32::<PRECISION>(y_l_high),
+                ),
+                i_bias_y,
+            )),
+            i_cap_y,
+        );
+
+        let y = vcombine_u8(vmovn_u16(y_low), vmovn_u16(y_high));
+        vst1q_u8(y_ptr.add(cx), y);
+
+        cx += 16;
+    }
+
+    cx
+}