Skip to content

Commit

Permalink
GBR improvements, YUV 4:0:0 improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Nov 30, 2024
1 parent 744dbbe commit 79028e6
Show file tree
Hide file tree
Showing 7 changed files with 417 additions and 14 deletions.
8 changes: 5 additions & 3 deletions app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,16 +95,17 @@ fn main() {
YuvBiPlanarImageMut::<u8>::alloc(width as u32, height as u32, YuvChromaSubsampling::Yuv420);

let mut planar_image =
YuvPlanarImageMut::<u8>::alloc(width as u32, height as u32, YuvChromaSubsampling::Yuv444);
YuvPlanarImageMut::<u8>::alloc(width as u32, height as u32, YuvChromaSubsampling::Yuv420);

// let mut bytes_16: Vec<u16> = src_bytes.iter().map(|&x| (x as u16) << 4).collect();

let start_time = Instant::now();
rgb_to_gbr(
rgb_to_yuv420(
&mut planar_image,
&src_bytes,
rgba_stride as u32,
YuvRange::Limited,
YuvStandardMatrix::Bt601,
)
.unwrap();
// bytes_16.fill(0);
Expand Down Expand Up @@ -257,11 +258,12 @@ fn main() {
// let rgba_stride = width as usize * 4;
// let mut rgba = vec![0u8; height as usize * rgba_stride];

gbr_to_rgb(
yuv420_to_rgb(
&fixed_planar,
&mut rgba,
rgba_stride as u32,
YuvRange::Limited,
YuvStandardMatrix::Bt601,
)
.unwrap();

Expand Down
3 changes: 1 addition & 2 deletions src/from_identity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,7 @@ impl FullRangeWideRow<u16> for WideRowGbrProcessor<u16> {
_start_cx: usize,
_width: usize,
) -> usize {
let mut _cx = 0;
_cx
0
}
}

Expand Down
4 changes: 3 additions & 1 deletion src/neon/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ mod rgba_to_yuv;
mod rgba_to_yuv420;
mod y_p16_to_rgba16;
mod y_to_rgb;
mod y_to_rgb_alpha;
mod ycgco_to_rgb;
mod ycgco_to_rgb_alpha;
mod ycgcor_to_rgb;
Expand All @@ -61,7 +62,7 @@ mod yuy2_to_yuv;
pub(crate) use gbr_to_rgb::{
yuv_to_rgba_row_full, yuv_to_rgba_row_limited, yuv_to_rgba_row_limited_rdm,
};
pub(crate) use rgb_to_y::neon_rgb_to_y_row;
pub(crate) use rgb_to_y::{neon_rgb_to_y_rdm, neon_rgb_to_y_row};
pub(crate) use rgb_to_ycgco::neon_rgb_to_ycgco_row;
pub(crate) use rgb_to_ycgco_r::neon_rgb_to_ycgcor_row;
pub(crate) use rgb_to_yuv_p16::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm};
Expand All @@ -70,6 +71,7 @@ pub(crate) use rgba_to_yuv::{neon_rgba_to_yuv, neon_rgba_to_yuv_rdm};
pub(crate) use rgba_to_yuv420::{neon_rgba_to_yuv420, neon_rgba_to_yuv_rdm420};
pub(crate) use y_p16_to_rgba16::{neon_y_p16_to_rgba16_rdm, neon_y_p16_to_rgba16_row};
pub(crate) use y_to_rgb::{neon_y_to_rgb_row, neon_y_to_rgb_row_rdm};
pub(crate) use y_to_rgb_alpha::{neon_y_to_rgb_alpha_row, neon_y_to_rgb_row_alpha_rdm};
pub(crate) use ycgco_to_rgb::neon_ycgco_to_rgb_row;
pub(crate) use ycgco_to_rgb_alpha::neon_ycgco_to_rgb_alpha_row;
pub(crate) use ycgcor_to_rgb::neon_ycgcor_to_rgb_row;
Expand Down
133 changes: 129 additions & 4 deletions src/neon/rgb_to_y.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ use crate::yuv_support::{CbCrForwardTransform, YuvChromaRange, YuvSourceChannels
use std::arch::aarch64::*;

#[target_feature(enable = "rdm")]
pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
pub(crate) unsafe fn neon_rgb_to_y_rdm<const ORIGIN_CHANNELS: u8>(
transform: &CbCrForwardTransform<i32>,
range: &YuvChromaRange,
y_plane: *mut u8,
Expand All @@ -51,7 +51,6 @@ pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
let v_yr = vdupq_n_s16(transform.yr as i16);
let v_yg = vdupq_n_s16(transform.yg as i16);
let v_yb = vdupq_n_s16(transform.yb as i16);
let v_zeros = vdupq_n_s16(0i16);

let i_bias_y = vdupq_n_s16(range.bias_y as i16);
let i_cap_y = vdupq_n_u16(range.range_y as u16 + range.bias_y as u16);
Expand Down Expand Up @@ -96,7 +95,6 @@ pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
let mut y_high = vqrdmlahq_s16(y_bias, r_high, v_yr);
y_high = vqrdmlahq_s16(y_high, g_high, v_yg);
y_high = vqrdmlahq_s16(y_high, b_high, v_yb);
y_high = vmaxq_s16(y_high, v_zeros);

let y_high = vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(y_high, i_bias_y)), i_cap_y);

Expand All @@ -107,7 +105,6 @@ pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(
let mut y_low = vqrdmlahq_s16(y_bias, r_low, v_yr);
y_low = vqrdmlahq_s16(y_low, g_low, v_yg);
y_low = vqrdmlahq_s16(y_low, b_low, v_yb);
y_low = vmaxq_s16(y_low, v_zeros);

let y_low = vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(y_low, i_bias_y)), i_cap_y);

Expand All @@ -119,3 +116,131 @@ pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8>(

cx
}

pub(crate) unsafe fn neon_rgb_to_y_row<const ORIGIN_CHANNELS: u8, const PRECISION: i32>(
transform: &CbCrForwardTransform<i32>,
range: &YuvChromaRange,
y_plane: *mut u8,
rgba: &[u8],
start_cx: usize,
width: usize,
) -> usize {
let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into();
let channels = source_channels.get_channels_count();

let bias_y = range.bias_y as i32;

let y_ptr = y_plane;
let rgba_ptr = rgba.as_ptr();

let y_bias = vdupq_n_s32(bias_y);
let weights_arr: [i16; 8] = [
transform.yr as i16,
transform.yg as i16,
transform.yb as i16,
transform.cb_r as i16,
transform.cb_g as i16,
transform.cb_b as i16,
transform.cr_r as i16,
transform.cr_g as i16,
];
let v_weights = vld1q_s16(weights_arr.as_ptr());

let i_bias_y = vdupq_n_s16(range.bias_y as i16);
let i_cap_y = vdupq_n_u16(range.range_y as u16 + range.bias_y as u16);

let mut cx = start_cx;
while cx + 16 < width {
let r_values_u8: uint8x16_t;
let g_values_u8: uint8x16_t;
let b_values_u8: uint8x16_t;

match source_channels {
YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => {
let rgb_values = vld3q_u8(rgba_ptr.add(cx * channels));
if source_channels == YuvSourceChannels::Rgb {
r_values_u8 = rgb_values.0;
g_values_u8 = rgb_values.1;
b_values_u8 = rgb_values.2;
} else {
r_values_u8 = rgb_values.2;
g_values_u8 = rgb_values.1;
b_values_u8 = rgb_values.0;
}
}
YuvSourceChannels::Rgba => {
let rgb_values = vld4q_u8(rgba_ptr.add(cx * channels));
r_values_u8 = rgb_values.0;
g_values_u8 = rgb_values.1;
b_values_u8 = rgb_values.2;
}
YuvSourceChannels::Bgra => {
let rgb_values = vld4q_u8(rgba_ptr.add(cx * channels));
r_values_u8 = rgb_values.2;
g_values_u8 = rgb_values.1;
b_values_u8 = rgb_values.0;
}
}

let r_high = vreinterpretq_s16_u16(vmovl_high_u8(r_values_u8));
let g_high = vreinterpretq_s16_u16(vmovl_high_u8(g_values_u8));
let b_high = vreinterpretq_s16_u16(vmovl_high_u8(b_values_u8));

let r_h_low = vget_low_s16(r_high);
let g_h_low = vget_low_s16(g_high);
let b_h_low = vget_low_s16(b_high);

let mut y_h_high = vmlal_high_laneq_s16::<0>(y_bias, r_high, v_weights);
y_h_high = vmlal_high_laneq_s16::<1>(y_h_high, g_high, v_weights);
y_h_high = vmlal_high_laneq_s16::<2>(y_h_high, b_high, v_weights);

let mut y_h_low = vmlal_laneq_s16::<0>(y_bias, r_h_low, v_weights);
y_h_low = vmlal_laneq_s16::<1>(y_h_low, g_h_low, v_weights);
y_h_low = vmlal_laneq_s16::<2>(y_h_low, b_h_low, v_weights);

let y_high = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(
vcombine_s16(
vshrn_n_s32::<PRECISION>(y_h_low),
vshrn_n_s32::<PRECISION>(y_h_high),
),
i_bias_y,
)),
i_cap_y,
);

let r_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r_values_u8)));
let g_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(g_values_u8)));
let b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b_values_u8)));

let r_l_low = vget_low_s16(r_low);
let g_l_low = vget_low_s16(g_low);
let b_l_low = vget_low_s16(b_low);

let mut y_l_high = vmlal_high_laneq_s16::<0>(y_bias, r_low, v_weights);
y_l_high = vmlal_high_laneq_s16::<1>(y_l_high, g_low, v_weights);
y_l_high = vmlal_high_laneq_s16::<2>(y_l_high, b_low, v_weights);

let mut y_l_low = vmlal_laneq_s16::<0>(y_bias, r_l_low, v_weights);
y_l_low = vmlal_laneq_s16::<1>(y_l_low, g_l_low, v_weights);
y_l_low = vmlal_laneq_s16::<2>(y_l_low, b_l_low, v_weights);

let y_low = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(
vcombine_s16(
vshrn_n_s32::<PRECISION>(y_l_low),
vshrn_n_s32::<PRECISION>(y_l_high),
),
i_bias_y,
)),
i_cap_y,
);

let y = vcombine_u8(vmovn_u16(y_low), vmovn_u16(y_high));
vst1q_u8(y_ptr.add(cx), y);

cx += 16;
}

cx
}
Loading

0 comments on commit 79028e6

Please sign in to comment.