Skip to content

Commit

Permalink
AVX improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed May 20, 2024
1 parent e8eaf06 commit 869d65c
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 28 deletions.
8 changes: 0 additions & 8 deletions src/intel_simd_support.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

#[cfg(target_arch = "x86_64")]
#[inline(always)]
pub unsafe fn demote_to_avx256_to_128(data: __m256i) -> __m128i {
let lo_lane = _mm256_castsi256_si128(data);
let hi_lane = _mm256_extracti128_si256::<1>(data);
return _mm_packus_epi16(lo_lane, hi_lane);
}

#[cfg(target_arch = "x86_64")]
#[inline(always)]
pub unsafe fn store_u8_rgb_avx2(ptr: *mut u8, r: __m256i, g: __m256i, b: __m256i, use_transient: bool) {
Expand Down
33 changes: 13 additions & 20 deletions src/yuv_to_rgba.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,15 @@ unsafe fn avx2_process_row(
v_luma_coeff,
);

let r_high = _mm256_srli_epi16::<6>(_mm256_max_epi16(
let r_high = _mm256_srai_epi16::<6>(_mm256_max_epi16(
_mm256_adds_epi16(y_high, _mm256_mullo_epi16(v_high, v_cr_coeff)),
v_min_values,
));
let b_high = _mm256_srli_epi16::<6>(_mm256_max_epi16(
let b_high = _mm256_srai_epi16::<6>(_mm256_max_epi16(
_mm256_adds_epi16(y_high, _mm256_mullo_epi16(u_high, v_cb_coeff)),
v_min_values,
));
let g_high = _mm256_srli_epi16::<6>(_mm256_max_epi16(
let g_high = _mm256_srai_epi16::<6>(_mm256_max_epi16(
_mm256_adds_epi16(
y_high,
_mm256_adds_epi16(
Expand All @@ -113,22 +113,22 @@ unsafe fn avx2_process_row(
v_min_values,
));

let u_low = _mm256_sub_epi16(_mm256_cvtepu8_epi16(u_low_u8), uv_corr);
let v_low = _mm256_sub_epi16(_mm256_cvtepu8_epi16(v_low_u8), uv_corr);
let u_low = _mm256_subs_epi16(_mm256_cvtepu8_epi16(u_low_u8), uv_corr);
let v_low = _mm256_subs_epi16(_mm256_cvtepu8_epi16(v_low_u8), uv_corr);
let y_low = _mm256_mullo_epi16(
_mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_values)),
v_luma_coeff,
);

let r_low = _mm256_srli_epi16::<6>(_mm256_max_epi16(
let r_low = _mm256_srai_epi16::<6>(_mm256_max_epi16(
_mm256_adds_epi16(y_low, _mm256_mullo_epi16(v_low, v_cr_coeff)),
v_min_values,
));
let b_low = _mm256_srli_epi16::<6>(_mm256_max_epi16(
let b_low = _mm256_srai_epi16::<6>(_mm256_max_epi16(
_mm256_adds_epi16(y_low, _mm256_mullo_epi16(u_low, v_cb_coeff)),
v_min_values,
));
let g_low = _mm256_srli_epi16::<6>(_mm256_max_epi16(
let g_low = _mm256_srai_epi16::<6>(_mm256_max_epi16(
_mm256_adds_epi16(
y_low,
_mm256_adds_epi16(
Expand All @@ -139,16 +139,9 @@ unsafe fn avx2_process_row(
v_min_values,
));

let r_low_u8 = _mm256_castsi128_si256(demote_to_avx256_to_128(r_low));
let r_high_u8 = demote_to_avx256_to_128(r_high);
let g_low_u8 = _mm256_castsi128_si256(demote_to_avx256_to_128(g_low));
let g_high_u8 = demote_to_avx256_to_128(g_high);
let b_low_u8 = _mm256_castsi128_si256(demote_to_avx256_to_128(b_low));
let b_high_u8 = demote_to_avx256_to_128(b_high);

let r_values = _mm256_inserti128_si256::<1>(r_low_u8, r_high_u8);
let g_values = _mm256_inserti128_si256::<1>(g_low_u8, g_high_u8);
let b_values = _mm256_inserti128_si256::<1>(b_low_u8, b_high_u8);
let r_values = _mm256_packus_epi16(r_low, r_high);
let g_values = _mm256_packus_epi16(g_low, g_high);
let b_values = _mm256_packus_epi16(b_low, b_high);

let dst_shift = rgba_offset + cx * channels;

Expand Down Expand Up @@ -286,8 +279,8 @@ unsafe fn sse42_process_row(
v_min_values,
));

let u_low = _mm_sub_epi16(_mm_cvtepu8_epi16(u_low_u8), uv_corr);
let v_low = _mm_sub_epi16(_mm_cvtepu8_epi16(v_low_u8), uv_corr);
let u_low = _mm_subs_epi16(_mm_cvtepu8_epi16(u_low_u8), uv_corr);
let v_low = _mm_subs_epi16(_mm_cvtepu8_epi16(v_low_u8), uv_corr);
let y_low = _mm_mullo_epi16(
_mm_cvtepu8_epi16(_mm_srli_si128::<0>(y_values)),
v_luma_coeff,
Expand Down

0 comments on commit 869d65c

Please sign in to comment.