From eb3dd2f7dceaef00b00734faff1d41c2e9b301a0 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sun, 10 Nov 2024 16:48:18 +0100 Subject: [PATCH 01/10] autotest: fix benchark which was no longer running since 122cc14376d --- .github/workflows/benchmarks/test.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/benchmarks/test.sh b/.github/workflows/benchmarks/test.sh index 3f5332d2e0b4..295d7b83d108 100755 --- a/.github/workflows/benchmarks/test.sh +++ b/.github/workflows/benchmarks/test.sh @@ -19,7 +19,10 @@ BENCHMARK_OPTIONS=( # Run target build and compare its results to the reference one. # Fail if we get results 20% slower or more. # Retry if that fails a first time. +# dist=no is needed because pytest-benchmark doesn't like other values of dist +# and in conftest.py/pytest.ini we set by default --dist=loadgroup BENCHMARK_COMPARE_OPTIONS=( + "--dist=no" \ "--benchmark-compare-fail=min:20%" \ "--benchmark-compare=0001_ref" \ ) From 35225d0a7fa5d0183b1152eb282381c8c338d852 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sun, 10 Nov 2024 16:48:28 +0100 Subject: [PATCH 02/10] CI: osx: run benchmarks --- ci/travis/osx/script.sh | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/travis/osx/script.sh b/ci/travis/osx/script.sh index 7b212dfc9deb..30de03403fc2 100755 --- a/ci/travis/osx/script.sh +++ b/ci/travis/osx/script.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash set -e @@ -27,3 +27,12 @@ DYLD_LIBRARY_PATH=$PWD/build PYTHONPATH=$PWD/build/swig/python python3 -c "from # Run all the Python autotests (cd build && ctest -V -R autotest -j${NPROC}) + +# Use time.process_time for more reliability on VMs +# dist=no is needed because pytest-benchmark doesn't like other values of dist +# and in conftest.py/pytest.ini we set by default --dist=loadgroup +BENCHMARK_OPTIONS=( + "--dist=no" \ + "--benchmark-timer=time.process_time" \ +) +(cd build && python3 -m pytest autotest/benchmark "${BENCHMARK_OPTIONS[@]}" --capture=no -ra -vv) From 316aaf418320baffb320578ebf0feeaa72e16bac Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sun, 10 Nov 2024 17:16:24 +0100 Subject: [PATCH 03/10] gcore/gdalsse_priv.h: enable SSE4.1 code path for AVX and Neon --- gcore/gdalsse_priv.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gcore/gdalsse_priv.h b/gcore/gdalsse_priv.h index ade33367ee55..d470f76d8330 100644 --- a/gcore/gdalsse_priv.h +++ b/gcore/gdalsse_priv.h @@ -31,7 +31,7 @@ /* Requires SSE2 */ #include -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS) #include #endif #endif @@ -221,7 +221,7 @@ class XMMReg2Double inline void nsLoad2Val(const unsigned char *ptr) { __m128i xmm_i = GDALCopyInt16ToXMM(ptr); -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS) xmm_i = _mm_cvtepu8_epi32(xmm_i); #else xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128()); @@ -233,7 +233,7 @@ class XMMReg2Double inline void nsLoad2Val(const short *ptr) { __m128i xmm_i = GDALCopyInt32ToXMM(ptr); -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS) xmm_i = _mm_cvtepi16_epi32(xmm_i); #else xmm_i = _mm_unpacklo_epi16( @@ -247,7 +247,7 @@ class XMMReg2Double inline void nsLoad2Val(const unsigned short *ptr) { __m128i xmm_i = GDALCopyInt32ToXMM(ptr); -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS) xmm_i = _mm_cvtepu16_epi32(xmm_i); #else xmm_i = _mm_unpacklo_epi16( @@ -261,7 +261,7 @@ class XMMReg2Double XMMReg2Double &high) { __m128i xmm_i = GDALCopyInt32ToXMM(ptr); -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS) xmm_i = _mm_cvtepu8_epi32(xmm_i); #else xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128()); From d309a2487a8d7cc9956a2de4e5f20784e5200aca Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sun, 10 Nov 2024 17:06:13 +0100 Subject: [PATCH 04/10] alg/: enable ARM Neon optimizations for warping, pansharpening, gridding, dithering, RPC --- alg/CMakeLists.txt | 5 +++++ alg/gdal_rpc.cpp | 12 ++++++++++-- alg/gdaldither.cpp | 7 +++++-- alg/gdalgrid.cpp | 8 +++++--- alg/gdalgridsse.cpp | 7 ++++++- alg/gdalpansharpen.cpp | 3 ++- alg/gdalwarpkernel.cpp | 40 ++++++++++++++++++++++++---------------- 7 files changed, 57 insertions(+), 25 deletions(-) diff --git a/alg/CMakeLists.txt b/alg/CMakeLists.txt index 98a86899f765..2d5971ca70f9 100644 --- a/alg/CMakeLists.txt +++ b/alg/CMakeLists.txt @@ -95,6 +95,11 @@ elseif (GDAL_USE_QHULL) gdal_target_link_libraries(alg PRIVATE ${QHULL_LIBRARY}) endif () +if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS) + target_compile_definitions(alg PRIVATE -DHAVE_SSE_AT_COMPILE_TIME -DUSE_NEON_OPTIMIZATIONS) + target_sources(alg PRIVATE gdalgridsse.cpp) +endif() + if (HAVE_SSE_AT_COMPILE_TIME) target_sources(alg PRIVATE gdalgridsse.cpp) target_compile_definitions(alg PRIVATE -DHAVE_SSE_AT_COMPILE_TIME) diff --git a/alg/gdal_rpc.cpp b/alg/gdal_rpc.cpp index fb7e415f225f..27737ff345df 100644 --- a/alg/gdal_rpc.cpp +++ b/alg/gdal_rpc.cpp @@ -34,10 +34,18 @@ #include "gdal_mdreader.h" #include "gdal_alg_priv.h" #include "gdal_priv.h" -#if defined(__x86_64) || defined(_M_X64) -#define USE_SSE2_OPTIM + +#ifdef USE_NEON_OPTIMIZATIONS +#define USE_SSE2 +#elif defined(__x86_64) || defined(_M_X64) +#define USE_SSE2 +#endif + +#ifdef USE_SSE2 #include "gdalsse_priv.h" +#define USE_SSE2_OPTIM #endif + #include "ogr_api.h" #include "ogr_geometry.h" #include "ogr_spatialref.h" diff --git a/alg/gdaldither.cpp b/alg/gdaldither.cpp index 46353ab6ec8e..9e1a5e48b6ef 100644 --- a/alg/gdaldither.cpp +++ b/alg/gdaldither.cpp @@ -45,13 +45,16 @@ #include "gdal.h" #include "gdal_priv.h" -#if defined(__x86_64) || defined(_M_X64) +#ifdef USE_NEON_OPTIMIZATIONS #define USE_SSE2 +#include "include_sse2neon.h" +#elif defined(__x86_64) || defined(_M_X64) +#define USE_SSE2 +#include #endif #ifdef USE_SSE2 -#include #define CAST_PCT(x) reinterpret_cast(x) #define ALIGN_INT_ARRAY_ON_16_BYTE(x) \ (((reinterpret_cast(x) % 16) != 0) \ diff --git a/alg/gdalgrid.cpp b/alg/gdalgrid.cpp index 044a217e9588..23341513ffff 100644 --- a/alg/gdalgrid.cpp +++ b/alg/gdalgrid.cpp @@ -2842,9 +2842,11 @@ GDALGridContext *GDALGridContextCreate(GDALGridAlgorithm eAlgorithm, #ifdef HAVE_SSE_AT_COMPILE_TIME if (pafXAligned == nullptr && - CPLTestBool( - CPLGetConfigOption("GDAL_USE_SSE", "YES")) && - CPLHaveRuntimeSSE()) + CPLTestBool(CPLGetConfigOption("GDAL_USE_SSE", "YES")) +#if !defined(USE_NEON_OPTIMIZATIONS) + && CPLHaveRuntimeSSE() +#endif + ) { pafXAligned = static_cast( VSI_MALLOC_ALIGNED_AUTO_VERBOSE(sizeof(float) * diff --git a/alg/gdalgridsse.cpp b/alg/gdalgridsse.cpp index 0d5f71f84ce1..02d2889cfdf5 100644 --- a/alg/gdalgridsse.cpp +++ b/alg/gdalgridsse.cpp @@ -14,7 +14,12 @@ #include "gdalgrid_priv.h" #ifdef HAVE_SSE_AT_COMPILE_TIME + +#ifdef USE_NEON_OPTIMIZATIONS +#include "include_sse2neon.h" +#else #include +#endif /************************************************************************/ /* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE() */ @@ -44,7 +49,7 @@ CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE( __m128 xmm_denominator = _mm_setzero_ps(); int mask = 0; -#if defined(__x86_64) || defined(_M_X64) +#if defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS) // This would also work in 32bit mode, but there are only 8 XMM registers // whereas we have 16 for 64bit. const size_t LOOP_SIZE = 8; diff --git a/alg/gdalpansharpen.cpp b/alg/gdalpansharpen.cpp index 53f11b7d6cf0..0dc8f449e334 100644 --- a/alg/gdalpansharpen.cpp +++ b/alg/gdalpansharpen.cpp @@ -650,8 +650,9 @@ void GDALPansharpenOperation::WeightedBrovey3( /* We restrict to 64bit processors because they are guaranteed to have SSE2 */ /* Could possibly be used too on 32bit, but we would need to check at runtime */ -#if defined(__x86_64) || defined(_M_X64) +#if defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS) +#define USE_SSE2 #include "gdalsse_priv.h" template diff --git a/alg/gdalwarpkernel.cpp b/alg/gdalwarpkernel.cpp index 9d475d0dd79d..8cd036e0f0bd 100644 --- a/alg/gdalwarpkernel.cpp +++ b/alg/gdalwarpkernel.cpp @@ -52,9 +52,17 @@ #include "ogr_geos.h" #endif +#ifdef USE_NEON_OPTIMIZATIONS +#include "include_sse2neon.h" +#define USE_SSE2 + +#include "gdalsse_priv.h" + // We restrict to 64bit processors because they are guaranteed to have SSE2. // Could possibly be used too on 32bit, but we would need to check at runtime. -#if defined(__x86_64) || defined(_M_X64) +#elif defined(__x86_64) || defined(_M_X64) +#define USE_SSE2 + #include "gdalsse_priv.h" #if __SSE4_1__ @@ -2971,7 +2979,7 @@ static bool GWKCubicResample4Sample(const GDALWarpKernel *poWK, int iBand, return true; } -#if defined(__x86_64) || defined(_M_X64) +#ifdef USE_SSE2 /************************************************************************/ /* XMMLoad4Values() */ @@ -2987,7 +2995,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GByte *ptr) __m128i xmm_i = _mm_cvtsi32_si128(i); // Zero extend 4 packed unsigned 8-bit integers in a to packed // 32-bit integers. -#if __SSE4_1__ +#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS) xmm_i = _mm_cvtepu8_epi32(xmm_i); #else xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128()); @@ -3003,7 +3011,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr) __m128i xmm_i = _mm_cvtsi64_si128(i); // Zero extend 4 packed unsigned 16-bit integers in a to packed // 32-bit integers. -#if __SSE4_1__ +#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS) xmm_i = _mm_cvtepu16_epi32(xmm_i); #else xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128()); @@ -3017,7 +3025,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr) /* Return the sum of the 4 floating points of the register. */ /************************************************************************/ -#if __SSE3__ +#if defined(__SSE3__) || defined(USE_NEON_OPTIMIZATIONS) static CPL_INLINE float XMMHorizontalAdd(__m128 v) { __m128 shuf = _mm_movehdup_ps(v); // (v3 , v3 , v1 , v1) @@ -3037,7 +3045,7 @@ static CPL_INLINE float XMMHorizontalAdd(__m128 v) } #endif -#endif // (defined(__x86_64) || defined(_M_X64)) +#endif // define USE_SSE2 /************************************************************************/ /* GWKCubicResampleSrcMaskIsDensity4SampleRealT() */ @@ -3067,7 +3075,7 @@ static CPL_INLINE bool GWKCubicResampleSrcMaskIsDensity4SampleRealT( pdfDensity, pdfReal, adfImagIgnored); } -#if defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64)) +#if defined(USE_SSE_CUBIC_IMPL) && defined(USE_SSE2) const float fDeltaX = static_cast(dfSrcX) - 0.5f - iSrcX; const float fDeltaY = static_cast(dfSrcY) - 0.5f - iSrcY; @@ -3137,7 +3145,7 @@ static CPL_INLINE bool GWKCubicResampleSrcMaskIsDensity4SampleRealT( if (fabs(*pdfReal - static_cast(*pdfReal) - 0.5) > .007) return true; -#endif // defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64)) +#endif // defined(USE_SSE_CUBIC_IMPL) && defined(USE_SSE2) const double dfDeltaX = dfSrcX - 0.5 - iSrcX; const double dfDeltaY = dfSrcY - 0.5 - iSrcY; @@ -3154,7 +3162,7 @@ static CPL_INLINE bool GWKCubicResampleSrcMaskIsDensity4SampleRealT( for (GPtrDiff_t i = -1; i < 3; i++) { const GPtrDiff_t iOffset = iSrcOffset + i * poWK->nSrcXSize - 1; -#if !(defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))) +#if !(defined(USE_SSE_CUBIC_IMPL) && defined(USE_SSE2)) if (poWK->pafUnifiedSrcDensity[iOffset + 0] < SRC_DENSITY_THRESHOLD || poWK->pafUnifiedSrcDensity[iOffset + 1] < SRC_DENSITY_THRESHOLD || poWK->pafUnifiedSrcDensity[iOffset + 2] < SRC_DENSITY_THRESHOLD || @@ -4167,7 +4175,7 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand, reinterpret_cast(poWK->papabySrcImage[iBand]); pSrc += iSrcOffset + static_cast(jMin) * nSrcXSize; -#if defined(__x86_64) || defined(_M_X64) +#if defined(USE_SSE2) if (iMax - iMin + 1 == 6) { // This is just an optimized version of the general case in @@ -4530,7 +4538,7 @@ GWKResampleNoMasksT(const GDALWarpKernel *poWK, int iBand, double dfSrcX, /* We restrict to 64bit processors because they are guaranteed to have SSE2 */ /* Could possibly be used too on 32bit, but we would need to check at runtime */ -#if defined(__x86_64) || defined(_M_X64) +#if defined(USE_SSE2) /************************************************************************/ /* GWKResampleNoMasks_SSE2_T() */ @@ -4800,7 +4808,7 @@ bool GWKResampleNoMasksT(const GDALWarpKernel *poWK, int iBand, #endif /* INSTANTIATE_FLOAT64_SSE2_IMPL */ -#endif /* defined(__x86_64) || defined(_M_X64) */ +#endif /* defined(USE_SSE2) */ /************************************************************************/ /* GWKRoundSourceCoordinates() */ @@ -6120,7 +6128,7 @@ static CPLErr GWKRealCase(GDALWarpKernel *poWK) /* We restrict to 64bit processors because they are guaranteed to have SSE2 */ /* and enough SSE registries */ -#if defined(__x86_64) || defined(_M_X64) +#if defined(USE_SSE2) static inline float Convolute4x4(const __m128 row0, const __m128 row1, const __m128 row2, const __m128 row3, @@ -6247,7 +6255,7 @@ static void GWKCubicResampleNoMasks4MultiBandT(const GDALWarpKernel *poWK, poWK->pafDstDensity[iDstOffset] = 1.0f; } -#endif // defined(__x86_64) || defined(_M_X64) +#endif // defined(USE_SSE2) /************************************************************************/ /* GWKResampleNoMasksOrDstDensityOnlyThreadInternal() */ @@ -6353,7 +6361,7 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData) const GPtrDiff_t iDstOffset = iDstX + static_cast(iDstY) * nDstXSize; -#if defined(__x86_64) || defined(_M_X64) +#if defined(USE_SSE2) if constexpr (bUse4SamplesFormula && eResample == GRA_Cubic && (std::is_same::value || std::is_same::value)) @@ -6367,7 +6375,7 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData) continue; } } -#endif // defined(__x86_64) || defined(_M_X64) +#endif // defined(USE_SSE2) [[maybe_unused]] double dfInvWeights = 0; for (int iBand = 0; iBand < poWK->nBands; iBand++) From eef42b55fca9ac1c810f3988f1928335499b92bb Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sun, 10 Nov 2024 17:11:58 +0100 Subject: [PATCH 05/10] GTI: use SSE2 code path for ARM Neon optimizations --- frmts/gti/CMakeLists.txt | 4 ++++ frmts/gti/gdaltileindexdataset.cpp | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/frmts/gti/CMakeLists.txt b/frmts/gti/CMakeLists.txt index 91bc928ea953..aaa366d4d5ab 100644 --- a/frmts/gti/CMakeLists.txt +++ b/frmts/gti/CMakeLists.txt @@ -24,3 +24,7 @@ endif () if (GDAL_ENABLE_DRIVER_GTI_PLUGIN) target_compile_definitions(gdal_GTI PRIVATE -DBUILT_AS_PLUGIN) endif() + +if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS) + target_compile_definitions(gdal_GTI PRIVATE -DUSE_NEON_OPTIMIZATIONS) +endif() diff --git a/frmts/gti/gdaltileindexdataset.cpp b/frmts/gti/gdaltileindexdataset.cpp index 52b7618537ac..dbd45c1d9c0e 100644 --- a/frmts/gti/gdaltileindexdataset.cpp +++ b/frmts/gti/gdaltileindexdataset.cpp @@ -37,7 +37,11 @@ #include "gdal_thread_pool.h" #include "gdal_utils.h" -#if defined(__SSE2__) || defined(_M_X64) +#ifdef USE_NEON_OPTIMIZATIONS +#define USE_SSE2_OPTIM +#define USE_SSE41_OPTIM +#include "include_sse2neon.h" +#elif defined(__SSE2__) || defined(_M_X64) #define USE_SSE2_OPTIM #include // MSVC doesn't define __SSE4_1__, but if -arch:AVX2 is enabled, we do have SSE4.1 From 1a9ffb608d923d84e30d85acb21d5ce4a7664426 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sun, 10 Nov 2024 17:20:38 +0100 Subject: [PATCH 06/10] PNG: use SSE2 code path for ARM Neon optimizations --- frmts/png/CMakeLists.txt | 4 ++++ frmts/png/filter_sse2_intrinsics.c | 8 +++++++- frmts/png/pngdataset.cpp | 6 +++--- frmts/png/pngdataset.h | 5 ++++- 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/frmts/png/CMakeLists.txt b/frmts/png/CMakeLists.txt index 42ae7e8415ec..51a14528776d 100644 --- a/frmts/png/CMakeLists.txt +++ b/frmts/png/CMakeLists.txt @@ -21,3 +21,7 @@ if (GDAL_USE_ZLIB_INTERNAL) else () gdal_target_link_libraries(gdal_PNG PRIVATE ZLIB::ZLIB) endif () + +if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS) + target_compile_definitions(gdal_PNG PRIVATE -DUSE_NEON_OPTIMIZATIONS) +endif() diff --git a/frmts/png/filter_sse2_intrinsics.c b/frmts/png/filter_sse2_intrinsics.c index f4c35ff2f0c8..43007b1f58f8 100644 --- a/frmts/png/filter_sse2_intrinsics.c +++ b/frmts/png/filter_sse2_intrinsics.c @@ -15,7 +15,9 @@ #endif #ifndef PNG_INTEL_SSE_IMPLEMENTATION -#if defined(__SSE4_1__) || defined(__AVX__) +#if defined(USE_NEON_OPTIMIZATIONS) +#define PNG_INTEL_SSE_IMPLEMENTATION 3 +#elif defined(__SSE4_1__) || defined(__AVX__) /* We are not actually using AVX, but checking for AVX is the best way we can detect SSE4.1 and SSSE3 on MSVC. */ @@ -30,7 +32,11 @@ #endif #endif +#if defined(USE_NEON_OPTIMIZATIONS) +#include "include_sse2neon.h" +#else #include +#endif /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). * They're positioned like this: diff --git a/frmts/png/pngdataset.cpp b/frmts/png/pngdataset.cpp index 5d49a9026392..1f1fe174730b 100644 --- a/frmts/png/pngdataset.cpp +++ b/frmts/png/pngdataset.cpp @@ -328,7 +328,7 @@ PNGDataset::~PNGDataset() #include "filter_sse2_intrinsics.c" #endif -#if defined(__GNUC__) && !defined(__SSE2__) +#if defined(__GNUC__) && !defined(__SSE2__) && !defined(USE_NEON_OPTIMIZATIONS) __attribute__((optimize("tree-vectorize"))) static inline void AddVectors(const GByte *CPL_RESTRICT pabyInputLine, GByte *CPL_RESTRICT pabyOutputLine, int nSize) @@ -677,7 +677,7 @@ CPLErr PNGDataset::LoadWholeImage(void *pSingleBuffer, GSpacing nPixelSpace, const GByte *CPL_RESTRICT pabyOutputLineUp = pabyOutputBuffer + (static_cast(iY) - 1) * nSamplesPerLine; -#if defined(__GNUC__) && !defined(__SSE2__) +#if defined(__GNUC__) && !defined(__SSE2__) && !defined(USE_NEON_OPTIMIZATIONS) AddVectors(pabyInputLine, pabyOutputLineUp, pabyOutputLine, nSamplesPerLine); #else @@ -707,7 +707,7 @@ CPLErr PNGDataset::LoadWholeImage(void *pSingleBuffer, GSpacing nPixelSpace, } else { -#if defined(__GNUC__) && !defined(__SSE2__) +#if defined(__GNUC__) && !defined(__SSE2__) && !defined(USE_NEON_OPTIMIZATIONS) AddVectors(pabyInputLine, pabyOutputLine, nSamplesPerLine); #else int iX; diff --git a/frmts/png/pngdataset.h b/frmts/png/pngdataset.h index 19b726011b53..a30371276cb7 100644 --- a/frmts/png/pngdataset.h +++ b/frmts/png/pngdataset.h @@ -49,7 +49,10 @@ #pragma warning(disable : 4611) #endif -#if defined(__SSE2__) || defined(_M_X64) || \ +#ifdef USE_NEON_OPTIMIZATIONS +#define HAVE_SSE2 +#include "include_sse2neon.h" +#elif defined(__SSE2__) || defined(_M_X64) || \ (defined(_M_IX86_FP) && _M_IX86_FP >= 2) #define HAVE_SSE2 #include From 504c8c91dabfcf6be56988b4d26b224509ce277f Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sun, 10 Nov 2024 17:36:24 +0100 Subject: [PATCH 07/10] gcore/gdal_minmax_element.hpp: use SSE4.1 code path with AVX and Neon --- gcore/gdal_minmax_element.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gcore/gdal_minmax_element.hpp b/gcore/gdal_minmax_element.hpp index 9ceb304553b3..d05a255581da 100644 --- a/gcore/gdal_minmax_element.hpp +++ b/gcore/gdal_minmax_element.hpp @@ -43,7 +43,7 @@ #ifdef GDAL_MINMAX_ELEMENT_USE_SSE2 // SSE2 header #include -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) || defined(__AVX__) #include #endif #endif @@ -320,7 +320,7 @@ template static inline T blendv(T a, T b, T mask); template <> __m128i blendv(__m128i a, __m128i b, __m128i mask) { -#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS) +#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS) return _mm_blendv_epi8(a, b, mask); #else return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b)); @@ -329,7 +329,7 @@ template <> __m128i blendv(__m128i a, __m128i b, __m128i mask) template <> __m128 blendv(__m128 a, __m128 b, __m128 mask) { -#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS) +#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS) return _mm_blendv_ps(a, b, mask); #else return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b)); @@ -338,7 +338,7 @@ template <> __m128 blendv(__m128 a, __m128 b, __m128 mask) template <> __m128d blendv(__m128d a, __m128d b, __m128d mask) { -#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS) +#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS) return _mm_blendv_pd(a, b, mask); #else return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, b)); From ef167eafe68e8eeb806c0065eb74ef3d07144375 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sun, 10 Nov 2024 17:38:16 +0100 Subject: [PATCH 08/10] gcore/gdal_priv_templates.hpp: use SSE4.1 code path with AVX and Neon --- gcore/gdal_priv_templates.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gcore/gdal_priv_templates.hpp b/gcore/gdal_priv_templates.hpp index cb1631485cf1..c26bac5762b6 100644 --- a/gcore/gdal_priv_templates.hpp +++ b/gcore/gdal_priv_templates.hpp @@ -609,7 +609,7 @@ static inline void GDALCopyXMMToInt64(const __m128i xmm, void *pDest) #include #endif -#if __SSE4_1__ +#if defined(__SSE4_1__) || defined(__AVX__) #include #endif @@ -627,7 +627,7 @@ inline void GDALCopy4Words(const float *pValueIn, GByte *const pValueOut) __m128i xmm_i = _mm_cvttps_epi32(xmm); -#if __SSSE3__ +#if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS) xmm_i = _mm_shuffle_epi8( xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24))); #else @@ -671,7 +671,7 @@ inline void GDALCopy4Words(const float *pValueIn, GUInt16 *const pValueOut) __m128i xmm_i = _mm_cvttps_epi32(xmm); -#if __SSE4_1__ +#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS) xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack int32 to uint16 #else // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only @@ -742,7 +742,7 @@ inline void GDALCopy8Words(const float *pValueIn, GUInt16 *const pValueOut) __m128i xmm_i = _mm_cvttps_epi32(xmm); __m128i xmm1_i = _mm_cvttps_epi32(xmm1); -#if __SSE4_1__ +#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS) xmm_i = _mm_packus_epi32(xmm_i, xmm1_i); // Pack int32 to uint16 #else // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only From db73dcc4abfbe55a2f96bf57522e48ab8c87750a Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sun, 10 Nov 2024 17:41:43 +0100 Subject: [PATCH 09/10] overview.cpp: use SSE4.1 optim with AVX --- gcore/overview.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcore/overview.cpp b/gcore/overview.cpp index 846c89a91e4e..84277ffddbd5 100644 --- a/gcore/overview.cpp +++ b/gcore/overview.cpp @@ -341,7 +341,7 @@ inline GUInt16 ComputeIntegerRMS_4values(double sumSquares) /* QuadraticMeanByteSSE2OrAVX2() */ /************************************************************************/ -#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS) +#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS) #define sse2_packus_epi32 _mm_packus_epi32 #else inline __m128i sse2_packus_epi32(__m128i a, __m128i b) From 0642738069153f1f0870fdddb3754c5c817106e1 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sun, 10 Nov 2024 17:42:28 +0100 Subject: [PATCH 10/10] warp: use SSE4.1 code path with AVX --- alg/gdalwarpkernel.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/alg/gdalwarpkernel.cpp b/alg/gdalwarpkernel.cpp index 8cd036e0f0bd..b9a18dcb4102 100644 --- a/alg/gdalwarpkernel.cpp +++ b/alg/gdalwarpkernel.cpp @@ -2995,7 +2995,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GByte *ptr) __m128i xmm_i = _mm_cvtsi32_si128(i); // Zero extend 4 packed unsigned 8-bit integers in a to packed // 32-bit integers. -#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS) +#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS) xmm_i = _mm_cvtepu8_epi32(xmm_i); #else xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128()); @@ -3011,7 +3011,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr) __m128i xmm_i = _mm_cvtsi64_si128(i); // Zero extend 4 packed unsigned 16-bit integers in a to packed // 32-bit integers. -#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS) +#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS) xmm_i = _mm_cvtepu16_epi32(xmm_i); #else xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());