From 669c278377c14b41786331e5b1f1c31f5524eeb2 Mon Sep 17 00:00:00 2001
From: Johnny Chen <johnnychen94@hotmail.com>
Date: Mon, 30 May 2022 17:06:45 +0800
Subject: [PATCH] generalize implementation for generic arrays

This supports OffsetArray, it also supports CuArray (but requires
ArrayInterfaceGPUArrays loaded)
---
 src/extreme_filter.jl    | 78 +++++++++++++++++-----------------------
 src/loopvectorization.jl |  2 ++
 test/extreme_filter.jl   |  5 +++
 3 files changed, 39 insertions(+), 46 deletions(-)

diff --git a/src/extreme_filter.jl b/src/extreme_filter.jl
index 46b21a6b..155d14bd 100644
--- a/src/extreme_filter.jl
+++ b/src/extreme_filter.jl
@@ -278,34 +278,22 @@ Base.@propagate_inbounds function _minimum_fast(A::AbstractArray{Bool}, p, Ω)
     return rst
 end
 
-function _shift_up_and_padd!(
-    out::AbstractArray{T,1}, A::AbstractArray{T,1}, shift, padd
-) where {T}
-    isempty(out) && return A
-    sizeVector = length(A)
-    idxt = 1
-    @inbounds for idxs in (shift + 1):(sizeVector) #memcopy in julia ?
-        out[idxt] = A[idxs]
-        idxt += 1
-    end
-    @inbounds for idxt in (sizeVector - shift + 1):(sizeVector) #memset in julia ?
-        out[idxt] = padd
-    end
-end
-
-function _shift_down_and_padd!(
-    out::AbstractArray{T,1}, A::AbstractArray{T,1}, shift, padd
-) where {T}
-    isempty(out) && return A
-    sizeVector = length(A)
-    @inbounds for idxt in (1):(shift)
-        out[idxt] = padd
-    end
-    idxt = shift + 1
-    @inbounds for idxs in (1):(sizeVector - shift)
-        out[idxt] = A[idxs]
-        idxt += 1
+function _padded_copyto!(dest::AbstractVector, src::AbstractVector, shift, v)
+    Base.require_one_based_indexing(dest, src)
+    if shift >= 0
+        o = shift
+        vsrc = @view src[(1 + o):end]
+        N = length(vsrc)
+        copyto!(dest, 1, vsrc, 1, N)
+        vdest = @view dest[(N + 1):end]
+        fill!(vdest, v)
+    else
+        o = -shift
+        vdest = @view dest[1:o]
+        fill!(vdest, v)
+        copyto!(dest, 1 + o, src, 1, length(dest) - o)
     end
+    return dest
 end
 
 function _shift_arith!(
@@ -313,18 +301,18 @@ function _shift_arith!(
     out::AbstractArray{T,1},
     tmp::AbstractArray{T,1},
     A::AbstractArray{T,1},
-    shiftup,
-    shiftdown,
+    shift,
 ) where {T} #tmp external to reuse external allocation
     if f === min
         padd = typemax(T)
     else
         padd = typemin(T)
     end
-    _shift_up_and_padd!(tmp, A, shiftup, padd)
+    _padded_copyto!(tmp, A, shift, padd)
     _mapf!(f, out, A, tmp)
-    _shift_down_and_padd!(tmp, A, shiftdown, padd)
-    return _mapf!(f, out, out, tmp)
+    _padded_copyto!(tmp, A, -shift, padd)
+    _mapf!(f, out, out, tmp)
+    return out
 end
 
 function _extreme_filter_C4_2D!(
@@ -332,16 +320,13 @@ function _extreme_filter_C4_2D!(
 ) where {T}
     @debug "call the optimized `extreme_filter` implementation for SEDiamond SE and 2D images" fname =
         _extreme_filter_C4_2D!
-    if size(out) != size(A)
-        throw(
-            ArgumentError(
-                "source and destination must have same size (got $(size(out)) and $(size(A)))",
-            ),
-        )
-    end
-    if ndims(out) != 2
-        throw(ArgumentError("source and destination must have to be 2D (got $(size(out))"))
-    end
+    axes(out) == axes(A) || throw(DimensionMismatch("axes(out) must match axes(A)"))
+
+    # We're almost there to support generic offset arrays except that SubArray doesn't work
+    # very nicely https://github.com/JuliaSIMD/LoopVectorization.jl/issues/406
+    out_actual = out
+    out = OffsetArrays.no_offset_view(out)
+    A = OffsetArrays.no_offset_view(A)
 
     # To avoid the result affected by loop order, we need two arrays
     src = (out === A) || (iter > 1) ? copy(A) : A
@@ -361,7 +346,7 @@ function _extreme_filter_C4_2D!(
         #x ?
         viewprevious = view(src, :, 1)
         # dilate/erode col 1
-        _shift_arith!(f, tmp2, tmp, viewprevious, 1, 1)
+        _shift_arith!(f, tmp2, tmp, viewprevious, 1)
         viewnext = view(src, :, 2)
         viewout = view(out, :, 1)
         # inf/sup between dilate/erode col 1 0 and col 2
@@ -375,7 +360,7 @@ function _extreme_filter_C4_2D!(
             viewout = view(out, :, c - 1)
             viewnext = view(src, :, c)
             # dilate(x-1)/erode(x-1)
-            _shift_arith!(f, tmp2, tmp, viewcurrent, 1, 1)
+            _shift_arith!(f, tmp2, tmp, viewcurrent, 1)
             #sup(x-2,dilate(x-1)),inf(x-2,erode(x-1))
             _mapf!(f, tmp, viewprevious, tmp2)
             #sup(sup(x-2,dilate(x-1),x) || inf(inf(x-2,erode(x-1),x)
@@ -392,11 +377,12 @@ function _extreme_filter_C4_2D!(
         #? x
         # dilate/erode col x
         viewout = view(out, :, xSize)
-        _shift_arith!(f, tmp2, tmp, viewcurrent, 1, 1)
+        _shift_arith!(f, tmp2, tmp, viewcurrent, 1)
         _mapf!(f, viewout, tmp2, viewprevious)
         if iter > 1 && i < iter
             src .= out
         end
     end
-    return out
+
+    return out_actual
 end
\ No newline at end of file
diff --git a/src/loopvectorization.jl b/src/loopvectorization.jl
index ef366bb5..0367a4fa 100644
--- a/src/loopvectorization.jl
+++ b/src/loopvectorization.jl
@@ -6,6 +6,8 @@ Optimized version of `@. out = f.(A, B)` for a few things:
 - Apply LoopVectorization when possible
 =#
 function _mapf!(f, out, A, B)
+    # CUDA support requires ArrayInterfaceGPUArrays being loaded -- but the current GPU
+    # implementation is so slow (much worse than CPU) so we don't load it by default.
     use_broadcast = !all(ArrayInterface.fast_scalar_indexing, (out, A, B))
     if use_broadcast
         @. out = f(A, B)
diff --git a/test/extreme_filter.jl b/test/extreme_filter.jl
index e68988e9..dfcee7ad 100644
--- a/test/extreme_filter.jl
+++ b/test/extreme_filter.jl
@@ -94,5 +94,10 @@
         out = ImageMorphology._extreme_filter_C4_2D!(max, similar(img_gray), img_gray, 1)
         @test eltype(out) == Gray{Float32}
         @test out == ref_iter1 ./ 5
+
+        imgc = centered(img)
+        out = ImageMorphology._extreme_filter_C4_2D!(max, similar(imgc), imgc, 1)
+        @test axes(out) == (-2:2, -2:2)
+        @test collect(out) == ref_iter1
     end
 end