From 669c278377c14b41786331e5b1f1c31f5524eeb2 Mon Sep 17 00:00:00 2001 From: Johnny Chen Date: Mon, 30 May 2022 17:06:45 +0800 Subject: [PATCH] generalize implementation for generic arrays This supports OffsetArray, it also supports CuArray (but requires ArrayInterfaceGPUArrays loaded) --- src/extreme_filter.jl | 78 +++++++++++++++++----------------------- src/loopvectorization.jl | 2 ++ test/extreme_filter.jl | 5 +++ 3 files changed, 39 insertions(+), 46 deletions(-) diff --git a/src/extreme_filter.jl b/src/extreme_filter.jl index 46b21a6b..155d14bd 100644 --- a/src/extreme_filter.jl +++ b/src/extreme_filter.jl @@ -278,34 +278,22 @@ Base.@propagate_inbounds function _minimum_fast(A::AbstractArray{Bool}, p, Ω) return rst end -function _shift_up_and_padd!( - out::AbstractArray{T,1}, A::AbstractArray{T,1}, shift, padd -) where {T} - isempty(out) && return A - sizeVector = length(A) - idxt = 1 - @inbounds for idxs in (shift + 1):(sizeVector) #memcopy in julia ? - out[idxt] = A[idxs] - idxt += 1 - end - @inbounds for idxt in (sizeVector - shift + 1):(sizeVector) #memset in julia ? - out[idxt] = padd - end -end - -function _shift_down_and_padd!( - out::AbstractArray{T,1}, A::AbstractArray{T,1}, shift, padd -) where {T} - isempty(out) && return A - sizeVector = length(A) - @inbounds for idxt in (1):(shift) - out[idxt] = padd - end - idxt = shift + 1 - @inbounds for idxs in (1):(sizeVector - shift) - out[idxt] = A[idxs] - idxt += 1 +function _padded_copyto!(dest::AbstractVector, src::AbstractVector, shift, v) + Base.require_one_based_indexing(dest, src) + if shift >= 0 + o = shift + vsrc = @view src[(1 + o):end] + N = length(vsrc) + copyto!(dest, 1, vsrc, 1, N) + vdest = @view dest[(N + 1):end] + fill!(vdest, v) + else + o = -shift + vdest = @view dest[1:o] + fill!(vdest, v) + copyto!(dest, 1 + o, src, 1, length(dest) - o) end + return dest end function _shift_arith!( @@ -313,18 +301,18 @@ function _shift_arith!( out::AbstractArray{T,1}, tmp::AbstractArray{T,1}, A::AbstractArray{T,1}, - shiftup, - shiftdown, + shift, ) where {T} #tmp external to reuse external allocation if f === min padd = typemax(T) else padd = typemin(T) end - _shift_up_and_padd!(tmp, A, shiftup, padd) + _padded_copyto!(tmp, A, shift, padd) _mapf!(f, out, A, tmp) - _shift_down_and_padd!(tmp, A, shiftdown, padd) - return _mapf!(f, out, out, tmp) + _padded_copyto!(tmp, A, -shift, padd) + _mapf!(f, out, out, tmp) + return out end function _extreme_filter_C4_2D!( @@ -332,16 +320,13 @@ function _extreme_filter_C4_2D!( ) where {T} @debug "call the optimized `extreme_filter` implementation for SEDiamond SE and 2D images" fname = _extreme_filter_C4_2D! - if size(out) != size(A) - throw( - ArgumentError( - "source and destination must have same size (got $(size(out)) and $(size(A)))", - ), - ) - end - if ndims(out) != 2 - throw(ArgumentError("source and destination must have to be 2D (got $(size(out))")) - end + axes(out) == axes(A) || throw(DimensionMismatch("axes(out) must match axes(A)")) + + # We're almost there to support generic offset arrays except that SubArray doesn't work + # very nicely https://github.com/JuliaSIMD/LoopVectorization.jl/issues/406 + out_actual = out + out = OffsetArrays.no_offset_view(out) + A = OffsetArrays.no_offset_view(A) # To avoid the result affected by loop order, we need two arrays src = (out === A) || (iter > 1) ? copy(A) : A @@ -361,7 +346,7 @@ function _extreme_filter_C4_2D!( #x ? viewprevious = view(src, :, 1) # dilate/erode col 1 - _shift_arith!(f, tmp2, tmp, viewprevious, 1, 1) + _shift_arith!(f, tmp2, tmp, viewprevious, 1) viewnext = view(src, :, 2) viewout = view(out, :, 1) # inf/sup between dilate/erode col 1 0 and col 2 @@ -375,7 +360,7 @@ function _extreme_filter_C4_2D!( viewout = view(out, :, c - 1) viewnext = view(src, :, c) # dilate(x-1)/erode(x-1) - _shift_arith!(f, tmp2, tmp, viewcurrent, 1, 1) + _shift_arith!(f, tmp2, tmp, viewcurrent, 1) #sup(x-2,dilate(x-1)),inf(x-2,erode(x-1)) _mapf!(f, tmp, viewprevious, tmp2) #sup(sup(x-2,dilate(x-1),x) || inf(inf(x-2,erode(x-1),x) @@ -392,11 +377,12 @@ function _extreme_filter_C4_2D!( #? x # dilate/erode col x viewout = view(out, :, xSize) - _shift_arith!(f, tmp2, tmp, viewcurrent, 1, 1) + _shift_arith!(f, tmp2, tmp, viewcurrent, 1) _mapf!(f, viewout, tmp2, viewprevious) if iter > 1 && i < iter src .= out end end - return out + + return out_actual end \ No newline at end of file diff --git a/src/loopvectorization.jl b/src/loopvectorization.jl index ef366bb5..0367a4fa 100644 --- a/src/loopvectorization.jl +++ b/src/loopvectorization.jl @@ -6,6 +6,8 @@ Optimized version of `@. out = f.(A, B)` for a few things: - Apply LoopVectorization when possible =# function _mapf!(f, out, A, B) + # CUDA support requires ArrayInterfaceGPUArrays being loaded -- but the current GPU + # implementation is so slow (much worse than CPU) so we don't load it by default. use_broadcast = !all(ArrayInterface.fast_scalar_indexing, (out, A, B)) if use_broadcast @. out = f(A, B) diff --git a/test/extreme_filter.jl b/test/extreme_filter.jl index e68988e9..dfcee7ad 100644 --- a/test/extreme_filter.jl +++ b/test/extreme_filter.jl @@ -94,5 +94,10 @@ out = ImageMorphology._extreme_filter_C4_2D!(max, similar(img_gray), img_gray, 1) @test eltype(out) == Gray{Float32} @test out == ref_iter1 ./ 5 + + imgc = centered(img) + out = ImageMorphology._extreme_filter_C4_2D!(max, similar(imgc), imgc, 1) + @test axes(out) == (-2:2, -2:2) + @test collect(out) == ref_iter1 end end