Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NO MERGE Speedup extrem filter #90

Closed
7 changes: 5 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,19 @@ uuid = "787d08f9-d448-5407-9aad-5290dd7ab264"
version = "0.3.2"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
TiledIteration = "06e1c1a7-607b-532d-9fad-de7d9aa2abac"

[compat]
Documenter = "0.24, 0.25, 0.26, 0.27"
ArrayInterface = "4, 5, 6"
Documenter = "0.24, 0.25"
ImageCore = "0.9"
OffsetArrays = "1.9"
LoopVectorization = "0.12"
Requires = "1"
TiledIteration = "0.3.1"
julia = "1.6"
Expand Down
16 changes: 10 additions & 6 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ using TestImages

on_CI = haskey(ENV, "GITHUB_ACTIONS")

cameraman = Gray{N0f8}.(testimage("cameraman"))
blobs = binarize(Gray.(testimage("blobs")), Otsu()) .> 0.5
const cameraman = Gray{N0f8}.(testimage("cameraman"))
const blobs = binarize(Gray.(testimage("blobs")), Otsu()) .> 0.5

tst_sizes = on_CI ? (64, ) : (256, 512)
tst_types = (Gray{N0f8}, Gray{Float32})
const tst_sizes = on_CI ? (64,) : (256, 512, 1024, 2048)
const tst_types = (Gray{N0f8}, Gray{Float32})

const SUITE = BenchmarkGroup()

Expand All @@ -33,6 +33,10 @@ let grp = SUITE["extreme_filter"]
grp[T]["$sz×$sz"]["r5_diamond"] = @benchmarkable extreme_filter(max, $tst_img, $se)
se = centered(collect(se))
grp[T]["$sz×$sz"]["r5_generic"] = @benchmarkable extreme_filter(max, $tst_img, $se)
grp[T]["$sz×$sz"]["r1_diamond_2D_optim"] =
@benchmarkable ImageMorphology._unsafe_extreme_filter_C4_2D!(max, similar($tst_img), $tst_img, 1)
grp[T]["$sz×$sz"]["r1_box_2D_optim"] =
@benchmarkable ImageMorphology._unsafe_extreme_filter_C8_2D!(max, similar($tst_img), $tst_img, 1)
end
end
end
Expand All @@ -43,7 +47,7 @@ let grp = SUITE["extreme_filter"]
grp[T]["$sz×$sz"] = BenchmarkGroup()
for (cname, cr) in [("worst", 0.95), ("best", 0.05), ("random", 0.5)]
tst_img = fill(zero(T), sz, sz)
tst_img[rand(sz, sz) .>= cr] .= true
tst_img[rand(sz, sz).>=cr] .= true

se = strel_diamond((3, 3))
grp[T]["$sz×$sz"]["r1_diamond_$cname"] = @benchmarkable extreme_filter(max, $tst_img, $se)
Expand Down Expand Up @@ -84,7 +88,7 @@ let grp = SUITE["Maxtree"]
for sz in tst_sizes
tst_img = (imresize((cameraman), (sz, sz)))
B = similar(tst_img)
grp["area_opening"]["$sz×$sz"] = @benchmarkable area_opening($B, min_area=50)
grp["area_opening"]["$sz×$sz"] = @benchmarkable area_opening($B, min_area = 50)
end
end

Expand Down
5 changes: 4 additions & 1 deletion src/ImageMorphology.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ using OffsetArrays: centered
using LinearAlgebra
using TiledIteration: EdgeIterator, SplitAxis, SplitAxes
using Requires
using LoopVectorization
using ArrayInterface

const _docstring_se = """
`se` is the structuring element that defines the neighborhood of the image. See
Expand Down Expand Up @@ -35,8 +37,9 @@ include("imfill.jl")
include("maxtree.jl")

include("feature_transform.jl")
include("utils.jl")
using .FeatureTransform
include("utils.jl")
include("loopvectorization.jl")

include("deprecations.jl")

Expand Down
229 changes: 229 additions & 0 deletions src/extreme_filter.jl
Original file line number Diff line number Diff line change
Expand Up @@ -269,3 +269,232 @@ Base.@propagate_inbounds function _minimum_fast(A::AbstractArray{Bool}, p, Ω)
end
return rst
end

function _padded_copyto!(dest::AbstractVector, src::AbstractVector, shift, v)
Base.require_one_based_indexing(dest, src)
if shift >= 0
o = shift
vsrc = @view src[(1+o):end]
N = length(vsrc)
copyto!(dest, 1, vsrc, 1, N)
vdest = @view dest[(N+1):end]
fill!(vdest, v)
else
o = -shift
vdest = @view dest[1:o]
fill!(vdest, v)
copyto!(dest, 1 + o, src, 1, length(dest) - o)
end
return dest
end
johnnychen94 marked this conversation as resolved.
Show resolved Hide resolved

# Shift vector of size N up or down by 1 accorded to dir, pad with v
# ptr level optimized implementation for Real types
# short-circuit all check so unsafe
function _unsafe_padded_copyto!(dest::AbstractVector{T}, src::AbstractVector{T}, dir, N, v) where {T}
if dir
#in src = [1,2,3,4], v, N=4
#out dest = [V,1,2,3]
unsafe_store!(pointer(dest), v)
unsafe_copyto!(pointer(dest, 2), pointer(src, 1), N - 1)
else
#in src = [1,2,3,4], v, N=4
#out dest = [2,3,4,v]
unsafe_copyto!(pointer(dest, 1), pointer(src, 2), N - 1)
unsafe_store!(pointer(dest), v, N)
end
return dest
end

# ptr level optimized implementation for Real types
# short-circuit all check
# call LoopVectorization directly
function _unsafe_shift_arith!(
f::MAX_OR_MIN,
out::AbstractArray{T,1},
tmp::AbstractArray{T,1},
tmp2::AbstractArray{T,1},
A::AbstractArray{T,1}
) where {T} #tmp external to reuse external allocation
if f === min
padd = typemax(T)
else
padd = typemin(T)
end
N = length(out)
#in src = [1,2,3,4], padd, N=4
#out tmp = [padd,1,2,3]
#out tmp2 = [1,2,3,padd]
_unsafe_padded_copyto!(tmp, A, true, N, padd)
_unsafe_padded_copyto!(tmp2, A, false, N, padd)
@turbo for i in eachindex(out, A, tmp, tmp2)
out[i] = f(f(A[i], tmp[i]), tmp2[i])
end
return out
end

# optimized `extreme_filter` implementation for SEDiamond SE and 2D images
# Similar approach could be found in
# Žlaus, Danijel & Mongus, Domen. (2018). In-place SIMD Accelerated Mathematical Morphology. 76-79. 10.1145/3206157.3206176.
# see https://www.researchgate.net/publication/325480366_In-place_SIMD_Accelerated_Mathematical_Morphology
function _unsafe_extreme_filter_C4_2D!(f::MAX_OR_MIN, out::AbstractArray{T,2}, A::AbstractArray{T,2}, iter) where {T}
@debug "call the optimized `extreme_filter` implementation for SEDiamond SE and 2D images" fname =
_unsafe_extreme_filter_C4_2D!
axes(out) == axes(A) || throw(DimensionMismatch("axes(out) must match axes(A)"))
# We're almost there to support generic offset arrays except that SubArray doesn't work
# very nicely https://github.com/JuliaSIMD/LoopVectorization.jl/issues/406
out_actual = out
out = OffsetArrays.no_offset_view(out)
A = OffsetArrays.no_offset_view(A)

# To avoid the result affected by loop order, we need two arrays
src = (out === A) || (iter > 1) ? copy(A) : A
# NOTE(johnnychen94): we don't need to do `out .= src` here because it is write-only;
# all values are generated from the read-only `src`.

#creating temporaries
ySize, xSize = size(A)
tmp = similar(out, eltype(out), ySize)
tmp2 = similar(tmp)
tmp3 = similar(tmp)

# applying radius=r filter is equivalent to applying radius=1 filter r times
for i = 1:iter
#compute first edge column
#translate to clipped connection, x neighborhood of interest, ? neighborhood we don't care, . center
# x ?
# . x
# x ?
viewprevious = view(src, :, 1)
# dilate/erode col 1
_unsafe_shift_arith!(f, tmp2, tmp, tmp3, viewprevious)
viewnext = view(src, :, 2)
viewout = view(out, :, 1)
# inf/sup between dilate/erode col 1 0 and col 2
LoopVectorization.vmap!(f, viewout, viewnext, tmp2)
#next->current
viewcurrent = view(src, :, 2)
for c = 3:xSize
# viewprevious c-2
# viewcurrent c-1
# viewnext c
# ? x ?
# x . x
# ? x ?
viewout = view(out, :, c - 1)
viewnext = view(src, :, c)
# dilate(x-1)/erode(x-1)
_unsafe_shift_arith!(f, tmp2, tmp, tmp3, viewcurrent)
@turbo for i in eachindex(viewout, viewprevious, tmp2)
#sup(x-2,dilate(x-1)),inf(x-2,erode(x-1))
#sup(sup(x-2,dilate(x-1),x) || inf(inf(x-2,erode(x-1),x)
viewout[i] = f(f(viewprevious[i], tmp2[i]), viewnext[i])
end
#current->previous
viewprevious = view(src, :, c - 1)
#next->current
viewcurrent = view(src, :, c)
end
#end last column
#translate to clipped connection
# ? x
# x .
# ? x
# dilate/erode col x
viewout = view(out, :, xSize)
_unsafe_shift_arith!(f, tmp2, tmp, tmp3, viewcurrent)
LoopVectorization.vmap!(f, viewout, tmp2, viewprevious)
if iter > 1 && i < iter
src .= out
end
end

return out_actual
end

# optimized `extreme_filter` implementation for SEBox SE and 2D images
# Similar approach could be found in
# Žlaus, Danijel & Mongus, Domen. (2018). In-place SIMD Accelerated Mathematical Morphology. 76-79. 10.1145/3206157.3206176.
# see https://www.researchgate.net/publication/325480366_In-place_SIMD_Accelerated_Mathematical_Morphology
function _unsafe_extreme_filter_C8_2D!(f, out::AbstractArray{T,2}, A::AbstractArray{T,2}, iter) where {T}
@debug "call the optimized `extreme_filter` implementation for SEBox SE and 2D images" fname =
_unsafe_extreme_filter_C8_2D!
axes(out) == axes(A) || throw(DimensionMismatch("axes(out) must match axes(A)"))
# We're almost there to support generic offset arrays except that SubArray doesn't work
# very nicely https://github.com/JuliaSIMD/LoopVectorization.jl/issues/406
out_actual = out
out = OffsetArrays.no_offset_view(out)
A = OffsetArrays.no_offset_view(A)

# To avoid the result affected by loop order, we need two arrays
src = (out === A) || (iter > 1) ? copy(A) : A
# NOTE(johnnychen94): we don't need to do `out .= src` here because it is write-only;
# all values are generated from the read-only `src`.

#creating temporaries
ySize, xSize = size(A)
tmp = similar(out, eltype(out), ySize)
tmp2 = similar(tmp)
tmp3 = similar(tmp)
tmp4 = similar(tmp)
tmp5 = similar(tmp)

# applying radius=r filter is equivalent to applying radius=1 filter r times
for i = 1:iter
#compute first edge column
#translate to clipped connection, x neighborhood of interest, ? neighborhood we don't care, . center
# x x
# . x
# x x
viewprevious = view(src, :, 1)
# dilate/erode col 1
_unsafe_shift_arith!(f, tmp2, tmp, tmp5, viewprevious)
viewnext = view(src, :, 2)
viewout = view(out, :, 1)
# dilate/erode col 2
_unsafe_shift_arith!(f, tmp3, tmp, tmp5, viewnext)
#sup(dilate col 1,dilate col 0) or inf(erode col 1,erode col 0)
LoopVectorization.vmap!(f, viewout, tmp3, tmp2)
#next->current
viewcurrent = view(src, :, 2)
for c = 3:xSize
# Invariant of the loop
# temp2 contains dilation/erosion of previous col x-2
# temp3 contains dilation/erosion of current col x-1
# temp4 contains dilation/erosion of next col ie x
# viewprevious c-2
# viewcurrent c-1
# viewnext c
# x x x
# x . x
# x x x
viewout = view(out, :, c - 1)
viewnext = view(src, :, c)
# dilate(x)/erode(x)
_unsafe_shift_arith!(f, tmp4, tmp, tmp5, viewnext)
@turbo for i in eachindex(viewout, tmp4, tmp3, tmp2)
#sup(x-2,dilate(x-1)),inf(x-2,erode(x-1))
#sup(dilate(x-2),dilate(x-1),dilate(y)) or inf(erode(x-2),erode(x-1),erode(x))
viewout[i] = f(f(tmp2[i], tmp3[i]), tmp4[i])
end
#swap
tmp4, tmp3 = tmp3, tmp4
tmp4, tmp2 = tmp2, tmp4
end
#end last column
#translate to clipped connection
# x x
# x .
# x x
# dilate/erode col x
viewout = view(out, :, xSize)
_unsafe_shift_arith!(f, tmp2, tmp, tmp5, viewcurrent)
LoopVectorization.vmap!(f, viewout, tmp3, tmp2)
if iter > 1 && i < iter
src .= out
end
end

return out_actual
end

51 changes: 51 additions & 0 deletions src/loopvectorization.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#=
Optimized version of `@. out = f.(A, B)` for a few things:

- there are some overhead when applying broadcasting on small subarray
(https://github.com/JuliaLang/julia/issues/28126)
- Apply LoopVectorization when possible
=#
function _mapf!(f, out, A, B)
# CUDA support requires ArrayInterfaceGPUArrays being loaded -- but the current GPU
# implementation is so slow (much worse than CPU) so we don't load it by default.
use_broadcast = !all(ArrayInterface.fast_scalar_indexing, (out, A, B))
if use_broadcast
@. out = f(A, B)
else
__mapf_loop!(f, out, A, B)
end
return out
end

function __mapf_loop!(f, out, A, B)
# LoopVectorization currently does not support Colorant
if ArrayInterface.can_avx(f) && _is_gray(out, A, B)
__mapf_loop_avx!(f, _numeric_array(out), _numeric_array(A), _numeric_array(B))
else
__mapf_loop_simd!(f, out, A, B)
end
return out
end

function __mapf_loop_avx!(f, out, A, B)
@tturbo for i in eachindex(out, A, B)
out[i] = f(A[i], B[i])
end
return out
end

# TODO(johnnychen94): remove these when VectorizationBase lower bound >= v0.21.35?
# https://github.com/JuliaSIMD/VectorizationBase.jl/pull/85
__mapf_loop_avx!(::typeof(max), out::AbstractArray{Bool}, A, B) = __mapf_loop_simd!(|, out, A, B)
__mapf_loop_avx!(::typeof(min), out::AbstractArray{Bool}, A, B) = __mapf_loop_simd!(&, out, A, B)

function __mapf_loop_simd!(f, out, A, B)
@inbounds @simd for i in eachindex(out, A, B)
out[i] = f(A[i], B[i])
end
return out
end

_numeric_array(A::AbstractArray{T}) where {T<:Real} = A
_numeric_array(A::AbstractArray{AT}) where {AT<:Gray} = reinterpret(eltype(AT), A)
_is_gray(As::AbstractArray...) = all(A -> eltype(A) <: Union{Real,Gray}, As)
Loading