diff --git a/.gitignore b/.gitignore index efcf6e5..48c374b 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ actual/ /docs/site/ /deps/aes-ni Manifest.toml +.DS_Store diff --git a/src/Random123.jl b/src/Random123.jl index 98f3aad..53c5dd1 100644 --- a/src/Random123.jl +++ b/src/Random123.jl @@ -29,10 +29,8 @@ include("philox.jl") export R123_USE_AESNI -"True when AES-NI has been enabled." -const R123_USE_AESNI = @static if Sys.isapple() && Sys.ARCH ≡ :aarch64 - false -else +"True when x86 AES-NI instructions have been detected." +const R123_USE_X86_AES_NI::Bool = @static if Sys.ARCH ≡ :x86_64 || Sys.ARCH ≡ :i686 try cmd = Base.julia_cmd() push!( @@ -47,16 +45,51 @@ else catch e false end +else + false end +"True when AArch64 FEAT_AES instructions have been detected." +const R123_USE_AARCH64_FEAT_AES::Bool = if Sys.ARCH ≡ :aarch64 + try + cmd = Base.julia_cmd() + push!( + cmd.exec, + "-e", + "const uint8x16 = NTuple{16, VecElement{UInt8}};" * + "@assert ccall(\"llvm.aarch64.crypto.aesmc\", " * + "llvmcall, uint8x16, (uint8x16,), " * + "uint8x16((0x4a, 0x68, 0xbd, 0xe1, 0xfe, 0x16, 0x3d, " * + "0xec, 0xde, 0x06, 0x72, 0x86, 0xe3, 0x8c, 0x14, 0xd9))) ≡ " * + "uint8x16((0x70, 0xa7, 0x7b, 0xd2, 0x0c, 0x79, 0xbd, " * + "0xf1, 0x59, 0xc2, 0xad, 0x1a, 0x9f, 0x05, 0x37, 0x0f))", + ) + success(cmd) + catch e + false + end +else + false +end + +"True when AES-acceleration instructions have been detected." +const R123_USE_AESNI::Bool = R123_USE_X86_AES_NI || R123_USE_AARCH64_FEAT_AES + @static if R123_USE_AESNI export AESNI1x, AESNI4x, aesni export ARS1x, ARS4x, ars - include("./aesni_common.jl") - include("./aesni.jl") - include("./ars.jl") else - @warn "AES-NI instruction set is not enabled, so the related RNGs (AESNI and ARS) are not available." + @warn "AES-acceleration instructions have not been detected, so the related RNGs (AESNI and ARS) are not available." +end + +@static if R123_USE_X86_AES_NI + include("./x86/aesni_common.jl") + include("./x86/aesni.jl") + include("./x86/ars.jl") +elseif R123_USE_AARCH64_FEAT_AES + include("./aarch64/aesni_common.jl") + include("./aarch64/aesni.jl") + include("./aarch64/ars.jl") end end diff --git a/src/aarch64/aesni.jl b/src/aarch64/aesni.jl new file mode 100644 index 0000000..fa3c4fb --- /dev/null +++ b/src/aarch64/aesni.jl @@ -0,0 +1,238 @@ +import Base: copy, copyto!, ==, llvmcall +import Random: rand, seed! +import RandomNumbers: gen_seed, union_uint, seed_type, unsafe_copyto!, unsafe_compare + + +"The key for AESNI." +mutable struct AESNIKey + key1::uint64x2 + key2::uint64x2 + key3::uint64x2 + key4::uint64x2 + key5::uint64x2 + key6::uint64x2 + key7::uint64x2 + key8::uint64x2 + key9::uint64x2 + key10::uint64x2 + key11::uint64x2 + AESNIKey() = new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) +end + +copyto!(dest::AESNIKey, src::AESNIKey) = unsafe_copyto!(dest, src, UInt128, 11) + +copy(src::AESNIKey) = copyto!(AESNIKey(), src) + +==(key1::AESNIKey, key2::AESNIKey) = unsafe_compare(key1, key2, UInt128, 11) + +""" +Assistant function for AES128. Originally compiled for x86 from the C++ source code: +```cpp +R123_STATIC_INLINE __m128i AES_128_ASSIST (__m128i temp1, __m128i temp2) { + __m128i temp3; + temp2 = _mm_shuffle_epi32 (temp2 ,0xff); + temp3 = _mm_slli_si128 (temp1, 0x4); + temp1 = _mm_xor_si128 (temp1, temp3); + temp3 = _mm_slli_si128 (temp3, 0x4); + temp1 = _mm_xor_si128 (temp1, temp3); + temp3 = _mm_slli_si128 (temp3, 0x4); + temp1 = _mm_xor_si128 (temp1, temp3); + temp1 = _mm_xor_si128 (temp1, temp2); + return temp1; +} +``` +Then made architecture-agnostic as LLVM IR. +""" +_aes_128_assist(a::uint64x2, b::uint64x2) = llvmcall( + """%3 = bitcast <2 x i64> %1 to <4 x i32> + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> + %5 = bitcast <4 x i32> %4 to <2 x i64> + %6 = bitcast <2 x i64> %0 to <16 x i8> + %7 = shufflevector <16 x i8> , <16 x i8> %6, <16 x i32> + %8 = bitcast <16 x i8> %7 to <2 x i64> + %9 = xor <2 x i64> %8, %0 + %10 = shufflevector <16 x i8> , <16 x i8> %7, <16 x i32> + %11 = bitcast <16 x i8> %10 to <2 x i64> + %12 = xor <2 x i64> %9, %11 + %13 = shufflevector <16 x i8> , <16 x i8> %10, <16 x i32> + %14 = bitcast <16 x i8> %13 to <2 x i64> + %15 = xor <2 x i64> %12, %5 + %16 = xor <2 x i64> %15, %14 + ret <2 x i64> %16""", + uint64x2_lvec, Tuple{uint64x2_lvec, uint64x2_lvec}, + a.data, b.data +) |> uint64x2 + +function _aesni_expand!(k::AESNIKey, rkey::uint64x2) + k.key1 = rkey + tmp = _aes_key_gen_assist(rkey, Val(0x1)) + rkey = _aes_128_assist(rkey, tmp) + k.key2 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x2)) + rkey = _aes_128_assist(rkey, tmp) + k.key3 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x4)) + rkey = _aes_128_assist(rkey, tmp) + k.key4 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x8)) + rkey = _aes_128_assist(rkey, tmp) + k.key5 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x10)) + rkey = _aes_128_assist(rkey, tmp) + k.key6 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x20)) + rkey = _aes_128_assist(rkey, tmp) + k.key7 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x40)) + rkey = _aes_128_assist(rkey, tmp) + k.key8 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x80)) + rkey = _aes_128_assist(rkey, tmp) + k.key9 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x1b)) + rkey = _aes_128_assist(rkey, tmp) + k.key10 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x36)) + rkey = _aes_128_assist(rkey, tmp) + k.key11 = rkey + + k +end + +AESNIKey(key::UInt128) = _aesni_expand!(AESNIKey(), uint64x2(key)) + +""" +```julia +AESNI1x <: AbstractAESNI1x +AESNI1x([seed]) +``` + +AESNI1x is one kind of AESNI Counter-Based RNGs. It generates one `UInt128` number at a time. + +`seed` is an `Integer` which will be automatically converted to `UInt128`. + +Only available when [`R123_USE_AESNI`](@ref). +""" +mutable struct AESNI1x <: AbstractAESNI1x + x::uint64x2 + ctr::uint64x2 + key::AESNIKey +end + +function AESNI1x(seed::Integer=gen_seed(UInt128)) + r = AESNI1x(0, 0, AESNIKey()) + seed!(r, seed) + r +end + +function seed!(r::AESNI1x, seed::Integer=gen_seed(UInt128)) + r.x = zero(uint64x2) + r.ctr = zero(uint64x2) + _aesni_expand!(r.key, uint64x2(seed % UInt128)) + random123_r(r) + r +end + +seed_type(::Type{AESNI1x}) = UInt128 + +function copyto!(dest::AESNI1x, src::AESNI1x) + dest.x = src.x + dest.ctr = src.ctr + copyto!(dest.key, src.key) + dest +end + +copy(src::AESNI1x) = copyto!(AESNI1x(), src) + +==(r1::AESNI1x, r2::AESNI1x) = r1.x == r2.x && r1.key == r2.key && r1.ctr == r2.ctr + +""" +```julia +AESNI4x <: AbstractAESNI4x +AESNI4x([seed]) +``` + +AESNI4x is one kind of AESNI Counter-Based RNGs. It generates four `UInt32` numbers at a time. + +`seed` is a `Tuple` of four `Integer`s which will all be automatically converted to `UInt32`. + +Only available when [`R123_USE_AESNI`](@ref). +""" +mutable struct AESNI4x <: AbstractAESNI4x + x::uint64x2 + ctr1::uint64x2 + key::AESNIKey + p::Int +end + +function AESNI4x(seed::NTuple{4, Integer}=gen_seed(UInt32, 4)) + r = AESNI4x(zero(uint64x2), zero(uint64x2), AESNIKey(), 0) + seed!(r, seed) + r +end + +function seed!(r::AESNI4x, seed::NTuple{4, Integer}=gen_seed(UInt32, 4)) + key = union_uint(Tuple(x % UInt32 for x in seed)) + r.ctr1 = 0 + _aesni_expand!(r.key, uint64x2(key)) + r.p = 0 + random123_r(r) + r +end + +seed_type(::Type{AESNI4x}) = NTuple{4, UInt32} + +function copyto!(dest::AESNI4x, src::AESNI4x) + unsafe_copyto!(dest, src, UInt128, 2) + copyto!(dest.key, src.key) + dest.p = src.p + dest +end + +copy(src::AESNI4x) = copyto!(AESNI4x(), src) +==(r1::AESNI4x, r2::AESNI4x) = unsafe_compare(r1, r2, UInt128, 2) && + r1.key == r2.key && r1.p == r2.p + +function get_key_uint64x2(o::Union{AESNI1x, AESNI4x})::NTuple{11, uint64x2} + k = o.key + (k.key1,k.key2,k.key3,k.key4,k.key5,k.key6,k.key7,k.key8,k.key9,k.key10,k.key11) +end +get_ctr_uint64x2(o::AESNI4x)::Tuple{uint64x2} = (o.ctr1,) +get_ctr_uint64x2(o::AESNI1x)::Tuple{uint64x2} = (o.ctr,) +get_key(o::Union{AESNI1x, AESNI4x})::NTuple{11,UInt128} = map(UInt128, get_key_uint64x2(o)) +get_ctr(o::Union{AESNI1x, AESNI4x})::Tuple{UInt128} = map(UInt128, get_ctr_uint64x2(o)) + +@inline aesni(key::NTuple{11,uint64x2}, ctr::Tuple{uint64x2})::Tuple{uint64x2} = + (_aes_enc_full(only(ctr), key),) + +""" + aesni(key::NTuple{11,UInt128}, ctr::Tuple{UInt128})::Tuple{UInt128} + +Functional variant of [`AESNI1x`](@ref) and [`AESNI4x`](@ref). +This function if free of mutability and side effects. +""" +@inline function aesni(key::NTuple{11,UInt128}, ctr::Tuple{UInt128})::Tuple{UInt128} + k = map(uint64x2, key) + c = map(uint64x2, ctr) + map(UInt128,aesni(k,c)) +end + + +@inline function random123_r(r::AESNI1x) + r.x = only(aesni(get_key_uint64x2(r), get_ctr_uint64x2(r))) + (UInt128(r.x),) +end + +@inline function random123_r(r::AESNI4x) + r.x = only(aesni(get_key_uint64x2(r), get_ctr_uint64x2(r))) + split_uint(UInt128(r.x), UInt32) +end diff --git a/src/aarch64/aesni_common.jl b/src/aarch64/aesni_common.jl new file mode 100644 index 0000000..020cee6 --- /dev/null +++ b/src/aarch64/aesni_common.jl @@ -0,0 +1,205 @@ +using Base: llvmcall +import Base.(+) + +using ..Random123: R123Generator1x, R123Generator4x +import ..Random123: random123_r, set_counter! + +const LITTLE_ENDIAN::Bool = ENDIAN_BOM ≡ 0x04030201 + +const uint64x2_lvec = NTuple{2, VecElement{UInt64}} +struct uint64x2 + data::uint64x2_lvec +end +@inline Base.convert(::Type{uint64x2}, x::UInt128) = unsafe_load(Ptr{uint64x2}(pointer_from_objref(Ref(x)))) +@inline Base.convert(::Type{UInt128}, x::uint64x2) = unsafe_load(Ptr{UInt128}(pointer_from_objref(Ref(x)))) +@inline UInt128(x::uint64x2) = convert(UInt128, x) +@inline uint64x2(x::UInt128) = convert(uint64x2, x) +@inline Base.convert(::Type{uint64x2}, x::Union{Signed, Unsigned}) = convert(uint64x2, UInt128(x)) +@inline Base.convert(::Type{T}, x::uint64x2) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x)) + +@inline uint64x2(hi::UInt64, lo::UInt64) = @static if LITTLE_ENDIAN + uint64x2((VecElement(lo), VecElement(hi))) +else + uint64x2((VecElement(hi), VecElement(lo))) +end + +@inline Base.zero(::Type{uint64x2}) = convert(uint64x2, zero(UInt128)) +@inline Base.one(::Type{uint64x2}) = uint64x2(zero(UInt64), one(UInt64)) +@inline Base.xor(a::uint64x2, b::uint64x2) = llvmcall( + """%3 = xor <2 x i64> %1, %0 + ret <2 x i64> %3""", + uint64x2_lvec, Tuple{uint64x2_lvec, uint64x2_lvec}, + a.data, b.data, +) |> uint64x2 +@inline (+)(a::uint64x2, b::uint64x2) = llvmcall( + """%3 = add <2 x i64> %1, %0 + ret <2 x i64> %3""", + uint64x2_lvec, Tuple{uint64x2_lvec, uint64x2_lvec}, + a.data, b.data, +) |> uint64x2 +@inline (+)(a::uint64x2, b::Integer) = a + uint64x2(UInt128(b)) + +const uint8x16_lvec = NTuple{16, VecElement{UInt8}} +struct uint8x16 + data::uint8x16_lvec +end +@inline Base.convert(::Type{uint64x2}, x::uint8x16) = unsafe_load(Ptr{uint64x2}(pointer_from_objref(Ref(x)))) +@inline Base.convert(::Type{uint8x16}, x::uint64x2) = unsafe_load(Ptr{uint8x16}(pointer_from_objref(Ref(x)))) +@inline uint8x16(x::uint64x2) = convert(uint8x16, x) +@inline uint64x2(x::uint8x16) = convert(uint64x2, x) +@inline Base.convert(::Type{uint8x16}, x::UInt128) = unsafe_load(Ptr{uint8x16}(pointer_from_objref(Ref(x)))) +@inline Base.convert(::Type{UInt128}, x::uint8x16) = unsafe_load(Ptr{UInt128}(pointer_from_objref(Ref(x)))) +@inline UInt128(x::uint8x16) = convert(UInt128, x) +@inline uint8x16(x::UInt128) = convert(uint8x16, x) +@inline Base.convert(::Type{uint8x16}, x::Union{Signed, Unsigned}) = convert(uint8x16, UInt128(x)) +@inline Base.convert(::Type{T}, x::uint8x16) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x)) + +@inline function uint8x16(bytes::Vararg{UInt8, 16}) + bytes_prepped = bytes + @static if LITTLE_ENDIAN + bytes_prepped = reverse(bytes_prepped) + end + bytes_vec::uint8x16_lvec = VecElement.(bytes_prepped) + return uint8x16(bytes_vec) +end + +@inline Base.zero(::Type{uint8x16}) = convert(uint8x16, zero(UInt128)) +@inline Base.xor(a::uint8x16, b::uint8x16) = llvmcall( + """%3 = xor <16 x i8> %1, %0 + ret <16 x i8> %3""", + uint8x16_lvec, Tuple{uint8x16_lvec, uint8x16_lvec}, + a.data, b.data, +) |> uint8x16 + +const uint32x4_lvec = NTuple{4, VecElement{UInt32}} +struct uint32x4 + data::uint32x4_lvec +end +@inline Base.convert(::Type{uint64x2}, x::uint32x4) = unsafe_load(Ptr{uint64x2}(pointer_from_objref(Ref(x)))) +@inline Base.convert(::Type{uint32x4}, x::uint64x2) = unsafe_load(Ptr{uint32x4}(pointer_from_objref(Ref(x)))) +@inline uint32x4(x::uint64x2) = convert(uint32x4, x) +@inline uint64x2(x::uint32x4) = convert(uint64x2, x) +@inline Base.convert(::Type{uint8x16}, x::uint32x4) = unsafe_load(Ptr{uint8x16}(pointer_from_objref(Ref(x)))) +@inline Base.convert(::Type{uint32x4}, x::uint8x16) = unsafe_load(Ptr{uint32x4}(pointer_from_objref(Ref(x)))) +@inline uint32x4(x::uint8x16) = convert(uint32x4, x) +@inline uint8x16(x::uint32x4) = convert(uint8x16, x) +@inline Base.convert(::Type{uint32x4}, x::UInt128) = unsafe_load(Ptr{uint32x4}(pointer_from_objref(Ref(x)))) +@inline Base.convert(::Type{UInt128}, x::uint32x4) = unsafe_load(Ptr{UInt128}(pointer_from_objref(Ref(x)))) +@inline UInt128(x::uint32x4) = convert(UInt128, x) +@inline uint32x4(x::UInt128) = convert(uint32x4, x) +@inline Base.convert(::Type{uint32x4}, x::Union{Signed, Unsigned}) = convert(uint32x4, UInt128(x)) +@inline Base.convert(::Type{T}, x::uint32x4) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x)) + +@inline function uint32x4(bytes::Vararg{UInt32, 4}) + bytes_prepped = bytes + @static if LITTLE_ENDIAN + bytes_prepped = reverse(bytes_prepped) + end + bytes_vec::uint32x4_lvec = VecElement.(bytes_prepped) + return uint32x4(bytes_vec) +end + +@inline Base.zero(::Type{uint32x4}) = convert(uint32x4, zero(UInt128)) +@inline Base.xor(a::uint32x4, b::uint32x4) = llvmcall( + """%3 = xor <4 x i32> %1, %0 + ret <4 x i32> %3""", + uint32x4_lvec, Tuple{uint32x4_lvec, uint32x4_lvec}, + a.data, b.data, +) |> uint32x4 + +# Raw NEON instrinsics, provided by FEAT_AES +@inline _vaese(a::uint8x16, b::uint8x16) = ccall( + "llvm.aarch64.crypto.aese", + llvmcall, + uint8x16_lvec, + (uint8x16_lvec, uint8x16_lvec), + a.data, b.data, +) |> uint8x16 +@inline _vaesmc(a::uint8x16) = ccall( + "llvm.aarch64.crypto.aesmc", + llvmcall, + uint8x16_lvec, + (uint8x16_lvec,), + a.data, +) |> uint8x16 + +""" +Assistant function for AES keygen. Originally compiled for AArch64 from the C source code: +```cpp +uint8x16_t _mm_aeskeygenassist_helper(uint8x16_t a) +{ + uint8x16_t dest = { + // Undo ShiftRows step from AESE and extract X1 and X3 + a[0x4], a[0x1], a[0xE], a[0xB], // SubBytes(X1) + a[0x1], a[0xE], a[0xB], a[0x4], // ROT(SubBytes(X1)) + a[0xC], a[0x9], a[0x6], a[0x3], // SubBytes(X3) + a[0x9], a[0x6], a[0x3], a[0xC], // ROT(SubBytes(X3)) + }; + return dest; +} +``` +Then made architecture-agnostic as LLVM IR. +""" +@inline _aes_key_gen_shuffle_helper(a::uint8x16) = llvmcall( + """%2 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> + ret <16 x i8> %2""", + uint8x16_lvec, Tuple{uint8x16_lvec}, + a.data, +) |> uint8x16 + +# Mimics of the x86 AES-NI instrinsics +# +# Algorithm translations courtesy of the SIMD Everywhere and SSE2NEON projects: +# https://github.com/simd-everywhere/simde/blob/v0.8.0-rc1/simde/x86/aes.h +# https://github.com/DLTcollab/sse2neon/blob/v1.6.0/sse2neon.h +@inline function _aes_enc(a::uint64x2, round_key::uint64x2) + res = _vaesmc(_vaese(uint8x16(a), zero(uint8x16))) + return uint64x2(res) ⊻ round_key +end +@inline function _aes_enc_last(a::uint64x2, round_key::uint64x2) + res = _vaese(uint8x16(a), zero(uint8x16)) + return uint64x2(res) ⊻ round_key +end +@inline function _aes_key_gen_assist(a::uint64x2, ::Val{R}) where {R} + res = _aes_key_gen_shuffle_helper(_vaese(uint8x16(a), zero(uint8x16))) + r = R % UInt32 + z = zero(UInt32) + return uint64x2(res) ⊻ uint64x2(uint32x4(r, z, r, z)) +end + +""" + _aes_enc_full(a::uint64x2, round_keys::NTuple{N,uint64x2})::uint64x2 where {N} + +Full AES encryption flow for N rounds. +""" +@inline function _aes_enc_full(a::uint64x2, round_keys::NTuple{N,uint64x2})::uint64x2 where {N} + res = uint8x16(a) + for (i, key) in enumerate(round_keys) + if i ≢ N + res = _vaese(res, uint8x16(key)) + if i ≢ N - 1 + res = _vaesmc(res) + end + else + return uint64x2(res ⊻ uint8x16(key)) + end + end + return a # pathological 0-round case +end + +"Abstract RNG that generates one number at a time and is based on AESNI." +abstract type AbstractAESNI1x <: R123Generator1x{UInt128} end +"Abstract RNG that generates four numbers at a time and is based on AESNI." +abstract type AbstractAESNI4x <: R123Generator4x{UInt32} end + +@inline function set_counter!( + r::AbstractAESNI4x, + ctr::NTuple{4, Integer} +) + r.p = 0 + r.ctr1 = union_uint(Tuple(x % UInt32 for x in ctr)) + random123_r(r) + r +end + +@inline inc_counter!(r::AbstractAESNI4x) = (r.ctr1 += one(uint64x2); r) diff --git a/src/aarch64/ars.jl b/src/aarch64/ars.jl new file mode 100644 index 0000000..8571c56 --- /dev/null +++ b/src/aarch64/ars.jl @@ -0,0 +1,163 @@ +import Base: copy, copyto!, == +import Random: rand, seed! +import RandomNumbers: gen_seed, split_uint, union_uint, seed_type, unsafe_copyto!, unsafe_compare + +""" +```julia +ARS1x{R} <: AbstractAESNI1x +ARS1x([seed, R=7]) +``` + +ARS1x is one kind of ARS Counter-Based RNGs. It generates one `UInt128` number at a time. + +`seed` is an `Integer` which will be automatically converted to `UInt128`. + +`R` denotes to the Rounds which should be at least 1 and no more than 10. With 7 rounds (by default), it has +a considerable safety margin over the minimum number of rounds with no known statistical flaws, but still has +excellent performance. + +Only available when [`R123_USE_AESNI`](@ref). +""" +mutable struct ARS1x{R} <: AbstractAESNI1x + x::uint64x2 + ctr::uint64x2 + key::uint64x2 +end + +function ARS1x(seed::Integer=gen_seed(UInt128), R::Integer = 7) + R = Int(R) + @assert 1 ≤ R ≤ 10 + m0 = zero(uint64x2) + r = ARS1x{R}(m0, m0, m0) + seed!(r, seed) +end + +function seed!(r::ARS1x, seed::Integer=gen_seed(UInt128)) + r.x = zero(uint64x2) + r.ctr = zero(uint64x2) + r.key = seed % UInt128 + random123_r(r) + r +end + +@inline seed_type(::Type{ARS1x{R}}) where R = UInt128 + +copyto!(dest::ARS1x{R}, src::ARS1x{R}) where R = unsafe_copyto!(dest, src, UInt128, 3) + +copy(src::ARS1x{R}) where R = ARS1x{R}(src.x, src.ctr, src.key) + +==(r1::ARS1x{R}, r2::ARS1x{R}) where R = unsafe_compare(r1, r2, UInt128, 3) + +""" +```julia +ARS4x{R} <: AbstractAESNI4x +ARS4x([seed, R=7]) +``` + +ARS4x is one kind of ARS Counter-Based RNGs. It generates four `UInt32` numbers at a time. + +`seed` is a `Tuple` of four `Integer`s which will all be automatically converted to `UInt32`. + +`R` denotes to the Rounds which must be at least 1 and no more than 10. With 7 rounds (by default), it has a +considerable safety margin over the minimum number of rounds with no known statistical flaws, but still has +excellent performance. + +Only available when [`R123_USE_AESNI`](@ref). +""" +mutable struct ARS4x{R} <: AbstractAESNI4x + x::uint64x2 + ctr1::uint64x2 + key::uint64x2 + p::Int +end + +function ARS4x(seed::NTuple{4, Integer}=gen_seed(UInt32, 4), R::Integer=7) + R = Int(R) + @assert 1 ≤ R ≤ 10 + r = ARS4x{R}(zero(uint64x2), zero(uint64x2), zero(uint64x2), 0) + seed!(r, seed) +end + +function seed!(r::ARS4x, seed::NTuple{4, Integer}=gen_seed(UInt32, 4)) + r.ctr1 = zero(uint64x2) + r.key = union_uint(Tuple(x % UInt32 for x in seed)) + r.p = 0 + random123_r(r) + r +end + +@inline seed_type(::Type{ARS4x{R}}) where R = NTuple{4, UInt32} + +function copyto!(dest::ARS4x{R}, src::ARS4x{R}) where R + unsafe_copyto!(dest, src, UInt128, 3) + dest.p = src.p + dest +end + +copy(src::ARS4x{R}) where R = ARS4x{R}(src.x, src.ctr1, src.key, src.p) + +==(r1::ARS4x{R}, r2::ARS4x{R}) where R = unsafe_compare(r1, r2, UInt128, 3) && r1.p ≡ r2.p + +function expr_ars1xm128i(expr_key, expr_ctr, R) + @assert R isa Int && 1 ≤ R ≤ 10 + rounds = [quote + kk += kweyl + v = _aes_enc(v, kk) + end for _ in 2:R] + quote + ctr = $(expr_ctr) + key = $(expr_key) + kweyl = uint64x2(0xbb67ae8584caa73b, 0x9e3779b97f4a7c15) + kk = key + v = ctr ⊻ kk + q1 = UInt128(ctr) + q2 = UInt128(key) + $(rounds...) + kk += kweyl + ret = _aes_enc_last(v, kk) + end +end + +@generated function ars1xm128i(r::Union{ARS1x{R}, ARS4x{R}}) where R + expr_ctr = if r <: ARS1x + :(r.ctr) + elseif r <: ARS4x + :(r.ctr1) + else + :(error("Unreachable")) + end + expr_key = :(r.key) + expr_ars1xm128i(expr_key, expr_ctr, R) +end + +@generated function ars(key::Tuple{uint64x2}, ctr::Tuple{uint64x2}, ::Val{R})::Tuple{uint64x2} where {R} + :(($(expr_ars1xm128i(:(only(key)), :(only(ctr)), R)),)) +end + +""" + ars(key::Tuple{UInt128}, ctr::Tuple{UInt128}, rounds::Val{R})::Tuple{UInt128} where {R} + +Functional variant of [`ARS1x`](@ref) and [`ARS4x`](@ref). +This function if free of mutability and side effects. +""" +function ars(key::Tuple{UInt128}, ctr::Tuple{UInt128}, rounds::Val{R})::Tuple{UInt128} where {R} + k = map(uint64x2, key) + c = map(uint64x2, ctr) + map(UInt128,ars(k,c,rounds)) +end + +get_key(r::Union{ARS1x, ARS4x}) = (UInt128(r.key),) +get_ctr(r::ARS1x) = (UInt128(r.ctr),) +get_ctr(r::ARS4x) = (UInt128(r.ctr1),) + +@inline function random123_r(r::ARS1x{R}) where R + r.x = ars1xm128i(r) + (UInt128(r.x),) +end + +@inline function random123_r(r::ARS4x{R}) where R + r.x = ars1xm128i(r) + split_uint(UInt128(r.x), UInt32) +end + + diff --git a/src/aesni.jl b/src/x86/aesni.jl similarity index 100% rename from src/aesni.jl rename to src/x86/aesni.jl diff --git a/src/aesni_common.jl b/src/x86/aesni_common.jl similarity index 94% rename from src/aesni_common.jl rename to src/x86/aesni_common.jl index fbb440d..6d70829 100644 --- a/src/aesni_common.jl +++ b/src/x86/aesni_common.jl @@ -16,7 +16,7 @@ Base.convert(::Type{__m128i}, x::Union{Signed, Unsigned}) = convert(__m128i, UIn Base.convert(::Type{T}, x::__m128i) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x)) const LITTLE_ENDIAN = ENDIAN_BOM ≡ 0x04030201 -__m128i(hi::UInt64, lo::UInt64) = LITTLE_ENDIAN ? __m128i((VecElement(lo), VecElement(hi))) : __m128i((VecElement(hi), VecElement(lo))) +__m128i(hi::UInt64, lo::UInt64) = @static LITTLE_ENDIAN ? __m128i((VecElement(lo), VecElement(hi))) : __m128i((VecElement(hi), VecElement(lo))) Base.zero(::Type{__m128i}) = __m128i(zero(UInt64), zero(UInt64)) Base.one(::Type{__m128i}) = __m128i(zero(UInt64), one(UInt64)) diff --git a/src/ars.jl b/src/x86/ars.jl similarity index 100% rename from src/ars.jl rename to src/x86/ars.jl diff --git a/test/aarch64/aesni.jl b/test/aarch64/aesni.jl new file mode 100644 index 0000000..256df60 --- /dev/null +++ b/test/aarch64/aesni.jl @@ -0,0 +1,28 @@ +import Random: seed! +using Test: @test, @testset + +using RandomNumbers +using Random123 + +import RandomNumbers: split_uint +import Random123: uint64x2, AESNIKey + +@testset "Accelerated AESNI" begin + x = zero(uint64x2) + ctr = uint64x2(0x9799b5d54f7b9227b47607190d0dfefb) + key = 0x07b8e4b6aa98ec245a7da274d3b8146a + aesni_key = AESNIKey(key) + @test rand(AESNI1x(x, ctr, aesni_key), UInt128) ≡ 0x60f4c27fe48fe1b8c5f4568a585b0dc0 + + r = AESNI1x(key) + r1 = AESNI4x(split_uint(key, UInt32)) + @test seed_type(r) ≡ UInt128 + @test seed_type(r1) ≡ NTuple{4, UInt32} + @test copyto!(copy(r), r) == r + @test copyto!(copy(r1), r1) == r1 + @test UInt128(r.x) ≡ rand(r1, UInt128) + @test rand(r, UInt128) ≡ rand(r1, UInt128) + set_counter!(r, 0) + set_counter!(r1, 1) + @test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) +end diff --git a/test/aarch64/ars.jl b/test/aarch64/ars.jl new file mode 100644 index 0000000..426dde5 --- /dev/null +++ b/test/aarch64/ars.jl @@ -0,0 +1,31 @@ +import Random: seed! +using Test: @test, @testset + +using RandomNumbers +using Random123 + +import RandomNumbers: split_uint +import Random123: uint64x2 + +@testset "Accelerated ARS" begin + x = zero(uint64x2) + ctr = uint64x2(0x9799b5d54f7b9227, 0xb47607190d0dfefb) + key = uint64x2(0x07b8e4b6aa98ec24, 0x5a7da274d3b8146a) + @test rand(ARS1x{1}(x, ctr, key), UInt128) ≡ 0x1a0b14c707b64224e548ef12331396ef + @test rand(ARS1x{2}(x, ctr, key), UInt128) ≡ 0x3ced8e0970690f718336318ba22e8ae1 + @test rand(ARS1x{3}(x, ctr, key), UInt128) ≡ UInt128(uint64x2(0xb6621a8b006319e8, 0x67c841642c32fc19)) + @test rand(ARS1x{10}(x, ctr, key), UInt128) ≡ UInt128(uint64x2(0xac35df44f996ed82, 0x4e287697bad2f9a2)) + + key = rand(UInt128) + r = ARS1x(key) + r1 = ARS4x(split_uint(key, UInt32)) + @test seed_type(r) ≡ UInt128 + @test seed_type(r1) ≡ NTuple{4, UInt32} + @test copyto!(copy(r), r) == r + @test copyto!(copy(r1), r1) == r1 + @test UInt128(r.x) ≡ rand(r1, UInt128) + @test rand(r, UInt128) ≡ rand(r1, UInt128) + set_counter!(r, 0) + set_counter!(r1, 1) + @test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) +end diff --git a/test/aesni.jl b/test/aesni.jl deleted file mode 100644 index 7fc9ab4..0000000 --- a/test/aesni.jl +++ /dev/null @@ -1,30 +0,0 @@ -if R123_USE_AESNI - -import Random: seed! -using Test: @test - -using RandomNumbers -using Random123 - -import RandomNumbers: split_uint -import Random123: __m128i, AESNIKey - -x = zero(__m128i) -ctr = __m128i(0x9799b5d54f7b9227, 0xb47607190d0dfefb) -key = 0x07b8e4b6aa98ec245a7da274d3b8146a -aesni_key = AESNIKey(key) -@test rand(AESNI1x(x, ctr, aesni_key), UInt128) ≡ 0x60f4c27fe48fe1b8c5f4568a585b0dc0 - -r = AESNI1x(key) -r1 = AESNI4x(split_uint(key, UInt32)) -@test seed_type(r) ≡ UInt128 -@test seed_type(r1) ≡ NTuple{4, UInt32} -@test copyto!(copy(r), r) == r -@test copyto!(copy(r1), r1) == r1 -@test UInt128(r.x) ≡ rand(r1, UInt128) -@test rand(r, UInt128) ≡ rand(r1, UInt128) -set_counter!(r, 0) -set_counter!(r1, 1) -@test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) - -end diff --git a/test/ars.jl b/test/ars.jl deleted file mode 100644 index 648687d..0000000 --- a/test/ars.jl +++ /dev/null @@ -1,33 +0,0 @@ -if R123_USE_AESNI - -import Random: seed! -using Test: @test - -using RandomNumbers -using Random123 - -import RandomNumbers: split_uint -import Random123: __m128i - -x = zero(__m128i) -ctr = __m128i(0x9799b5d54f7b9227, 0xb47607190d0dfefb) -key = __m128i(0x07b8e4b6aa98ec24, 0x5a7da274d3b8146a) -@test rand(ARS1x{1}(x, ctr, key), UInt128) ≡ 0x1a0b14c707b64224e548ef12331396ef -@test rand(ARS1x{2}(x, ctr, key), UInt128) ≡ 0x3ced8e0970690f718336318ba22e8ae1 -@test rand(ARS1x{3}(x, ctr, key), UInt128) ≡ UInt128(__m128i(0xb6621a8b006319e8, 0x67c841642c32fc19)) -@test rand(ARS1x{10}(x, ctr, key), UInt128) ≡ UInt128(__m128i(0xac35df44f996ed82, 0x4e287697bad2f9a2)) - -key = rand(UInt128) -r = ARS1x(key) -r1 = ARS4x(split_uint(key, UInt32)) -@test seed_type(r) ≡ UInt128 -@test seed_type(r1) ≡ NTuple{4, UInt32} -@test copyto!(copy(r), r) == r -@test copyto!(copy(r1), r1) == r1 -@test UInt128(r.x) ≡ rand(r1, UInt128) -@test rand(r, UInt128) ≡ rand(r1, UInt128) -set_counter!(r, 0) -set_counter!(r1, 1) -@test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) - -end diff --git a/test/runtests.jl b/test/runtests.jl index 17000d3..913ad19 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,7 +12,8 @@ using Printf: @printf seed1 = 1 seed2 = (1,2) seed4 = (1,2,3,4) - for (rng, alg, options) in [ + AlgChoice = Tuple{Random123.AbstractR123, Function, Union{Tuple{}, Tuple{Val}}} + alg_choices = AlgChoice[ (Threefry2x(UInt32, seed2) , threefry, (Val(20),)) , (Threefry2x(UInt64, seed2) , threefry, (Val(20),)) , (Threefry4x(UInt32, seed4) , threefry, (Val(20),)) , @@ -21,11 +22,16 @@ using Printf: @printf (Philox2x(UInt64 , seed1) , philox , (Val(10),)) , (Philox4x(UInt32 , seed2) , philox , (Val(10),)) , (Philox4x(UInt64 , seed2) , philox , (Val(10),)) , - (AESNI1x(seed1) , aesni , () ) , - (AESNI4x(seed4) , aesni , () ) , - (ARS1x(seed1) , ars , (Val(7),) ) , - (ARS4x(seed4) , ars , (Val(7),) ) , ] + @static if R123_USE_AESNI + append!(alg_choices, AlgChoice[ + (AESNI1x(seed1) , aesni , () ) , + (AESNI4x(seed4) , aesni , () ) , + (ARS1x(seed1) , ars , (Val(7),) ) , + (ARS4x(seed4) , ars , (Val(7),) ) , + ]) + end + for (rng, alg, options) in alg_choices key = @inferred get_key(rng) ctr = @inferred get_ctr(rng) @test isbitstype(typeof(key)) @@ -89,17 +95,19 @@ end @test x9 === y9 end - rng = ARS1x(1) - @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) - @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) - @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) - @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) - - rng = AESNI1x(1) - @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) - @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) - @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) - @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) + if R123_USE_AESNI + rng = ARS1x(1) + @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) + @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) + @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) + @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) + + rng = AESNI1x(1) + @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) + @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) + @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) + @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) + end end @@ -164,5 +172,10 @@ redirect_stdout(stdout_) compare_dirs("expected", "actual") cd(pwd_) -include("aesni.jl") -include("ars.jl") +@static if Random123.R123_USE_X86_AES_NI + include("./x86/aesni.jl") + include("./x86/ars.jl") +elseif Random123.R123_USE_AARCH64_FEAT_AES + include("./aarch64/aesni.jl") + include("./aarch64/ars.jl") +end diff --git a/test/x86/aesni.jl b/test/x86/aesni.jl new file mode 100644 index 0000000..0c112fe --- /dev/null +++ b/test/x86/aesni.jl @@ -0,0 +1,28 @@ +import Random: seed! +using Test: @test, @testset + +using RandomNumbers +using Random123 + +import RandomNumbers: split_uint +import Random123: __m128i, AESNIKey + +@testset "Accelerated AESNI" begin + x = zero(__m128i) + ctr = __m128i(0x9799b5d54f7b9227b47607190d0dfefb) + key = 0x07b8e4b6aa98ec245a7da274d3b8146a + aesni_key = AESNIKey(key) + @test rand(AESNI1x(x, ctr, aesni_key), UInt128) ≡ 0x60f4c27fe48fe1b8c5f4568a585b0dc0 + + r = AESNI1x(key) + r1 = AESNI4x(split_uint(key, UInt32)) + @test seed_type(r) ≡ UInt128 + @test seed_type(r1) ≡ NTuple{4, UInt32} + @test copyto!(copy(r), r) == r + @test copyto!(copy(r1), r1) == r1 + @test UInt128(r.x) ≡ rand(r1, UInt128) + @test rand(r, UInt128) ≡ rand(r1, UInt128) + set_counter!(r, 0) + set_counter!(r1, 1) + @test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) +end diff --git a/test/x86/ars.jl b/test/x86/ars.jl new file mode 100644 index 0000000..38355d0 --- /dev/null +++ b/test/x86/ars.jl @@ -0,0 +1,31 @@ +import Random: seed! +using Test: @test, @testset + +using RandomNumbers +using Random123 + +import RandomNumbers: split_uint +import Random123: __m128i + +@testset "Accelerated ARS" begin + x = zero(__m128i) + ctr = __m128i(0x9799b5d54f7b9227, 0xb47607190d0dfefb) + key = __m128i(0x07b8e4b6aa98ec24, 0x5a7da274d3b8146a) + @test rand(ARS1x{1}(x, ctr, key), UInt128) ≡ 0x1a0b14c707b64224e548ef12331396ef + @test rand(ARS1x{2}(x, ctr, key), UInt128) ≡ 0x3ced8e0970690f718336318ba22e8ae1 + @test rand(ARS1x{3}(x, ctr, key), UInt128) ≡ UInt128(__m128i(0xb6621a8b006319e8, 0x67c841642c32fc19)) + @test rand(ARS1x{10}(x, ctr, key), UInt128) ≡ UInt128(__m128i(0xac35df44f996ed82, 0x4e287697bad2f9a2)) + + key = rand(UInt128) + r = ARS1x(key) + r1 = ARS4x(split_uint(key, UInt32)) + @test seed_type(r) ≡ UInt128 + @test seed_type(r1) ≡ NTuple{4, UInt32} + @test copyto!(copy(r), r) == r + @test copyto!(copy(r1), r1) == r1 + @test UInt128(r.x) ≡ rand(r1, UInt128) + @test rand(r, UInt128) ≡ rand(r1, UInt128) + set_counter!(r, 0) + set_counter!(r1, 1) + @test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) +end