diff --git a/ec/affine_t.hpp b/ec/affine_t.hpp new file mode 100644 index 0000000..2766014 --- /dev/null +++ b/ec/affine_t.hpp @@ -0,0 +1,190 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef __SPPARK_EC_AFFINE_T_HPP__ +#define __SPPARK_EC_AFFINE_T_HPP__ + +template class Affine_t; +template class Affine_inf_t; +template class jacobian_t; +template class xyzz_t; + +#ifndef __CUDACC__ +# undef __host__ +# define __host__ +# undef __device__ +# define __device__ +# undef __noinline__ +# define __noinline__ +#endif + +template +class Affine_t { + friend class Affine_inf_t; + friend class jacobian_t; + friend class xyzz_t; + + field_t X, Y; + +public: + Affine_t(const field_t& x, const field_t& y) : X(x), Y(y) {} + inline __host__ __device__ Affine_t() {} + +#ifdef __CUDA_ARCH__ + inline __device__ bool is_inf() const + { return (bool)(X.is_zero(Y)); } +#else + inline __host__ bool is_inf() const + { return (bool)(X.is_zero() & Y.is_zero()); } +#endif + + inline __host__ Affine_t& operator=(const jacobian_t& a) + { + Y = 1/a.Z; + X = Y^2; // 1/Z^2 + Y *= X; // 1/Z^3 + X *= a.X; // X/Z^2 + Y *= a.Y; // Y/Z^3 + return *this; + } + inline __host__ Affine_t(const jacobian_t& a) { *this = a; } + + inline __host__ Affine_t& operator=(const xyzz_t& a) + { + Y = 1/a.ZZZ; + X = Y * a.ZZ; // 1/Z + X = X^2; // 1/Z^2 + X *= a.X; // X/Z^2 + Y *= a.Y; // Y/Z^3 + return *this; + } + inline __host__ Affine_t(const xyzz_t& a) { *this = a; } + + inline __host__ __device__ operator jacobian_t() const + { + jacobian_t p; + p.X = X; + p.Y = Y; + p.Z = field_t::one(is_inf()); + return p; + } + + inline __host__ __device__ operator xyzz_t() const + { + xyzz_t p; + p.X = X; + p.Y = Y; + p.ZZZ = p.ZZ = field_t::one(is_inf()); + return p; + } + +#ifdef __NVCC__ + class mem_t { + field_h X, Y; + + public: + inline __device__ operator Affine_t() const + { + Affine_t p; + p.X = X; + p.Y = Y; + return p; + } + }; +#else + using mem_t = Affine_t; +#endif + +#ifndef NDEBUG + friend inline __host__ bool operator==(const Affine_t& a, const Affine_t& b) + { return (a.X == b.X) & (a.Y == b.Y); } + + friend inline __host__ bool operator!=(const Affine_t& a, const Affine_t& b) + { return !(a == b); } + +# if defined(_GLIBCXX_IOSTREAM) || defined(_IOSTREAM_) // non-standard + friend __host__ std::ostream& operator<<(std::ostream& os, const Affine_t& p) + { + return os << "X: " << p.X << std::endl + << "Y: " << p.Y; + } +# endif +#endif +}; + +template +class Affine_inf_t { + field_t X, Y; + bool inf; + + inline __host__ __device__ bool is_inf() const + { return inf; } + +public: + inline __host__ __device__ operator Affine_t() const + { + bool inf = is_inf(); + Affine_t p; + p.X = czero(X, inf); + p.Y = czero(Y, inf); + return p; + } + + inline __host__ __device__ operator jacobian_t() const + { + jacobian_t p; + p.X = X; + p.Y = Y; + p.Z = field_t::one(is_inf()); + return p; + } + + inline __host__ __device__ operator xyzz_t() const + { + xyzz_t p; + p.X = X; + p.Y = Y; + p.ZZZ = p.ZZ = field_t::one(is_inf()); + return p; + } + +#ifdef __NVCC__ + class mem_t { + field_h X, Y; +#ifdef __CUDACC__ + int inf[sizeof(field_t)%16 ? 2 : 4]; + + inline __host__ __device__ bool is_inf() const + { return inf[0]&1 != 0; } +#else + bool inf; + + inline __host__ __device__ bool is_inf() const + { return inf; } +#endif + public: + inline __device__ operator Affine_t() const + { + bool inf = is_inf(); + Affine_t p; + p.X = czero((field_t)X, inf); + p.Y = czero((field_t)Y, inf); + return p; + } + + inline __device__ operator Affine_inf_t() const + { + bool inf = is_inf(); + Affine_inf_t p; + p.X = czero((field_t)X, inf); + p.Y = czero((field_t)Y, inf); + p.inf = inf; + return p; + } + }; +#else + using mem_t = Affine_inf_t; +#endif +}; +#endif diff --git a/ec/jacobian_t.hpp b/ec/jacobian_t.hpp index 1c7eaa0..53cb1a3 100644 --- a/ec/jacobian_t.hpp +++ b/ec/jacobian_t.hpp @@ -5,39 +5,27 @@ #ifndef __SPPARK_EC_JACOBIAN_T_HPP__ #define __SPPARK_EC_JACOBIAN_T_HPP__ -template class jacobian_t { +#ifndef __SPPARK_EC_AFFINE_T_HPP__ +# include "affine_t.hpp" +#endif + +template +class jacobian_t { + friend class Affine_t; + friend class Affine_inf_t; + field_t X, Y, Z; inline operator const void*() const { return this; } inline operator void*() { return this; } public: + using affine_t = Affine_t; + jacobian_t() {} jacobian_t(const field_t& x, const field_t& y, const field_t& z) : X(x), Y(y), Z(z) {} - class affine_t { friend jacobian_t; - field_t X, Y; - - public: - affine_t() {} - affine_t(const field_t& x, const field_t& y) : X(x), Y(y) {} - - inline bool is_inf() const - { return (bool)(X.is_zero() & Y.is_zero()); } - - inline affine_t& operator=(const jacobian_t& a) - { - Y = 1/a.Z; - X = Y^2; // 1/Z^2 - Y *= X; // 1/Z^3 - X *= a.X; // X/Z^2 - Y *= a.Y; // Y/Z^3 - return *this; - } - inline affine_t(const jacobian_t& a) { *this = a; } - }; - inline operator affine_t() const { return affine_t(*this); } inline jacobian_t& operator=(const affine_t& a) @@ -538,5 +526,37 @@ template class jacobian_t { } *this = p3; } + +#ifndef NDEBUG + friend inline bool operator==(const jacobian_t& a, const jacobian_t& b) + { + field_t X1, Y1, X2, Y2; + Y1 = a.Z^2; + Y2 = b.Z^2; + + X2 = a.X * Y2; // a.X * b.Z^2 + X1 = b.X * Y1; // b.X * a.Z^2 + + Y2 *= a.Y; // a.Y * b.Z^2 + Y1 *= b.Y; // b.Y * a.Z^2 + + Y2 *= b.Z; // a.Y * b.Z^3 + Y1 *= a.Z; // b.Y * a.Z^3 + + return (X1 == X2 & Y1 == Y2) & (a.is_inf() ^ b.is_inf() ^ 1); + } + + friend inline bool operator!=(const jacobian_t& a, const jacobian_t& b) + { return !(a == b); } + +# if defined(_GLIBCXX_IOSTREAM) || defined(_IOSTREAM_) // non-standard + friend std::ostream& operator<<(std::ostream& os, const jacobian_t& p) + { + return os << "X: " << p.X << std::endl + << "Y: " << p.Y << std::endl + << "Z: " << p.Z; + } +# endif +#endif }; #endif diff --git a/ec/xyzz_t.hpp b/ec/xyzz_t.hpp index 403a2f3..0e21ddc 100644 --- a/ec/xyzz_t.hpp +++ b/ec/xyzz_t.hpp @@ -5,22 +5,28 @@ #ifndef __SPPARK_EC_XYZZ_T_HPP__ #define __SPPARK_EC_XYZZ_T_HPP__ -#ifndef __CUDACC__ -# undef __host__ -# define __host__ -# undef __device__ -# define __device__ -# undef __noinline__ -# define __noinline__ +#ifndef __SPPARK_EC_AFFINE_T_HPP__ +# include "affine_t.hpp" #endif -template +template class xyzz_t { + friend class Affine_t; + friend class Affine_inf_t; + field_t X, Y, ZZZ, ZZ; public: + using affine_t = Affine_t; + using affine_inf_t = Affine_inf_t; + static const unsigned int degree = field_t::degree; + inline __host__ __device__ xyzz_t() {} + inline __host__ __device__ + xyzz_t(const field_t& x, const field_t& y, const field_t& zzz, const field_t& zz) : + X(x), Y(y), ZZZ(zzz), ZZ(zz) {} + #ifdef __NVCC__ class mem_t { friend class xyzz_t; field_h X, Y, ZZZ, ZZ; @@ -57,121 +63,6 @@ class xyzz_t { using mem_t = xyzz_t; #endif - class affine_t { friend class xyzz_t; - field_t X, Y; - - public: - affine_t(const field_t& x, const field_t& y) : X(x), Y(y) {} - inline __host__ __device__ affine_t() {} - -#ifdef __CUDA_ARCH__ - inline __device__ bool is_inf() const - { return (bool)(X.is_zero(Y)); } -#else - inline __host__ bool is_inf() const - { return (bool)(X.is_zero() & Y.is_zero()); } -#endif - - inline __host__ affine_t& operator=(const xyzz_t& a) - { - Y = 1/a.ZZZ; - X = Y * a.ZZ; // 1/Z - X = X^2; // 1/Z^2 - X *= a.X; // X/Z^2 - Y *= a.Y; // Y/Z^3 - return *this; - } - inline __host__ affine_t(const xyzz_t& a) { *this = a; } - -#ifdef __SPPARK_EC_JACOBIAN_T_HPP__ - inline operator jacobian_t() const - { return jacobian_t{ X, Y, field_t::one(is_inf()) }; } -#endif - - inline __host__ __device__ operator xyzz_t() const - { - xyzz_t p; - p.X = X; - p.Y = Y; - p.ZZZ = p.ZZ = field_t::one(is_inf()); - return p; - } - -#ifdef __NVCC__ - class mem_t { - field_h X, Y; - - public: - inline __device__ operator affine_t() const - { - affine_t p; - p.X = X; - p.Y = Y; - return p; - } - }; -#else - using mem_t = affine_t; -#endif - }; - - class affine_inf_t { - field_t X, Y; - bool inf; - - inline __host__ __device__ bool is_inf() const - { return inf; } - - public: - inline __device__ operator affine_t() const - { - bool inf = is_inf(); - affine_t p; - p.X = czero(X, inf); - p.Y = czero(Y, inf); - return p; - } - -#ifdef __NVCC__ - class mem_t { - field_h X, Y; -#ifdef __CUDACC__ - int inf[sizeof(field_t)%16 ? 2 : 4]; - - inline __host__ __device__ bool is_inf() const - { return inf[0]&1 != 0; } -#else - bool inf; - - inline __host__ __device__ bool is_inf() const - { return inf; } -#endif - public: - inline __device__ operator affine_t() const - { - bool inf = is_inf(); - affine_t p; - p.X = czero((field_t)X, inf); - p.Y = czero((field_t)Y, inf); - return p; - } - - inline __device__ operator affine_inf_t() const - { - bool inf = is_inf(); - affine_inf_t p; - p.X = czero((field_t)X, inf); - p.Y = czero((field_t)Y, inf); - p.inf = inf; - return p; - } - }; -#else - using mem_t = affine_inf_t; -#endif - }; - - template inline __host__ __device__ xyzz_t& operator=(const affine_t& a) { X = a.X; @@ -180,12 +71,11 @@ class xyzz_t { return *this; } - inline __host__ operator affine_t() const { return affine_t(*this); } + inline __host__ operator affine_t() const + { return affine_t(*this); } -#ifdef __SPPARK_EC_JACOBIAN_T_HPP__ inline operator jacobian_t() const { return jacobian_t{ X*ZZ, Y*ZZZ, ZZ }; } -#endif #ifdef __CUDA_ARCH__ inline __device__ bool is_inf() const { return (bool)(ZZZ.is_zero(ZZ)); } @@ -414,7 +304,6 @@ class xyzz_t { * with twists to handle even subtractions and either input at infinity. * Addition costs 8M+2S, while conditional doubling - 2M+4M+3S. */ - template __host__ __device__ void add(const affine_t& p2, bool subtract = false) { #ifdef __CUDA_ARCH__ @@ -488,7 +377,6 @@ class xyzz_t { } #ifdef __CUDA_ARCH__ - template __device__ void uadd(const affine_t& p2, bool subtract = false) { @@ -598,7 +486,6 @@ class xyzz_t { *this = p31; } #else - template inline void uadd(const affine_t& p2, bool subtract = false) { add(p2, subtract); } #endif @@ -616,5 +503,32 @@ class xyzz_t { return ret; } #endif + +#ifndef NDEBUG + friend inline __host__ bool operator==(const xyzz_t& a, const xyzz_t& b) + { + field_t X1, Y1, X2, Y2; + X1 = a.X * b.ZZ; + X2 = b.X * a.ZZ; + + Y1 = a.Y * b.ZZZ; + Y2 = b.Y * a.ZZZ; + + return (X1 == X2 & Y1 == Y2) & (a.is_inf() ^ b.is_inf() ^ 1); + } + + friend inline __host__ bool operator!=(const xyzz_t& a, const xyzz_t& b) + { return !(a == b); } + +# if defined(_GLIBCXX_IOSTREAM) || defined(_IOSTREAM_) // non-standard + friend __host__ std::ostream& operator<<(std::ostream& os, const xyzz_t& p) + { + return os << "X: " << p.X << std::endl + << "Y: " << p.Y << std::endl + << "ZZZ: " << p.ZZZ << std::endl + << "ZZ: " << p.ZZ; + } +# endif +#endif }; #endif diff --git a/msm/batch_addition.cuh b/msm/batch_addition.cuh index c4140dc..325bb04 100644 --- a/msm/batch_addition.cuh +++ b/msm/batch_addition.cuh @@ -20,9 +20,9 @@ # error "invalid BATCH_ADD_NSTREAMS" #endif -template + class affine_h = class affine_t::mem_t> __device__ __forceinline__ static void add(bucket_h ret[], const affine_h points[], uint32_t npoints, const uint32_t bitmap[], const uint32_t refmap[], @@ -113,27 +113,27 @@ static void add(bucket_h ret[], const affine_h points[], uint32_t npoints, current = 0; } -template + class affine_h = class affine_t::mem_t> __launch_bounds__(BATCH_ADD_BLOCK_SIZE) __global__ void batch_addition(bucket_h ret[], const affine_h points[], uint32_t npoints, const uint32_t bitmap[], bool accumulate = false, uint32_t sid = 0) { add(ret, points, npoints, bitmap, nullptr, accumulate, sid); } -template + class affine_h = class affine_t::mem_t> __launch_bounds__(BATCH_ADD_BLOCK_SIZE) __global__ void batch_diff(bucket_h ret[], const affine_h points[], uint32_t npoints, const uint32_t bitmap[], const uint32_t refmap[], bool accumulate = false, uint32_t sid = 0) { add(ret, points, npoints, bitmap, refmap, accumulate, sid); } -template + class affine_h = class affine_t::mem_t> __launch_bounds__(BATCH_ADD_BLOCK_SIZE) __global__ void batch_addition(bucket_h ret[], const affine_h points[], size_t npoints, const uint32_t digits[], const uint32_t& ndigits) diff --git a/msm/pippenger.cuh b/msm/pippenger.cuh index 61d198e..676c504 100644 --- a/msm/pippenger.cuh +++ b/msm/pippenger.cuh @@ -144,9 +144,9 @@ void breakdown(vec2d_t digits, const scalar_t scalars[], size_t len, #endif template + class affine_h = class affine_t::mem_t> __launch_bounds__(ACCUMULATE_NTHREADS) __global__ void accumulate(bucket_h buckets_[], uint32_t nwins, uint32_t wbits, /*const*/ affine_h points_[], const vec2d_t digits, @@ -299,16 +299,17 @@ void integrate(bucket_h buckets_[], uint32_t nwins, uint32_t wbits, uint32_t nbi #ifndef SPPARK_DONT_INSTANTIATE_TEMPLATES template __global__ -void accumulate(bucket_t::mem_t buckets_[], - uint32_t nwins, uint32_t wbits, - /*const*/ affine_t::mem_t points_[], - const vec2d_t digits, - const vec2d_t histogram, - uint32_t sid); +void accumulate(bucket_t::mem_t buckets_[], + uint32_t nwins, uint32_t wbits, + /*const*/ affine_t::mem_t points_[], + const vec2d_t digits, + const vec2d_t histogram, + uint32_t sid); template __global__ -void batch_addition(bucket_t::mem_t buckets[], - const affine_t::mem_t points[], size_t npoints, - const uint32_t digits[], const uint32_t& ndigits); +void batch_addition(bucket_t::mem_t buckets[], + const affine_t::mem_t points[], + size_t npoints, const uint32_t digits[], + const uint32_t& ndigits); template __global__ void integrate(bucket_t::mem_t buckets_[], uint32_t nwins, uint32_t wbits, uint32_t nbits); @@ -506,14 +507,14 @@ public: for (uint32_t i = 0; i < batch; i++) { gpu[i&1].wait(ev); - batch_addition<<>>( + batch_addition + <<>>( &d_buckets[nwins << (wbits-1)], &d_points[d_off], num, &d_digits[0][0], d_hist[0][0] ); CUDA_OK(cudaGetLastError()); - gpu[i&1].launch_coop(accumulate, + gpu[i&1].launch_coop(accumulate, {gpu.sm_count(), 0}, d_buckets, nwins, wbits, &d_points[d_off], d_digits, d_hist, i&1 ); @@ -634,7 +635,7 @@ private: for (size_t j = 0; j < lsbits-1-NTHRBITS; j++) raise.dbl(); res.add(raise); - res.add(row[i][0]); + res.add((point_t)row[i][0]); if (i) acc.add(row[i][1]); }