Merge pull request #317 from StephenButtolph/sampling

added new sampling algos and optimized initializations
ava-labs · Aug 23, 2020 · 053a094 · 053a094
2 parents 170716e + c6e95f0
commit 053a094
Show file tree

Hide file tree

Showing 14 changed files with 285 additions and 35 deletions.
diff --git a/snow/validators/set.go b/snow/validators/set.go
@@ -71,6 +71,14 @@ func NewSet() Set {
 	}
 }
 
+// NewBestSet returns a new, empty set of validators.
+func NewBestSet(expectedSampleSize int) Set {
+	return &set{
+		vdrMap:  make(map[[20]byte]int),
+		sampler: sampler.NewBestWeightedWithoutReplacement(expectedSampleSize),
+	}
+}
+
 // set of validators. Validator function results are cached. Therefore, to
 // update a validators weight, one should ensure to call add with the updated
 // validator.

diff --git a/utils/sampler/uniform.go b/utils/sampler/uniform.go
@@ -8,3 +8,6 @@ type Uniform interface {
 	Initialize(sampleRange uint64) error
 	Sample(length int) ([]uint64, error)
 }
+
+// NewUniform returns a new sampler
+func NewUniform() Uniform { return &uniformReplacer{} }
diff --git a/utils/sampler/uniform_benchmark_test.go b/utils/sampler/uniform_benchmark_test.go
@@ -10,18 +10,17 @@ import (
 
 // BenchmarkAllUniform
 func BenchmarkAllUniform(b *testing.B) {
-	sizes := []int{
-		1,
-		5,
-		25,
-		50,
-		75,
-		100,
+	sizes := []uint64{
+		30,
+		35,
+		500,
+		10000,
+		100000,
 	}
 	for _, s := range uniformSamplers {
 		for _, size := range sizes {
 			b.Run(fmt.Sprintf("sampler %s with %d elements uniformly", s.name, size), func(b *testing.B) {
-				UniformBenchmark(b, s.sampler, 1000000, size)
+				UniformBenchmark(b, s.sampler, size, 30)
 			})
 		}
 	}

diff --git a/utils/sampler/uniform_best.go b/utils/sampler/uniform_best.go
@@ -0,0 +1,81 @@
+// (c) 2019-2020, Ava Labs, Inc. All rights reserved.
+// See the file LICENSE for licensing terms.
+
+package sampler
+
+import (
+	"errors"
+	"math"
+	"math/rand"
+	"time"
+
+	"github.com/ava-labs/gecko/utils/timer"
+)
+
+var (
+	errNoValidUniformSamplers = errors.New("no valid uniform samplers found")
+)
+
+func init() { rand.Seed(time.Now().UnixNano()) }
+
+// uniformBest implements the Uniform interface.
+//
+// Sampling is performed by using another implementation of the Uniform
+// interface.
+//
+// Initialization attempts to find the best sampling algorithm given the dataset
+// by performing a benchmark of the provided implementations.
+type uniformBest struct {
+	Uniform
+	samplers            []Uniform
+	maxSampleSize       int
+	benchmarkIterations int
+	clock               timer.Clock
+}
+
+// NewBestUniform returns a new sampler
+func NewBestUniform(expectedSampleSize int) Uniform {
+	return &uniformBest{
+		samplers: []Uniform{
+			&uniformReplacer{},
+			&uniformResample{},
+		},
+		maxSampleSize:       expectedSampleSize,
+		benchmarkIterations: 100,
+	}
+}
+
+func (s *uniformBest) Initialize(length uint64) error {
+	s.Uniform = nil
+	bestDuration := time.Duration(math.MaxInt64)
+
+	sampleSize := s.maxSampleSize
+	if length < uint64(sampleSize) {
+		sampleSize = int(length)
+	}
+
+samplerLoop:
+	for _, sampler := range s.samplers {
+		if err := sampler.Initialize(length); err != nil {
+			continue
+		}
+
+		start := s.clock.Time()
+		for i := 0; i < s.benchmarkIterations; i++ {
+			if _, err := sampler.Sample(sampleSize); err != nil {
+				continue samplerLoop
+			}
+		}
+		end := s.clock.Time()
+		duration := end.Sub(start)
+		if duration < bestDuration {
+			bestDuration = duration
+			s.Uniform = sampler
+		}
+	}
+
+	if s.Uniform == nil {
+		return errNoValidUniformSamplers
+	}
+	return nil
+}
diff --git a/utils/sampler/uniform_replacer.go b/utils/sampler/uniform_replacer.go
@@ -35,9 +35,6 @@ type uniformReplacer struct {
 	length uint64
 }
 
-// NewUniform returns a new sampler
-func NewUniform() Uniform { return &uniformReplacer{} }
-
 func (s *uniformReplacer) Initialize(length uint64) error {
 	if length > math.MaxInt64 {
 		return errOutOfRange

diff --git a/utils/sampler/uniform_resample.go b/utils/sampler/uniform_resample.go
@@ -0,0 +1,54 @@
+// (c) 2019-2020, Ava Labs, Inc. All rights reserved.
+// See the file LICENSE for licensing terms.
+
+package sampler
+
+import (
+	"math"
+	"math/rand"
+	"time"
+)
+
+func init() { rand.Seed(time.Now().UnixNano()) }
+
+// uniformResample allows for sampling over a uniform distribution without
+// replacement.
+//
+// Sampling is performed by sampling with replacement and resampling if a
+// duplicate is sampled.
+//
+// Initialization takes O(1) time.
+//
+// Sampling is performed in O(count) time and O(count) space.
+type uniformResample struct {
+	length uint64
+}
+
+func (s *uniformResample) Initialize(length uint64) error {
+	if length > math.MaxInt64 {
+		return errOutOfRange
+	}
+	s.length = length
+	return nil
+}
+
+func (s *uniformResample) Sample(count int) ([]uint64, error) {
+	if count < 0 || s.length < uint64(count) {
+		return nil, errOutOfRange
+	}
+
+	drawn := make(map[uint64]struct{}, count)
+	results := make([]uint64, count)
+	for i := 0; i < count; i++ {
+		draw := uint64(rand.Int63n(int64(s.length)))
+		if _, ok := drawn[draw]; ok {
+			i--
+			continue
+		}
+		drawn[draw] = struct{}{}
+
+		results[i] = draw
+	}
+
+	return results, nil
+}
diff --git a/utils/sampler/uniform_test.go b/utils/sampler/uniform_test.go
@@ -22,6 +22,14 @@ var (
 			name:    "replacer",
 			sampler: &uniformReplacer{},
 		},
+		{
+			name:    "resampler",
+			sampler: &uniformResample{},
+		},
+		{
+			name:    "best",
+			sampler: NewBestUniform(30),
+		},
 	}
 	uniformTests = []struct {
 		name string

diff --git a/utils/sampler/weighted.go b/utils/sampler/weighted.go
@@ -15,3 +15,17 @@ type Weighted interface {
 	Initialize(weights []uint64) error
 	Sample(sampleValue uint64) (int, error)
 }
+
+// NewWeighted returns a new sampler
+func NewWeighted() Weighted {
+	return &weightedBest{
+		samplers: []Weighted{
+			&weightedArray{},
+			&weightedHeap{},
+			&weightedUniform{
+				maxWeight: 1 << 10,
+			},
+		},
+		benchmarkIterations: 100,
+	}
+}
diff --git a/utils/sampler/weighted_benchmark_test.go b/utils/sampler/weighted_benchmark_test.go
@@ -13,8 +13,8 @@ import (
 	safemath "github.com/ava-labs/gecko/utils/math"
 )
 
-// BenchmarkAllWeighted
-func BenchmarkAllWeighted(b *testing.B) {
+// BenchmarkAllWeightedSampling
+func BenchmarkAllWeightedSampling(b *testing.B) {
 	pows := []float64{
 		0,
 		1,
@@ -31,17 +31,52 @@ func BenchmarkAllWeighted(b *testing.B) {
 	for _, s := range weightedSamplers {
 		for _, pow := range pows {
 			for _, size := range sizes {
-				if WeightedPowBenchmark(b, s.sampler, pow, size) {
+				if WeightedPowBenchmarkSampler(b, s.sampler, pow, size) {
 					b.Run(fmt.Sprintf("sampler %s with %d elements at x^%.1f", s.name, size, pow), func(b *testing.B) {
-						WeightedPowBenchmark(b, s.sampler, pow, size)
+						WeightedPowBenchmarkSampler(b, s.sampler, pow, size)
 					})
 				}
 			}
 		}
 		for _, size := range sizes {
-			if WeightedSingletonBenchmark(b, s.sampler, size) {
+			if WeightedSingletonBenchmarkSampler(b, s.sampler, size) {
 				b.Run(fmt.Sprintf("sampler %s with %d singleton elements", s.name, size), func(b *testing.B) {
-					WeightedSingletonBenchmark(b, s.sampler, size)
+					WeightedSingletonBenchmarkSampler(b, s.sampler, size)
+				})
+			}
+		}
+	}
+}
+
+// BenchmarkAllWeightedInitializer
+func BenchmarkAllWeightedInitializer(b *testing.B) {
+	pows := []float64{
+		0,
+		1,
+		2,
+		3,
+	}
+	sizes := []int{
+		10,
+		500,
+		1000,
+		50000,
+		100000,
+	}
+	for _, s := range weightedSamplers {
+		for _, pow := range pows {
+			for _, size := range sizes {
+				if WeightedPowBenchmarkSampler(b, s.sampler, pow, size) {
+					b.Run(fmt.Sprintf("initializer %s with %d elements at x^%.1f", s.name, size, pow), func(b *testing.B) {
+						WeightedPowBenchmarkInitializer(b, s.sampler, pow, size)
+					})
+				}
+			}
+		}
+		for _, size := range sizes {
+			if WeightedSingletonBenchmarkSampler(b, s.sampler, size) {
+				b.Run(fmt.Sprintf("initializer %s with %d singleton elements", s.name, size), func(b *testing.B) {
+					WeightedSingletonBenchmarkInitializer(b, s.sampler, size)
 				})
 			}
 		}
@@ -67,7 +102,7 @@ func CalcWeightedPoW(exponent float64, size int) (uint64, []uint64, error) {
 	return totalWeight, weights, nil
 }
 
-func WeightedPowBenchmark(
+func WeightedPowBenchmarkSampler(
 	b *testing.B,
 	s Weighted,
 	exponent float64,
@@ -88,7 +123,7 @@ func WeightedPowBenchmark(
 	return true
 }
 
-func WeightedSingletonBenchmark(b *testing.B, s Weighted, size int) bool {
+func WeightedSingletonBenchmarkSampler(b *testing.B, s Weighted, size int) bool {
 	weights := make([]uint64, size)
 	weights[0] = uint64(math.MaxInt64 - size + 1)
 	for i := 1; i < len(weights); i++ {
@@ -106,3 +141,30 @@ func WeightedSingletonBenchmark(b *testing.B, s Weighted, size int) bool {
 	}
 	return true
 }
+
+func WeightedPowBenchmarkInitializer(
+	b *testing.B,
+	s Weighted,
+	exponent float64,
+	size int,
+) {
+	_, weights, _ := CalcWeightedPoW(exponent, size)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = s.Initialize(weights)
+	}
+}
+
+func WeightedSingletonBenchmarkInitializer(b *testing.B, s Weighted, size int) {
+	weights := make([]uint64, size)
+	weights[0] = uint64(math.MaxInt64 - size + 1)
+	for i := 1; i < len(weights); i++ {
+		weights[i] = 1
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = s.Initialize(weights)
+	}
+}
diff --git a/utils/sampler/weighted_best.go b/utils/sampler/weighted_best.go
@@ -15,7 +15,7 @@ import (
 )
 
 var (
-	errNoValidSamplers = errors.New("no valid samplers found")
+	errNoValidWeightedSamplers = errors.New("no valid weighted samplers found")
 )
 
 func init() { rand.Seed(time.Now().UnixNano()) }
@@ -48,9 +48,12 @@ func (s *weightedBest) Initialize(weights []uint64) error {
 		return errWeightsTooLarge
 	}
 
-	samples := make([]uint64, s.benchmarkIterations)
-	for i := range samples {
-		samples[i] = uint64(rand.Int63n(int64(totalWeight)))
+	samples := []uint64(nil)
+	if totalWeight > 0 {
+		samples = make([]uint64, s.benchmarkIterations)
+		for i := range samples {
+			samples[i] = uint64(rand.Int63n(int64(totalWeight)))
+		}
 	}
 
 	s.Weighted = nil
@@ -75,7 +78,7 @@ func (s *weightedBest) Initialize(weights []uint64) error {
 	}
 
 	if s.Weighted == nil {
-		return errNoValidSamplers
+		return errNoValidWeightedSamplers
 	}
 	return nil
 }
diff --git a/utils/sampler/weighted_heap.go b/utils/sampler/weighted_heap.go
@@ -47,7 +47,9 @@ func (s *weightedHeap) Initialize(weights []uint64) error {
 
 	// Initialize the heap
 	for i := len(s.heap) - 1; i > 0; i-- {
-		parentIndex := (i - 1) / 2
+		// Explicitly performing a shift here allows the compiler to avoid
+		// checking for negative numbers, which saves a couple cycles
+		parentIndex := (i - 1) >> 1
 		newWeight, err := safemath.Add64(
 			s.heap[parentIndex].cumulativeWeight,
 			s.heap[i].cumulativeWeight,