quantified-uncertainty · berekuk · Oct 27, 2022 · Oct 7, 2022 · Oct 19, 2022 · Oct 20, 2022
diff --git a/packages/squiggle-lang/src/rescript/Utility/E/E_A.res b/packages/squiggle-lang/src/rescript/Utility/E/E_A.res
@@ -307,48 +307,69 @@ module Floats = {
     /*
       This function goes through a sorted array and divides it into two different clusters:
       continuous samples and discrete samples. The discrete samples are stored in a mutable map.
-      Samples are thought to be discrete if they have at least `minDiscreteWight` duplicates.
-
-      If the min discrete weight is 4, that would mean that at least four elements needed from a specific
-      value for that to be kept as discrete. This is important because in some cases, we can expect that
-      some common elements will be generated by regular operations. The final continuous array will be sorted.
-
-      This function is performance-critical, don't change it significantly without benchmarking
-      SampleSet->PointSet conversion performance.
- */
+      Samples are considered discrete if they have at least `minDiscreteWight` duplicates.
+      Using a `minDiscreteWight` higher than 2 is important because sometimes common elements
+      will be generated by regular operations.
+      The final continuous array will be sorted.
+
+      The method here is designed for high performance for fairly small `minDiscreteWight`
+      values for both mostly-continuous and mostly-discrete inputs.
+      For each position i it visits, it compares it to the place where a run starting at i would end.
+      For continuous distributions, this comparison is always false, keeping branch prediction costs low.
+      If the comparison is true, it finds the complete run with recursive doubling then a binary search,
+      which skips over many elements for long runs.
+    */
     let splitContinuousAndDiscreteForMinWeight = (
       sortedArray: array<float>,
       ~minDiscreteWeight: int,
     ) => {
       let continuous: array<float> = []
       let discrete = FloatFloatMap.empty()
 
-      let addData = (count: int, value: float): unit => {
-        if count >= minDiscreteWeight {
-          FloatFloatMap.add(value, count->Belt.Int.toFloat, discrete)
+      // In a run of exactly minDiscreteWeight, the first and last
+      // element indices differ by minDistance.
+      let minDistance = minDiscreteWeight - 1
+
+      let len = length(sortedArray)
+      let i = ref(0)
+      while i.contents < len - minDistance {
+        // We are interested in runs of elements equal to value
+        let value = sortedArray[i.contents]
+        if value != sortedArray[i.contents + minDistance] {
+          // No long run starting at i, so it's continuous
+          Js.Array2.push(continuous, value)->ignore
+          i := i.contents + 1
         } else {
-          for _ in 1 to count {
-            continuous->Js.Array2.push(value)->ignore
+          // Now we know that a run starts at i
+          // Move i forward to next unequal value
+          // That is, find iNext so that isEqualAt(iNext-1) and !isEqualAt(iNext)
+          let iOrig = i.contents
+          // Find base so that iNext is in (iOrig+base, iOrig+2*base]
+          // This is where we start the binary search
+          let base = ref(minDistance)
+          let isEqualAt = (ind: int) => ind < len && sortedArray[ind] == value
+          while isEqualAt(iOrig + base.contents * 2) {
+            base := base.contents * 2
           }
-        }
-      }
-
-      let (finalCount, finalValue) = sortedArray->Belt.Array.reduce(
-        // initial prev value doesn't matter; if it collides with the first element of the array, flush won't do anything
-        (0, 0.),
-        ((count, prev), element) => {
-          if element == prev {
-            (count + 1, prev)
-          } else {
-            // new value, process previous ones
-            addData(count, prev)
-            (1, element)
+          // Maintain iNext in (lo, i]. Once lo+1 == i, i is iNext.
+          let lo = ref(iOrig + base.contents)
+          i := Js.Math.min_int(lo.contents + base.contents, len)
+          while i.contents - lo.contents > 1 {
+            let mid = lo.contents + (i.contents - lo.contents) / 2
+            if sortedArray[mid] == value {
+              lo := mid
+            } else {
+              i := mid
+            }
           }
-        },
-      )
 
-      // flush final values
-      addData(finalCount, finalValue)
+          let count = i.contents - iOrig
+          FloatFloatMap.add(value, count->Belt.Int.toFloat, discrete)
+        }
+      }
+      // Remaining values are continuous
+      let tail = Belt.Array.sliceToEnd(sortedArray, i.contents)
+      Js.Array2.pushMany(continuous, tail)->ignore
 
       (continuous, discrete)
     }