Add KS tests for weighted sampling; use A-ExpJ alg with log-keys (#1530)

dhardy · web-flow · commit bf9d429e89ee · 2024-11-26T11:22:55.000Z
- Extra testing for weighted sampling
- Fix IndexedRandom::choose_multiple_weighted with very small keys
- Use A-ExpJ algorithm with BinaryHeap for better performance with large length / amount
diff --git a/distr_test/Cargo.toml b/distr_test/Cargo.toml
@@ -5,7 +5,7 @@ edition = "2021"
 publish = false
 
 [dev-dependencies]
-rand_distr = { path = "../rand_distr", version = "=0.5.0-alpha.1", default-features = false }
+rand_distr = { path = "../rand_distr", version = "=0.5.0-alpha.1", default-features = false, features = ["alloc"] }
 rand = { path = "..", version = "=0.9.0-alpha.1", features = ["small_rng"] }
 num-traits = "0.2.19"
 # Special functions for testing distributions
diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs
@@ -0,0 +1,235 @@
+// Copyright 2024 Developers of the Rand project.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+mod ks;
+use ks::test_discrete;
+use rand::distr::{Distribution, WeightedIndex};
+use rand::seq::{IndexedRandom, IteratorRandom};
+use rand_distr::{WeightedAliasIndex, WeightedTreeIndex};
+
+/// Takes the unnormalized pdf and creates the cdf of a discrete distribution
+fn make_cdf(num: usize, f: impl Fn(i64) -> f64) -> impl Fn(i64) -> f64 {
+    let mut cdf = Vec::with_capacity(num);
+    let mut ac = 0.0;
+    for i in 0..num {
+        ac += f(i as i64);
+        cdf.push(ac);
+    }
+
+    let frac = 1.0 / ac;
+    for x in &mut cdf {
+        *x *= frac;
+    }
+
+    move |i| {
+        if i < 0 {
+            0.0
+        } else {
+            cdf[i as usize]
+        }
+    }
+}
+
+#[test]
+fn weighted_index() {
+    fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
+        let distr = WeightedIndex::new((0..num).map(|i| weight(i as i64))).unwrap();
+        test_discrete(0, distr, make_cdf(num, weight));
+    }
+
+    test_weights(100, |_| 1.0);
+    test_weights(100, |i| ((i + 1) as f64).ln());
+    test_weights(100, |i| i as f64);
+    test_weights(100, |i| (i as f64).powi(3));
+    test_weights(100, |i| 1.0 / ((i + 1) as f64));
+}
+
+#[test]
+fn weighted_alias_index() {
+    fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
+        let weights = (0..num).map(|i| weight(i as i64)).collect();
+        let distr = WeightedAliasIndex::new(weights).unwrap();
+        test_discrete(0, distr, make_cdf(num, weight));
+    }
+
+    test_weights(100, |_| 1.0);
+    test_weights(100, |i| ((i + 1) as f64).ln());
+    test_weights(100, |i| i as f64);
+    test_weights(100, |i| (i as f64).powi(3));
+    test_weights(100, |i| 1.0 / ((i + 1) as f64));
+}
+
+#[test]
+fn weighted_tree_index() {
+    fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
+        let distr = WeightedTreeIndex::new((0..num).map(|i| weight(i as i64))).unwrap();
+        test_discrete(0, distr, make_cdf(num, weight));
+    }
+
+    test_weights(100, |_| 1.0);
+    test_weights(100, |i| ((i + 1) as f64).ln());
+    test_weights(100, |i| i as f64);
+    test_weights(100, |i| (i as f64).powi(3));
+    test_weights(100, |i| 1.0 / ((i + 1) as f64));
+}
+
+#[test]
+fn choose_weighted_indexed() {
+    struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
+    impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
+        fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
+            *IndexedRandom::choose_weighted(&self.0[..], rng, |i| (self.1)(*i)).unwrap()
+        }
+    }
+
+    fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
+        let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);
+        test_discrete(0, distr, make_cdf(num, &weight));
+    }
+
+    test_weights(100, |_| 1.0);
+    test_weights(100, |i| ((i + 1) as f64).ln());
+    test_weights(100, |i| i as f64);
+    test_weights(100, |i| (i as f64).powi(3));
+    test_weights(100, |i| 1.0 / ((i + 1) as f64));
+}
+
+#[test]
+fn choose_one_weighted_indexed() {
+    struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
+    impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
+        fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
+            *IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 1, |i| (self.1)(*i))
+                .unwrap()
+                .next()
+                .unwrap()
+        }
+    }
+
+    fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
+        let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);
+        test_discrete(0, distr, make_cdf(num, &weight));
+    }
+
+    test_weights(100, |_| 1.0);
+    test_weights(100, |i| ((i + 1) as f64).ln());
+    test_weights(100, |i| i as f64);
+    test_weights(100, |i| (i as f64).powi(3));
+    test_weights(100, |i| 1.0 / ((i + 1) as f64));
+}
+
+#[test]
+fn choose_two_weighted_indexed() {
+    struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
+    impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
+        fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
+            let mut iter =
+                IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 2, |i| (self.1)(*i))
+                    .unwrap();
+            let mut a = *iter.next().unwrap();
+            let mut b = *iter.next().unwrap();
+            assert!(iter.next().is_none());
+            if b < a {
+                std::mem::swap(&mut a, &mut b);
+            }
+            a * self.0.len() as i64 + b
+        }
+    }
+
+    fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
+        let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);
+
+        let pmf1 = (0..num).map(|i| weight(i as i64)).collect::<Vec<f64>>();
+        let sum: f64 = pmf1.iter().sum();
+        let frac = 1.0 / sum;
+
+        let mut ac = 0.0;
+        let mut cdf = Vec::with_capacity(num * num);
+        for a in 0..num {
+            for b in 0..num {
+                if a < b {
+                    let pa = pmf1[a] * frac;
+                    let pab = pa * pmf1[b] / (sum - pmf1[a]);
+
+                    let pb = pmf1[b] * frac;
+                    let pba = pb * pmf1[a] / (sum - pmf1[b]);
+
+                    ac += pab + pba;
+                }
+                cdf.push(ac);
+            }
+        }
+        assert!((cdf.last().unwrap() - 1.0).abs() < 1e-9);
+
+        let cdf = |i| {
+            if i < 0 {
+                0.0
+            } else {
+                cdf[i as usize]
+            }
+        };
+
+        test_discrete(0, distr, cdf);
+    }
+
+    test_weights(100, |_| 1.0);
+    test_weights(100, |i| ((i + 1) as f64).ln());
+    test_weights(100, |i| i as f64);
+    test_weights(100, |i| (i as f64).powi(3));
+    test_weights(100, |i| 1.0 / ((i + 1) as f64));
+    test_weights(10, |i| ((i + 1) as f64).powi(-8));
+}
+
+#[test]
+fn choose_iterator() {
+    struct Adapter<I>(I);
+    impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
+        fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
+            IteratorRandom::choose(self.0.clone(), rng).unwrap()
+        }
+    }
+
+    let distr = Adapter((0..100).map(|i| i as i64));
+    test_discrete(0, distr, make_cdf(100, |_| 1.0));
+}
+
+#[test]
+fn choose_stable_iterator() {
+    struct Adapter<I>(I);
+    impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
+        fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
+            IteratorRandom::choose_stable(self.0.clone(), rng).unwrap()
+        }
+    }
+
+    let distr = Adapter((0..100).map(|i| i as i64));
+    test_discrete(0, distr, make_cdf(100, |_| 1.0));
+}
+
+#[test]
+fn choose_two_iterator() {
+    struct Adapter<I>(I);
+    impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
+        fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
+            let mut buf = [0; 2];
+            IteratorRandom::choose_multiple_fill(self.0.clone(), rng, &mut buf);
+            buf.sort_unstable();
+            assert!(buf[0] < 99 && buf[1] >= 1);
+            let a = buf[0];
+            4950 - (99 - a) * (100 - a) / 2 + buf[1] - a - 1
+        }
+    }
+
+    let distr = Adapter((0..100).map(|i| i as i64));
+
+    test_discrete(
+        0,
+        distr,
+        |i| if i < 0 { 0.0 } else { (i + 1) as f64 / 4950.0 },
+    );
+}
diff --git a/src/seq/index.rs b/src/seq/index.rs
@@ -333,8 +333,8 @@ where
 /// ordering). The weights are to be provided by the input function `weights`,
 /// which will be called once for each index.
 ///
-/// This implementation uses the algorithm described by Efraimidis and Spirakis
-/// in this paper: <https://doi.org/10.1016/j.ipl.2005.11.003>
+/// This implementation is based on the algorithm A-ExpJ as found in
+/// [Efraimidis and Spirakis, 2005](https://doi.org/10.1016/j.ipl.2005.11.003).
 /// It uses `O(length + amount)` space and `O(length)` time.
 ///
 /// Error cases:
@@ -354,7 +354,7 @@ where
     N: UInt,
     IndexVec: From<Vec<N>>,
 {
-    use std::cmp::Ordering;
+    use std::{cmp::Ordering, collections::BinaryHeap};
 
     if amount == N::zero() {
         return Ok(IndexVec::U32(Vec::new()));
@@ -373,9 +373,9 @@ where
 
     impl<N> Ord for Element<N> {
         fn cmp(&self, other: &Self) -> Ordering {
-            // partial_cmp will always produce a value,
-            // because we check that the weights are not nan
-            self.key.partial_cmp(&other.key).unwrap()
+            // unwrap() should not panic since weights should not be NaN
+            // We reverse so that BinaryHeap::peek shows the smallest item
+            self.key.partial_cmp(&other.key).unwrap().reverse()
         }
     }
 
@@ -387,12 +387,14 @@ where
 
     impl<N> Eq for Element<N> {}
 
-    let mut candidates = Vec::with_capacity(length.as_usize());
+    let mut candidates = BinaryHeap::with_capacity(amount.as_usize());
     let mut index = N::zero();
-    while index < length {
+    while index < length && candidates.len() < amount.as_usize() {
         let weight = weight(index.as_usize()).into();
         if weight > 0.0 {
-            let key = rng.random::<f64>().powf(1.0 / weight);
+            // We use the log of the key used in A-ExpJ to improve precision
+            // for small weights:
+            let key = rng.random::<f64>().ln() / weight;
             candidates.push(Element { index, key });
         } else if !(weight >= 0.0) {
             return Err(WeightError::InvalidWeight);
@@ -401,23 +403,33 @@ where
         index += N::one();
     }
 
-    let avail = candidates.len();
-    if avail < amount.as_usize() {
+    if candidates.len() < amount.as_usize() {
         return Err(WeightError::InsufficientNonZero);
     }
 
-    // Partially sort the array to find the `amount` elements with the greatest
-    // keys. Do this by using `select_nth_unstable` to put the elements with
-    // the *smallest* keys at the beginning of the list in `O(n)` time, which
-    // provides equivalent information about the elements with the *greatest* keys.
-    let (_, mid, greater) = candidates.select_nth_unstable(avail - amount.as_usize());
+    let mut x = rng.random::<f64>().ln() / candidates.peek().unwrap().key;
+    while index < length {
+        let weight = weight(index.as_usize()).into();
+        if weight > 0.0 {
+            x -= weight;
+            if x <= 0.0 {
+                let min_candidate = candidates.pop().unwrap();
+                let t = (min_candidate.key * weight).exp();
+                let key = rng.random_range(t..1.0).ln() / weight;
+                candidates.push(Element { index, key });
+
+                x = rng.random::<f64>().ln() / candidates.peek().unwrap().key;
+            }
+        } else if !(weight >= 0.0) {
+            return Err(WeightError::InvalidWeight);
+        }
 
-    let mut result: Vec<N> = Vec::with_capacity(amount.as_usize());
-    result.push(mid.index);
-    for element in greater {
-        result.push(element.index);
+        index += N::one();
     }
-    Ok(IndexVec::from(result))
+
+    Ok(IndexVec::from(
+        candidates.iter().map(|elt| elt.index).collect(),
+    ))
 }
 
 /// Randomly sample exactly `amount` indices from `0..length`, using Floyd's
diff --git a/src/seq/slice.rs b/src/seq/slice.rs
@@ -732,17 +732,17 @@ mod test {
         use super::*;
 
         // The theoretical probabilities of the different outcomes are:
-        // AB: 0.5  * 0.5  = 0.250
-        // AC: 0.5  * 0.5  = 0.250
-        // BA: 0.25 * 0.67 = 0.167
-        // BC: 0.25 * 0.33 = 0.082
-        // CA: 0.25 * 0.67 = 0.167
-        // CB: 0.25 * 0.33 = 0.082
-        let choices = [('a', 2), ('b', 1), ('c', 1)];
+        // AB: 0.5   * 0.667 = 0.3333
+        // AC: 0.5   * 0.333 = 0.1667
+        // BA: 0.333 * 0.75  = 0.25
+        // BC: 0.333 * 0.25  = 0.0833
+        // CA: 0.167 * 0.6   = 0.1
+        // CB: 0.167 * 0.4   = 0.0667
+        let choices = [('a', 3), ('b', 2), ('c', 1)];
         let mut rng = crate::test::rng(414);
 
         let mut results = [0i32; 3];
-        let expected_results = [4167, 4167, 1666];
+        let expected_results = [5833, 2667, 1500];
         for _ in 0..10000 {
             let result = choices
                 .choose_multiple_weighted(&mut rng, 2, |item| item.1)