Skip to content

Commit 48ae5d6

Browse files
committed
sampling algorithms
1 parent 026f20f commit 48ae5d6

12 files changed

+257
-22
lines changed

Chap.03/README.md

+7-6
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
# 3 Random Sampling
22

3-
- Drawing from all un-sampled positions
4-
- Dictionary of sampled positions
5-
- Sorting sampling
6-
- Scanning an selecting
7-
- Heap and random keys
8-
- Reservoir sampling
3+
- Drawing from all un-sampled positions ([drawing_sampling.hpp](drawing_sampling.hpp))
4+
- Dictionary of sampled positions ([dictionary_sampling.hpp](dictionary_sampling.hpp))
5+
- Sorting sampling ([sorting_sampling.hpp](sorting_sampling.hpp))
6+
- Scanning an selecting ([scanning_sampling.hpp](scanning_sampling.hpp))
7+
- Heap and random keys ([heap_sampling.hpp](heap_sampling.hpp))
8+
- Reservoir sampling ([reservoir_sampling.hpp](reservoir_sampling.hpp))
9+

Chap.03/dictionary_sampling.hpp

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
/*
2+
License: GPL-3.0
3+
Author: Gaspare Ferraro <[email protected]>
4+
*/
5+
6+
#pragma once
7+
8+
#include <vector>
9+
#include <set>
10+
#include <utility>
11+
#include <iterator>
12+
13+
template <class T, class RandomNumberGenerator>
14+
std::vector<T> dictionary_sampling(const std::vector<T>& S, size_t m, RandomNumberGenerator& gen) {
15+
std::set<T> D;
16+
while( D.size() < m )
17+
D.insert(S[gen() % S.size()]);
18+
return std::vector<T>(D.begin(), D.end());
19+
}

Chap.03/drawing_sampling.hpp

+10-3
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,14 @@
99
#include <utility>
1010
#include <iterator>
1111

12-
template <class T>
13-
std::vector<T> drawing_sampling(const std::vector<T> S, int m) {
14-
12+
template <class T, class RandomNumberGenerator>
13+
std::vector<T> drawing_sampling(const std::vector<T>& S, size_t m, RandomNumberGenerator& gen){
14+
std::vector<T> tmp(S.begin(), S.end());
15+
std::vector<T> out(m, T());
16+
for(size_t i=0; i<m; i++) {
17+
int p = gen() % (tmp.size()-i);
18+
out[i] = tmp[p];
19+
std::swap(tmp[p], tmp[tmp.size()-i-1]);
20+
}
21+
return out;
1522
}

Chap.03/heap_sampling.hpp

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
/*
2+
License: GPL-3.0
3+
Author: Gaspare Ferraro <[email protected]>
4+
*/
5+
6+
#pragma once
7+
8+
#include <vector>
9+
#include <utility>
10+
#include <iterator>
11+
#include <queue>
12+
#include <climits>
13+
14+
template <class T, class RandomNumberGenerator>
15+
std::vector<T> heap_sampling(const std::vector<T>& S, size_t m, RandomNumberGenerator& gen) {
16+
std::priority_queue< std::pair<double, T> > Q;
17+
std::vector<T> out;
18+
for(size_t i=0; i<m; i++) Q.push(std::pair<double, T>(1., S[0]));
19+
for(T s : S) {
20+
double r = (double) (gen() % INT_MAX)/INT_MAX;
21+
if( r < Q.top().first ) {
22+
Q.pop();
23+
Q.push(std::make_pair(r, s));
24+
}
25+
}
26+
while( !Q.empty() ) {
27+
out.push_back(Q.top().second);
28+
Q.pop();
29+
}
30+
return out;
31+
}

Chap.03/reservoir_sampling.hpp

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/*
2+
License: GPL-3.0
3+
Author: Gaspare Ferraro <[email protected]>
4+
*/
5+
6+
#pragma once
7+
8+
#include <vector>
9+
#include <utility>
10+
#include <iterator>
11+
12+
template <class T, class RandomNumberGenerator>
13+
std::vector<T> reservoir_sampling(const std::vector<T>& S, size_t m, RandomNumberGenerator& gen) {
14+
std::vector<T> out(S.begin(), S.begin()+m);
15+
auto start = S.begin()+m;
16+
while( start != S.end() ) {
17+
size_t h = gen() % (start-S.begin());
18+
if( h < m )
19+
out[h] = *start;
20+
start++;
21+
}
22+
return out;
23+
}

Chap.03/scanning_sampling.hpp

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
/*
2+
License: GPL-3.0
3+
Author: Gaspare Ferraro <[email protected]>
4+
*/
5+
6+
#pragma once
7+
8+
#include <vector>
9+
#include <utility>
10+
#include <iterator>
11+
12+
template <class T, class RandomNumberGenerator>
13+
std::vector<T> scanning_sampling(const std::vector<T>& S, size_t m, RandomNumberGenerator& gen) {
14+
std::vector<T> out(m, 0);
15+
size_t s=0;
16+
for(size_t i=0; i<S.size(); ++i) {
17+
size_t p = gen() % (S.size()-i+1) ;
18+
if( p < m-s )
19+
out[s++] = S[i];
20+
}
21+
return out;
22+
}

Chap.03/sorting_sampling.hpp

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
/*
2+
License: GPL-3.0
3+
Author: Gaspare Ferraro <[email protected]>
4+
*/
5+
6+
#pragma once
7+
8+
#include <vector>
9+
#include <set>
10+
#include <utility>
11+
#include <iterator>
12+
13+
template <class T, class RandomNumberGenerator>
14+
std::vector<T> sorting_sampling(const std::vector<T>& S, size_t m, RandomNumberGenerator& gen) {
15+
std::set<T> D;
16+
while( D.size() < m ){
17+
D.clear();
18+
for(size_t i=0; i<m; ++i)
19+
D.insert(S[gen() % S.size()]);
20+
}
21+
return std::vector<T>(D.begin(), D.end());
22+
}

Chap.03/test.cpp

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#include <bits/stdc++.h>
2+
#include "dictionary_sampling.hpp"
3+
#include "drawing_sampling.hpp"
4+
#include "heap_sampling.hpp"
5+
#include "reservoir_sampling.hpp"
6+
#include "scanning_sampling.hpp"
7+
#include "sorting_sampling.hpp"
8+
9+
#define NTEST 100000
10+
11+
using namespace std;
12+
13+
int main(){
14+
15+
random_device rd;
16+
mt19937 rng(rd());
17+
18+
vector<int> S;
19+
map<int, int> dictH;
20+
map<int, int> drawH;
21+
map<int, int> heapH;
22+
map<int, int> reservoirH;
23+
map<int, int> scanningH;
24+
map<int, int> sortingH;
25+
26+
for(int i=0; i<100; i++) S.push_back(i);
27+
28+
for(int i=0; i<NTEST; i++) {
29+
auto dict = dictionary_sampling<int>(S, 10, rng);
30+
auto draw = drawing_sampling<int>(S, 10, rng);
31+
auto heap = heap_sampling<int>(S, 10, rng);
32+
auto reservoir = reservoir_sampling<int>(S, 10, rng);
33+
auto scanning = scanning_sampling<int>(S, 10, rng);
34+
auto sorting = sorting_sampling<int>(S, 10, rng);
35+
36+
for(int x: dict) dictH[x]++;
37+
for(int x: draw) drawH[x]++;
38+
for(int x: heap) heapH[x]++;
39+
for(int x: reservoir) reservoirH[x]++;
40+
for(int x: scanning) scanningH[x]++;
41+
for(int x: sorting) sortingH[x]++;
42+
}
43+
44+
for(int i=0; i<100; i++)
45+
printf("[%3d] %6d %6d %6d %6d %6d %6d\n", i, dictH[i], drawH[i], heapH[i], reservoirH[i], scanningH[i], sortingH[i]);
46+
47+
}

Chap.06/benchmark

0 Bytes
Binary file not shown.

Chap.06/benchmark_doubling.cpp

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
License: GPL-3.0
3+
Author: Gaspare Ferraro <[email protected]>
4+
*/
5+
6+
#include <bits/stdc++.h>
7+
#include "binary_search_intersection.hpp"
8+
#include "merge_intersection.hpp"
9+
#include "doubling_search.hpp"
10+
#include "mutual_partitioning.hpp"
11+
12+
using namespace std;
13+
14+
vector<int> randomSet(int size, int max, int min=1) {
15+
16+
if( max < min ) return randomSet(size, min, max);
17+
assert( max-min+1 > size );
18+
19+
random_device rd;
20+
mt19937 rng(rd());
21+
22+
vector<int> out(size, 0);
23+
for(int i=0; i<size; i++)
24+
out[i] = min+i;
25+
for(int i=size; i<max; i++)
26+
{
27+
int j = rng() % i;
28+
if( j < size )
29+
out[j] = i;
30+
}
31+
sort(out.begin(), out.end());
32+
return out;
33+
}
34+
35+
int main()
36+
{
37+
38+
vector<int> size = {100, 1000, 10000, 100000, 1000000, 10000000};
39+
40+
vector<int> a = randomSet(size[4], 3*size[5]);
41+
vector<int> b = randomSet(size[5], 3*size[5]);
42+
43+
for(int k=0; k<100; k++){
44+
clock_t doubling = 0;
45+
clock_t start = 0;
46+
47+
vector<int> c(b.size());
48+
c.clear();
49+
start = clock();
50+
doubling_search_intersection<int>(a, b, c.begin());
51+
doubling += clock()-start;
52+
double doublingd = (double) doubling / CLOCKS_PER_SEC;
53+
54+
printf("| %8d | %8d | %.3fs |\n", size[4], size[5], doublingd);
55+
fflush(stdout);
56+
}
57+
58+
return 0;
59+
}
60+

Chap.06/test.cpp

+10-7
Original file line numberDiff line numberDiff line change
@@ -24,41 +24,44 @@ int main()
2424

2525
vector<int> a(sa.begin(), sa.end());
2626
vector<int> b(sb.begin(), sb.end());
27-
vector<int> c(20);
27+
vector<int> c(20), d(20), e(20), f(20), g(20);
2828

2929
auto it = set_intersection(a.begin(), a.end(), b.begin(), b.end(), c.begin());
3030
c.resize(it-c.begin());
31-
32-
auto d = binary_search_intersection<int>(a, b);
33-
auto e = merge_intersection<int>(a, b);
34-
auto f = doubling_search_intersection<int>(a, b);
35-
auto g = mutual_partitioning_intersection<int>(a, b);
36-
3731
printf("stl set_intersection:\n\t");
3832
for(auto x : c)
3933
printf("%d ", x);
4034
printf("\n");
4135

36+
it = binary_search_intersection<int>(a, b, d.begin());
37+
d.resize(it-d.begin());
4238
printf("binary_search_intersection:\n\t");
4339
for(auto x : d)
4440
printf("%d ", x);
4541
printf("\n");
4642

43+
it = merge_intersection<int>(a, b, e.begin());
44+
e.resize(it-e.begin());
4745
printf("merge_intersection:\n\t");
4846
for(auto x : e)
4947
printf("%d ", x);
5048
printf("\n");
5149

50+
it = doubling_search_intersection<int>(a, b, f.begin());
51+
f.resize(it-f.begin());
5252
printf("doubling_search_intersection:\n\t");
5353
for(auto x : f)
5454
printf("%d ", x);
5555
printf("\n");
5656

57+
it = mutual_partitioning_intersection<int>(a, b, g.begin());
58+
g.resize(it-g.begin());
5759
printf("mutual_partitioning:\n\t");
5860
for(auto x : g)
5961
printf("%d ", x);
6062
printf("\n");
6163

64+
6265
return 0;
6366
}
6467

README.md

+6-6
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@ Chapters:
99
[3 Random Sampling](Chap.03/)
1010
------------------
1111

12-
- Drawing from all un-sampled positions
13-
- Dictionary of sampled positions
14-
- Sorting sampling
15-
- Scanning an selecting
16-
- Heap and random keys
17-
- Reservoir sampling
12+
- Drawing from all un-sampled positions ([drawing_sampling.hpp](Chap.03/drawing_sampling.hpp))
13+
- Dictionary of sampled positions ([dictionary_sampling.hpp](Chap.03/dictionary_sampling.hpp))
14+
- Sorting sampling ([sorting_sampling.hpp](Chap.03/sorting_sampling.hpp))
15+
- Scanning an selecting ([scanning_sampling.hpp](Chap.03/scanning_sampling.hpp))
16+
- Heap and random keys ([heap_sampling.hpp](Chap.03/heap_sampling.hpp))
17+
- Reservoir sampling ([reservoir_sampling.hpp](Chap.03/reservoir_sampling.hpp))
1818

1919
[4 List Ranking](Chap.04/)
2020
------------------

0 commit comments

Comments
 (0)