Skip to content

Commit b082369

Browse files
committed
style: rename L to neighborhood_size (#22)
* Mainly this is to replace the two Ls in Matcher.hpp and ImputationMatcher.hpp, however I've applied to all other instances too.
1 parent 0f3205d commit b082369

9 files changed

+36
-39
lines changed

src/HMM.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@ void HMM::compute_mutation_scores(std::vector<double> bp_sizes, double mutation_
7575
std::vector<int> HMM::breakpoints(std::vector<bool> observations, int start) {
7676
// Viterbi
7777
// Initialize
78-
int L = observations.size();
79-
std::vector<unsigned short> z(L);
80-
int end = start + L;
78+
int neighborhood_size = observations.size();
79+
std::vector<unsigned short> z(neighborhood_size);
80+
int end = start + neighborhood_size;
8181
for (int i = 0; i < num_states; i++) {
8282
double score = observations[0] ? het_score[start][i] : hom_score[start][i];
8383
trellis[start][i] = score;
@@ -86,7 +86,7 @@ std::vector<int> HMM::breakpoints(std::vector<bool> observations, int start) {
8686
// Main routine
8787
double score;
8888
unsigned short running_argmax;
89-
for (int j = 1; j < L; j++) {
89+
for (int j = 1; j < neighborhood_size; j++) {
9090
for (int i = 0; i < num_states; i++) {
9191
double running_max = 0;
9292
for (int k = 0; k < num_states; k++) {
@@ -107,26 +107,26 @@ std::vector<int> HMM::breakpoints(std::vector<bool> observations, int start) {
107107
}
108108

109109
// Get best path
110-
double running_max = trellis[start + L - 1][0];
110+
double running_max = trellis[start + neighborhood_size - 1][0];
111111
unsigned short argmax = 0;
112112
for (int k = 1; k < num_states; k++) {
113-
double s = trellis[start + L - 1][k];
113+
double s = trellis[start + neighborhood_size - 1][k];
114114
if (s > running_max) {
115115
running_max = s;
116116
argmax = k;
117117
}
118118
}
119119

120120
// Traceback
121-
z[L - 1] = argmax;
122-
for (int j = L - 1; j >= 1; j--) {
121+
z[neighborhood_size - 1] = argmax;
122+
for (int j = neighborhood_size - 1; j >= 1; j--) {
123123
z[j - 1] = pointers[j + start][z[j]];
124124
}
125125

126126
// Break it up
127127
std::vector<int> breakpoints;
128128
breakpoints.push_back(0 + start);
129-
for (int j = 1; j < L; j++) {
129+
for (int j = 1; j < neighborhood_size; j++) {
130130
if (z[j - 1] != z[j]) {
131131
breakpoints.push_back(j + start);
132132
}

src/ImputationMatcher.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111

1212
ImputationMatcher::ImputationMatcher(int _n_ref, int _n_target,
1313
const std::vector<double>& _genetic_positions,
14-
double _query_interval_size, int _L)
14+
double _query_interval_size, int _neighborhood_size)
1515
: num_reference(_n_ref), num_target(_n_target), genetic_positions(_genetic_positions),
16-
query_interval_size(_query_interval_size), L(_L) {
16+
query_interval_size(_query_interval_size), neighborhood_size(_neighborhood_size) {
1717
if (genetic_positions.size() <= 2) {
1818
throw std::runtime_error("Need at least 3 sites, found " +
1919
std::to_string(genetic_positions.size()));
@@ -132,7 +132,7 @@ void ImputationMatcher::process_site(const std::vector<int>& genotype) {
132132
}
133133

134134
for (auto& sorting_twople : target_sort) {
135-
// get L-sized neighbourhood per target sample around target_sort[target_id] in ref_sorting
135+
// get neighborhood-sized per target sample around target_sort[target_id] in ref_sorting
136136
int target_id = sorting_twople.first;
137137
int sorting_idx = sorting_twople.second;
138138
int insert_start;

src/ImputationMatcher.hpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,17 @@ class ImputationMatcher {
2424
// adapted to be used for imputation.
2525
// TODO: include a second pass through data here to get divergence values and not do that using
2626
// Threads-fastLS
27-
int L = 0;
27+
int neighborhood_size = 0;
2828
std::vector<int> query_sites;
2929
int num_samples = 0;
3030
int num_reference = 0;
3131
int num_target = 0;
3232
int num_sites = 0;
3333
double query_interval_size = 0.0;
3434
std::unordered_map<int, std::unordered_set<int>> match_sets;
35-
int neighborhood_size = 0;
3635
std::vector<double> genetic_positions;
3736
ImputationMatcher(int _n_ref, int _n_target, const std::vector<double>& _genetic_positions,
38-
double _query_interval_size, int _L);
37+
double _query_interval_size, int _neighborhood_size);
3938

4039
// Do all the work
4140
void process_site(const std::vector<int>& genotype);

src/Matcher.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ void MatchGroup::filter_matches(int min_matches) {
7575
}
7676
}
7777

78-
// Then determine top 4 candidates for neighbouring groups
78+
// Then determine top 4 candidates for neighboring groups
7979
top_four_maps.reserve(num_samples);
8080
for (int i = 0; i < num_samples; i++) {
8181
top_four_maps.emplace_back(std::min(4, (int) match_candidates.at(i).size()));
@@ -104,10 +104,10 @@ void MatchGroup::clear() {
104104
}
105105

106106
Matcher::Matcher(int _n, const std::vector<double>& _genetic_positions, double _query_interval_size,
107-
double _match_group_interval_size, int _L, int _min_matches)
107+
double _match_group_interval_size, int _neighborhood_size, int _min_matches)
108108
: num_samples(_n), genetic_positions(_genetic_positions),
109109
query_interval_size(_query_interval_size),
110-
match_group_interval_size(_match_group_interval_size), L(_L), min_matches(_min_matches) {
110+
match_group_interval_size(_match_group_interval_size), neighborhood_size(_neighborhood_size), min_matches(_min_matches) {
111111
if (genetic_positions.size() <= 2) {
112112
throw std::runtime_error("Need at least 3 sites, found " +
113113
std::to_string(genetic_positions.size()));
@@ -216,7 +216,7 @@ void Matcher::process_site(const std::vector<int>& genotype) {
216216
}
217217
sorting = next_sorting;
218218

219-
// Threading-neighbour queries
219+
// Threading-neighbor queries
220220
if (match_group_idx < match_group_sites.size() - 1 &&
221221
sites_processed >= match_group_sites.at(match_group_idx + 1)) {
222222
match_group_idx++;
@@ -239,17 +239,17 @@ void Matcher::process_site(const std::vector<int>& genotype) {
239239
for (int i = 1; i < num_samples; i++) {
240240
std::vector<int> matches;
241241
int allele = genotype.at(i);
242-
matches.reserve(L);
242+
matches.reserve(neighborhood_size);
243243
auto iter = threaded.insert(permutation.at(i));
244244
auto iter_up = iter.first;
245245
auto iter_down = iter.first;
246246
// Check if genotypes are identical, just to be sure
247-
while (matches.size() < L && (iter_down != threaded.begin() || iter_up != threaded.end())) {
247+
while (matches.size() < neighborhood_size && (iter_down != threaded.begin() || iter_up != threaded.end())) {
248248
if (iter_down != threaded.begin()) {
249249
iter_down--;
250250
matches.push_back(sorting.at(*iter_down));
251251
}
252-
if (matches.size() < L && iter_up != threaded.end()) {
252+
if (matches.size() < neighborhood_size && iter_up != threaded.end()) {
253253
iter_up++;
254254
if (iter_up != threaded.end()) {
255255
matches.push_back(sorting.at(*iter_up));

src/Matcher.hpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,17 @@ class Matcher {
3535
std::vector<int> permutation;
3636

3737
public:
38-
int L = 0;
38+
int neighborhood_size = 0;
3939
std::vector<int> query_sites;
4040
std::vector<int> match_group_sites;
4141
int num_samples = 0;
4242
int num_sites = 0;
4343
double query_interval_size = 0.0;
4444
// matches in these groups are considered together in the hmm
4545
double match_group_interval_size = 0.0;
46-
int neighborhood_size = 0;
4746
std::vector<double> genetic_positions;
4847
Matcher(int _n, const std::vector<double>& _genetic_positions, double _query_interval_size,
49-
double _match_group_interval_size, int _L, int _min_matches);
48+
double _match_group_interval_size, int _neighborhood_size, int _min_matches);
5049

5150
// Do all the work
5251
void process_site(const std::vector<int>& genotype);

src/ThreadsFastLS.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,12 +1017,12 @@ std::vector<std::tuple<int, std::vector<int>>> ThreadsFastLS::traceback(Tracebac
10171017
}
10181018

10191019
/**
1020-
* Similar to normal traceback, but picks the up-to L best matches and stores their overlap with the
1020+
* Similar to normal traceback, but picks the up-to neighborhood_size best matches and stores their overlap with the
10211021
* input sequence returns a list of a tuple of lists :-P I.e., a list of segments, and each segment
1022-
* is a tuple (sample_IDs, starts, ends) all of equal length <= L
1022+
* is a tuple (sample_IDs, starts, ends) all of equal length <= neighborhood_size
10231023
*/
10241024
std::vector<std::tuple<std::vector<int>, std::vector<int>, std::vector<int>>>
1025-
ThreadsFastLS::traceback_impute(std::vector<bool>& genotypes, TracebackState* tb, Node* match, int L) {
1025+
ThreadsFastLS::traceback_impute(std::vector<bool>& genotypes, TracebackState* tb, Node* match, int neighborhood_size) {
10261026
std::vector<std::tuple<std::vector<int>, std::vector<int>, std::vector<int>>> imputation_path;
10271027
int prev_end = num_sites;
10281028
while (tb != nullptr) {
@@ -1084,7 +1084,7 @@ ThreadsFastLS::traceback_impute(std::vector<bool>& genotypes, TracebackState* tb
10841084
std::vector<int> segment_starts;
10851085
std::vector<int> sample_ids;
10861086
std::vector<int> segment_ends;
1087-
for (int j = idx.size() - 1; j >= std::max(0, (int) (idx.size() - L)); j--) {
1087+
for (int j = idx.size() - 1; j >= std::max(0, (int) (idx.size() - neighborhood_size)); j--) {
10881088
segment_starts.push_back(segment_start);
10891089
segment_ends.push_back(segment_end);
10901090
sample_ids.push_back(div_states[idx[j]]);
@@ -1344,7 +1344,6 @@ ThreadsFastLS::threads_ls(const std::vector<bool>& genotype) {
13441344
return best_path;
13451345
}
13461346

1347-
// todo: L is no longer required here
13481347
std::tuple<std::vector<int>, std::vector<std::vector<int>>, std::vector<double>, std::vector<int>>
13491348
ThreadsFastLS::thread(const int new_sample_ID, const std::vector<bool>& genotype) {
13501349
// Compute LS path
@@ -1415,13 +1414,13 @@ ThreadsFastLS::thread(const int new_sample_ID, const std::vector<bool>& genotype
14151414
return remove_burn_in(bp_starts, target_IDs, segment_ages, het_sites);
14161415
}
14171416

1418-
std::vector<ImputationSegment> ThreadsFastLS::impute(std::vector<bool>& genotype, int L) {
1417+
std::vector<ImputationSegment> ThreadsFastLS::impute(std::vector<bool>& genotype, int neighborhood_size) {
14191418
// vector of sample_ids, seg_starts, seg_ends (buffered)
14201419
std::vector<std::tuple<std::vector<int>, std::vector<int>, std::vector<int>>> best_path;
14211420
Node* match;
14221421
TracebackState* traceback;
14231422
std::tie(traceback, match) = fastLS(genotype);
1424-
best_path = traceback_impute(genotype, traceback, match, L);
1423+
best_path = traceback_impute(genotype, traceback, match, neighborhood_size);
14251424
traceback_states.clear();
14261425

14271426
std::vector<double> seg_ages;

src/ThreadsFastLS.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,13 +111,13 @@ class ThreadsFastLS {
111111
std::vector<double>& segment_ages, std::vector<int>& het_sites);
112112
std::vector<std::tuple<int, std::vector<int>>> threads_ls(const std::vector<bool>& genotype);
113113

114-
std::vector<ImputationSegment> impute(std::vector<bool>& genotype, int L);
114+
std::vector<ImputationSegment> impute(std::vector<bool>& genotype, int neighborhood_size);
115115
std::pair<TracebackState*, Node*> fastLS(const std::vector<bool>& genotype,
116116
bool imputation = false);
117117
std::vector<std::tuple<int, std::vector<int>>> traceback(TracebackState* tb, Node* match,
118118
bool return_all = false);
119119
std::vector<std::tuple<std::vector<int>, std::vector<int>, std::vector<int>>>
120-
traceback_impute(std::vector<bool>& genotypes, TracebackState* tb, Node* match, int L);
120+
traceback_impute(std::vector<bool>& genotypes, TracebackState* tb, Node* match, int neighborhood_size);
121121
std::array<std::pair<TracebackState*, Node*>, 2> fastLS_diploid(const std::vector<int>& genotype);
122122
std::array<std::vector<std::tuple<int, std::vector<int>>>, 2>
123123
diploid_ls(std::vector<int> unphased_genotypes);

src/threads_arg/__main__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -394,13 +394,13 @@ def threads_infer(pgen, map_gz, recombination_rate, demography, mutation_rate, q
394394
# There are four params here to mess with:
395395
# - query interval
396396
# - match group size
397-
# - neighbourhood size (L)
397+
# - neighborhood size
398398
# - min_matches
399399
# Keep min_matches low for small sample sizes, can be 2 up to ~1,000 but then > 3
400400
# Smaller numbers run faster and consume less memory
401401
MIN_MATCHES = 4
402-
L = 4
403-
matcher = Matcher(2 * num_samples, genetic_positions[ac_mask], query_interval, match_group_interval, L, MIN_MATCHES)
402+
neighborhood_size = 4
403+
matcher = Matcher(2 * num_samples, genetic_positions[ac_mask], query_interval, match_group_interval, neighborhood_size, MIN_MATCHES)
404404

405405
# Matching step
406406
for b in range(n_batches):

src/threads_arg_pybind.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ PYBIND11_MODULE(threads_arg_python_bindings, m) {
6363
py::class_<Matcher>(m, "Matcher")
6464
.def(py::init<int, std::vector<double>, double, double, int, int>(), "Initialize",
6565
py::arg("num_samples"), py::arg("genetic_positions"), py::arg("query_interval_size"),
66-
py::arg("match_group_interval_size"), py::arg("L") = 4, py::arg("min_matches") = 4)
66+
py::arg("match_group_interval_size"), py::arg("neighborhood_size") = 4, py::arg("min_matches") = 4)
6767
.def_readonly("query_sites", &Matcher::query_sites)
6868
.def_readonly("genetic_positions", &Matcher::genetic_positions)
6969
.def_readonly("query_interval_size", &Matcher::query_interval_size)
@@ -81,7 +81,7 @@ PYBIND11_MODULE(threads_arg_python_bindings, m) {
8181
py::class_<ImputationMatcher>(m, "ImputationMatcher")
8282
.def(py::init<int, int, const std::vector<double>&, double, int>(), "Initialize",
8383
py::arg("num_reference"), py::arg("num_target"), py::arg("genetic_positions"),
84-
py::arg("query_interval_size"), py::arg("L") = 8)
84+
py::arg("query_interval_size"), py::arg("neighborhood_size") = 8)
8585
.def("process_site", &ImputationMatcher::process_site)
8686
.def("get_matches", &ImputationMatcher::get_matches);
8787

0 commit comments

Comments
 (0)