Skip to content

Commit b0b6c41

Browse files
authored
Fix/pip likelihood crash (#42)
* Set probabilities at root to f64::min if zero Hack to avoid running into underflow when branch lengths grow to infinity and site likelihoods become 0. If at the root the individual probability of a single site is 0.0, set it to f64::MIN_POSITIVE instead to limit the likelihood to a numerical value. Needs to be tested for stability/numerical correctness. * Add test for dataset with -Inf likelihood * Test for repeated blen optimisation Added a test for repeated branch length opimisation. The test is just to make sure it runs and does not crash with -Inf likelihood. * Minor refactoring for meaningful names * Checking for x < f64::min instead of == 0.0 Replace direct float comparison to 0.0 by comparison to f64::MIN_POSITIVE instead. * Fix/input alphabet mismatch (#43) * Allow to specify alphabet for sequences * Made alphabets lazy static, remove clones Made dna and protein alphabets lazy static, made functions returning their copies public, added derive Copy, removed unnecessary alphabet clones * Add missing test data Added missing test data file that is compatible with the dna alphabet but is actually a very trimmed down proptein alignment. * Fix log message * Remove tree file from test * Change if x.abs() < f64::MIN to x.is_subnormal Changed check for single column probability from x.abs < f64::MIN_POSITIVE to a more meaningful x.is_subnormal(). This produces the same result as limiting the likelihood to be at most f64::MIN / msa_length, but reduces the number of ln() calls. * Change comment for pip likelihood correction Accidentally working on #42 as well. * Check column probs against 0.0 and is_NaN * Add test for too small column probs * Replace repeatedly computed const with lazy_static * Remove is_nan condition which could cover up bugs * Removing stray comments
1 parent ae4d5fa commit b0b6c41

File tree

15 files changed

+210
-46
lines changed

15 files changed

+210
-46
lines changed

Diff for: phylo/data/p105.msa.fa

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
>284813
2+
--------------------------------------------QRLIL-----------
3+
---------------------------------
4+
>284811
5+
-MSDL--E----HESVPK--IPNESVWPDIVYLPDFKPTFPKWQ----------------
6+
---------------------------------
7+
>284593
8+
MSSELPGE----HESIPK--IPSEAVWPDIVYLPDFKPSFPQWR----------------
9+
---------------------------------
10+
>237561
11+
-MVET--E----HESIPQ--MPNEEIWPDVNYLPDFKSSFPQWK----------------
12+
----------NDDRDHNNYNEDNIGIDKHQNM-
13+
>284591
14+
-MVDT--E----HESISP--MPTEETWPGVTALPDYKPTFPQWS-----VGTNMKSGYNA
15+
AAGPGLMMST-----------------------
16+
>284812
17+
-------ERSNC-DRISETGLPNEEVWPGVTLLQDYKSTFPRWK----------------
18+
--------------------------------F

Diff for: phylo/data/p105.newick

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
((284811:0.0000000000000002220446049250313,(284593:0.09695146373489569,(237561:0.3249811214954128,(284812:0.2932969786389556,(284813:4183.274915114282,284591:0.16527960508898415):0.16533876806006598):0.04817830194688745):0.10198380019982661):0.041663718143987165):0);

Diff for: phylo/data/p226.msa.fa

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
>5207
2+
---A
3+
>284811
4+
----
5+
>284593
6+
----
7+
>284591
8+
G-G-
9+
>284592
10+
----
11+
>284812
12+
-V-G

Diff for: phylo/src/alignment/alignment_builder.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ impl<'a> AlignmentBuilder<'a> {
9494
.collect();
9595
Record::with_attrs(rec.id(), rec.desc(), &seq)
9696
});
97-
self.seqs = Sequences::with_alphabet(new_seqs.collect(), self.seqs.alphabet().clone());
97+
self.seqs = Sequences::with_alphabet(new_seqs.collect(), *self.seqs.alphabet());
9898
}
9999

100100
fn stack_maps(msa_len: usize, map_x: &Mapping, map_y: &Mapping) -> Mapping {

Diff for: phylo/src/alignment/mod.rs

+1-4
Original file line numberDiff line numberDiff line change
@@ -122,10 +122,7 @@ impl Alignment {
122122
records.push(Record::with_attrs(rec.id(), rec.desc(), &aligned_seq));
123123
}
124124

125-
Ok(Sequences::with_alphabet(
126-
records,
127-
self.seqs.alphabet.clone(),
128-
))
125+
Ok(Sequences::with_alphabet(records, self.seqs.alphabet))
129126
}
130127

131128
fn compile_leaf_map(&self, root: &NodeIdx, tree: &Tree) -> Result<LeafMapping> {

Diff for: phylo/src/alignment/sequences.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ impl Sequences {
117117
Sequences {
118118
s: seqs,
119119
aligned: false,
120-
alphabet: self.alphabet.clone(),
120+
alphabet: *self.alphabet(),
121121
}
122122
}
123123

Diff for: phylo/src/alphabets/mod.rs

+23-21
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ pub static AMB_CHAR: u8 = b'X';
1515
pub static GAP: u8 = b'-';
1616
pub static POSSIBLE_GAPS: &[u8] = b"_*-";
1717

18-
#[derive(Debug, PartialEq, Clone)]
18+
#[derive(Debug, PartialEq, Clone, Copy)]
1919
pub struct Alphabet {
2020
symbols: &'static [u8],
2121
ambiguous: &'static [u8],
@@ -64,35 +64,21 @@ impl Alphabet {
6464
}
6565

6666
pub fn detect_alphabet(sequences: &[Record]) -> Alphabet {
67-
let dna_alphabet = dna_alphabet();
67+
let dna_alphabet = *DNA_ALPHABET;
6868
for record in sequences.iter() {
6969
if !dna_alphabet.is_word(record.seq()) {
70-
return protein_alphabet();
70+
return *PROTEIN_ALPHABET;
7171
}
7272
}
7373
dna_alphabet
7474
}
7575

76-
pub(crate) fn dna_alphabet() -> Alphabet {
77-
Alphabet {
78-
symbols: NUCLEOTIDES,
79-
ambiguous: AMB_NUCLEOTIDES,
80-
index: &NUCLEOTIDE_INDEX,
81-
valid_symbols: &VALID_NUCLEOTIDES,
82-
char_sets: &NUCLEOTIDE_SETS,
83-
name: "DNA",
84-
}
76+
pub fn dna_alphabet() -> Alphabet {
77+
*DNA_ALPHABET
8578
}
8679

87-
pub(crate) fn protein_alphabet() -> Alphabet {
88-
Alphabet {
89-
symbols: AMINOACIDS,
90-
ambiguous: AMB_AMINOACIDS,
91-
index: &AMINOACID_INDEX,
92-
valid_symbols: &VALID_AMINOACIDS,
93-
char_sets: &AMINOACID_SETS,
94-
name: "protein",
95-
}
80+
pub fn protein_alphabet() -> Alphabet {
81+
*PROTEIN_ALPHABET
9682
}
9783

9884
lazy_static! {
@@ -120,6 +106,14 @@ lazy_static! {
120106
.cloned()
121107
.collect()
122108
};
109+
pub static ref DNA_ALPHABET: Alphabet = Alphabet {
110+
symbols: NUCLEOTIDES,
111+
ambiguous: AMB_NUCLEOTIDES,
112+
index: &NUCLEOTIDE_INDEX,
113+
valid_symbols: &VALID_NUCLEOTIDES,
114+
char_sets: &NUCLEOTIDE_SETS,
115+
name: "DNA",
116+
};
123117
}
124118

125119
fn generic_nucleotide_sets(char: u8) -> FreqVector {
@@ -168,6 +162,14 @@ lazy_static! {
168162
.cloned()
169163
.collect()
170164
};
165+
pub static ref PROTEIN_ALPHABET: Alphabet = Alphabet {
166+
symbols: AMINOACIDS,
167+
ambiguous: AMB_AMINOACIDS,
168+
index: &AMINOACID_INDEX,
169+
valid_symbols: &VALID_AMINOACIDS,
170+
char_sets: &AMINOACID_SETS,
171+
name: "protein",
172+
};
171173
}
172174

173175
fn generic_aminoacid_sets(char: u8) -> FreqVector {

Diff for: phylo/src/optimisers/blen_optimiser.rs

+3-5
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,10 @@ use argmin::solver::brent::BrentOpt;
66
use log::{debug, info};
77

88
use crate::likelihood::TreeSearchCost;
9-
use crate::optimisers::PhyloOptimisationResult;
9+
use crate::optimisers::{PhyloOptimisationResult, SingleValOptResult};
1010
use crate::tree::NodeIdx;
1111
use crate::Result;
1212

13-
use super::SingleValOptResult;
14-
1513
pub struct BranchOptimiser<C: TreeSearchCost + Display + Clone> {
1614
pub(crate) epsilon: f64,
1715
pub(crate) c: RefCell<C>,
@@ -79,7 +77,7 @@ impl<C: TreeSearchCost + Clone + Display> BranchOptimiser<C> {
7977
impl<C: TreeSearchCost + Clone + Display> BranchOptimiser<C> {
8078
pub(crate) fn optimise_branch(&mut self, branch: &NodeIdx) -> Result<SingleValOptResult> {
8179
let start_blen = self.c.borrow().tree().node(branch).blen;
82-
let (start, end) = if start_blen == 0.0 {
80+
let (min, max) = if start_blen == 0.0 {
8381
(0.0, 1.0)
8482
} else {
8583
(start_blen * 0.1, start_blen * 10.0)
@@ -88,7 +86,7 @@ impl<C: TreeSearchCost + Clone + Display> BranchOptimiser<C> {
8886
cost: &mut self.c,
8987
branch: *branch,
9088
};
91-
let gss = BrentOpt::new(start, end);
89+
let gss = BrentOpt::new(min, max);
9290
let res = Executor::new(optimiser, gss)
9391
.configure(|_| IterState::new().param(start_blen))
9492
.run()?;

Diff for: phylo/src/optimisers/blen_optimiser_tests.rs

+31-3
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,9 @@ use crate::likelihood::TreeSearchCost;
66
use crate::optimisers::BranchOptimiser;
77
use crate::phylo_info::PhyloInfoBuilder as PIB;
88
use crate::pip_model::{PIPCostBuilder as PIPCB, PIPModel};
9-
use crate::substitution_models::{dna_models::*, SubstModel, SubstitutionCostBuilder as SCB};
10-
use crate::tree::tree_parser::from_newick;
11-
9+
use crate::substitution_models::{dna_models::*, SubstModel, SubstitutionCostBuilder as SCB, WAG};
1210
use crate::tree;
11+
use crate::tree::tree_parser::from_newick;
1312

1413
#[test]
1514
fn branch_opt_likelihood_increase_pip() {
@@ -97,3 +96,32 @@ fn branch_optimiser_against_phyml() {
9796
assert_eq!(o.cost.cost(), o.final_cost);
9897
assert_eq!(c.cost(), o.final_cost);
9998
}
99+
100+
#[test]
101+
fn repeated_optimisation_limit() {
102+
// This used to create -Inf likelihoods due to too long branch lengths and the probability
103+
// turning to 0.0.
104+
// This is supposed to run and not crash, no other conditions.
105+
106+
let fldr = Path::new("./data/");
107+
let seq_file = fldr.join("p105.msa.fa");
108+
let info = PIB::new(seq_file).build().unwrap();
109+
110+
let model = PIPModel::<WAG>::new(&[], &[]);
111+
112+
let mut cost = PIPCB::new(model, info).build().unwrap();
113+
let mut prev_cost = f64::NEG_INFINITY;
114+
let mut final_cost = TreeSearchCost::cost(&cost);
115+
let max_iterations = 100;
116+
let epsilon = 1e-5;
117+
118+
let mut iterations = 0;
119+
while final_cost - prev_cost > epsilon && iterations < max_iterations {
120+
iterations += 1;
121+
prev_cost = final_cost;
122+
let branch_o = BranchOptimiser::new(cost.clone()).run().unwrap();
123+
assert!(branch_o.final_cost > branch_o.initial_cost);
124+
final_cost = branch_o.final_cost;
125+
cost = branch_o.cost;
126+
}
127+
}

Diff for: phylo/src/phylo_info/phyloinfo_builder.rs

+36-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use anyhow::bail;
55
use log::{info, warn};
66

77
use crate::alignment::{AlignmentBuilder, Sequences};
8+
use crate::alphabets::Alphabet;
89
use crate::io::{self, DataError};
910
use crate::phylo_info::PhyloInfo;
1011
use crate::tree::{build_nj_tree, Tree};
@@ -13,6 +14,7 @@ use crate::Result;
1314
pub struct PhyloInfoBuilder {
1415
sequence_file: PathBuf,
1516
tree_file: Option<PathBuf>,
17+
alphabet: Option<Alphabet>,
1618
}
1719

1820
impl PhyloInfoBuilder {
@@ -32,6 +34,7 @@ impl PhyloInfoBuilder {
3234
PhyloInfoBuilder {
3335
sequence_file,
3436
tree_file: None,
37+
alphabet: None,
3538
}
3639
}
3740

@@ -53,6 +56,7 @@ impl PhyloInfoBuilder {
5356
PhyloInfoBuilder {
5457
sequence_file,
5558
tree_file: Some(tree_file),
59+
alphabet: None,
5660
}
5761
}
5862

@@ -92,6 +96,25 @@ impl PhyloInfoBuilder {
9296
self
9397
}
9498

99+
/// Sets the tree file path for the PhyloInfoBuilder struct.
100+
/// Returns the PhyloInfoBuilder struct with the tree file path set.
101+
///
102+
/// # Arguments
103+
/// * `path` - File path to the tree newick file.
104+
///
105+
/// # Example
106+
/// ```
107+
/// use std::path::PathBuf;
108+
/// use phylo::alphabets::protein_alphabet;
109+
/// use phylo::phylo_info::PhyloInfoBuilder;
110+
/// let info = PhyloInfoBuilder::new(PathBuf::from("./data/sequences_DNA_small.fasta")).alphabet(Some(protein_alphabet())).build().unwrap();
111+
/// assert_eq!(info.msa.alphabet(), &protein_alphabet());
112+
/// ```
113+
pub fn alphabet(mut self, alphabet: Option<Alphabet>) -> PhyloInfoBuilder {
114+
self.alphabet = alphabet;
115+
self
116+
}
117+
95118
/// Builds the PhyloInfo struct from the sequence file and the tree file (if provided).
96119
/// If the provided tree file has more than one tree, only the first tree will be processed.
97120
/// If no tree file is provided, an NJ tree is built from the sequences.
@@ -119,7 +142,19 @@ impl PhyloInfoBuilder {
119142
"Reading sequences from file {}",
120143
self.sequence_file.display()
121144
);
122-
let sequences = Sequences::new(io::read_sequences_from_file(&self.sequence_file)?);
145+
let sequences = if self.alphabet.is_none() {
146+
info!("No alphabet provided, detecting alphabet from sequences");
147+
Sequences::new(io::read_sequences_from_file(&self.sequence_file)?)
148+
} else {
149+
info!(
150+
"Using provided {} alphabet",
151+
self.alphabet.as_ref().unwrap()
152+
);
153+
Sequences::with_alphabet(
154+
io::read_sequences_from_file(&self.sequence_file)?,
155+
self.alphabet.unwrap(),
156+
)
157+
};
123158
info!("{} sequence(s) read successfully", sequences.len());
124159

125160
let tree = match &self.tree_file {

Diff for: phylo/src/phylo_info/tests.rs

+16
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use assert_matches::assert_matches;
55
use bio::io::fasta::Record;
66

77
use crate::alignment::Sequences;
8+
use crate::alphabets::{dna_alphabet, protein_alphabet};
89
use crate::frequencies;
910
use crate::io::{read_sequences_from_file, DataError};
1011
use crate::phylo_info::{PhyloInfo, PhyloInfoBuilder as PIB};
@@ -425,3 +426,18 @@ fn empirical_frequencies_no_aas() {
425426
epsilon = 1e-6
426427
);
427428
}
429+
430+
#[test]
431+
fn force_protein_alphabet() {
432+
// If data is severely subsampled it might look like DNA even though it's really protein
433+
let fldr = PathBuf::from("./data");
434+
let info = PIB::new(fldr.join("p226.msa.fa")).build().unwrap();
435+
assert_eq!(info.msa.alphabet(), &dna_alphabet());
436+
437+
let fldr = PathBuf::from("./data");
438+
let info = PIB::new(fldr.join("p226.msa.fa"))
439+
.alphabet(Some(protein_alphabet()))
440+
.build()
441+
.unwrap();
442+
assert_eq!(info.msa.alphabet(), &protein_alphabet());
443+
}

Diff for: phylo/src/pip_model/mod.rs

+21-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use std::ops::Mul;
77
use std::vec;
88

99
use anyhow::bail;
10+
use lazy_static::lazy_static;
1011
use log::warn;
1112
use nalgebra::{DMatrix, DVector};
1213

@@ -25,6 +26,10 @@ use crate::Result;
2526
// (2.0 * PI).ln() / 2.0;
2627
pub static SHIFT: f64 = 0.9189385332046727;
2728

29+
lazy_static! {
30+
pub static ref MINLOGPROB: f64 = (f64::MIN_POSITIVE).ln();
31+
}
32+
2833
fn log_factorial_shifted(n: usize) -> f64 {
2934
// An approximation using Stirling's formula, minus constant log(sqrt(2*PI)).
3035
// Has an absolute error of 3e-4 for n=2, 3e-6 for n=10, 3e-9 for n=100.
@@ -317,7 +322,22 @@ impl<Q: QMatrix> PIPCost<Q> {
317322
let root_idx = usize::from(&self.info.tree.root);
318323
let msa_length = self.info.msa.len();
319324

320-
tmp.pnu[root_idx].map(|x| x.ln()).sum() + tmp.c0_pnu[root_idx]
325+
// In certain scenarios (e.g. a completely unrelated sequence, see data/p105.msa.fa)
326+
// individual column probabilities become too close to 0.0 (become subnormal)
327+
// and the log likelihood becomes -Inf. This is mathematically reasonable, but during branch
328+
// length optimisation BrentOpt cannot handle it and proposes NaN branch lengths.
329+
// This is a workaround that sets the probability to the smallest posible positive float,
330+
// which is equivalent to restricting the log likelihood to f64::MIN.
331+
tmp.pnu[root_idx]
332+
.map(|x| {
333+
if x == 0.0 || x.is_subnormal() {
334+
*MINLOGPROB
335+
} else {
336+
x.ln()
337+
}
338+
})
339+
.sum()
340+
+ tmp.c0_pnu[root_idx]
321341
- log_factorial_shifted(msa_length)
322342
}
323343

0 commit comments

Comments
 (0)