Skip to content

Commit 9b84a11

Browse files
committed
Speed up open/chimeric searching 🚀
- Rework isotope errors, precursor filtering interally - Use u8 for internal residue representation, instead of char
1 parent 3a0c25c commit 9b84a11

17 files changed

+167
-140
lines changed

.github/workflows/build.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ jobs:
7979
sage_version=${{ github.ref_name }}
8080
staging="sage-$sage_version-${{ matrix.target }}"
8181
mkdir -p "$staging"
82-
cp {README.md,LICENSE} "$staging/"
82+
cp {README.md,DOCS.md,LICENSE} "$staging/"
8383
cp "target/${{ matrix.target }}/release/sage.exe" "$staging/"
8484
7z a "$staging.zip" "$staging"
8585
echo "ASSET=$staging.zip" >> $GITHUB_ENV

CHANGELOG.md

+6
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7+
## [0.12.0]
8+
### Added
9+
- Add `wide_window` option to configuration file. This option turns off `precursor_tol`, instead using the isolation window written in the mzML file.
10+
### Changed
11+
- Changed internal calculation of precursor tolerances when searching with `isotope_errors`. The new version should be more accurate. This change also enables a significant boost to search speed for open searches.
12+
713
## [0.11.2]
814
### Added
915
- Add rank & charge features to LDA

Cargo.lock

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,6 @@ members = [
66
]
77

88
[profile.release]
9-
#lto = "fat"
10-
#codegen-units = 1
9+
lto = "fat"
10+
codegen-units = 1
1111
panic = "abort"

DOCS.md

+2
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,8 @@ Example:
156156
"isotope_errors": [-1, 3]
157157
```
158158

159+
**NOTE**: Searching with isotope errors is slower than searching with a wider precursor tolerance that encompasses the isotope errors, e.g. `"da": [-3.5, 1.25]`. Using the wider precursor tolerance will generally increase the number of confidently identified PSMs as well.
160+
159161
## Other Settings
160162

161163
Note on the settings below:

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ For additional information about configuration options and output file formats,
245245
],
246246
"deisotope": false, // Optional[bool] {default=false}: perform deisotoping and charge state deconvolution
247247
"chimera": false, // Optional[bool] {default=false}: search for chimeric/co-fragmenting PSMS
248-
"wide_window": false, // Optional[bool] {default=false}: ignore `precursor_tol` and search in wide-window/DIA mode
248+
"wide_window": false, // Optional[bool] {default=false}: _ignore_ `precursor_tol` and search in wide-window/DIA mode
249249
"predict_rt": false, // Optional[bool] {default=true}: use retention time prediction model as an feature for LDA
250250
"min_peaks": 15, // Optional[int] {default=15}: only process MS2 spectra with at least N peaks
251251
"max_peaks": 150, // Optional[int] {default=150}: take the top N most intense MS2 peaks to search,

crates/sage-cli/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "sage-cli"
3-
version = "0.11.2"
3+
version = "0.12.0"
44
authors = ["Michael Lazear <[email protected]"]
55
edition = "2021"
66
rust-version = "1.62"

crates/sage/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "sage-core"
3-
version = "0.11.2"
3+
version = "0.12.0"
44
authors = ["Michael Lazear <[email protected]"]
55
edition = "2021"
66
rust-version = "1.62"

crates/sage/src/database.rs

+22-15
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use crate::enzyme::{Enzyme, EnzymeParameters};
22
use crate::fasta::Fasta;
33
use crate::ion_series::{IonSeries, Kind};
4-
use crate::mass::{Tolerance, NEUTRON};
4+
use crate::mass::Tolerance;
55
use crate::peptide::Peptide;
66
use dashmap::DashMap;
77
use fnv::FnvBuildHasher;
@@ -92,7 +92,7 @@ impl Builder {
9292
let mut output = HashMap::new();
9393
if let Some(input) = input {
9494
for (ch, mass) in input {
95-
if crate::mass::VALID_AA.contains(&ch) || "^$[]".contains(ch) {
95+
if crate::mass::VALID_AA.contains(&(ch as u8)) || "^$[]".contains(ch) {
9696
output.insert(ch, mass);
9797
} else {
9898
error!(
@@ -359,25 +359,21 @@ impl IndexedDatabase {
359359
precursor_mass: f32,
360360
precursor_tol: Tolerance,
361361
fragment_tol: Tolerance,
362-
min_isotope_err: i8,
363-
max_isotope_err: i8,
364362
) -> IndexedQuery<'_> {
365363
let (precursor_lo, precursor_hi) = precursor_tol.bounds(precursor_mass);
366364

367365
let (pre_idx_lo, pre_idx_hi) = binary_search_slice(
368366
&self.peptides,
369367
|p, bounds| p.monoisotopic.total_cmp(bounds),
370-
precursor_lo - (NEUTRON * max_isotope_err as f32).abs(),
371-
precursor_hi + (NEUTRON * min_isotope_err as f32).abs(),
368+
precursor_lo,
369+
precursor_hi,
372370
);
373371

374372
IndexedQuery {
375373
db: self,
376374
precursor_mass,
377375
precursor_tol,
378376
fragment_tol,
379-
min_isotope_err,
380-
max_isotope_err,
381377
pre_idx_lo,
382378
pre_idx_hi,
383379
}
@@ -421,8 +417,6 @@ pub struct IndexedQuery<'d> {
421417
precursor_mass: f32,
422418
precursor_tol: Tolerance,
423419
fragment_tol: Tolerance,
424-
min_isotope_err: i8,
425-
max_isotope_err: i8,
426420
pub pre_idx_lo: usize,
427421
pub pre_idx_hi: usize,
428422
}
@@ -464,11 +458,24 @@ impl<'d> IndexedQuery<'d> {
464458

465459
// Finally, filter down our slice into exact matches only
466460
slice[inner_left..inner_right].iter().filter(move |frag| {
467-
let neutral = self.db[frag.peptide_index].monoisotopic;
468-
(self.min_isotope_err..=self.max_isotope_err).any(|isotope_err| {
469-
let delta = isotope_err as f32 * NEUTRON;
470-
(neutral >= precursor_lo - delta) && (neutral <= precursor_hi - delta)
471-
}) && frag.fragment_mz >= fragment_lo
461+
// This looks somewhat complicated, but it's a consequence of
462+
// how the `binary_search_slice` function works - it will return
463+
// the set of indices that maximally cover the desired range - the exact
464+
// `left` and `right` indices may be valid, or just outside of the range.
465+
// Anything interior of `left` and `right` is guaranteed to be within the
466+
// precursor tolerance, so we just need to check the edge cases
467+
//
468+
// Previously, a direct lookup to check the mass of the current fragment was
469+
// performed, but the pointer indirection + float comparison can slow down
470+
// open searches by as much as 2x!!
471+
// e.g. used to be `self.db[frag.peptide_index].monoisotopic >= precursor_lo`
472+
(frag.peptide_index.0 > self.pre_idx_lo as u32
473+
|| (frag.peptide_index.0 == self.pre_idx_lo as u32
474+
&& self.db[frag.peptide_index].monoisotopic >= precursor_lo))
475+
&& (frag.peptide_index.0 < self.pre_idx_hi as u32
476+
|| (frag.peptide_index.0 == self.pre_idx_hi as u32
477+
&& self.db[frag.peptide_index].monoisotopic <= precursor_hi))
478+
&& frag.fragment_mz >= fragment_lo
472479
&& frag.fragment_mz <= fragment_hi
473480
})
474481
})

crates/sage/src/enzyme.rs

+4-2
Original file line numberDiff line numberDiff line change
@@ -94,13 +94,15 @@ pub struct Enzyme {
9494
impl Enzyme {
9595
pub fn new(cleave: &str, skip_suffix: Option<char>, c_terminal: bool) -> Option<Self> {
9696
assert!(
97-
cleave.chars().all(|x| VALID_AA.contains(&x)) || cleave == "$",
97+
cleave.chars().all(|x| VALID_AA.contains(&(x as u8))) || cleave == "$",
9898
"Enzyme cleavage sequence contains non-amino acid characters: {}",
9999
cleave
100100
);
101101

102102
assert!(
103-
skip_suffix.map(|x| VALID_AA.contains(&x)).unwrap_or(true),
103+
skip_suffix
104+
.map(|x| VALID_AA.contains(&(x as u8)))
105+
.unwrap_or(true),
104106
"Enzyme cleavage restriction is non-amino acid character: {}",
105107
skip_suffix.unwrap(),
106108
);

crates/sage/src/lfq.rs

+10-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use crate::database::{binary_search_slice, IndexedDatabase, PeptideIx};
2-
use crate::mass::{Tolerance, NEUTRON};
2+
use crate::mass::{Composition, Mass, Tolerance, NEUTRON};
33
use crate::ml::{matrix::Matrix, retention_alignment::Alignment};
44
use crate::scoring::Feature;
55
use crate::spectrum::ProcessedSpectrum;
@@ -218,8 +218,15 @@ impl FeatureMap {
218218
.entry((entry.peptide, entry.decoy))
219219
.or_insert_with(|| {
220220
let p = &db[entry.peptide];
221-
let dist =
222-
crate::isotopes::peptide_isotopes(p.carbons, p.sulfurs);
221+
let composition = p
222+
.sequence
223+
.iter()
224+
.map(|r| r.composition())
225+
.sum::<Composition>();
226+
let dist = crate::isotopes::peptide_isotopes(
227+
composition.carbon,
228+
composition.sulfur,
229+
);
223230
Grid::new(entry, RT_TOL, dist, alignments.len(), GRID_SIZE)
224231
});
225232

crates/sage/src/mass.rs

+55-55
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@ pub trait Mass {
5757
#[derive(Clone, Debug, PartialEq, PartialOrd, Serialize)]
5858
pub enum Residue {
5959
// Standard amino acid residue
60-
Just(char),
60+
Just(u8),
6161
// Amino acid residue with a mass modification
62-
Mod(char, f32),
62+
Mod(u8, f32),
6363
}
6464

6565
impl Mass for Residue {
@@ -78,65 +78,65 @@ impl Mass for Residue {
7878
}
7979
}
8080

81-
pub const VALID_AA: [char; 22] = [
82-
'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W',
83-
'Y', 'U', 'O',
81+
pub const VALID_AA: [u8; 22] = [
82+
b'A', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'K', b'L', b'M', b'N', b'P', b'Q', b'R', b'S',
83+
b'T', b'V', b'W', b'Y', b'U', b'O',
8484
];
8585

86-
impl Mass for char {
86+
impl Mass for u8 {
8787
fn monoisotopic(&self) -> f32 {
8888
match self {
89-
'A' => 71.03711,
90-
'R' => 156.1011,
91-
'N' => 114.04293,
92-
'D' => 115.02694,
93-
'C' => 103.00919,
94-
'E' => 129.04259,
95-
'Q' => 128.05858,
96-
'G' => 57.02146,
97-
'H' => 137.05891,
98-
'I' => 113.08406,
99-
'L' => 113.08406,
100-
'K' => 128.09496,
101-
'M' => 131.0405,
102-
'F' => 147.0684,
103-
'P' => 97.05276,
104-
'S' => 87.03203,
105-
'T' => 101.04768,
106-
'W' => 186.07931,
107-
'Y' => 163.06333,
108-
'V' => 99.06841,
109-
'U' => 150.95363,
110-
'O' => 237.14773,
111-
_ => unreachable!("BUG: invalid amino acid {}", self),
89+
b'A' => 71.03711,
90+
b'R' => 156.1011,
91+
b'N' => 114.04293,
92+
b'D' => 115.02694,
93+
b'C' => 103.00919,
94+
b'E' => 129.04259,
95+
b'Q' => 128.05858,
96+
b'G' => 57.02146,
97+
b'H' => 137.05891,
98+
b'I' => 113.08406,
99+
b'L' => 113.08406,
100+
b'K' => 128.09496,
101+
b'M' => 131.0405,
102+
b'F' => 147.0684,
103+
b'P' => 97.05276,
104+
b'S' => 87.03203,
105+
b'T' => 101.04768,
106+
b'W' => 186.07931,
107+
b'Y' => 163.06333,
108+
b'V' => 99.06841,
109+
b'U' => 150.95363,
110+
b'O' => 237.14773,
111+
_ => unreachable!("BUG: invalid amino acid {}", *self as char),
112112
}
113113
}
114114

115115
fn composition(&self) -> Composition {
116116
match self {
117-
'A' => Composition::new(3, 2, 0),
118-
'R' => Composition::new(6, 2, 0),
119-
'N' => Composition::new(4, 3, 0),
120-
'D' => Composition::new(4, 4, 0),
121-
'C' => Composition::new(3, 2, 1),
122-
'E' => Composition::new(5, 4, 0),
123-
'Q' => Composition::new(5, 3, 0),
124-
'G' => Composition::new(2, 2, 0),
125-
'H' => Composition::new(6, 2, 0),
126-
'I' => Composition::new(6, 2, 0),
127-
'L' => Composition::new(6, 2, 0),
128-
'K' => Composition::new(6, 2, 0),
129-
'M' => Composition::new(5, 2, 1),
130-
'F' => Composition::new(9, 2, 0),
131-
'P' => Composition::new(5, 2, 0),
132-
'S' => Composition::new(3, 3, 0),
133-
'T' => Composition::new(4, 3, 0),
134-
'W' => Composition::new(11, 2, 0),
135-
'Y' => Composition::new(9, 3, 0),
136-
'V' => Composition::new(5, 2, 0),
137-
'U' => Composition::new(3, 2, 0),
138-
'O' => Composition::new(12, 3, 0),
139-
_ => unreachable!("BUG: invalid amino acid {}", self),
117+
b'A' => Composition::new(3, 2, 0),
118+
b'R' => Composition::new(6, 2, 0),
119+
b'N' => Composition::new(4, 3, 0),
120+
b'D' => Composition::new(4, 4, 0),
121+
b'C' => Composition::new(3, 2, 1),
122+
b'E' => Composition::new(5, 4, 0),
123+
b'Q' => Composition::new(5, 3, 0),
124+
b'G' => Composition::new(2, 2, 0),
125+
b'H' => Composition::new(6, 2, 0),
126+
b'I' => Composition::new(6, 2, 0),
127+
b'L' => Composition::new(6, 2, 0),
128+
b'K' => Composition::new(6, 2, 0),
129+
b'M' => Composition::new(5, 2, 1),
130+
b'F' => Composition::new(9, 2, 0),
131+
b'P' => Composition::new(5, 2, 0),
132+
b'S' => Composition::new(3, 3, 0),
133+
b'T' => Composition::new(4, 3, 0),
134+
b'W' => Composition::new(11, 2, 0),
135+
b'Y' => Composition::new(9, 3, 0),
136+
b'V' => Composition::new(5, 2, 0),
137+
b'U' => Composition::new(3, 2, 0),
138+
b'O' => Composition::new(12, 3, 0),
139+
_ => unreachable!("BUG: invalid amino acid {}", *self as char),
140140
}
141141
}
142142
}
@@ -167,12 +167,12 @@ impl Sum for Composition {
167167
impl std::fmt::Display for Residue {
168168
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
169169
match self {
170-
Residue::Just(c) => f.write_char(*c),
170+
Residue::Just(c) => f.write_char(*c as char),
171171
Residue::Mod(c, m) => {
172172
if m.is_sign_positive() {
173-
write!(f, "{}[+{}]", c, m)
173+
write!(f, "{}[+{}]", *c as char, m)
174174
} else {
175-
write!(f, "{}[{}]", c, m)
175+
write!(f, "{}[{}]", *c as char, m)
176176
}
177177
}
178178
}

crates/sage/src/ml/linear_discriminant.rs

+3-4
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,9 @@ pub fn score_psms(scores: &mut [Feature]) -> Option<()> {
140140
(perc.delta_next).ln_1p(),
141141
(perc.delta_best).ln_1p(),
142142
(perc.delta_mass as f64).ln_1p(),
143-
perc.isotope_error as f64,
144-
perc.average_ppm as f64,
145-
poisson,
143+
(perc.isotope_error as f64),
144+
(perc.average_ppm as f64),
145+
(poisson),
146146
(perc.matched_intensity_pct as f64).ln_1p(),
147147
(perc.matched_peaks as f64),
148148
(perc.longest_b as f64).ln_1p(),
@@ -162,7 +162,6 @@ pub fn score_psms(scores: &mut [Feature]) -> Option<()> {
162162
.map(|sc| sc.label == -1)
163163
.collect::<Vec<_>>();
164164
let features = Matrix::new(features, scores.len(), FEATURES);
165-
166165
let lda = LinearDiscriminantAnalysis::train(&features, &decoys)?;
167166
if !lda.eigenvector.iter().all(|f| f.is_finite()) {
168167
log::error!(

crates/sage/src/ml/retention_model.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ impl RetentionModel {
4747
crate::mass::Residue::Just(c) => c,
4848
crate::mass::Residue::Mod(c, _) => c,
4949
};
50-
let idx = map[((*c) as u8 - b'A') as usize];
50+
let idx = map[(c - b'A') as usize];
5151
embedding[idx] += 1.0;
5252
// Embed N- and C-terminal AA's (2 on each end, excluding K/R)
5353
match aa_idx {
@@ -67,7 +67,7 @@ impl RetentionModel {
6767
// Create a mapping from amino acid character to vector embedding
6868
let mut map = [0; 26];
6969
for (idx, aa) in VALID_AA.iter().enumerate() {
70-
map[((*aa as u8) - b'A') as usize] = idx;
70+
map[(aa - b'A') as usize] = idx;
7171
}
7272

7373
let rt = training_set

0 commit comments

Comments
 (0)