@@ -6,15 +6,18 @@ use csv::ByteRecord;
6
6
use log:: info;
7
7
use rayon:: prelude:: * ;
8
8
use sage_cloudpath:: CloudPath ;
9
- use sage_core:: database:: IndexedDatabase ;
9
+ use sage_core:: database:: { IndexedDatabase , Parameters , PeptideIx } ;
10
+ use sage_core:: fasta:: Fasta ;
10
11
use sage_core:: ion_series:: Kind ;
11
12
use sage_core:: lfq:: { Peak , PrecursorId } ;
12
13
use sage_core:: mass:: Tolerance ;
14
+ use sage_core:: peptide:: Peptide ;
13
15
use sage_core:: scoring:: Fragments ;
14
16
use sage_core:: scoring:: { Feature , Scorer } ;
15
17
use sage_core:: spectrum:: { ProcessedSpectrum , SpectrumProcessor } ;
16
18
use sage_core:: tmt:: TmtQuant ;
17
19
use std:: collections:: HashMap ;
20
+ use std:: collections:: HashSet ;
18
21
use std:: time:: Instant ;
19
22
20
23
pub struct Runner {
@@ -24,7 +27,8 @@ pub struct Runner {
24
27
}
25
28
26
29
impl Runner {
27
- pub fn new ( parameters : Search ) -> anyhow:: Result < Self > {
30
+ pub fn new ( parameters : Search , parallel : usize ) -> anyhow:: Result < Self > {
31
+ let mut parameters = parameters. clone ( ) ;
28
32
let start = Instant :: now ( ) ;
29
33
let fasta = sage_cloudpath:: util:: read_fasta (
30
34
& parameters. database . fasta ,
@@ -38,7 +42,32 @@ impl Runner {
38
42
)
39
43
} ) ?;
40
44
41
- let database = parameters. database . clone ( ) . build ( fasta) ;
45
+ let database = match parameters. database . prefilter {
46
+ false => parameters. database . clone ( ) . build ( fasta) ,
47
+ true => {
48
+ parameters
49
+ . database
50
+ . auto_calculate_prefilter_chunk_size ( & fasta) ;
51
+ if parameters. database . prefilter_chunk_size >= fasta. targets . len ( ) {
52
+ parameters. database . clone ( ) . build ( fasta)
53
+ } else {
54
+ info ! (
55
+ "using {} db chunks of size {}" ,
56
+ ( fasta. targets. len( ) + parameters. database. prefilter_chunk_size - 1 )
57
+ / parameters. database. prefilter_chunk_size,
58
+ parameters. database. prefilter_chunk_size,
59
+ ) ;
60
+ let mini_runner = Self {
61
+ database : IndexedDatabase :: default ( ) ,
62
+ parameters : parameters. clone ( ) ,
63
+ start,
64
+ } ;
65
+ let peptides = mini_runner. prefilter_peptides ( parallel, fasta) ;
66
+ parameters. database . clone ( ) . build_from_peptides ( peptides)
67
+ }
68
+ }
69
+ } ;
70
+
42
71
info ! (
43
72
"generated {} fragments, {} peptides in {}ms" ,
44
73
database. fragments. len( ) ,
@@ -52,6 +81,108 @@ impl Runner {
52
81
} )
53
82
}
54
83
84
+ pub fn prefilter_peptides ( self , parallel : usize , fasta : Fasta ) -> Vec < Peptide > {
85
+ let spectra: Option < Vec < ProcessedSpectrum > > =
86
+ match parallel >= self . parameters . mzml_paths . len ( ) {
87
+ true => Some ( self . read_processed_spectra ( & self . parameters . mzml_paths , 0 , 0 ) ) ,
88
+ false => None ,
89
+ } ;
90
+ let mut all_peptides: Vec < Peptide > = fasta
91
+ . iter_chunks ( self . parameters . database . prefilter_chunk_size )
92
+ . enumerate ( )
93
+ . flat_map ( |( chunk_id, fasta_chunk) | {
94
+ let start = Instant :: now ( ) ;
95
+ info ! ( "pre-filtering fasta chunk {}" , chunk_id, ) ;
96
+ let db = & self . parameters . database . clone ( ) . build ( fasta_chunk) ;
97
+ info ! (
98
+ "generated {} fragments, {} peptides in {}ms" ,
99
+ db. fragments. len( ) ,
100
+ db. peptides. len( ) ,
101
+ ( Instant :: now( ) - start) . as_millis( )
102
+ ) ;
103
+ let scorer = Scorer {
104
+ db,
105
+ precursor_tol : self . parameters . precursor_tol ,
106
+ fragment_tol : self . parameters . fragment_tol ,
107
+ min_matched_peaks : self . parameters . min_matched_peaks ,
108
+ min_isotope_err : self . parameters . isotope_errors . 0 ,
109
+ max_isotope_err : self . parameters . isotope_errors . 1 ,
110
+ min_precursor_charge : self . parameters . precursor_charge . 0 ,
111
+ max_precursor_charge : self . parameters . precursor_charge . 1 ,
112
+ override_precursor_charge : self . parameters . override_precursor_charge ,
113
+ max_fragment_charge : self . parameters . max_fragment_charge ,
114
+ chimera : self . parameters . chimera ,
115
+ report_psms : self . parameters . report_psms + 1 ,
116
+ wide_window : self . parameters . wide_window ,
117
+ annotate_matches : self . parameters . annotate_matches ,
118
+ score_type : self . parameters . score_type ,
119
+ } ;
120
+ let peptide_idxs: HashSet < PeptideIx > = match & spectra {
121
+ Some ( spectra) => self . peptide_filter_processed_spectra ( & scorer, spectra) ,
122
+ None => self
123
+ . parameters
124
+ . mzml_paths
125
+ . chunks ( parallel)
126
+ . enumerate ( )
127
+ . flat_map ( |( chunk_idx, chunk) | {
128
+ let spectra_chunk =
129
+ self . read_processed_spectra ( chunk, chunk_idx, parallel) ;
130
+ self . peptide_filter_processed_spectra ( & scorer, & spectra_chunk)
131
+ } )
132
+ . collect ( ) ,
133
+ }
134
+ . into_iter ( )
135
+ . collect ( ) ;
136
+ let peptides: Vec < Peptide > = peptide_idxs
137
+ . into_iter ( )
138
+ . map ( |idx| db[ idx] . clone ( ) )
139
+ . collect ( ) ;
140
+ info ! (
141
+ "found {} pre-filtered peptides for fasta chunk {}" ,
142
+ peptides. len( ) ,
143
+ chunk_id,
144
+ ) ;
145
+ peptides
146
+ } )
147
+ . collect ( ) ;
148
+ Parameters :: reorder_peptides ( & mut all_peptides) ;
149
+ all_peptides
150
+ }
151
+
152
+ fn peptide_filter_processed_spectra (
153
+ & self ,
154
+ scorer : & Scorer ,
155
+ spectra : & Vec < ProcessedSpectrum > ,
156
+ ) -> Vec < PeptideIx > {
157
+ use std:: sync:: atomic:: { AtomicUsize , Ordering } ;
158
+ let counter = AtomicUsize :: new ( 0 ) ;
159
+ let start = Instant :: now ( ) ;
160
+
161
+ let peptide_idxs: Vec < _ > = spectra
162
+ . par_iter ( )
163
+ . filter ( |spec| spec. peaks . len ( ) >= self . parameters . min_peaks && spec. level == 2 )
164
+ . map ( |x| {
165
+ let prev = counter. fetch_add ( 1 , Ordering :: Relaxed ) ;
166
+ if prev > 0 && prev % 10_000 == 0 {
167
+ let duration = Instant :: now ( ) . duration_since ( start) . as_millis ( ) as usize ;
168
+
169
+ let rate = prev * 1000 / ( duration + 1 ) ;
170
+ log:: trace!( "- searched {} spectra ({} spectra/s)" , prev, rate) ;
171
+ }
172
+ x
173
+ } )
174
+ . flat_map ( |spec| {
175
+ scorer. quick_score ( spec, self . parameters . database . prefilter_low_memory )
176
+ } )
177
+ . collect ( ) ;
178
+
179
+ let duration = Instant :: now ( ) . duration_since ( start) . as_millis ( ) as usize ;
180
+ let prev = counter. load ( Ordering :: Relaxed ) ;
181
+ let rate = prev * 1000 / ( duration + 1 ) ;
182
+ log:: info!( "- search: {:8} ms ({} spectra/s)" , duration, rate) ;
183
+ peptide_idxs
184
+ }
185
+
55
186
fn spectrum_fdr ( & self , features : & mut [ Feature ] ) -> usize {
56
187
if sage_core:: ml:: linear_discriminant:: score_psms ( features, self . parameters . precursor_tol )
57
188
. is_none ( )
@@ -76,8 +207,8 @@ impl Runner {
76
207
fn search_processed_spectra (
77
208
& self ,
78
209
scorer : & Scorer ,
79
- spectra : Vec < ProcessedSpectrum > ,
80
- ) -> SageResults {
210
+ spectra : & Vec < ProcessedSpectrum > ,
211
+ ) -> Vec < Feature > {
81
212
use std:: sync:: atomic:: { AtomicUsize , Ordering } ;
82
213
let counter = AtomicUsize :: new ( 0 ) ;
83
214
let start = Instant :: now ( ) ;
@@ -102,7 +233,14 @@ impl Runner {
102
233
let prev = counter. load ( Ordering :: Relaxed ) ;
103
234
let rate = prev * 1000 / ( duration + 1 ) ;
104
235
log:: info!( "- search: {:8} ms ({} spectra/s)" , duration, rate) ;
236
+ features
237
+ }
105
238
239
+ fn complete_features (
240
+ & self ,
241
+ spectra : Vec < ProcessedSpectrum > ,
242
+ features : Vec < Feature > ,
243
+ ) -> SageResults {
106
244
let quant = self
107
245
. parameters
108
246
. quant
@@ -132,6 +270,17 @@ impl Runner {
132
270
chunk_idx : usize ,
133
271
batch_size : usize ,
134
272
) -> SageResults {
273
+ let spectra = self . read_processed_spectra ( chunk, chunk_idx, batch_size) ;
274
+ let features = self . search_processed_spectra ( scorer, & spectra) ;
275
+ self . complete_features ( spectra, features)
276
+ }
277
+
278
+ fn read_processed_spectra (
279
+ & self ,
280
+ chunk : & [ String ] ,
281
+ chunk_idx : usize ,
282
+ batch_size : usize ,
283
+ ) -> Vec < ProcessedSpectrum > {
135
284
// Read all of the spectra at once - this can help prevent memory over-consumption issues
136
285
info ! (
137
286
"processing files {} .. {} " ,
@@ -190,7 +339,7 @@ impl Runner {
190
339
let io_time = Instant :: now ( ) - start;
191
340
info ! ( "- file IO: {:8} ms" , io_time. as_millis( ) ) ;
192
341
193
- self . search_processed_spectra ( scorer , spectra)
342
+ spectra
194
343
}
195
344
196
345
pub fn batch_files ( & self , scorer : & Scorer , batch_size : usize ) -> SageResults {
0 commit comments