diff --git a/src/sampler.cpp b/src/sampler.cpp index 39e1430a38..1d1f3f608b 100644 --- a/src/sampler.cpp +++ b/src/sampler.cpp @@ -32,17 +32,14 @@ void Sampler::set_source_paths(const vector& source_paths, const vector>& transcript_expressions, const vector>& haplotype_transcripts) { if (!source_paths.empty() && !transcript_expressions.empty()) { - cerr << "error:[Sampler] cannot sample from list of paths and from list of transcripts simultaneously" << endl; - exit(1); + logger.error() << "cannot sample from list of paths and from list of transcripts simultaneously" << endl; } else if (!haplotype_transcripts.empty() && transcript_expressions.empty()) { - cerr << "error:[Sampler] cannot sample from haplotype transcripts without an expression profile" << endl; - exit(1); + logger.error() << "cannot sample from haplotype transcripts without an expression profile" << endl; } if (!source_path_ploidies.empty() && source_path_ploidies.size() != source_paths.size()) { - cerr << "error:[Sampler] cannot sample from list of paths with the wrong number of ploidy weights (" - << source_path_ploidies.size() << " vs. " << source_paths.size() << ")" << endl; - exit(1); + logger.error() << "cannot sample from list of paths with the wrong number of ploidy weights (" + << source_path_ploidies.size() << " vs. " << source_paths.size() << ")" << endl; } else if (!transcript_expressions.empty()) { this->source_paths.clear(); @@ -564,8 +561,8 @@ Alignment Sampler::alignment_with_error(size_t length, } } if (iter == max_tries) { - cerr << "[vg::Sampler] Warning: could not generate alignment of sufficient length in " - << max_tries << " tries. Graph may be too small, or indel rate too high." << endl; + logger.warn() << "could not generate alignment of sufficient length in " + << max_tries << " tries. Graph may be too small, or indel rate too high." << endl; } aln.set_identity(identity(aln.path())); @@ -603,10 +600,10 @@ bool Sampler::is_valid(const Alignment& aln) { auto expected_bases = graph.get_length(graph.get_handle(mapping.position().node_id())); if (accounted_bases != expected_bases) { - cerr << "[vg::Sampler] Warning: alignment mapping " << i << " accounts for " - << accounted_bases << " bases of graph sequence, but needs to account for " - << expected_bases << endl; - cerr << pb2json(aln) << endl; + logger.warn() << "alignment mapping " << i << " accounts for " + << accounted_bases << " bases of graph sequence, but needs to account for " + << expected_bases << endl + << pb2json(aln) << endl; return false; } } @@ -634,6 +631,7 @@ NGSSimulator::NGSSimulator(PathPositionHandleGraph& graph, double fragment_length_stdev, double error_multiplier, bool retry_on_Ns, + bool use_avg_length, bool sample_unsheared_paths, uint64_t manual_seed) : AbstractReadSampler(graph) @@ -643,6 +641,7 @@ NGSSimulator::NGSSimulator(PathPositionHandleGraph& graph, , fragment_mean(fragment_length_mean) , fragment_sd(fragment_length_stdev) , retry_on_Ns(retry_on_Ns) + , use_avg_length(use_avg_length) , strand_sampler(0, 1) , background_sampler(0, alphabet.size() - 1) , mut_sampler(0, alphabet.size() - 2) @@ -651,48 +650,42 @@ NGSSimulator::NGSSimulator(PathPositionHandleGraph& graph, , source_paths(source_paths_input) , joint_initial_distr(manual_seed ? 1760681024122689423ull * manual_seed + 1107607255714504485ull : random_device()()) , sample_unsheared_paths(sample_unsheared_paths) + , logger(Logger("NGSSimulator")) { if (!ngs_paired_fastq_file.empty() && interleaved_fastq) { - cerr << "error:[NGSSimulator] cannot indicate interleaved FASTQ and paired FASTQs simultaneously" << endl; - exit(1); + logger.error() << "cannot indicate interleaved FASTQ and paired FASTQs simultaneously" << endl; } if (!source_paths.empty() && !transcript_expressions.empty()) { - cerr << "error:[NGSSimulator] cannot simultaneously limit sampling to paths and match an expression profile" << endl; - exit(1); + logger.error() << "cannot simultaneously limit sampling to paths and match an expression profile" << endl; } if (!source_path_ploidies.empty() && source_path_ploidies.size() != source_paths.size()) { - cerr << "error:[NGSSimulator] cannot sample from list of paths with the wrong number of ploidy weights (" - << source_path_ploidies.size() << " vs. " << source_paths.size() << ")" << endl; - exit(1); + logger.error() << "cannot sample from list of paths with the wrong number of ploidy weights (" + << source_path_ploidies.size() << " vs. " << source_paths.size() << ")" << endl; } if (!haplotype_transcripts.empty() && transcript_expressions.empty()) { - cerr << "error:[NGSSimulator] cannot sample from haplotype transcripts without an expression profile" << endl; - exit(1); + logger.error() << "cannot sample from haplotype transcripts without an expression profile" << endl; } if (substition_polymorphism_rate < 0.0 || substition_polymorphism_rate > 1.0 || indel_polymorphism_rate < 0.0 || indel_polymorphism_rate > 1.0 || indel_error_proportion < 0.0 || indel_error_proportion > 1.0) { - cerr << "error:[NGSSimulator] All proportions must be between 0.0 and 1.0" << endl; - exit(1); + logger.error() << "All proportions must be between 0.0 and 1.0" << endl; } if (substition_polymorphism_rate + indel_polymorphism_rate > 1.0) { - cerr << "error:[NGSSimulator] Indel polymorphism rate and substitution polymorphism rate cannot sum to greater than 1.0" << endl; - exit(1); + logger.error() << "Indel polymorphism rate and substitution polymorphism rate" + << " cannot sum to greater than 1.0" << endl; } if (fragment_length_mean <= 0.0) { - cerr << "error:[NGSSimulator] Mean fragment length must be positive" << endl; - exit(1); + logger.error() << "Mean fragment length must be positive" << endl; } if (fragment_length_stdev < 0.0) { - cerr << "error:[NGSSimulator] Fragment length standard deviation must be positive" << endl; - exit(1); + logger.error() << "Fragment length standard deviation must be positive" << endl; } @@ -858,35 +851,47 @@ NGSSimulator::NGSSimulator(PathPositionHandleGraph& graph, // auto-detect the read length size_t modal_length = 0; size_t modal_length_count = 0; + size_t total_length = 0; size_t total_reads = 0; for (const pair& length_record : length_count) { if (length_record.second > modal_length_count) { modal_length_count = length_record.second; modal_length = length_record.first; } + total_length += length_record.first * length_record.second; total_reads += length_record.second; } + + size_t final_read_length = use_avg_length ? total_length / total_reads + : modal_length; - if (((double) modal_length_count) / total_reads < 0.5 && !sample_unsheared_paths) { - cerr << "warning:[NGSSimulator] Auto-detected read length of " << modal_length << " encompasses less than half of training reads, NGSSimulator is optimized for training data in which most reads are the same length" << endl; + if (((double) modal_length_count) / total_reads < 0.5 && !sample_unsheared_paths && !use_avg_length) { + logger.warn() << "Auto-detected read length of " << modal_length + << " encompasses less than half of training reads;" + << " NGSSimulator is optimized for training data" + << " in which most reads are the same length" << endl; } - if (modal_length > fragment_length_mean - 2.0 * fragment_length_stdev && !sample_unsheared_paths) { - cerr << "warning:[NGSSimulator] Auto-detected read length of " << modal_length << " is long compared to mean fragment length " << fragment_length_mean << " and standard deviation " << fragment_length_stdev << ". Sampling may take additional time and the statistical properties of the fragment length distribution may not reflect input parameters." << endl; + if (final_read_length > fragment_length_mean - 2.0 * fragment_length_stdev && !sample_unsheared_paths) { + logger.warn() << "Auto-detected read length of " << final_read_length + << " is long compared to mean fragment length " << fragment_length_mean + << " and standard deviation " << fragment_length_stdev + << ". Sampling may take additional time and the statistical properties" + << " of the fragment length distribution may not reflect input parameters." << endl; } - // shorten the quality string samplers until they are the modal length (this determines read length later) + // shorten the quality string samplers until they are the final length (this determines read length later) // if we're sampling unsheared paths, take the whole read - while (transition_distrs_1.size() > modal_length && !sample_unsheared_paths) { + while (transition_distrs_1.size() > final_read_length && !sample_unsheared_paths) { transition_distrs_1.pop_back(); } - while (transition_distrs_2.size() > modal_length && !sample_unsheared_paths) { + while (transition_distrs_2.size() > final_read_length && !sample_unsheared_paths) { transition_distrs_2.pop_back(); } if (transition_distrs_1.size() != transition_distrs_2.size() && transition_distrs_2.size() > 0) { - cerr << "error:[NGSSimulator] One fragment end in training data has no reads at the modal length, cannot produce joint samples" << endl; - exit(1); + logger.error() << "One fragment end in training data has no reads at the modal length;" + << " cannot produce joint samples" << endl; } finalize(); @@ -908,13 +913,11 @@ NGSSimulator::NGSSimulator(PathPositionHandleGraph& graph, void NGSSimulator::connect_to_position_file(const string& filename) { if (source_paths.empty()) { - cerr << "warning:[NGSSimulator] path position file will not be created because not simulating from paths" << endl; - return; + logger.warn() << "path position file will not be created because not simulating from paths" << endl; } position_file.open(filename); if (!position_file) { - cerr << "error:[NGSSimulator] failed to open position file: " << filename << endl; - exit(1); + logger.error() << "failed to open position file: " << filename << endl; } position_file << "read\tpath\toffset\treverse" << endl; } diff --git a/src/sampler.hpp b/src/sampler.hpp index c36288f4de..c9472a895d 100644 --- a/src/sampler.hpp +++ b/src/sampler.hpp @@ -98,6 +98,8 @@ class Sampler: public AbstractReadSampler { vector source_paths; vg::discrete_distribution<> path_sampler; // draw an index in source_paths size_t total_seq_length = 0; + /// Error/warning logger + Logger logger; /// Make a Sampler to sample from the given graph. /// If sampling from particular paths, source_paths should contain their @@ -117,7 +119,8 @@ class Sampler: public AbstractReadSampler { forward_only(forward_only), no_Ns(!allow_Ns), nonce(0), - source_paths(source_paths) { + source_paths(source_paths), + logger(Logger("Sampler")) { // sum seq lengths graph.for_each_handle([&](const handle_t& handle) { total_seq_length += graph.get_length(handle); @@ -225,6 +228,7 @@ class NGSSimulator : public AbstractReadSampler { double fragment_length_stdev = 50.0, double error_multiplier = 1.0, bool retry_on_Ns = true, + bool use_avg_length = false, bool sample_unsheared_paths = false, uint64_t seed = 0); @@ -387,12 +391,17 @@ class NGSSimulator : public AbstractReadSampler { /// Should we try again for a read without Ns of we get Ns? const bool retry_on_Ns; + /// Should we use the mean read length instead of the mode? + const bool use_avg_length; const bool sample_unsheared_paths; /// Restrict reads to just these paths (path-only mode) if nonempty. vector source_paths; ofstream position_file; + + /// Error/warning logger + Logger logger; }; diff --git a/src/subcommand/sim_main.cpp b/src/subcommand/sim_main.cpp index db17fe7bf4..704841612e 100644 --- a/src/subcommand/sim_main.cpp +++ b/src/subcommand/sim_main.cpp @@ -114,6 +114,8 @@ void help_sim(char** argv) { << " -v, --frag-std-dev FLOAT use this standard deviation" << endl << " for fragment length estimation" << endl << " -N, --allow-Ns allow reads to be sampled with Ns in them" << endl + << " -L, --use-average-length with -F, auto-detect read length" << endl + << " as mean instead of mode" << endl << " --max-tries N attempt sampling operations up to N times [100]" << endl << " -t, --threads N number of compute threads (only when using -F) [1]" << endl << "simulate from paths:" << endl @@ -163,6 +165,7 @@ int main_sim(int argc, char** argv) { int fragment_length = 0; double fragment_std_dev = 0; bool reads_may_contain_Ns = false; + bool use_avg_length = false; size_t max_tries = 100; bool strip_bonuses = false; bool interleaved = false; @@ -222,6 +225,7 @@ int main_sim(int argc, char** argv) { {"json-out", no_argument, 0, 'J'}, {"multi-position", no_argument, 0, OPT_MULTI_POSITION}, {"allow-Ns", no_argument, 0, 'N'}, + {"use-average-length", no_argument, 0, 'L'}, {"max-tries", required_argument, 0, OPT_MAX_TRIES}, {"unsheared", no_argument, 0, 'u'}, {"sub-rate", required_argument, 0, 'e'}, @@ -236,7 +240,7 @@ int main_sim(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "h?rl:n:s:e:i:fax:qJp:v:Nud:F:P:Am:R:g:T:H:S:It:E:", + c = getopt_long (argc, argv, "h?rl:n:s:e:i:fax:qJp:v:NLud:F:P:Am:R:g:T:H:S:It:E:", long_options, &option_index); // Detect the end of the options. @@ -364,6 +368,10 @@ int main_sim(int argc, char** argv) { case 'N': reads_may_contain_Ns = true; break; + + case 'L': + use_avg_length = true; + break; case OPT_MAX_TRIES: max_tries = parse(optarg); @@ -463,6 +471,11 @@ int main_sim(int argc, char** argv) { logger.error() << "unsheared fragment option only available " << "when simulating from FASTQ-trained errors" << std::endl; } + + if (fastq_name.empty() && use_avg_length) { + logger.error() << "average length option only available " + << "when simulating from FASTQ-trained errors" << std::endl; + } // Deal with path names. Do this before we create paths to represent threads. if (any_path) { @@ -801,6 +814,9 @@ int main_sim(int argc, char** argv) { if (reads_may_contain_Ns) { opt_info << "--allow-Ns" << std::endl; } + if (use_avg_length) { + opt_info << "--use-average-length" << std::endl; + } if (max_tries != 100) { opt_info << "--max-tries" << max_tries << std::endl; } @@ -836,6 +852,7 @@ int main_sim(int argc, char** argv) { fragment_std_dev ? fragment_std_dev : 0.000001, error_scale_factor, !reads_may_contain_Ns, + use_avg_length, unsheared_fragments, seed_val)); }