Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 46 additions & 43 deletions src/sampler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,14 @@ void Sampler::set_source_paths(const vector<string>& source_paths,
const vector<pair<string, double>>& transcript_expressions,
const vector<tuple<string, string, size_t>>& haplotype_transcripts) {
if (!source_paths.empty() && !transcript_expressions.empty()) {
cerr << "error:[Sampler] cannot sample from list of paths and from list of transcripts simultaneously" << endl;
exit(1);
logger.error() << "cannot sample from list of paths and from list of transcripts simultaneously" << endl;
}
else if (!haplotype_transcripts.empty() && transcript_expressions.empty()) {
cerr << "error:[Sampler] cannot sample from haplotype transcripts without an expression profile" << endl;
exit(1);
logger.error() << "cannot sample from haplotype transcripts without an expression profile" << endl;
}
if (!source_path_ploidies.empty() && source_path_ploidies.size() != source_paths.size()) {
cerr << "error:[Sampler] cannot sample from list of paths with the wrong number of ploidy weights ("
<< source_path_ploidies.size() << " vs. " << source_paths.size() << ")" << endl;
exit(1);
logger.error() << "cannot sample from list of paths with the wrong number of ploidy weights ("
<< source_path_ploidies.size() << " vs. " << source_paths.size() << ")" << endl;
}
else if (!transcript_expressions.empty()) {
this->source_paths.clear();
Expand Down Expand Up @@ -564,8 +561,8 @@ Alignment Sampler::alignment_with_error(size_t length,
}
}
if (iter == max_tries) {
cerr << "[vg::Sampler] Warning: could not generate alignment of sufficient length in "
<< max_tries << " tries. Graph may be too small, or indel rate too high." << endl;
logger.warn() << "could not generate alignment of sufficient length in "
<< max_tries << " tries. Graph may be too small, or indel rate too high." << endl;
}
aln.set_identity(identity(aln.path()));

Expand Down Expand Up @@ -603,10 +600,10 @@ bool Sampler::is_valid(const Alignment& aln) {
auto expected_bases = graph.get_length(graph.get_handle(mapping.position().node_id()));

if (accounted_bases != expected_bases) {
cerr << "[vg::Sampler] Warning: alignment mapping " << i << " accounts for "
<< accounted_bases << " bases of graph sequence, but needs to account for "
<< expected_bases << endl;
cerr << pb2json(aln) << endl;
logger.warn() << "alignment mapping " << i << " accounts for "
<< accounted_bases << " bases of graph sequence, but needs to account for "
<< expected_bases << endl
<< pb2json(aln) << endl;
return false;
}
}
Expand Down Expand Up @@ -634,6 +631,7 @@ NGSSimulator::NGSSimulator(PathPositionHandleGraph& graph,
double fragment_length_stdev,
double error_multiplier,
bool retry_on_Ns,
bool use_avg_length,
bool sample_unsheared_paths,
uint64_t manual_seed) :
AbstractReadSampler(graph)
Expand All @@ -643,6 +641,7 @@ NGSSimulator::NGSSimulator(PathPositionHandleGraph& graph,
, fragment_mean(fragment_length_mean)
, fragment_sd(fragment_length_stdev)
, retry_on_Ns(retry_on_Ns)
, use_avg_length(use_avg_length)
, strand_sampler(0, 1)
, background_sampler(0, alphabet.size() - 1)
, mut_sampler(0, alphabet.size() - 2)
Expand All @@ -651,48 +650,42 @@ NGSSimulator::NGSSimulator(PathPositionHandleGraph& graph,
, source_paths(source_paths_input)
, joint_initial_distr(manual_seed ? 1760681024122689423ull * manual_seed + 1107607255714504485ull : random_device()())
, sample_unsheared_paths(sample_unsheared_paths)
, logger(Logger("NGSSimulator"))
{
if (!ngs_paired_fastq_file.empty() && interleaved_fastq) {
cerr << "error:[NGSSimulator] cannot indicate interleaved FASTQ and paired FASTQs simultaneously" << endl;
exit(1);
logger.error() << "cannot indicate interleaved FASTQ and paired FASTQs simultaneously" << endl;
}

if (!source_paths.empty() && !transcript_expressions.empty()) {
cerr << "error:[NGSSimulator] cannot simultaneously limit sampling to paths and match an expression profile" << endl;
exit(1);
logger.error() << "cannot simultaneously limit sampling to paths and match an expression profile" << endl;
}

if (!source_path_ploidies.empty() && source_path_ploidies.size() != source_paths.size()) {
cerr << "error:[NGSSimulator] cannot sample from list of paths with the wrong number of ploidy weights ("
<< source_path_ploidies.size() << " vs. " << source_paths.size() << ")" << endl;
exit(1);
logger.error() << "cannot sample from list of paths with the wrong number of ploidy weights ("
<< source_path_ploidies.size() << " vs. " << source_paths.size() << ")" << endl;
}

if (!haplotype_transcripts.empty() && transcript_expressions.empty()) {
cerr << "error:[NGSSimulator] cannot sample from haplotype transcripts without an expression profile" << endl;
exit(1);
logger.error() << "cannot sample from haplotype transcripts without an expression profile" << endl;
}

if (substition_polymorphism_rate < 0.0 || substition_polymorphism_rate > 1.0
|| indel_polymorphism_rate < 0.0 || indel_polymorphism_rate > 1.0
|| indel_error_proportion < 0.0 || indel_error_proportion > 1.0) {
cerr << "error:[NGSSimulator] All proportions must be between 0.0 and 1.0" << endl;
exit(1);
logger.error() << "All proportions must be between 0.0 and 1.0" << endl;
}

if (substition_polymorphism_rate + indel_polymorphism_rate > 1.0) {
cerr << "error:[NGSSimulator] Indel polymorphism rate and substitution polymorphism rate cannot sum to greater than 1.0" << endl;
exit(1);
logger.error() << "Indel polymorphism rate and substitution polymorphism rate"
<< " cannot sum to greater than 1.0" << endl;
}

if (fragment_length_mean <= 0.0) {
cerr << "error:[NGSSimulator] Mean fragment length must be positive" << endl;
exit(1);
logger.error() << "Mean fragment length must be positive" << endl;
}

if (fragment_length_stdev < 0.0) {
cerr << "error:[NGSSimulator] Fragment length standard deviation must be positive" << endl;
exit(1);
logger.error() << "Fragment length standard deviation must be positive" << endl;
}


Expand Down Expand Up @@ -858,35 +851,47 @@ NGSSimulator::NGSSimulator(PathPositionHandleGraph& graph,
// auto-detect the read length
size_t modal_length = 0;
size_t modal_length_count = 0;
size_t total_length = 0;
size_t total_reads = 0;
for (const pair<size_t, size_t>& length_record : length_count) {
if (length_record.second > modal_length_count) {
modal_length_count = length_record.second;
modal_length = length_record.first;
}
total_length += length_record.first * length_record.second;
total_reads += length_record.second;
}

size_t final_read_length = use_avg_length ? total_length / total_reads
: modal_length;

if (((double) modal_length_count) / total_reads < 0.5 && !sample_unsheared_paths) {
cerr << "warning:[NGSSimulator] Auto-detected read length of " << modal_length << " encompasses less than half of training reads, NGSSimulator is optimized for training data in which most reads are the same length" << endl;
if (((double) modal_length_count) / total_reads < 0.5 && !sample_unsheared_paths && !use_avg_length) {
logger.warn() << "Auto-detected read length of " << modal_length
<< " encompasses less than half of training reads;"
<< " NGSSimulator is optimized for training data"
<< " in which most reads are the same length" << endl;
}

if (modal_length > fragment_length_mean - 2.0 * fragment_length_stdev && !sample_unsheared_paths) {
cerr << "warning:[NGSSimulator] Auto-detected read length of " << modal_length << " is long compared to mean fragment length " << fragment_length_mean << " and standard deviation " << fragment_length_stdev << ". Sampling may take additional time and the statistical properties of the fragment length distribution may not reflect input parameters." << endl;
if (final_read_length > fragment_length_mean - 2.0 * fragment_length_stdev && !sample_unsheared_paths) {
logger.warn() << "Auto-detected read length of " << final_read_length
<< " is long compared to mean fragment length " << fragment_length_mean
<< " and standard deviation " << fragment_length_stdev
<< ". Sampling may take additional time and the statistical properties"
<< " of the fragment length distribution may not reflect input parameters." << endl;
}

// shorten the quality string samplers until they are the modal length (this determines read length later)
// shorten the quality string samplers until they are the final length (this determines read length later)
// if we're sampling unsheared paths, take the whole read
while (transition_distrs_1.size() > modal_length && !sample_unsheared_paths) {
while (transition_distrs_1.size() > final_read_length && !sample_unsheared_paths) {
transition_distrs_1.pop_back();
}
while (transition_distrs_2.size() > modal_length && !sample_unsheared_paths) {
while (transition_distrs_2.size() > final_read_length && !sample_unsheared_paths) {
transition_distrs_2.pop_back();
}

if (transition_distrs_1.size() != transition_distrs_2.size() && transition_distrs_2.size() > 0) {
cerr << "error:[NGSSimulator] One fragment end in training data has no reads at the modal length, cannot produce joint samples" << endl;
exit(1);
logger.error() << "One fragment end in training data has no reads at the modal length;"
<< " cannot produce joint samples" << endl;
}

finalize();
Expand All @@ -908,13 +913,11 @@ NGSSimulator::NGSSimulator(PathPositionHandleGraph& graph,

void NGSSimulator::connect_to_position_file(const string& filename) {
if (source_paths.empty()) {
cerr << "warning:[NGSSimulator] path position file will not be created because not simulating from paths" << endl;
return;
logger.warn() << "path position file will not be created because not simulating from paths" << endl;
}
position_file.open(filename);
if (!position_file) {
cerr << "error:[NGSSimulator] failed to open position file: " << filename << endl;
exit(1);
logger.error() << "failed to open position file: " << filename << endl;
}
position_file << "read\tpath\toffset\treverse" << endl;
}
Expand Down
11 changes: 10 additions & 1 deletion src/sampler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ class Sampler: public AbstractReadSampler {
vector<string> source_paths;
vg::discrete_distribution<> path_sampler; // draw an index in source_paths
size_t total_seq_length = 0;
/// Error/warning logger
Logger logger;

/// Make a Sampler to sample from the given graph.
/// If sampling from particular paths, source_paths should contain their
Expand All @@ -117,7 +119,8 @@ class Sampler: public AbstractReadSampler {
forward_only(forward_only),
no_Ns(!allow_Ns),
nonce(0),
source_paths(source_paths) {
source_paths(source_paths),
logger(Logger("Sampler")) {
// sum seq lengths
graph.for_each_handle([&](const handle_t& handle) {
total_seq_length += graph.get_length(handle);
Expand Down Expand Up @@ -225,6 +228,7 @@ class NGSSimulator : public AbstractReadSampler {
double fragment_length_stdev = 50.0,
double error_multiplier = 1.0,
bool retry_on_Ns = true,
bool use_avg_length = false,
bool sample_unsheared_paths = false,
uint64_t seed = 0);

Expand Down Expand Up @@ -387,12 +391,17 @@ class NGSSimulator : public AbstractReadSampler {

/// Should we try again for a read without Ns of we get Ns?
const bool retry_on_Ns;
/// Should we use the mean read length instead of the mode?
const bool use_avg_length;
const bool sample_unsheared_paths;

/// Restrict reads to just these paths (path-only mode) if nonempty.
vector<string> source_paths;

ofstream position_file;

/// Error/warning logger
Logger logger;
};


Expand Down
19 changes: 18 additions & 1 deletion src/subcommand/sim_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ void help_sim(char** argv) {
<< " -v, --frag-std-dev FLOAT use this standard deviation" << endl
<< " for fragment length estimation" << endl
<< " -N, --allow-Ns allow reads to be sampled with Ns in them" << endl
<< " -L, --use-average-length with -F, auto-detect read length" << endl
<< " as mean instead of mode" << endl
<< " --max-tries N attempt sampling operations up to N times [100]" << endl
<< " -t, --threads N number of compute threads (only when using -F) [1]" << endl
<< "simulate from paths:" << endl
Expand Down Expand Up @@ -163,6 +165,7 @@ int main_sim(int argc, char** argv) {
int fragment_length = 0;
double fragment_std_dev = 0;
bool reads_may_contain_Ns = false;
bool use_avg_length = false;
size_t max_tries = 100;
bool strip_bonuses = false;
bool interleaved = false;
Expand Down Expand Up @@ -222,6 +225,7 @@ int main_sim(int argc, char** argv) {
{"json-out", no_argument, 0, 'J'},
{"multi-position", no_argument, 0, OPT_MULTI_POSITION},
{"allow-Ns", no_argument, 0, 'N'},
{"use-average-length", no_argument, 0, 'L'},
{"max-tries", required_argument, 0, OPT_MAX_TRIES},
{"unsheared", no_argument, 0, 'u'},
{"sub-rate", required_argument, 0, 'e'},
Expand All @@ -236,7 +240,7 @@ int main_sim(int argc, char** argv) {
};

int option_index = 0;
c = getopt_long (argc, argv, "h?rl:n:s:e:i:fax:qJp:v:Nud:F:P:Am:R:g:T:H:S:It:E:",
c = getopt_long (argc, argv, "h?rl:n:s:e:i:fax:qJp:v:NLud:F:P:Am:R:g:T:H:S:It:E:",
long_options, &option_index);

// Detect the end of the options.
Expand Down Expand Up @@ -364,6 +368,10 @@ int main_sim(int argc, char** argv) {
case 'N':
reads_may_contain_Ns = true;
break;

case 'L':
use_avg_length = true;
break;

case OPT_MAX_TRIES:
max_tries = parse<size_t>(optarg);
Expand Down Expand Up @@ -463,6 +471,11 @@ int main_sim(int argc, char** argv) {
logger.error() << "unsheared fragment option only available "
<< "when simulating from FASTQ-trained errors" << std::endl;
}

if (fastq_name.empty() && use_avg_length) {
logger.error() << "average length option only available "
<< "when simulating from FASTQ-trained errors" << std::endl;
}

// Deal with path names. Do this before we create paths to represent threads.
if (any_path) {
Expand Down Expand Up @@ -801,6 +814,9 @@ int main_sim(int argc, char** argv) {
if (reads_may_contain_Ns) {
opt_info << "--allow-Ns" << std::endl;
}
if (use_avg_length) {
opt_info << "--use-average-length" << std::endl;
}
if (max_tries != 100) {
opt_info << "--max-tries" << max_tries << std::endl;
}
Expand Down Expand Up @@ -836,6 +852,7 @@ int main_sim(int argc, char** argv) {
fragment_std_dev ? fragment_std_dev : 0.000001,
error_scale_factor,
!reads_may_contain_Ns,
use_avg_length,
unsheared_fragments,
seed_val));
}
Expand Down
Loading