diff --git a/scripts/plot-pr.R b/scripts/plot-pr.R index 0fbda529781..2c6c9214bcc 100755 --- a/scripts/plot-pr.R +++ b/scripts/plot-pr.R @@ -10,11 +10,16 @@ require("tidyverse") require("ggrepel") # Read in the combined toil-vg stats.tsv, listing: -# correct, mapq, aligner (really graph name), read name, count -dat <- read.table(commandArgs(TRUE)[1], header=T) +# correct, mapq, aligner (really graph name), read name, count, eligible +dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor")) + +if (("eligible" %in% names(dat))) { + # If the eligible column is present, remove ineligible reads + dat <- dat[dat$eligible == 1, ] +} if (! ("count" %in% names(dat))) { - # If the count column is not present, add i + # If the count column is not present, add it dat$count <- rep(1, nrow(dat)) } @@ -48,8 +53,8 @@ dat$aligner <- factor(dat$aligner, levels=aligner.names) name.lists <- name.lists[name.order] # Determine colors for aligners -bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b") -light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee") +bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b","#6caed1") +light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee","#b9d9e9") # We have to go through both lists together when assigning colors, because pe and non-pe versions of a condition need corresponding colors. cursor <- 1 @@ -95,6 +100,25 @@ colors <- colors[aligner.names] # Add a bin "factor" to each row, binning float MAPQs into bins from 0 to 60 (and inclusing bins for out of range on each end) dat$bin <- cut(dat$mq, c(-Inf,seq(0,60,1),Inf)) +# We need to work out our scales +reads.per.condition <- sum(dat$count) / length(aligner.names) +# Start with small scale +labels <- c("1e-0","1e-1","1e-2","1e-3","1e-4") +breaks <- c(0,1,2,3,4) +limits <- c(0, 4) +if ( reads.per.condition > 10000 ) { + # Use big scale if there are a lot of reads + labels <- c(labels, "1e-5","1e-6") + breaks <- c(breaks, 5,6) + limits <- c(0, 6) +} +if ( reads.per.condition > 1000000 ) { + # Use big scale if there are a lot of reads + labels <- c(labels, "1e-7","1e-8","1e-9") + breaks <- c(breaks, 7,8,9) + limits <- c(0, 9) +} + # Now we break out the cool dplyr/magrittr/tidyverse tools like %>% pipe operators. dat.roc <- dat %>% # Make positive and negative count columns @@ -127,15 +151,15 @@ dat.plot <- dat.roc %>% # There will be points with variable sizes geom_point(aes(size=Positive+Negative)) + # We manually assign these selected colors - scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) + + scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) + # And we want a size legend scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) + # And we want a fake log Y axis - scale_y_continuous(labels=c("1e-0","1e-1","1e-2","1e-3","1e-4","1e-5","1e-6","1e-7","1e-8","1e-9"), breaks=c(0,1,2,3,4,5,6,7,8,9), limits=c(0, 9)) + + scale_y_continuous(labels=labels, breaks=breaks, limits=limits) + # Label it ylab("1 - Precision") + # And we want a fake log X axis - scale_x_continuous(labels=c("1e-0","1e-1","1e-2","1e-3","1e-4","1e-5","1e-6","1e-7","1e-8","1e-9"), breaks=c(0,1,2,3,4,5,6,7,8,9), limits=c(0, 9)) + + scale_x_continuous(labels=labels, breaks=breaks, limits=limits) + # Label it xlab("1 - Recall") + # And we want this cool theme diff --git a/scripts/plot-qq.R b/scripts/plot-qq.R index 0f5a8b7074a..434b9455805 100755 --- a/scripts/plot-qq.R +++ b/scripts/plot-qq.R @@ -2,15 +2,21 @@ # plot-qq.R [ [title]] -list.of.packages <- c("tidyverse", "ggrepel", "svglite") +list.of.packages <- c("tidyverse", "ggrepel", "svglite", "binom") new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] if(length(new.packages)) install.packages(new.packages) require("tidyverse") require("ggrepel") +require("binom") # Read in the combined toil-vg stats.tsv, listing: -# correct, mapq, aligner (really graph name), read name, count -dat <- read.table(commandArgs(TRUE)[1], header=T) +# correct, mapq, aligner (really graph name), read name, count, eligible +dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor")) + +if (("eligible" %in% names(dat))) { + # If the eligible column is present, remove ineligible reads + dat <- dat[dat$eligible == 1, ] +} if (! ("count" %in% names(dat))) { # If the count column is not present, add i @@ -47,8 +53,8 @@ dat$aligner <- factor(dat$aligner, levels=aligner.names) name.lists <- name.lists[name.order] # Determine colors for aligners -bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b") -light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee") +bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b","#6caed1") +light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee","#b9d9e9") # We have to go through both lists together when assigning colors, because pe and non-pe versions of a condition need corresponding colors. cursor <- 1 @@ -93,14 +99,20 @@ colors <- colors[aligner.names] dat$bin <- cut(dat$mq, c(-Inf,seq(0,60,1),Inf)) -x <- as.data.frame(summarize(group_by(dat, bin, aligner), N=n(), mapq=mean(mq), mapprob=mean(1-10^(-mapq/10)), observed=weighted.mean(correct, count))) +x <- as.data.frame(summarize(group_by(dat, bin, aligner), N=n(), mapq=mean(mq), mapprob=mean(1-10^(-mapq/10)), observed=weighted.mean(correct, count), select(binom.confint(sum(correct * count), sum(count), conf.level=0.9, methods="lrt"), c("lower", "upper")))) + +print(names(x)) +print(x$ci) -dat.plot <- ggplot(x, aes(1-mapprob+1e-9, 1-observed+1e-9, color=aligner, size=N, weight=N, label=round(mapq,2))) + - scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) + - scale_y_log10("measured error", limits=c(5e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) + - scale_x_log10("error estimate", limits=c(5e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) + +# Now plot the points as different sizes, but the error bar line ranges as a consistent size +dat.plot <- ggplot(x, aes(1-mapprob+1e-7, 1-observed+1e-7, color=aligner, size=N, weight=N, label=round(mapq,2))) + + scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) + + scale_y_log10("measured error", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) + + scale_x_log10("error estimate", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) + scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) + geom_point() + + # Only aesthetics that depend on each point need to be in the aes() mapping + geom_linerange(aes(x=1-mapprob+1e-7, ymin=1-upper+1e-7, ymax=1-lower+1e-7), linewidth=0.2, position=position_dodge(.05)) + geom_smooth() + geom_abline(intercept=0, slope=1, linetype=2) + theme_bw() diff --git a/scripts/plot-roc-log.R b/scripts/plot-roc-log.R index 54c3a653436..87321054a9f 100755 --- a/scripts/plot-roc-log.R +++ b/scripts/plot-roc-log.R @@ -20,8 +20,13 @@ require("tidyverse") require("ggrepel") # Read in the combined toil-vg stats.tsv, listing: -# correct, mapq, aligner (really graph name), read name, count -dat <- read.table(commandArgs(TRUE)[1], header=T) +# correct, mapq, aligner (really graph name), read name, count, eligible +dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor")) + +if (("eligible" %in% names(dat))) { + # If the eligible column is present, remove ineligible reads + dat <- dat[dat$eligible == 1, ] +} if (! ("count" %in% names(dat))) { # If the count column is not present, add i diff --git a/scripts/plot-roc.R b/scripts/plot-roc.R index 3353f0c4d6b..657b4a9782e 100755 --- a/scripts/plot-roc.R +++ b/scripts/plot-roc.R @@ -21,8 +21,13 @@ require("ggrepel") require("scales") # For squish # Read in the combined toil-vg stats.tsv, listing: -# correct, mapq, aligner (really graph name), read name, count -dat <- read.table(commandArgs(TRUE)[1], header=T) +# correct, mapq, aligner (really graph name), read name, count, eligible +dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor")) + +if (("eligible" %in% names(dat))) { + # If the eligible column is present, remove ineligible reads + dat <- dat[dat$eligible == 1, ] +} if (! ("count" %in% names(dat))) { # If the count column is not present, add i diff --git a/src/subcommand/gamcompare_main.cpp b/src/subcommand/gamcompare_main.cpp index 7cd27270513..3000a096806 100644 --- a/src/subcommand/gamcompare_main.cpp +++ b/src/subcommand/gamcompare_main.cpp @@ -11,6 +11,7 @@ #include "subcommand.hpp" #include "../alignment.hpp" +#include "../annotation.hpp" #include "../snarl_distance_index.hpp" #include "../vg.hpp" #include @@ -257,7 +258,7 @@ int main_gamcompare(int argc, char** argv) { // Output TSV to standard out in the format plot-qq.R needs. if (!header_printed) { // It needs a header - cout << "correct\tmq\taligner\tread" << endl; + cout << "correct\tmq\taligner\tread\teligible" << endl; header_printed = true; } @@ -266,7 +267,8 @@ int main_gamcompare(int argc, char** argv) { cout << (aln.correctly_mapped() ? "1" : "0") << "\t"; cout << aln.mapping_quality() << "\t"; cout << aligner_name << "\t"; - cout << aln.name() << endl; + cout << aln.name() << "\t"; + cout << (has_annotation(aln, "no_truth") ? "0" : "1") << endl; } text_buffer.clear(); }; @@ -341,6 +343,8 @@ int main_gamcompare(int argc, char** argv) { // Annotate it as such aln.set_correctly_mapped(correctly_mapped); + // And make sure we say it was possible to get + clear_annotation(aln, "no_truth"); if (correctly_mapped) { correct_counts.at(omp_get_thread_num()) += 1; @@ -358,15 +362,25 @@ int main_gamcompare(int argc, char** argv) { correct_count_by_mapq_by_thread.at(omp_get_thread_num()).at(mapq) += 1; } } + } else if (range != -1) { + // We are flagging reads correct/incorrect, but this read has no truth position. + // Remember that it was impossible to get. + set_annotation(aln, "no_truth", true); } #pragma omp critical { if (output_tsv) { - text_buffer.emplace_back(std::move(aln)); + if (emitter) { + // Copy the alignment since we need it twice + text_buffer.emplace_back(aln); + } else { + text_buffer.emplace_back(std::move(aln)); + } if (text_buffer.size() > 1000) { flush_text_buffer(); } - } else { + } + if (emitter) { emitter->write(std::move(aln)); } } diff --git a/vgci/vgci.sh b/vgci/vgci.sh index 44131e0b607..192d6d19839 100755 --- a/vgci/vgci.sh +++ b/vgci/vgci.sh @@ -30,7 +30,7 @@ KEEP_INTERMEDIATE_FILES=0 # Should we show stdout and stderr from tests? If so, set to "-s". SHOW_OPT="" # What toil-vg should we install? -TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@64774cc76ef117aef705c1b5203450482c6a4b60" +TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@c9bd6414f935e6095574a41a34addbb8d87b41a6" # What toil should we install? # Could be something like "toil[aws,mesos]==3.20.0" # or "git+https://github.com/DataBiosphere/toil.git@3ab74776a3adebd6db75de16985ce9d734f60743#egg=toil[aws,mesos]"