Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add notion of eligible reads to GAM comparison #4220

Merged
merged 5 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 32 additions & 8 deletions scripts/plot-pr.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,16 @@ require("tidyverse")
require("ggrepel")

# Read in the combined toil-vg stats.tsv, listing:
# correct, mapq, aligner (really graph name), read name, count
dat <- read.table(commandArgs(TRUE)[1], header=T)
# correct, mapq, aligner (really graph name), read name, count, eligible
dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))

if (("eligible" %in% names(dat))) {
# If the eligible column is present, remove ineligible reads
dat <- dat[dat$eligible == 1, ]
}

if (! ("count" %in% names(dat))) {
# If the count column is not present, add i
# If the count column is not present, add it
dat$count <- rep(1, nrow(dat))
}

Expand Down Expand Up @@ -48,8 +53,8 @@ dat$aligner <- factor(dat$aligner, levels=aligner.names)
name.lists <- name.lists[name.order]

# Determine colors for aligners
bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b")
light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee")
bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b","#6caed1")
light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee","#b9d9e9")
# We have to go through both lists together when assigning colors, because pe and non-pe versions of a condition need corresponding colors.
cursor <- 1

Expand Down Expand Up @@ -95,6 +100,25 @@ colors <- colors[aligner.names]
# Add a bin "factor" to each row, binning float MAPQs into bins from 0 to 60 (and inclusing bins for out of range on each end)
dat$bin <- cut(dat$mq, c(-Inf,seq(0,60,1),Inf))

# We need to work out our scales
reads.per.condition <- sum(dat$count) / length(aligner.names)
# Start with small scale
labels <- c("1e-0","1e-1","1e-2","1e-3","1e-4")
breaks <- c(0,1,2,3,4)
limits <- c(0, 4)
if ( reads.per.condition > 10000 ) {
# Use big scale if there are a lot of reads
labels <- c(labels, "1e-5","1e-6")
breaks <- c(breaks, 5,6)
limits <- c(0, 6)
}
if ( reads.per.condition > 1000000 ) {
# Use big scale if there are a lot of reads
labels <- c(labels, "1e-7","1e-8","1e-9")
breaks <- c(breaks, 7,8,9)
limits <- c(0, 9)
}

# Now we break out the cool dplyr/magrittr/tidyverse tools like %>% pipe operators.
dat.roc <- dat %>%
# Make positive and negative count columns
Expand Down Expand Up @@ -127,15 +151,15 @@ dat.plot <- dat.roc %>%
# There will be points with variable sizes
geom_point(aes(size=Positive+Negative)) +
# We manually assign these selected colors
scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) +
scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) +
# And we want a size legend
scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) +
# And we want a fake log Y axis
scale_y_continuous(labels=c("1e-0","1e-1","1e-2","1e-3","1e-4","1e-5","1e-6","1e-7","1e-8","1e-9"), breaks=c(0,1,2,3,4,5,6,7,8,9), limits=c(0, 9)) +
scale_y_continuous(labels=labels, breaks=breaks, limits=limits) +
# Label it
ylab("1 - Precision") +
# And we want a fake log X axis
scale_x_continuous(labels=c("1e-0","1e-1","1e-2","1e-3","1e-4","1e-5","1e-6","1e-7","1e-8","1e-9"), breaks=c(0,1,2,3,4,5,6,7,8,9), limits=c(0, 9)) +
scale_x_continuous(labels=labels, breaks=breaks, limits=limits) +
# Label it
xlab("1 - Recall") +
# And we want this cool theme
Expand Down
32 changes: 22 additions & 10 deletions scripts/plot-qq.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,21 @@

# plot-qq.R <stats TSV> <destination image file> [<comma-separated "aligner" names to include> [title]]

list.of.packages <- c("tidyverse", "ggrepel", "svglite")
list.of.packages <- c("tidyverse", "ggrepel", "svglite", "binom")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
require("tidyverse")
require("ggrepel")
require("binom")

# Read in the combined toil-vg stats.tsv, listing:
# correct, mapq, aligner (really graph name), read name, count
dat <- read.table(commandArgs(TRUE)[1], header=T)
# correct, mapq, aligner (really graph name), read name, count, eligible
dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))

if (("eligible" %in% names(dat))) {
# If the eligible column is present, remove ineligible reads
dat <- dat[dat$eligible == 1, ]
}

if (! ("count" %in% names(dat))) {
# If the count column is not present, add i
Expand Down Expand Up @@ -47,8 +53,8 @@ dat$aligner <- factor(dat$aligner, levels=aligner.names)
name.lists <- name.lists[name.order]

# Determine colors for aligners
bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b")
light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee")
bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b","#6caed1")
light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee","#b9d9e9")
# We have to go through both lists together when assigning colors, because pe and non-pe versions of a condition need corresponding colors.
cursor <- 1

Expand Down Expand Up @@ -93,14 +99,20 @@ colors <- colors[aligner.names]

dat$bin <- cut(dat$mq, c(-Inf,seq(0,60,1),Inf))

x <- as.data.frame(summarize(group_by(dat, bin, aligner), N=n(), mapq=mean(mq), mapprob=mean(1-10^(-mapq/10)), observed=weighted.mean(correct, count)))
x <- as.data.frame(summarize(group_by(dat, bin, aligner), N=n(), mapq=mean(mq), mapprob=mean(1-10^(-mapq/10)), observed=weighted.mean(correct, count), select(binom.confint(sum(correct * count), sum(count), conf.level=0.9, methods="lrt"), c("lower", "upper"))))

print(names(x))
print(x$ci)

dat.plot <- ggplot(x, aes(1-mapprob+1e-9, 1-observed+1e-9, color=aligner, size=N, weight=N, label=round(mapq,2))) +
scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) +
scale_y_log10("measured error", limits=c(5e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
scale_x_log10("error estimate", limits=c(5e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
# Now plot the points as different sizes, but the error bar line ranges as a consistent size
dat.plot <- ggplot(x, aes(1-mapprob+1e-7, 1-observed+1e-7, color=aligner, size=N, weight=N, label=round(mapq,2))) +
scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) +
scale_y_log10("measured error", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
scale_x_log10("error estimate", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) +
geom_point() +
# Only aesthetics that depend on each point need to be in the aes() mapping
geom_linerange(aes(x=1-mapprob+1e-7, ymin=1-upper+1e-7, ymax=1-lower+1e-7), linewidth=0.2, position=position_dodge(.05)) +
geom_smooth() +
geom_abline(intercept=0, slope=1, linetype=2) +
theme_bw()
Expand Down
9 changes: 7 additions & 2 deletions scripts/plot-roc-log.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,13 @@ require("tidyverse")
require("ggrepel")

# Read in the combined toil-vg stats.tsv, listing:
# correct, mapq, aligner (really graph name), read name, count
dat <- read.table(commandArgs(TRUE)[1], header=T)
# correct, mapq, aligner (really graph name), read name, count, eligible
dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))

if (("eligible" %in% names(dat))) {
# If the eligible column is present, remove ineligible reads
dat <- dat[dat$eligible == 1, ]
}

if (! ("count" %in% names(dat))) {
# If the count column is not present, add i
Expand Down
9 changes: 7 additions & 2 deletions scripts/plot-roc.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,13 @@ require("ggrepel")
require("scales") # For squish

# Read in the combined toil-vg stats.tsv, listing:
# correct, mapq, aligner (really graph name), read name, count
dat <- read.table(commandArgs(TRUE)[1], header=T)
# correct, mapq, aligner (really graph name), read name, count, eligible
dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))

if (("eligible" %in% names(dat))) {
# If the eligible column is present, remove ineligible reads
dat <- dat[dat$eligible == 1, ]
}

if (! ("count" %in% names(dat))) {
# If the count column is not present, add i
Expand Down
22 changes: 18 additions & 4 deletions src/subcommand/gamcompare_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "subcommand.hpp"

#include "../alignment.hpp"
#include "../annotation.hpp"
#include "../snarl_distance_index.hpp"
#include "../vg.hpp"
#include <vg/io/stream.hpp>
Expand Down Expand Up @@ -257,7 +258,7 @@ int main_gamcompare(int argc, char** argv) {
// Output TSV to standard out in the format plot-qq.R needs.
if (!header_printed) {
// It needs a header
cout << "correct\tmq\taligner\tread" << endl;
cout << "correct\tmq\taligner\tread\teligible" << endl;
header_printed = true;
}

Expand All @@ -266,7 +267,8 @@ int main_gamcompare(int argc, char** argv) {
cout << (aln.correctly_mapped() ? "1" : "0") << "\t";
cout << aln.mapping_quality() << "\t";
cout << aligner_name << "\t";
cout << aln.name() << endl;
cout << aln.name() << "\t";
cout << (has_annotation(aln, "no_truth") ? "0" : "1") << endl;
}
text_buffer.clear();
};
Expand Down Expand Up @@ -341,6 +343,8 @@ int main_gamcompare(int argc, char** argv) {

// Annotate it as such
aln.set_correctly_mapped(correctly_mapped);
// And make sure we say it was possible to get
clear_annotation(aln, "no_truth");

if (correctly_mapped) {
correct_counts.at(omp_get_thread_num()) += 1;
Expand All @@ -358,15 +362,25 @@ int main_gamcompare(int argc, char** argv) {
correct_count_by_mapq_by_thread.at(omp_get_thread_num()).at(mapq) += 1;
}
}
} else if (range != -1) {
// We are flagging reads correct/incorrect, but this read has no truth position.
// Remember that it was impossible to get.
set_annotation(aln, "no_truth", true);
}
#pragma omp critical
{
if (output_tsv) {
text_buffer.emplace_back(std::move(aln));
if (emitter) {
// Copy the alignment since we need it twice
text_buffer.emplace_back(aln);
} else {
text_buffer.emplace_back(std::move(aln));
}
if (text_buffer.size() > 1000) {
flush_text_buffer();
}
} else {
}
if (emitter) {
emitter->write(std::move(aln));
}
}
Expand Down
2 changes: 1 addition & 1 deletion vgci/vgci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ KEEP_INTERMEDIATE_FILES=0
# Should we show stdout and stderr from tests? If so, set to "-s".
SHOW_OPT=""
# What toil-vg should we install?
TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@64774cc76ef117aef705c1b5203450482c6a4b60"
TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@c9bd6414f935e6095574a41a34addbb8d87b41a6"
# What toil should we install?
# Could be something like "toil[aws,mesos]==3.20.0"
# or "git+https://github.com/DataBiosphere/toil.git@3ab74776a3adebd6db75de16985ce9d734f60743#egg=toil[aws,mesos]"
Expand Down
Loading