Skip to content

Commit

Permalink
Merge branch 'vgteam:master' into annotate_subpath
Browse files Browse the repository at this point in the history
  • Loading branch information
parsaeskandar authored Mar 1, 2024
2 parents a03ff22 + 67c20e9 commit 512f201
Show file tree
Hide file tree
Showing 29 changed files with 976 additions and 472 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Please cite:
* [The VG Paper](https://doi.org/10.1038/nbt.4227) when using `vg`
* [The VG Giraffe Paper](https://doi.org/10.1126/science.abg8871) when using `vg giraffe`
* [The VG Call Paper](https://doi.org/10.1186/s13059-020-1941-7) when SV genotyping with `vg call`
* [The GBZ Paper](https://doi.org/10.1093/bioinformatics/btad097) when using GBZ
* [The GBZ Paper](https://doi.org/10.1093/bioinformatics/btac656) when using GBZ
* [The HPRC Paper](https://doi.org/10.1038/s41586-023-05896-x) when using `vg deconstruct`
* [The Snarls Paper](https://doi.org/10.1089/cmb.2017.0251) when using `vg snarls`
* [The Personalized Pangenome Paper](https://doi.org/10.1101/2023.12.13.571553) when using `vg haplotypes` and/or `vg giraffe --haplotype-name`
Expand Down Expand Up @@ -228,7 +228,7 @@ There are multiple read mappers in `vg`:

* `vg giraffe` is designed to be fast for highly accurate short reads, against graphs with haplotype information.
* `vg map` is a general-purpose read mapper.
* `vg mpmap` does "munti-path" mapping, to allow describing local alignment uncertainty. [This is useful for transcriptomics.](#Transcriptomic-analysis)
* `vg mpmap` does "multi-path" mapping, to allow describing local alignment uncertainty. [This is useful for transcriptomics.](#Transcriptomic-analysis)

#### Mapping with `vg giraffe`

Expand Down
40 changes: 32 additions & 8 deletions scripts/plot-pr.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,16 @@ require("tidyverse")
require("ggrepel")

# Read in the combined toil-vg stats.tsv, listing:
# correct, mapq, aligner (really graph name), read name, count
dat <- read.table(commandArgs(TRUE)[1], header=T)
# correct, mapq, aligner (really graph name), read name, count, eligible
dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))

if (("eligible" %in% names(dat))) {
# If the eligible column is present, remove ineligible reads
dat <- dat[dat$eligible == 1, ]
}

if (! ("count" %in% names(dat))) {
# If the count column is not present, add i
# If the count column is not present, add it
dat$count <- rep(1, nrow(dat))
}

Expand Down Expand Up @@ -48,8 +53,8 @@ dat$aligner <- factor(dat$aligner, levels=aligner.names)
name.lists <- name.lists[name.order]

# Determine colors for aligners
bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b")
light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee")
bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b","#6caed1")
light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee","#b9d9e9")
# We have to go through both lists together when assigning colors, because pe and non-pe versions of a condition need corresponding colors.
cursor <- 1

Expand Down Expand Up @@ -95,6 +100,25 @@ colors <- colors[aligner.names]
# Add a bin "factor" to each row, binning float MAPQs into bins from 0 to 60 (and inclusing bins for out of range on each end)
dat$bin <- cut(dat$mq, c(-Inf,seq(0,60,1),Inf))

# We need to work out our scales
reads.per.condition <- sum(dat$count) / length(aligner.names)
# Start with small scale
labels <- c("1e-0","1e-1","1e-2","1e-3","1e-4")
breaks <- c(0,1,2,3,4)
limits <- c(0, 4)
if ( reads.per.condition > 10000 ) {
# Use big scale if there are a lot of reads
labels <- c(labels, "1e-5","1e-6")
breaks <- c(breaks, 5,6)
limits <- c(0, 6)
}
if ( reads.per.condition > 1000000 ) {
# Use big scale if there are a lot of reads
labels <- c(labels, "1e-7","1e-8","1e-9")
breaks <- c(breaks, 7,8,9)
limits <- c(0, 9)
}

# Now we break out the cool dplyr/magrittr/tidyverse tools like %>% pipe operators.
dat.roc <- dat %>%
# Make positive and negative count columns
Expand Down Expand Up @@ -127,15 +151,15 @@ dat.plot <- dat.roc %>%
# There will be points with variable sizes
geom_point(aes(size=Positive+Negative)) +
# We manually assign these selected colors
scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) +
scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) +
# And we want a size legend
scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) +
# And we want a fake log Y axis
scale_y_continuous(labels=c("1e-0","1e-1","1e-2","1e-3","1e-4","1e-5","1e-6","1e-7","1e-8","1e-9"), breaks=c(0,1,2,3,4,5,6,7,8,9), limits=c(0, 9)) +
scale_y_continuous(labels=labels, breaks=breaks, limits=limits) +
# Label it
ylab("1 - Precision") +
# And we want a fake log X axis
scale_x_continuous(labels=c("1e-0","1e-1","1e-2","1e-3","1e-4","1e-5","1e-6","1e-7","1e-8","1e-9"), breaks=c(0,1,2,3,4,5,6,7,8,9), limits=c(0, 9)) +
scale_x_continuous(labels=labels, breaks=breaks, limits=limits) +
# Label it
xlab("1 - Recall") +
# And we want this cool theme
Expand Down
32 changes: 22 additions & 10 deletions scripts/plot-qq.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,21 @@

# plot-qq.R <stats TSV> <destination image file> [<comma-separated "aligner" names to include> [title]]

list.of.packages <- c("tidyverse", "ggrepel", "svglite")
list.of.packages <- c("tidyverse", "ggrepel", "svglite", "binom")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
require("tidyverse")
require("ggrepel")
require("binom")

# Read in the combined toil-vg stats.tsv, listing:
# correct, mapq, aligner (really graph name), read name, count
dat <- read.table(commandArgs(TRUE)[1], header=T)
# correct, mapq, aligner (really graph name), read name, count, eligible
dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))

if (("eligible" %in% names(dat))) {
# If the eligible column is present, remove ineligible reads
dat <- dat[dat$eligible == 1, ]
}

if (! ("count" %in% names(dat))) {
# If the count column is not present, add i
Expand Down Expand Up @@ -47,8 +53,8 @@ dat$aligner <- factor(dat$aligner, levels=aligner.names)
name.lists <- name.lists[name.order]

# Determine colors for aligners
bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b")
light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee")
bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b","#6caed1")
light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee","#b9d9e9")
# We have to go through both lists together when assigning colors, because pe and non-pe versions of a condition need corresponding colors.
cursor <- 1

Expand Down Expand Up @@ -93,14 +99,20 @@ colors <- colors[aligner.names]

dat$bin <- cut(dat$mq, c(-Inf,seq(0,60,1),Inf))

x <- as.data.frame(summarize(group_by(dat, bin, aligner), N=n(), mapq=mean(mq), mapprob=mean(1-10^(-mapq/10)), observed=weighted.mean(correct, count)))
x <- as.data.frame(summarize(group_by(dat, bin, aligner), N=n(), mapq=mean(mq), mapprob=mean(1-10^(-mapq/10)), observed=weighted.mean(correct, count), select(binom.confint(sum(correct * count), sum(count), conf.level=0.9, methods="lrt"), c("lower", "upper"))))

print(names(x))
print(x$ci)

dat.plot <- ggplot(x, aes(1-mapprob+1e-9, 1-observed+1e-9, color=aligner, size=N, weight=N, label=round(mapq,2))) +
scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) +
scale_y_log10("measured error", limits=c(5e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
scale_x_log10("error estimate", limits=c(5e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
# Now plot the points as different sizes, but the error bar line ranges as a consistent size
dat.plot <- ggplot(x, aes(1-mapprob+1e-7, 1-observed+1e-7, color=aligner, size=N, weight=N, label=round(mapq,2))) +
scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) +
scale_y_log10("measured error", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
scale_x_log10("error estimate", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) +
geom_point() +
# Only aesthetics that depend on each point need to be in the aes() mapping
geom_linerange(aes(x=1-mapprob+1e-7, ymin=1-upper+1e-7, ymax=1-lower+1e-7), linewidth=0.2, position=position_dodge(.05)) +
geom_smooth() +
geom_abline(intercept=0, slope=1, linetype=2) +
theme_bw()
Expand Down
9 changes: 7 additions & 2 deletions scripts/plot-roc-log.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,13 @@ require("tidyverse")
require("ggrepel")

# Read in the combined toil-vg stats.tsv, listing:
# correct, mapq, aligner (really graph name), read name, count
dat <- read.table(commandArgs(TRUE)[1], header=T)
# correct, mapq, aligner (really graph name), read name, count, eligible
dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))

if (("eligible" %in% names(dat))) {
# If the eligible column is present, remove ineligible reads
dat <- dat[dat$eligible == 1, ]
}

if (! ("count" %in% names(dat))) {
# If the count column is not present, add i
Expand Down
9 changes: 7 additions & 2 deletions scripts/plot-roc.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,13 @@ require("ggrepel")
require("scales") # For squish

# Read in the combined toil-vg stats.tsv, listing:
# correct, mapq, aligner (really graph name), read name, count
dat <- read.table(commandArgs(TRUE)[1], header=T)
# correct, mapq, aligner (really graph name), read name, count, eligible
dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))

if (("eligible" %in% names(dat))) {
# If the eligible column is present, remove ineligible reads
dat <- dat[dat$eligible == 1, ]
}

if (! ("count" %in% names(dat))) {
# If the count column is not present, add i
Expand Down
Loading

1 comment on commit 512f201

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch annotate_subpath. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 16967 seconds

Please sign in to comment.