Merge branch 'vgteam:master' into annotate_subpath

vgteam · Mar 1, 2024 · 512f201 · 512f201 · adamnovak · Apr 9, 2024
2 parents a03ff22 + 67c20e9
commit 512f201
Show file tree

Hide file tree

Showing 29 changed files with 976 additions and 472 deletions.
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ Please cite:
 * [The VG Paper](https://doi.org/10.1038/nbt.4227) when using `vg`
 * [The VG Giraffe Paper](https://doi.org/10.1126/science.abg8871) when using `vg giraffe`
 * [The VG Call Paper](https://doi.org/10.1186/s13059-020-1941-7) when SV genotyping with `vg call`
-* [The GBZ Paper](https://doi.org/10.1093/bioinformatics/btad097) when using GBZ
+* [The GBZ Paper](https://doi.org/10.1093/bioinformatics/btac656) when using GBZ
 * [The HPRC Paper](https://doi.org/10.1038/s41586-023-05896-x) when using `vg deconstruct`
 * [The Snarls Paper](https://doi.org/10.1089/cmb.2017.0251) when using `vg snarls`
 * [The Personalized Pangenome Paper](https://doi.org/10.1101/2023.12.13.571553) when using `vg haplotypes` and/or `vg giraffe --haplotype-name`
@@ -228,7 +228,7 @@ There are multiple read mappers in `vg`:
 
 * `vg giraffe` is designed to be fast for highly accurate short reads, against graphs with haplotype information.
 * `vg map` is a general-purpose read mapper.
-* `vg mpmap` does "munti-path" mapping, to allow describing local alignment uncertainty. [This is useful for transcriptomics.](#Transcriptomic-analysis)
+* `vg mpmap` does "multi-path" mapping, to allow describing local alignment uncertainty. [This is useful for transcriptomics.](#Transcriptomic-analysis)
 
 #### Mapping with `vg giraffe`
 

diff --git a/deps/gbwtgraph b/deps/gbwtgraph
diff --git a/scripts/plot-pr.R b/scripts/plot-pr.R
@@ -10,11 +10,16 @@ require("tidyverse")
 require("ggrepel")
 
 # Read in the combined toil-vg stats.tsv, listing:
-# correct, mapq, aligner (really graph name), read name, count
-dat <- read.table(commandArgs(TRUE)[1], header=T)
+# correct, mapq, aligner (really graph name), read name, count, eligible
+dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))
+
+if (("eligible" %in% names(dat))) {
+    # If the eligible column is present, remove ineligible reads
+    dat <- dat[dat$eligible == 1, ]
+}
 
 if (! ("count" %in% names(dat))) {
-    # If the count column is not present, add i
+    # If the count column is not present, add it
     dat$count <- rep(1, nrow(dat))
 }
 
@@ -48,8 +53,8 @@ dat$aligner <- factor(dat$aligner, levels=aligner.names)
 name.lists <- name.lists[name.order]
 
 # Determine colors for aligners
-bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b")
-light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee")
+bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b","#6caed1")
+light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee","#b9d9e9")
 # We have to go through both lists together when assigning colors, because pe and non-pe versions of a condition need corresponding colors.
 cursor <- 1
 
@@ -95,6 +100,25 @@ colors <- colors[aligner.names]
 # Add a bin "factor" to each row, binning float MAPQs into bins from 0 to 60 (and inclusing bins for out of range on each end)
 dat$bin <- cut(dat$mq, c(-Inf,seq(0,60,1),Inf))
 
+# We need to work out our scales
+reads.per.condition <- sum(dat$count) / length(aligner.names)
+# Start with small scale
+labels <- c("1e-0","1e-1","1e-2","1e-3","1e-4")
+breaks <- c(0,1,2,3,4)
+limits <- c(0, 4)
+if ( reads.per.condition > 10000 ) {
+    # Use big scale if there are a lot of reads
+    labels <- c(labels, "1e-5","1e-6")
+    breaks <- c(breaks, 5,6)
+    limits <- c(0, 6)
+}
+if ( reads.per.condition > 1000000 ) {
+    # Use big scale if there are a lot of reads
+    labels <- c(labels, "1e-7","1e-8","1e-9")
+    breaks <- c(breaks, 7,8,9)
+    limits <- c(0, 9)
+}
+
 # Now we break out the cool dplyr/magrittr/tidyverse tools like %>% pipe operators.
 dat.roc <- dat %>%
     # Make positive and negative count columns
@@ -127,15 +151,15 @@ dat.plot <- dat.roc %>%
         # There will be points with variable sizes
         geom_point(aes(size=Positive+Negative)) +
         # We manually assign these selected colors
-        scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) +
+        scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) +
         # And we want a size legend
         scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) +
         # And we want a fake log Y axis
-        scale_y_continuous(labels=c("1e-0","1e-1","1e-2","1e-3","1e-4","1e-5","1e-6","1e-7","1e-8","1e-9"), breaks=c(0,1,2,3,4,5,6,7,8,9), limits=c(0, 9)) +
+        scale_y_continuous(labels=labels, breaks=breaks, limits=limits) +
         # Label it
         ylab("1 - Precision") +
         # And we want a fake log X axis
-        scale_x_continuous(labels=c("1e-0","1e-1","1e-2","1e-3","1e-4","1e-5","1e-6","1e-7","1e-8","1e-9"), breaks=c(0,1,2,3,4,5,6,7,8,9), limits=c(0, 9)) +
+        scale_x_continuous(labels=labels, breaks=breaks, limits=limits) +
         # Label it
         xlab("1 - Recall") +
         # And we want this cool theme

diff --git a/scripts/plot-qq.R b/scripts/plot-qq.R
@@ -2,15 +2,21 @@
 
 # plot-qq.R <stats TSV> <destination image file> [<comma-separated "aligner" names to include> [title]]
 
-list.of.packages <- c("tidyverse", "ggrepel", "svglite")
+list.of.packages <- c("tidyverse", "ggrepel", "svglite", "binom")
 new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
 if(length(new.packages)) install.packages(new.packages)
 require("tidyverse")
 require("ggrepel")
+require("binom")
 
 # Read in the combined toil-vg stats.tsv, listing:
-# correct, mapq, aligner (really graph name), read name, count
-dat <- read.table(commandArgs(TRUE)[1], header=T)
+# correct, mapq, aligner (really graph name), read name, count, eligible
+dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))
+
+if (("eligible" %in% names(dat))) {
+    # If the eligible column is present, remove ineligible reads
+    dat <- dat[dat$eligible == 1, ]
+}
 
 if (! ("count" %in% names(dat))) {
     # If the count column is not present, add i
@@ -47,8 +53,8 @@ dat$aligner <- factor(dat$aligner, levels=aligner.names)
 name.lists <- name.lists[name.order]
 
 # Determine colors for aligners
-bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b")
-light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee")
+bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b","#6caed1")
+light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee","#b9d9e9")
 # We have to go through both lists together when assigning colors, because pe and non-pe versions of a condition need corresponding colors.
 cursor <- 1
 
@@ -93,14 +99,20 @@ colors <- colors[aligner.names]
 
 dat$bin <- cut(dat$mq, c(-Inf,seq(0,60,1),Inf))
 
-x <- as.data.frame(summarize(group_by(dat, bin, aligner), N=n(), mapq=mean(mq), mapprob=mean(1-10^(-mapq/10)), observed=weighted.mean(correct, count)))
+x <- as.data.frame(summarize(group_by(dat, bin, aligner), N=n(), mapq=mean(mq), mapprob=mean(1-10^(-mapq/10)), observed=weighted.mean(correct, count), select(binom.confint(sum(correct * count), sum(count), conf.level=0.9, methods="lrt"), c("lower", "upper"))))
+
+print(names(x))
+print(x$ci)
 
-dat.plot <- ggplot(x, aes(1-mapprob+1e-9, 1-observed+1e-9, color=aligner, size=N, weight=N, label=round(mapq,2))) +
-    scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) +
-    scale_y_log10("measured error", limits=c(5e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
-    scale_x_log10("error estimate", limits=c(5e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
+# Now plot the points as different sizes, but the error bar line ranges as a consistent size
+dat.plot <- ggplot(x, aes(1-mapprob+1e-7, 1-observed+1e-7, color=aligner, size=N, weight=N, label=round(mapq,2))) +
+    scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) +
+    scale_y_log10("measured error", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
+    scale_x_log10("error estimate", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
     scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) +
     geom_point() +
+    # Only aesthetics that depend on each point need to be in the aes() mapping
+    geom_linerange(aes(x=1-mapprob+1e-7, ymin=1-upper+1e-7, ymax=1-lower+1e-7), linewidth=0.2, position=position_dodge(.05)) +
     geom_smooth() +
     geom_abline(intercept=0, slope=1, linetype=2) +
     theme_bw()

diff --git a/scripts/plot-roc-log.R b/scripts/plot-roc-log.R
@@ -20,8 +20,13 @@ require("tidyverse")
 require("ggrepel")
 
 # Read in the combined toil-vg stats.tsv, listing:
-# correct, mapq, aligner (really graph name), read name, count
-dat <- read.table(commandArgs(TRUE)[1], header=T)
+# correct, mapq, aligner (really graph name), read name, count, eligible
+dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))
+
+if (("eligible" %in% names(dat))) {
+    # If the eligible column is present, remove ineligible reads
+    dat <- dat[dat$eligible == 1, ]
+}
 
 if (! ("count" %in% names(dat))) {
     # If the count column is not present, add i

diff --git a/scripts/plot-roc.R b/scripts/plot-roc.R
@@ -21,8 +21,13 @@ require("ggrepel")
 require("scales") # For squish
 
 # Read in the combined toil-vg stats.tsv, listing:
-# correct, mapq, aligner (really graph name), read name, count
-dat <- read.table(commandArgs(TRUE)[1], header=T)
+# correct, mapq, aligner (really graph name), read name, count, eligible
+dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))
+
+if (("eligible" %in% names(dat))) {
+    # If the eligible column is present, remove ineligible reads
+    dat <- dat[dat$eligible == 1, ]
+}
 
 if (! ("count" %in% names(dat))) {
     # If the count column is not present, add i
+1 −1		LICENSE
+4 −0		README.md
+5 −1		SERIALIZATION.md
+76 −7		include/gbwtgraph/algorithms.h
+10 −0		include/gbwtgraph/gbwtgraph.h
+0 −93		include/gbwtgraph/internal.h
+68 −28		include/gbwtgraph/path_cover.h
+108 −10		include/gbwtgraph/utils.h
+132 −12		src/algorithms.cpp
+42 −9		src/gbwtgraph.cpp
+36 −1		src/gbz_stats.cpp
+3 −3		src/gfa.cpp
+0 −341		src/internal.cpp
+207 −253		src/path_cover.cpp
+394 −1		src/utils.cpp
+32 −0		tests/gfas/components_ref.gfa
+202 −2		tests/test_algorithms.cpp
+51 −0		tests/test_gbwtgraph.cpp
+28 −0		tests/test_gfa.cpp
+441 −170		tests/test_path_cover.cpp
+262 −0		tests/test_utils.cpp