From 08ab5b749faa4169e9af41c3eb2784fc42561d5a Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Wed, 27 Nov 2024 12:30:54 +0100 Subject: [PATCH 01/14] draft script to make markdown manpage --- doc/README.md | 25 +++++++++++++++++++ doc/vgmanmd.desc.md | 9 +++++++ doc/vgmanmd.py | 58 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+) create mode 100644 doc/README.md create mode 100644 doc/vgmanmd.desc.md create mode 100644 doc/vgmanmd.py diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 00000000000..0a2626842bd --- /dev/null +++ b/doc/README.md @@ -0,0 +1,25 @@ +# Automated markdown manpage + +Make a markdown document with the usage messages of (selected) `vg` subcommands. +Calls the `vg` command, so it will match the version available in the command line. + +```sh +python3 vgmanmd.py > man.md +``` + +Then copy the markdown content to a wiki page, for example, or move the markdown page to somewhere in the vg repo? + +## Edit descriptions + +See [`vgmanmd.desc.md`](vgmanmd.desc.md) file. +This file must be in the current directory. +Also, in the title `# `, `` should match name of the command. +The name `intro` is used for the introduction, to be placed after the table of content. +The names don't need to match a command, and not all commands have to be described: if available, a subcommand description will be added before its usage message. + +The goal of the description is to be vague enough that we don't need to change them often, but informative enough that new users can get a good feel of the commands, plus pointers to other resources (e.g. Wiki pages). + +## Change list of selected subcommands + +At the top of [`vgmanmd.py`](vgmanmd.py), change the *cmds* list. + diff --git a/doc/vgmanmd.desc.md b/doc/vgmanmd.desc.md new file mode 100644 index 00000000000..b9d87924d4b --- /dev/null +++ b/doc/vgmanmd.desc.md @@ -0,0 +1,9 @@ +# intro + +For **mapping**: [`vg giraffe`](#giraffe), ... See more at [wiki page XX](https://github...) + +For ..." + +# giraffe + +Maps reads to a pangenome **blazingly fast**. diff --git a/doc/vgmanmd.py b/doc/vgmanmd.py new file mode 100644 index 00000000000..92917abca54 --- /dev/null +++ b/doc/vgmanmd.py @@ -0,0 +1,58 @@ +import subprocess + + +# commands to include +cmds = ['index', 'view', 'autoindex', 'pack', 'giraffe', 'map', 'call', + 'mpmap', 'rna', 'chunk', 'stats', 'gbwt', 'paths', 'find', + 'construct', 'minimizer', 'haplotypes', 'deconstruct', 'convert', + 'gamsort', 'surject'] +cmds.sort() + +# parse short descriptions +desc_inf = open('vgmanmd.desc.md', 'rt') +desc = {} +cur_desc = '' +cur_header = '' +for line in desc_inf: + if line[0] == '#': + # new header + if cur_header != '': + desc[cur_header] = cur_desc + cur_header = line.rstrip().replace('# ', '') + cur_desc = '' + else: + cur_desc += line +desc[cur_header] = cur_desc +desc_inf.close() + +# start page +print('# vg manpage') + +# get vg version +ret = subprocess.run(['vg', 'version'], capture_output=True) +vg_v = ret.stdout.decode().split('\n')[0] + +print('\n*Automatically made for ' + vg_v + '.*\n\n') + +# table of contents +for cmd in cmds: + print(' - [vg {cmd}](#{cmd})'.format(cmd=cmd)) + +print('\n\n') + +# add intro text +if 'intro' in desc: + print(desc['intro']) + print('\n\n') + +# help for each cmd +for cmd in cmds: + print('## {cmd}\n\n'.format(cmd=cmd)) + if cmd in desc: + print(desc[cmd]) + print('\n\n') + # run subcommand with -h + ret = subprocess.run(['vg', cmd, '-h'], capture_output=True) + print('```') + print(ret.stderr.decode()) + print('```\n\n') From 1f93e47ec0f5b3b8c9b672dc7232240ac4299b1d Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 28 Nov 2024 15:25:55 +0100 Subject: [PATCH 02/14] Add intro/index to the man page --- doc/vgmanmd.desc.md | 101 ++++++++++++++++++++++++++++++++++++++++++-- doc/vgmanmd.py | 17 ++++---- 2 files changed, 106 insertions(+), 12 deletions(-) diff --git a/doc/vgmanmd.desc.md b/doc/vgmanmd.desc.md index b9d87924d4b..0e094da5f1c 100644 --- a/doc/vgmanmd.desc.md +++ b/doc/vgmanmd.desc.md @@ -1,9 +1,102 @@ +# file-info + +This file contains extra text that will be added to the man pages generated with doc/vgmanmd.py +The `# intro` section is added to the top of the page, and each `# subcommand` section will be added to given subcommand +When adding a new subcommand, add it to the appropriate section(s) in the intro + # intro -For **mapping**: [`vg giraffe`](#giraffe), ... See more at [wiki page XX](https://github...) +This is a redundant and incomplete list of subcommands of vg, organized by common uses. For a complete list of subcommands, run `vg help`. + +For more in-depth explanations of tools and workflows, see the [vg wiki page](https://github.com/vgteam/vg/wiki) + +- **Graph construction and indexing** + See the [wiki page](https://github.com/vgteam/vg/wiki/Index-Types) for an overview of vg indexes. + - [`vg autoindex`](#autoindex) automatically construct a graph and indexes for a specific workflow (e.g. giraffe, rpvg). [wiki page](https://github.com/vgteam/vg/wiki/Automatic-indexing-for-read-mapping-and-downstream-inference) + - [`vg construct`](#construct) manually construct a graph from a reference and variants. [wiki page](https://github.com/vgteam/vg/wiki/Construction) + - [`vg index`](#index) manually build individual indexes (xg, distance, GCSA, etc). [wiki page](https://github.com/vgteam/vg/wiki/Index-Construction) + - [`vg gbwt`](#gbwt) manually build and manipulate GBWTs and indexes (GBWTgraph, GBZ, r-index). [wiki page](https://github.com/vgteam/vg/wiki/VG-GBWT-Subcommand) + - [`vg minimizer`](#minimizer) manually build a minimizer index for mapping. + - [`vg haplotypes`](#haplotypes) haplotype sample a graph. Recommended for mapping with giraffe. [wiki page](https://github.com/vgteam/vg/wiki/Haplotype-Sampling) +- **Read mapping** + - [`vg giraffe`](#giraffe) fast haplotype-aware short read alignment. [wiki page](https://github.com/vgteam/vg/wiki/Mapping-short-reads-with-Giraffe) + - [`vg mpmap`](#mpmap) splice-aware multipath alignment of short reads. [wiki page](https://github.com/vgteam/vg/wiki/Multipath-alignments-and-vg-mpmap) + - [`vg map`](#map) MEM-based read alignment. [wiki page](https://github.com/vgteam/vg/wiki/Working-with-a-whole-genome-variation-graph) +- **Downstream analyses** + - [`vg pack`](#pack) convert alignments to a compact coverage index. Used with [vg call](#call) + - [`vg call`](#call) call or genotype VCF variants. Uses [vg pack](#pack). [wiki page](https://github.com/vgteam/vg/wiki/SV-Genotyping-and-variant-calling) + - [`vg rna`](#rna) transciptomic analyses. [wiki page](https://github.com/vgteam/vg/wiki/Transcriptomic-analyses). Also see [rpvg](https://github.com/jonassibbesen/rpvg) + - [`vg deconstruct`](#deconstruct) create a VCF from variation in the graph. [wiki page](https://github.com/vgteam/vg/wiki/VCF-export-with-vg-deconstruct) +- **Working with read alignments** + - [`vg gamsort`](#gamsort) sort a GAM/GAF file or index a sorted GAM file. + - [`vg filter`](#filter) filter alignments by properties. + - [`vg surject`](#surject) project alignments on a graph onto a linear reference (gam/gaf->bam/sam/cram). + - [`vg inject`](#inject) project alignments on a linear reference onto a graph (bam/sam/cram->gam/gaf). + - [`vg sim`](#sim) simulate reads from a graph. [wiki page](https://github.com/vgteam/vg/wiki/Simulating-reads-with-vg-sim) +- **Graph and read statistics** + - [`vg stats`](#stats) get stats about the graph. + - [`vg paths`](#paths) get stats about the paths. [wiki page](https://github.com/vgteam/vg/wiki/Path-Metadata-Model) + - [`vg gbwt`](#gbwt) get stats about a GBWT. + - [`vg filter`](#filter) get stats about alignments (use `--tsv-out`). +- **Manipulating a graph** + - [`vg mod`](#mod) filter, transform, and edit the graph. + - [`vg prune`](#prune) prune the graph for GCSA2 indexing. + - [`vg ids`](#ids) manipulate graph node ids. + - [`vg paths`](#paths) manipulate paths in a graph. + - [`vg gbwt`](#gbwt) manipulate GBWTs and associated indexes. [wiki page](https://github.com/vgteam/vg/wiki/VG-GBWT-Subcommand) + - [`vg annotate`](#annotate) annotate a graph or alignments. +- **Conversion between formats** + - [`vg convert`](#convert) convert between handle graph formats and GFA, and between alignment formats. + - [`vg view`](#view) convert between non-handle graph formats and alignment formats (dot, json, turtle...). + - [`vg surject`](#surject) project alignments on a graph onto a linear reference (gam/gaf->bam/sam/cram). + - [`vg inject`](#inject) project alignments on a linear reference onto a graph (bam/sam/cram->gam/gaf). + - [`vg paths`](#paths) extract a fasta from a graph. [wiki page](https://github.com/vgteam/vg/wiki/Extracting-a-FASTA-from-a-Graph) +- **Subgraph extraction** + - [`vg chunk`](#chunk) split a graph and/or alignment into smaller chunks. + - [`vg find`](#find) use an index to find nodes, edges, kmers, paths, or positions. + +# annotate + +Annotate alignments with graphs and graphs with alignments. + +# autoindex + +Mapping tool-oriented index construction from interchange formats. + +# convert + +Convert graphs between handle-graph compliant formats as well as GFA. + +# find + +Use an index to find nodes, edges, kmers, paths, or positions. + +# ids + +Manipulate node ids. + +# pack + +Convert alignments to a compact coverage index. + +# paths + +Traverse paths in the graph. + +# view + +format conversions for graphs and alignments + +# filter + +Filter alignments by properties. + +`vg filter --tsv-out` can be used to produce a TSV file of user-specified fields from the GAM file. For example, + +`vg filter --tsv-out "name;mapping_quality" ` -For ..." +is the equivalent of -# giraffe +`vg view -aj | jq -r '[.name,.mapping_quality] | @tsv'` -Maps reads to a pangenome **blazingly fast**. +To find which fields are stored in a GAM file, use [`vg view`](#view) to view the GAM as a JSON file. diff --git a/doc/vgmanmd.py b/doc/vgmanmd.py index 92917abca54..984994e26e7 100644 --- a/doc/vgmanmd.py +++ b/doc/vgmanmd.py @@ -2,10 +2,11 @@ # commands to include +######### If you add to this, also add it to the intro section of vgmandmd.desc.md cmds = ['index', 'view', 'autoindex', 'pack', 'giraffe', 'map', 'call', - 'mpmap', 'rna', 'chunk', 'stats', 'gbwt', 'paths', 'find', + 'mpmap', 'rna', 'chunk', 'stats', 'gbwt', 'paths', 'find', 'filter', 'construct', 'minimizer', 'haplotypes', 'deconstruct', 'convert', - 'gamsort', 'surject'] + 'gamsort', 'surject', 'mod', 'prune', 'ids', 'sim', 'annotate'] cmds.sort() # parse short descriptions @@ -34,17 +35,17 @@ print('\n*Automatically made for ' + vg_v + '.*\n\n') -# table of contents -for cmd in cmds: - print(' - [vg {cmd}](#{cmd})'.format(cmd=cmd)) - -print('\n\n') - # add intro text if 'intro' in desc: print(desc['intro']) print('\n\n') +# table of contents +#for cmd in cmds: +# print(' - [vg {cmd}](#{cmd})'.format(cmd=cmd)) +# +#print('\n\n') + # help for each cmd for cmd in cmds: print('## {cmd}\n\n'.format(cmd=cmd)) From aab756640131017653dd7a59ff121eecf038e92b Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 28 Nov 2024 15:47:58 +0100 Subject: [PATCH 03/14] Add the man page but idk where to put it --- doc/man.md | 1387 +++++++++++++++++++++++++++++++++++++++++++ doc/vgmanmd.desc.md | 2 +- 2 files changed, 1388 insertions(+), 1 deletion(-) create mode 100644 doc/man.md diff --git a/doc/man.md b/doc/man.md new file mode 100644 index 00000000000..e4a87c88ca0 --- /dev/null +++ b/doc/man.md @@ -0,0 +1,1387 @@ +# vg manpage + +*Automatically made for vg version v1.61.0-36-g64d7e82e0 "Plodio".* + + + +This is a redundant and incomplete list of subcommands of vg, organized by common uses. For a complete list of subcommands, run `vg help`. + +For more in-depth explanations of tools and workflows, see the [vg wiki page](https://github.com/vgteam/vg/wiki) + +- **Graph construction and indexing** + See the [wiki page](https://github.com/vgteam/vg/wiki/Index-Types) for an overview of vg indexes. + - [`vg autoindex`](#autoindex) automatically construct a graph and indexes for a specific workflow (e.g. giraffe, rpvg). [wiki page](https://github.com/vgteam/vg/wiki/Automatic-indexing-for-read-mapping-and-downstream-inference) + - [`vg construct`](#construct) manually construct a graph from a reference and variants. [wiki page](https://github.com/vgteam/vg/wiki/Construction) + - [`vg index`](#index) manually build individual indexes (xg, distance, GCSA, etc). [wiki page](https://github.com/vgteam/vg/wiki/Index-Construction) + - [`vg gbwt`](#gbwt) manually build and manipulate GBWTs and indexes (GBWTgraph, GBZ, r-index). [wiki page](https://github.com/vgteam/vg/wiki/VG-GBWT-Subcommand) + - [`vg minimizer`](#minimizer) manually build a minimizer index for mapping. + - [`vg haplotypes`](#haplotypes) haplotype sample a graph. Recommended for mapping with giraffe. [wiki page](https://github.com/vgteam/vg/wiki/Haplotype-Sampling) +- **Read mapping** + - [`vg giraffe`](#giraffe) fast haplotype-aware short read alignment. [wiki page](https://github.com/vgteam/vg/wiki/Mapping-short-reads-with-Giraffe) + - [`vg mpmap`](#mpmap) splice-aware multipath alignment of short reads. [wiki page](https://github.com/vgteam/vg/wiki/Multipath-alignments-and-vg-mpmap) + - [`vg map`](#map) MEM-based read alignment. [wiki page](https://github.com/vgteam/vg/wiki/Working-with-a-whole-genome-variation-graph) +- **Downstream analyses** + - [`vg pack`](#pack) convert alignments to a compact coverage index. Used with [vg call](#call) + - [`vg call`](#call) call or genotype VCF variants. Uses [vg pack](#pack). [wiki page](https://github.com/vgteam/vg/wiki/SV-Genotyping-and-variant-calling) + - [`vg rna`](#rna) construct splicing graphs and pantranscriptomes. [wiki page](https://github.com/vgteam/vg/wiki/Transcriptomic-analyses). Also see [rpvg](https://github.com/jonassibbesen/rpvg) + - [`vg deconstruct`](#deconstruct) create a VCF from variation in the graph. [wiki page](https://github.com/vgteam/vg/wiki/VCF-export-with-vg-deconstruct) +- **Working with read alignments** + - [`vg gamsort`](#gamsort) sort a GAM/GAF file or index a sorted GAM file. + - [`vg filter`](#filter) filter alignments by properties. + - [`vg surject`](#surject) project alignments on a graph onto a linear reference (gam/gaf->bam/sam/cram). + - [`vg inject`](#inject) project alignments on a linear reference onto a graph (bam/sam/cram->gam/gaf). + - [`vg sim`](#sim) simulate reads from a graph. [wiki page](https://github.com/vgteam/vg/wiki/Simulating-reads-with-vg-sim) +- **Graph and read statistics** + - [`vg stats`](#stats) get stats about the graph. + - [`vg paths`](#paths) get stats about the paths. [wiki page](https://github.com/vgteam/vg/wiki/Path-Metadata-Model) + - [`vg gbwt`](#gbwt) get stats about a GBWT. + - [`vg filter`](#filter) get stats about alignments (use `--tsv-out`). +- **Manipulating a graph** + - [`vg mod`](#mod) filter, transform, and edit the graph. + - [`vg prune`](#prune) prune the graph for GCSA2 indexing. + - [`vg ids`](#ids) manipulate graph node ids. + - [`vg paths`](#paths) manipulate paths in a graph. + - [`vg gbwt`](#gbwt) manipulate GBWTs and associated indexes. [wiki page](https://github.com/vgteam/vg/wiki/VG-GBWT-Subcommand) + - [`vg annotate`](#annotate) annotate a graph or alignments. +- **Conversion between formats** + - [`vg convert`](#convert) convert between handle graph formats and GFA, and between alignment formats. + - [`vg view`](#view) convert between non-handle graph formats and alignment formats (dot, json, turtle...). + - [`vg surject`](#surject) project alignments on a graph onto a linear reference (gam/gaf->bam/sam/cram). + - [`vg inject`](#inject) project alignments on a linear reference onto a graph (bam/sam/cram->gam/gaf). + - [`vg paths`](#paths) extract a fasta from a graph. [wiki page](https://github.com/vgteam/vg/wiki/Extracting-a-FASTA-from-a-Graph) +- **Subgraph extraction** + - [`vg chunk`](#chunk) split a graph and/or alignment into smaller chunks. + - [`vg find`](#find) use an index to find nodes, edges, kmers, paths, or positions. + + + + + +## annotate + + + +Annotate alignments with graphs and graphs with alignments. + + + + + +``` +usage: vg annotate [options] >output.{gam,vg,tsv} +graph annotation options: + -x, --xg-name FILE xg index or graph to annotate (required) + -b, --bed-name FILE a BED file to convert to GAM. May repeat. + -f, --gff-name FILE a GFF3 file to convert to GAM. May repeat. + -g, --ggff output at GGFF subgraph annotation file instead of GAM (requires -s) + -F, --gaf-output output in GAF format rather than GAM + -s, --snarls FILE file containing snarls to expand GFF intervals into +alignment annotation options: + -a, --gam FILE file of Alignments to annotate (required) + -x, --xg-name FILE xg index of the graph against which the Alignments are aligned (required) + -p, --positions annotate alignments with reference positions + -m, --multi-position annotate alignments with multiple reference positions + -l, --search-limit N when annotating with positions, search this far for paths (default: read length) + -b, --bed-name FILE annotate alignments with overlapping region names from this BED. May repeat. + -n, --novelty output TSV table with header describing how much of each Alignment is novel + -t, --threads use the specified number of threads + +``` + + +## autoindex + + + +Mapping tool-oriented index construction from interchange formats. + + + + + +``` +usage: vg autoindex [options] +options: + output: + -p, --prefix PREFIX prefix to use for all output (default: index) + -w, --workflow NAME workflow to produce indexes for, can be provided multiple + times. options: map, mpmap, rpvg, giraffe (default: map) + input data: + -r, --ref-fasta FILE FASTA file containing the reference sequence (may repeat) + -v, --vcf FILE VCF file with sequence names matching -r (may repeat) + -i, --ins-fasta FILE FASTA file with sequences of INS variants from -v + -g, --gfa FILE GFA file to make a graph from + -x, --tx-gff FILE GTF/GFF file with transcript annotations (may repeat) + -H, --hap-tx-gff FILE GTF/GFF file with transcript annotations of a named haplotype (may repeat) + configuration: + -f, --gff-feature STR GTF/GFF feature type (col. 3) to add to graph (default: exon) + -a, --gff-tx-tag STR GTF/GFF tag (in col. 9) for transcript ID (default: transcript_id) + logging and computation: + -T, --tmp-dir DIR temporary directory to use for intermediate files + -M, --target-mem MEM target max memory usage (not exact, formatted INT[kMG]) + (default: 1/2 of available) + -t, --threads NUM number of threads (default: all available) + -V, --verbosity NUM log to stderr (0 = none, 1 = basic, 2 = debug; default 1) + -h, --help print this help message to stderr and exit + +``` + + +## call + + +``` +usage: vg call [options] > output.vcf +Call variants or genotype known variants + +support calling options: + -k, --pack FILE Supports created from vg pack for given input graph + -m, --min-support M,N Minimum allele support (M) and minimum site support (N) for call [default = 2,4] + -e, --baseline-error X,Y Baseline error rates for Poisson model for small (X) and large (Y) variants [default= 0.005,0.01] + -B, --bias-mode Use old ratio-based genotyping algorithm as opposed to porbablistic model + -b, --het-bias M,N Homozygous alt/ref allele must have >= M/N times more support than the next best allele [default = 6,6] +GAF options: + -G, --gaf Output GAF genotypes instead of VCF + -T, --traversals Output all candidate traversals in GAF without doing any genotyping + -M, --trav-padding N Extend each flank of traversals (from -T) with reference path by N bases if possible +general options: + -v, --vcf FILE VCF file to genotype (must have been used to construct input graph with -a) + -a, --genotype-snarls Genotype every snarl, including reference calls (use to compare multiple samples) + -A, --all-snarls Genotype all snarls, including nested child snarls (like deconstruct -a) + -c, --min-length N Genotype only snarls with at least one traversal of length >= N + -C, --max-length N Genotype only snarls where all traversals have length <= N + -f, --ref-fasta FILE Reference fasta (required if VCF contains symbolic deletions or inversions) + -i, --ins-fasta FILE Insertions fasta (required if VCF contains symbolic insertions) + -s, --sample NAME Sample name [default=SAMPLE] + -r, --snarls FILE Snarls (from vg snarls) to avoid recomputing. + -g, --gbwt FILE Only call genotypes that are present in given GBWT index. + -z, --gbz Only call genotypes that are present in GBZ index (applies only if input graph is GBZ). + -N, --translation FILE Node ID translation (as created by vg gbwt --translation) to apply to snarl names in output + -O, --gbz-translation Use the ID translation from the input gbz to apply snarl names to snarl names and AT fields in output + -p, --ref-path NAME Reference path to call on (multipile allowed. defaults to all paths) + -S, --ref-sample NAME Call on all paths with given sample name (cannot be used with -p) + -o, --ref-offset N Offset in reference path (multiple allowed, 1 per path) + -l, --ref-length N Override length of reference in the contig field of output VCF + -d, --ploidy N Ploidy of sample. Only 1 and 2 supported. (default: 2) + -R, --ploidy-regex RULES use the given comma-separated list of colon-delimited REGEX:PLOIDY rules to assign + ploidies to contigs not visited by the selected samples, or to all contigs simulated + from if no samples are used. Unmatched contigs get ploidy 2 (or that from -d). + -n, --nested Activate nested calling mode (experimental) + -I, --chains Call chains instead of snarls (experimental) + --progress Show progress + -t, --threads N number of threads to use + +``` + + +## chunk + + +``` +usage: vg chunk [options] > [chunk.vg] +Splits a graph and/or alignment into smaller chunks + +Graph chunks are saved to .vg files, read chunks are saved to .gam files, and haplotype annotations are +saved to .annotate.txt files, of the form ----.. +The BASENAME is specified with -b and defaults to "./chunks". +For a single-range chunk (-p or -r), the graph data is sent to standard output instead of a file. + +options: + -x, --xg-name FILE use this graph or xg index to chunk subgraphs + -G, --gbwt-name FILE use this GBWT haplotype index for haplotype extraction (for -T) + -a, --gam-name FILE chunk this gam file instead of the graph (multiple allowed) + -g, --gam-and-graph when used in combination with -a, both gam and graph will be chunked + -F, --in-gaf input alignment is a sorted bgzipped GAF +path chunking: + -p, --path TARGET write the chunk in the specified (0-based inclusive, multiple allowed) + path range TARGET=path[:pos1[-pos2]] to standard output + -P, --path-list FILE write chunks for all path regions in (line - separated file). format + for each as in -p (all paths chunked unless otherwise specified) + -e, --input-bed FILE write chunks for all (0-based end-exclusive) bed regions + -S, --snarls FILE write given path-range(s) and all snarls fully contained in them, as alternative to -c +id range chunking: + -r, --node-range N:M write the chunk for the specified node range to standard output + -R, --node-ranges FILE write the chunk for each node range in (newline or whitespace separated) file + -n, --n-chunks N generate this many id-range chunks, which are determined using the xg index +simple gam chunking: + -m, --gam-split-size N split gam (specified with -a, sort/index not required) up into chunks with at most N reads each +component chunking: + -C, --components create a chunk for each connected component. if a targets given with (-p, -P, -r, -R), limit to components containing them + -M, --path-components create a chunk for each path in the graph's connected component +general: + -s, --chunk-size N create chunks spanning N bases (or nodes with -r/-R) for all input regions. + -o, --overlap N overlap between chunks when using -s [0] + -E, --output-bed FILE write all created chunks to a bed file + -b, --prefix BASENAME write output chunk files with the given base name. Files for chunk i will + be named: ----. [./chunk] + -c, --context-steps N expand the context of the chunk this many node steps [1] + -l, --context-length N expand the context of the chunk by this many bp [0] + -T, --trace trace haplotype threads in chunks (and only expand forward from input coordinates). + Produces a .annotate.txt file with haplotype frequencies for each chunk. + --no-embedded-haplotypes Don't load haplotypes from the graph. It is possible to -T without any haplotypes available. + -f, --fully-contained only return GAM alignments that are fully contained within chunk + -O, --output-fmt Specify output format (vg, pg, hg, gfa). [pg (vg with -T)] + -t, --threads N for tasks that can be done in parallel, use this many threads [1] + -h, --help + +``` + + +## construct + + +``` +usage: vg construct [options] >new.vg +options: +construct from a reference and variant calls: + -r, --reference FILE input FASTA reference (may repeat) + -v, --vcf FILE input VCF (may repeat) + -n, --rename V=F match contig V in the VCFs to contig F in the FASTAs (may repeat) + -a, --alt-paths save paths for alts of variants by SHA1 hash + -A, --alt-paths-plain save paths for alts of variants by variant ID if possible, otherwise SHA1 + (IDs must be unique across all input VCFs) + -R, --region REGION specify a VCF contig name or 1-based inclusive region (may repeat, if on different contigs) + -C, --region-is-chrom don't attempt to parse the regions (use when the reference + sequence name could be inadvertently parsed as a region) + -z, --region-size N variants per region to parallelize (default: 1024) + -t, --threads N use N threads to construct graph (defaults to numCPUs) + -S, --handle-sv include structural variants in construction of graph. + -I, --insertions FILE a FASTA file containing insertion sequences + (referred to in VCF) to add to graph. + -f, --flat-alts N don't chop up alternate alleles from input VCF + -l, --parse-max N don't chop up alternate alleles from input VCF longer than N (default: 100) + -i, --no-trim-indels don't remove the 1bp reference base from alt alleles of indels. + -N, --in-memory construct the entire graph in memory before outputting it. +construct from a multiple sequence alignment: + -M, --msa FILE input multiple sequence alignment + -F, --msa-format format of the MSA file (options: fasta, clustal; default fasta) + -d, --drop-msa-paths don't add paths for the MSA sequences into the graph +shared construction options: + -m, --node-max N limit the maximum allowable node sequence size (default: 32) + nodes greater than this threshold will be divided + Note: nodes larger than ~1024 bp can't be GCSA2-indexed + -p, --progress show progress + +``` + + +## convert + + + +Convert graphs between handle-graph compliant formats as well as GFA. + + + + + +``` +usage: vg convert [options] +input options: + -g, --gfa-in input in GFA format + -r, --in-rgfa-rank N import rgfa tags with rank <= N as paths [default=0] + -b, --gbwt-in FILE input graph is a GBWTGraph using the GBWT in FILE + --ref-sample STR change haplotypes for this sample to reference paths (may repeat) +gfa input options (use with -g): + -T, --gfa-trans FILE write gfa id conversions to FILE +output options: + -v, --vg-out output in VG's original Protobuf format [DEPRECATED: use -p instead]. + -a, --hash-out output in HashGraph format + -p, --packed-out output in PackedGraph format [default] + -x, --xg-out output in XG format + -f, --gfa-out output in GFA format + -H, --drop-haplotypes do not include haplotype paths in the output + (useful with GBWTGraph / GBZ inputs) +gfa output options (use with -f): + -P, --rgfa-path STR write given path as rGFA tags instead of lines + (multiple allowed, only rank-0 supported) + -Q, --rgfa-prefix STR write paths with given prefix as rGFA tags instead of lines + (multiple allowed, only rank-0 supported) + -B, --rgfa-pline paths written as rGFA tags also written as lines + -W, --no-wline Write all paths as GFA P-lines instead of W-lines. + Allows handling multiple phase blocks and subranges used together. + --gbwtgraph-algorithm Always use the GBWTGraph library GFA algorithm. + Not compatible with other GFA output options or non-GBWT graphs. + --vg-algorithm Always use the VG GFA algorithm. Works with all options and graph types, + but can't preserve original GFA coordinates. + --no-translation When using the GBWTGraph algorithm, convert the graph directly to GFA. + Do not use the translation to preserve original coordinates. +alignment options: + -G, --gam-to-gaf FILE convert GAM FILE to GAF + -F, --gaf-to-gam FILE convert GAF FILE to GAM +general options: + -t, --threads N use N threads (defaults to numCPUs) + +``` + + +## deconstruct + + +``` +usage: vg deconstruct [options] [-p|-P] +Outputs VCF records for Snarls present in a graph (relative to a chosen reference path). +options: + -p, --path NAME A reference path to deconstruct against (multiple allowed). + -P, --path-prefix NAME All paths [excluding GBWT threads / non-reference GBZ paths] beginning with NAME used as reference (multiple allowed). + Other non-ref paths not considered as samples. + -r, --snarls FILE Snarls file (from vg snarls) to avoid recomputing. + -g, --gbwt FILE consider alt traversals that correspond to GBWT haplotypes in FILE (not needed for GBZ graph input). + -T, --translation FILE Node ID translation (as created by vg gbwt --translation) to apply to snarl names and AT fields in output + -O, --gbz-translation Use the ID translation from the input gbz to apply snarl names to snarl names and AT fields in output + -a, --all-snarls Process all snarls, including nested snarls (by default only top-level snarls reported). + -c, --context-jaccard N Set context mapping size used to disambiguate alleles at sites with multiple reference traversals (default: 10000). + -u, --untangle-travs Use context mapping to determine the reference-relative positions of each step in allele traversals (AP INFO field). + -K, --keep-conflicted Retain conflicted genotypes in output. + -S, --strict-conflicts Drop genotypes when we have more than one haplotype for any given phase (set by default when using GBWT input). + -C, --contig-only-ref Only use the CONTIG name (and not SAMPLE#CONTIG#HAPLOTYPE etc) for the reference if possible (ie there is only one reference sample). + -L, --cluster F Cluster traversals whose (handle) Jaccard coefficient is >= F together (default: 1.0) [experimental] + -n, --nested Write a nested VCF, including special tags. [experimental] + -R, --star-allele Use *-alleles to denote alleles that span but do not cross the site. Only works with -n + -t, --threads N Use N threads + -v, --verbose Print some status messages + + +``` + + +## filter + + + +Filter alignments by properties. + +`vg filter --tsv-out` can be used to produce a TSV file of user-specified fields from the GAM file. For example, + +`vg filter --tsv-out "name;mapping_quality" ` + +is the equivalent of + +`vg view -aj | jq -r '[.name,.mapping_quality] | @tsv'` + +To find which fields are stored in a GAM file, use [`vg view`](#view) to view the GAM as a JSON file. + + + + +``` +vg: invalid option -- 'h' +usage: vg filter [options] > out.gam +Filter alignments by properties. + +options: + -M, --input-mp-alns input is multipath alignments (GAMP) rather than GAM + -n, --name-prefix NAME keep only reads with this prefix in their names [default=''] + -N, --name-prefixes FILE keep reads with names with one of many prefixes, one per nonempty line + -e, --exact-name match read names exactly instead of by prefix + -a, --subsequence NAME keep reads that contain this subsequence + -A, --subsequences FILE keep reads that contain one of these subsequences, one per nonempty line + -p, --proper-pairs keep reads that are annotated as being properly paired + -P, --only-mapped keep reads that are mapped + -X, --exclude-contig REGEX drop reads with refpos annotations on contigs matching the given regex (may repeat) + -F, --exclude-feature NAME drop reads with the given feature in the "features" annotation (may repeat) + -s, --min-secondary N minimum score to keep secondary alignment + -r, --min-primary N minimum score to keep primary alignment + -O, --rescore re-score reads using default parameters and only alignment information + -f, --frac-score normalize score based on length + -u, --substitutions use substitution count instead of score + -o, --max-overhang N filter reads whose alignments begin or end with an insert > N [default=99999] + -m, --min-end-matches N filter reads that don't begin with at least N matches on each end + -S, --drop-split remove split reads taking nonexistent edges + -x, --xg-name FILE use this xg index or graph (required for -S and -D) + -v, --verbose print out statistics on numbers of reads filtered by what. + -V, --no-output print out statistics (as above) but do not write out filtered GAM. + -T, --tsv-out FIELD[;FIELD] do not write filtered gam but a tsv of the given fields + -q, --min-mapq N filter alignments with mapping quality < N + -E, --repeat-ends N filter reads with tandem repeat (motif size <= 2N, spanning >= N bases) at either end + -D, --defray-ends N clip back the ends of reads that are ambiguously aligned, up to N bases + -C, --defray-count N stop defraying after N nodes visited (used to keep runtime in check) [default=99999] + -d, --downsample S.P filter out all but the given portion 0.P of the reads. S may be an integer seed as in SAMtools + -i, --interleaved assume interleaved input. both ends will be filtered out if either fails filter + -I, --interleaved-all assume interleaved input. both ends will be filtered out if *both* fail filters + -b, --min-base-quality Q:F filter reads with where fewer than fraction F bases have base quality >= PHRED score Q. + -B, --annotation K[:V] keep reads if the annotation is present. If a value is given, keep reads if the values are equal + similar to running jq 'select(.annotation.K==V)' on the json + -c, --correctly-mapped keep only reads that are marked as correctly-mapped + -U, --complement apply the complement of the filter implied by the other arguments. + -t, --threads N number of threads [1] + +``` + + +## find + + + +Use an index to find nodes, edges, kmers, paths, or positions. + + + + + +``` +usage: vg find [options] >sub.vg +options: +graph features: + -x, --xg-name FILE use this xg index or graph (instead of rocksdb db) + -n, --node ID find node(s), return 1-hop context as graph + -N, --node-list FILE a white space or line delimited list of nodes to collect + --mapping FILE also include nodes that map to the selected node ids + -e, --edges-end ID return edges on end of node with ID + -s, --edges-start ID return edges on start of node with ID + -c, --context STEPS expand the context of the subgraph this many steps + -L, --use-length treat STEPS in -c or M in -r as a length in bases + -P, --position-in PATH find the position of the node (specified by -n) in the given path + -I, --list-paths write out the path names in the index + -r, --node-range N:M get nodes from N to M + -G, --gam GAM accumulate the graph touched by the alignments in the GAM + --connecting-start POS find the graph connecting from POS (node ID, + or -, node offset) to --connecting-end + --connecting-end POS find the graph connecting to POS (node ID, + or -, node offset) from --connecting-start + --connecting-range INT traverse up to INT bases when going from --connecting-start to --connecting-end (default: 100) +subgraphs by path range: + -p, --path TARGET find the node(s) in the specified path range(s) TARGET=path[:pos1[-pos2]] + -R, --path-bed FILE read our targets from the given BED FILE + -E, --path-dag with -p or -R, gets any node in the partial order from pos1 to pos2, assumes id sorted DAG + -W, --save-to PREFIX instead of writing target subgraphs to stdout, + write one per given target to a separate file named PREFIX[path]:[start]-[end].vg + -K, --subgraph-k K instead of graphs, write kmers from the subgraphs + -H, --gbwt FILE when enumerating kmers from subgraphs, determine their frequencies in this GBWT haplotype index +alignments: + -l, --sorted-gam FILE use this sorted, indexed GAM file + -F, --sorted-gaf FILE use this sorted, indexed GAF file + -o, --alns-on N:M write alignments which align to any of the nodes between N and M (inclusive) + -A, --to-graph VG get alignments to the provided subgraph +sequences: + -g, --gcsa FILE use this GCSA2 index of the sequence space of the graph (required for sequence queries) + -S, --sequence STR search for sequence STR using + -M, --mems STR describe the super-maximal exact matches of the STR (gcsa2) in JSON + -B, --reseed-length N find non-super-maximal MEMs inside SMEMs of length at least N + -f, --fast-reseed use fast SMEM reseeding algorithm + -Y, --max-mem N the maximum length of the MEM (default: GCSA2 order) + -Z, --min-mem N the minimum length of the MEM (default: 1) + -D, --distance return distance on path between pair of nodes (-n). if -P not used, best path chosen heurstically + -Q, --paths-named S return all paths whose names are prefixed with S (multiple allowed) + +``` + + +## gamsort + + +``` +gamsort: sort a GAM/GAF file, or index a sorted GAM file +Usage: gamsort [Options] gamfile +Options: + -i / --index FILE produce an index of the sorted GAM file + -d / --dumb-sort use naive sorting algorithm (no tmp files, faster for small GAMs) + -p / --progress Show progress. + -G / --gaf-input Input is a GAF file. + -c / --chunk-size Number of reads per chunk when sorting GAFs. + -t / --threads Use the specified number of threads. + + +``` + + +## gbwt + + +``` +usage: vg gbwt [options] [args] + +Manipulate GBWTs. Input GBWTs are loaded from input args or built in earlier steps. +The input graph is provided with one of -x, -G, or -Z + +General options: + -x, --xg-name FILE read the graph from FILE + -o, --output FILE write output GBWT to FILE + -d, --temp-dir DIR use directory DIR for temporary files + -p, --progress show progress and statistics + +GBWT construction parameters (for steps 1 and 4): + --buffer-size N GBWT construction buffer size in millions of nodes (default 100) + --id-interval N store path ids at one out of N positions (default 1024) + +Multithreading: + --num-jobs N use at most N parallel build jobs (for -v, -G, -A, -l, -P; default 4) + --num-threads N use N parallel search threads (for -b and -r; default 8) + +Step 1: GBWT construction (requires -o and one of { -v, -G, -Z, -E, A }): + -v, --vcf-input index the haplotypes in the VCF files specified in input args in parallel + (inputs must be over different contigs; requires -x, implies -f) + (does not store graph contigs in the GBWT) + --preset X use preset X (available: 1000gp) + --inputs-as-jobs create one build job for each input instead of using first-fit heuristic + --parse-only store the VCF parses without building GBWTs + (use -o for the file name prefix; skips subsequent steps) + --ignore-missing do not warn when variants are missing from the graph + --actual-phasing do not interpret unphased homozygous genotypes as phased + --force-phasing replace unphased genotypes with randomly phased ones + --discard-overlaps skip overlapping alternate alleles if the overlap cannot be resolved + instead of creating a phase break + --batch-size N index the haplotypes in batches of N samples (default 200) + --sample-range X-Y index samples X to Y (inclusive, 0-based) + --rename V=P VCF contig V matches path P in the graph (may repeat) + --vcf-variants variants in the graph use VCF contig names instead of path names + --vcf-region C:X-Y restrict VCF contig C to coordinates X to Y (inclusive, 1-based; may repeat) + --exclude-sample X do not index the sample with name X (faster than -R; may repeat) + -G, --gfa-input index the walks or paths in the GFA file (one input arg) + --max-node N chop long segments into nodes of at most N bp (default 1024, use 0 to disable) + --path-regex X parse metadata as haplotypes from path names using regex X instead of vg-parser-compatible rules + --path-fields X parse metadata as haplotypes, mapping regex submatches to these fields instead of using vg-parser-compatible rules + --translation FILE write the segment to node translation table to FILE + -Z, --gbz-input extract GBWT and GBWTGraph from GBZ input (one input arg) + --translation FILE write the segment to node translation table to FILE + -I, --gg-in FILE load GBWTGraph from FILE and GBWT from input (one input arg) + -E, --index-paths index the embedded non-alt paths in the graph (requires -x, no input args) + -A, --alignment-input index the alignments in the GAF files specified in input args (requires -x) + --gam-format the input files are in GAM format instead of GAF format + +Step 2: Merge multiple input GBWTs (requires -o): + -m, --merge use the insertion algorithm + -f, --fast fast merging algorithm (node ids must not overlap) + -b, --parallel use the parallel algorithm + --chunk-size N search in chunks of N sequences (default 1) + --pos-buffer N use N MiB position buffers for each search thread (default 64) + --thread-buffer N use N MiB thread buffers for each search thread (default 256) + --merge-buffers N merge 2^N thread buffers into one file per merge job (default 6) + --merge-jobs N run N parallel merge jobs (default 4) + +Step 3: Alter GBWT (requires -o and one input GBWT): + -R, --remove-sample X remove the sample with name X from the index (may repeat) + --set-tag K=V set a GBWT tag (may repeat) + --set-reference X set sample X as the reference (may repeat) + +Step 4: Path cover GBWT construction (requires an input graph, -o, and one of { -a, -l, -P }): + -a, --augment-gbwt add a path cover of missing components (one input GBWT) + -l, --local-haplotypes sample local haplotypes (one input GBWT) + -P, --path-cover build a greedy path cover (no input GBWTs) + -n, --num-paths N find N paths per component (default 64 for -l, 16 otherwise) + -k, --context-length N use N-node contexts (default 4) + --pass-paths include named graph paths in local haplotype or greedy path cover GBWT + +Step 5: GBWTGraph construction (requires an input graph and one input GBWT): + -g, --graph-name FILE build GBWTGraph and store it in FILE + --gbz-format serialize both GBWT and GBWTGraph in GBZ format (makes -o unnecessary) + +Step 6: R-index construction (one input GBWT): + -r, --r-index FILE build an r-index and store it in FILE + +Step 7: Metadata (one input GBWT): + -M, --metadata print basic metadata + -C, --contigs print the number of contigs + -H, --haplotypes print the number of haplotypes + -S, --samples print the number of samples + -L, --list-names list contig/sample names (use with -C or -S) + -T, --path-names list path names + --tags list GBWT tags + +Step 8: Paths (one input GBWT): + -c, --count-paths print the number of paths + -e, --extract FILE extract paths in SDSL format to FILE + + +``` + + +## giraffe + + +``` +usage: + vg giraffe -Z graph.gbz [-d graph.dist -m graph.min] [other options] > output.gam + vg giraffe -Z graph.gbz --haplotype-name graph.hapl --kff-name sample.kff [other options] > output.gam + +Fast haplotype-aware short read mapper. + +basic options: + -Z, --gbz-name FILE map to this GBZ graph + -d, --dist-name FILE cluster using this distance index + -m, --minimizer-name FILE use this minimizer index + -p, --progress show progress + -t, --threads INT number of mapping threads to use + -b, --parameter-preset NAME set computational parameters (fast / default) [default] + -h, --help print full help with all available options +input options: + -G, --gam-in FILE read and realign GAM-format reads from FILE + -f, --fastq-in FILE read and align FASTQ-format reads from FILE (two are allowed, one for each mate) + -i, --interleaved GAM/FASTQ input is interleaved pairs, for paired-end alignment +haplotype sampling: + --haplotype-name FILE sample from haplotype information in FILE + --kff-name FILE sample according to kmer counts in FILE + --index-basename STR name prefix for generated graph/index files (default: from graph name) +alternate graphs: + -x, --xg-name FILE map to this graph (if no -Z / -g), or use this graph for HTSLib output + -g, --graph-name FILE map to this GBWTGraph (if no -Z) + -H, --gbwt-name FILE use this GBWT index (when mapping to -x / -g) +output options: + -N, --sample NAME add this sample name + -R, --read-group NAME add this read group + -o, --output-format NAME output the alignments in NAME format (gam / gaf / json / tsv / SAM / BAM / CRAM) [gam] + --ref-paths FILE ordered list of paths in the graph, one per line or HTSlib .dict, for HTSLib @SQ headers + --named-coordinates produce GAM/GAF outputs in named-segment (GFA) space + -P, --prune-low-cplx prune short and low complexity anchors during linear format realignment + -n, --discard discard all output alignments (for profiling) + --output-basename NAME write output to a GAM file beginning with the given prefix for each setting combination + --report-name NAME write a TSV of output file and mapping speed to the given file + --show-work log how the mapper comes to its conclusions about mapping locations +Giraffe parameters: + -A, --rescue-algorithm NAME use algorithm NAME for rescue (none / dozeu / gssw) [dozeu] + --fragment-mean FLOAT force the fragment length distribution to have this mean (requires --fragment-stdev) + --fragment-stdev FLOAT force the fragment length distribution to have this standard deviation (requires --fragment-mean) + --track-provenance track how internal intermediate alignment candidates were arrived at + --track-correctness track if internal intermediate alignment candidates are correct (implies --track-provenance) + -B, --batch-size INT number of reads or pairs per batch to distribute to threads [512] +program options: + --watchdog-timeout INT complain after INT seconds working on a read or read pair [10] +scoring options: + --match INT use this match score [1] + --mismatch INT use this mismatch penalty [4] + --gap-open INT use this gap open penalty [6] + --gap-extend INT use this gap extension penalty [1] + --full-l-bonus INT the full-length alignment bonus [5] +result options: + -M, --max-multimaps INT produce up to INT alignments for each read [1] +computational parameters: + -c, --hit-cap INT use all minimizers with at most INT hits [10] + -C, --hard-hit-cap INT ignore all minimizers with more than INT hits [500] + -F, --score-fraction FLOAT select minimizers between hit caps until score is FLOAT of total [0.9] + -U, --max-min INT use at most INT minimizers [500] + --num-bp-per-min INT use maximum of number minimizers calculated by READ_LENGTH / INT and --max-min [1000] + -D, --distance-limit INT cluster using this distance limit [200] + -e, --max-extensions INT extend up to INT clusters [800] + -a, --max-alignments INT align up to INT extensions [8] + -s, --cluster-score FLOAT only extend clusters if they are within INT of the best score [50] + -S, --pad-cluster-score FLOAT also extend clusters within INT of above threshold to get a second-best cluster [20] + -u, --cluster-coverage FLOAT only extend clusters if they are within FLOAT of the best read coverage [0.3] + -v, --extension-score INT only align extensions if their score is within INT of the best score [1] + -w, --extension-set FLOAT only align extension sets if their score is within INT of the best score [20] + -O, --no-dp disable all gapped alignment + -r, --rescue-attempts INT attempt up to INT rescues per read in a pair [15] + -L, --max-fragment-length INT assume that fragment lengths should be smaller than INT when estimating the fragment length distribution [2000] + --exclude-overlapping-min exclude overlapping minimizers + --paired-distance-limit FLOAT cluster pairs of read using a distance limit FLOAT standard deviations greater than the mean [2] + --rescue-subgraph-size FLOAT search for rescued alignments FLOAT standard deviations greater than the mean [4] + --rescue-seed-limit INT attempt rescue with at most INT seeds [100] +long-read/chaining parameters: + --align-from-chains chain up extensions to create alignments, instead of doing each separately + --chaining-cluster-distance INT maximum distance to cluster over before chaining [100] + --precluster-connection-coverage-threshold FLOAT threshold of precluster pair coverage below the base, after which to stop reseeding between preclusters [0.3] + --min-precluster-connections INT minimum number of precluster connections to reseed over [10] + --max-precluster-connections INT maximum number of precluster connections to reseed over [50] + --max-lookback-bases INT maximum distance to look back when chaining [100] + --min-lookback-items INT minimum items to consider coming from when chaining [1] + --lookback-item-hard-cap INT maximum items to consider coming from when chaining [15] + --chain-score-threshold FLOAT only align chains if their score is within this many points of the best score [100] + --min-chains INT ignore score threshold to get this many chains aligned [1] + --chain-min-score INT do not align chains with less than this score [100] + --max-chain-connection INT maximum distance across which to connect seeds when aligning a chain [100] + --max-tail-length INT maximum length of a tail to align before forcing softclipping when aligning a chain [100] + --max-dp-cells INT maximum number of alignment cells to allow in a tail with GSSW [16777216] + +``` + + +## haplotypes + + +``` +Usage: + vg haplotypes [options] -k kmers.kff -g output.gbz graph.gbz + vg haplotypes [options] -H output.hapl graph.gbz + vg haplotypes [options] -i graph.hapl -k kmers.kff -g output.gbz graph.gbz + vg haplotypes [options] -i graph.hapl --vcf-input variants.vcf graph.gbz > output.tsv + vg haplotypes [options] -i graph.hapl -k kmers.kff --extract M:N graph.gbz > output.fa + +Haplotype sampling based on kmer counts. + +Output files: + -g, --gbz-output X write the output GBZ to X + -H, --haplotype-output X write haplotype information to X + +Input files: + -d, --distance-index X use this distance index (default: .dist) + -r, --r-index X use this r-index (default: .ri) + -i, --haplotype-input X use this haplotype information (default: generate) + -k, --kmer-input X use kmer counts from this KFF file (required for --gbz-output) + +Options for generating haplotype information: + --kmer-length N kmer length for building the minimizer index (default: 29) + --window-length N window length for building the minimizer index (default: 11) + --subchain-length N target length (in bp) for subchains (default: 10000) + --linear-structure extend subchains to avoid haplotypes visiting them multiple times + +Options for sampling haplotypes: + --preset X use preset X (default, haploid, diploid) + --coverage N kmer coverage in the KFF file (default: estimate) + --num-haplotypes N generate N haplotypes (default: 4) + sample from N candidates (with --diploid-sampling; default: 32) + --present-discount F discount scores for present kmers by factor F (default: 0.9) + --het-adjustment F adjust scores for heterozygous kmers by F (default: 0.05) + --absent-score F score absent kmers -F/+F (default: 0.8) + --haploid-scoring use a scoring model without heterozygous kmers + --diploid-sampling choose the best pair from the sampled haplotypes + --include-reference include named and reference paths in the output + +Other options: + -v, --verbosity N verbosity level (0 = silent, 1 = basic, 2 = detailed, 3 = debug; default: 0) + -t, --threads N approximate number of threads (default: 8 on this system) + +Developer options: + --validate validate the generated information (may be slow) + --vcf-input X map the variants in VCF file X to subchains + --contig-prefix X a prefix for transforming VCF contig names into GBWT contig names + --extract M:N extract haplotypes in chain M, subchain N in FASTA format + --score-output X write haplotype scores to X + --classify X classify kmers and write output to X + + +``` + + +## ids + + + +Manipulate node ids. + + + + + +``` +usage: vg ids [options] [graph2.vg ...] >new.vg +options: + -c, --compact minimize the space of integers used by the ids + -i, --increment N increase ids by N + -d, --decrement N decrease ids by N + -j, --join make a joint id space for all the graphs that are supplied + by iterating through the supplied graphs and incrementing + their ids to be non-conflicting (modifies original files) + -m, --mapping FILE create an empty node mapping for vg prune + -s, --sort assign new node IDs in (generalized) topological sort order + +``` + + +## index + + +``` +usage: vg index [options] [graph2.vg ...] +Creates an index on the specified graph or graphs. All graphs indexed must +already be in a joint ID space. +general options: + -b, --temp-dir DIR use DIR for temporary files + -t, --threads N number of threads to use + -p, --progress show progress +xg options: + -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing + -L, --xg-alts include alt paths in xg +gcsa options: + -g, --gcsa-out FILE output a GCSA2 index to the given file + -f, --mapping FILE use this node mapping in GCSA2 construction + -k, --kmer-size N index kmers of size N in the graph (default 16) + -X, --doubling-steps N use this number of doubling steps for GCSA2 construction (default 4) + -Z, --size-limit N limit temporary disk space usage to N gigabytes (default 2048) + -V, --verify-index validate the GCSA2 index using the input kmers (important for testing) +gam indexing options: + -l, --index-sorted-gam input is sorted .gam format alignments, store a GAI index of the sorted GAM in INPUT.gam.gai +vg in-place indexing options: + --index-sorted-vg input is ID-sorted .vg format graph chunks, store a VGI index of the sorted vg in INPUT.vg.vgi +snarl distance index options + -j --dist-name FILE use this file to store a snarl-based distance index + --snarl-limit N don't store snarl distances for snarls with more than N nodes (default 10000) + if N is 0 then don't store distances, only the snarl tree + --no-nested-distance only store distances along the top-level chain + +``` + + +## map + + +``` +vg: invalid option -- 'h' +usage: vg map [options] -d idxbase -f in1.fq [-f in2.fq] >aln.gam +Align reads to a graph. + +graph/index: + -d, --base-name BASE use BASE.xg and BASE.gcsa as the input index pair + -x, --xg-name FILE use this xg index or graph (defaults to .vg.xg) + -g, --gcsa-name FILE use this GCSA2 index (defaults to .gcsa) + -1, --gbwt-name FILE use this GBWT haplotype index (defaults to .gbwt) +algorithm: + -t, --threads N number of compute threads to use + -k, --min-mem INT minimum MEM length (if 0 estimate via -e) [0] + -e, --mem-chance FLOAT set {-k} such that this fraction of {-k} length hits will by chance [5e-4] + -c, --hit-max N ignore MEMs who have >N hits in our index (0 for no limit) [2048] + -Y, --max-mem INT ignore mems longer than this length (unset if 0) [0] + -r, --reseed-x FLOAT look for internal seeds inside a seed longer than FLOAT*--min-seed [1.5] + -u, --try-up-to INT attempt to align up to the INT best candidate chains of seeds (1/2 for paired) [128] + -l, --try-at-least INT attempt to align at least the INT best candidate chains of seeds [1] + -E, --approx-mq-cap INT weight MQ by suffix tree based estimate when estimate less than FLOAT [0] + --id-mq-weight N scale mapping quality by the alignment score identity to this power [2] + -W, --min-chain INT discard a chain if seeded bases shorter than INT [0] + -C, --drop-chain FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [0.45] + -n, --mq-overlap FLOAT scale MQ by count of alignments with this overlap in the query with the primary [0] + -P, --min-ident FLOAT accept alignment only if the alignment identity is >= FLOAT [0] + -H, --max-target-x N skip cluster subgraphs with length > N*read_length [100] + -w, --band-width INT band width for long read alignment [256] + -O, --band-overlap INT band overlap for long read alignment [{-w}/8] + -J, --band-jump INT the maximum number of bands of insertion we consider in the alignment chain model [128] + -B, --band-multi INT consider this many alignments of each band in banded alignment [16] + -Z, --band-min-mq INT treat bands with less than this MQ as unaligned [0] + -I, --fragment STR fragment length distribution specification STR=m:μ:σ:o:d [5000:0:0:0:1] + max, mean, stdev, orientation (1=same, 0=flip), direction (1=forward, 0=backward) + -U, --fixed-frag-model don't learn the pair fragment model online, use {-I} without update + -p, --print-frag-model suppress alignment output and print the fragment model on stdout as per {-I} format + --frag-calc INT update the fragment model every INT perfect pairs [10] + --fragment-x FLOAT calculate max fragment size as frag_mean+frag_sd*FLOAT [10] + --mate-rescues INT attempt up to INT mate rescues per pair [64] + -S, --unpaired-cost INT penalty for an unpaired read pair [17] + --no-patch-aln do not patch banded alignments by locally aligning unaligned regions + --xdrop-alignment use X-drop heuristic (much faster for long-read alignment) + --max-gap-length maximum gap length allowed in each contiguous alignment (for X-drop alignment) [40] +scoring: + -q, --match INT use this match score [1] + -z, --mismatch INT use this mismatch penalty [4] + --score-matrix FILE read a 4x4 integer substitution scoring matrix from a file + -o, --gap-open INT use this gap open penalty [6] + -y, --gap-extend INT use this gap extension penalty [1] + -L, --full-l-bonus INT the full-length alignment bonus [5] + --drop-full-l-bonus remove the full length bonus from the score before sorting and MQ calculation + -a, --hap-exp FLOAT the exponent for haplotype consistency likelihood in alignment score [1] + --recombination-penalty FLOAT use this log recombination penalty for GBWT haplotype scoring [20.7] + -A, --qual-adjust perform base quality adjusted alignments (requires base quality input) +preset: + -m, --alignment-model STR use a preset alignment scoring model, either "short" (default) or "long" (for ONT/PacBio) + "long" is equivalent to `-u 2 -L 63 -q 1 -z 2 -o 2 -y 1 -w 128 -O 32` +input: + -s, --sequence STR align a string to the graph in graph.vg using partial order alignment + -V, --seq-name STR name the sequence using this value (for graph modification with new named paths) + -T, --reads FILE take reads (one per line) from FILE, write alignments to stdout + -b, --hts-input FILE align reads from htslib-compatible FILE (BAM/CRAM/SAM) stdin (-), alignments to stdout + -G, --gam-input FILE realign GAM input + -f, --fastq FILE input fastq or (2-line format) fasta, possibly compressed, two are allowed, one for each mate + -F, --fasta FILE align the sequences in a FASTA file that may have multiple lines per reference sequence + -i, --interleaved fastq or GAM is interleaved paired-ended + -N, --sample NAME for --reads input, add this sample + -R, --read-group NAME for --reads input, add this read group +output: + -j, --output-json output JSON rather than an alignment stream (helpful for debugging) + -%, --gaf output alignments in GAF format + --surject-to TYPE surject the output into the graph's paths, writing TYPE := bam |sam | cram + --ref-paths FILE ordered list of paths in the graph, one per line or HTSlib .dict, for HTSLib @SQ headers + --buffer-size INT buffer this many alignments together before outputting in GAM [512] + -X, --compare realign GAM input (-G), writing alignment with "correct" field set to overlap with input + -v, --refpos-table for efficient testing output a table of name, chr, pos, mq, score + -K, --keep-secondary produce alignments for secondary input alignments in addition to primary ones + -M, --max-multimaps INT produce up to INT alignments for each read [1] + -Q, --mq-max INT cap the mapping quality at INT [60] + --exclude-unaligned exclude reads with no alignment + -D, --debug print debugging information about alignment to stderr + --log-time print runtime to stderr + +``` + + +## minimizer + + +``` +usage: vg minimizer [options] -d graph.dist -o graph.min graph + +Builds a (w, k)-minimizer index or a (k, s)-syncmer index of the threads in the GBWT +index. The graph can be any HandleGraph, which will be transformed into a GBWTGraph. +The transformation can be avoided by providing a GBWTGraph or a GBZ graph. + +Required options: + -d, --distance-index X annotate the hits with positions in this distance index + -o, --output-name X store the index to file X + +Minimizer options: + -k, --kmer-length N length of the kmers in the index (default 29, max 31) + -w, --window-length N choose the minimizer from a window of N kmers (default 11) + -c, --closed-syncmers index closed syncmers instead of minimizers + -s, --smer-length N use smers of length N in closed syncmers (default 18) + +Weighted minimizers: + -W, --weighted use weighted minimizers + --threshold N downweight kmers with more than N hits (default 500) + --iterations N downweight frequent kmers by N iterations (default 3) + --fast-counting use the fast kmer counting algorithm (default) + --save-memory use the space-efficient kmer counting algorithm + --hash-table N use 2^N-cell hash tables for kmer counting (default: guess) + +Other options: + -l, --load-index X load the index from file X and insert the new kmers into it + (overrides minimizer / weighted minimizer options) + -g, --gbwt-name X use the GBWT index in file X (required with a non-GBZ graph) + -p, --progress show progress information + -t, --threads N use N threads for index construction (default 8) + (using more than 16 threads rarely helps) + --no-dist build the index without distance index annotations (not recommended) + + +``` + + +## mod + + +``` +usage: vg mod [options] >[mod.vg] +Modifies graph, outputs modified on stdout. + +options: + -P, --label-paths don't edit with -i alignments, just use them for labeling the graph + -c, --compact-ids should we sort and compact the id space? (default false) + -b, --break-cycles use an approximate topological sort to break cycles in the graph + -n, --normalize normalize the graph so that edges are always non-redundant + (nodes have unique starting and ending bases relative to neighbors, + and edges that do not introduce new paths are removed and neighboring + nodes are merged) + -U, --until-normal N iterate normalization until convergence, or at most N times + -z, --nomerge-pre STR do not let normalize (-n, -U) zip up any pair of nodes that both belong to path with prefix STR + -E, --unreverse-edges flip doubly-reversing edges so that they are represented on the + forward strand of the graph + -s, --simplify remove redundancy from the graph that will not change its path space + -d, --dagify-step N copy strongly connected components of the graph N times, forwarding + edges from old to new copies to convert the graph into a DAG + -w, --dagify-to N copy strongly connected components of the graph forwarding + edges from old to new copies to convert the graph into a DAG + until the shortest path through each SCC is N bases long + -L, --dagify-len-max N stop a dagification step if the unrolling component has this much sequence + -f, --unfold N represent inversions accessible up to N from the forward + component of the graph + -O, --orient-forward orient the nodes in the graph forward + -N, --remove-non-path keep only nodes and edges which are part of paths + -A, --remove-path keep only nodes and edges which are not part of any path + -k, --keep-path NAME keep only nodes and edges in the path + -R, --remove-null removes nodes that have no sequence, forwarding their edges + -g, --subgraph ID gets the subgraph rooted at node ID, multiple allowed + -x, --context N steps the subgraph out by N steps (default: 1) + -p, --prune-complex remove nodes that are reached by paths of --length which + cross more than --edge-max edges + -S, --prune-subgraphs remove subgraphs which are shorter than --length + -l, --length N for pruning complex regions and short subgraphs + -X, --chop N chop nodes in the graph so they are not more than N bp long + -u, --unchop where two nodes are only connected to each other and by one edge + replace the pair with a single node that is the concatenation of their labels + -e, --edge-max N only consider paths which make edge choices at <= this many points + -M, --max-degree N unlink nodes that have edge degree greater than N + -m, --markers join all head and tails nodes to marker nodes + ('###' starts and '$$$' ends) of --length, for debugging + -y, --destroy-node ID remove node with given id + -a, --cactus convert to cactus graph representation + -v, --sample-vcf FILE for a graph with allele paths, compute the sample graph from the given VCF + -G, --sample-graph FILE subset an augmented graph to a sample graph using a Locus file + -t, --threads N for tasks that can be done in parallel, use this many threads + +``` + + +## mpmap + + +``` +usage: vg mpmap [options] -x graph.xg -g index.gcsa [-f reads1.fq [-f reads2.fq] | -G reads.gam] > aln.gamp +Multipath align reads to a graph. + +basic options: +graph/index: + -x, --graph-name FILE graph (required; XG format recommended but other formats are valid, see `vg convert`) + -g, --gcsa-name FILE use this GCSA2/LCP index pair for MEMs (required; both FILE and FILE.lcp, see `vg index`) + -d, --dist-name FILE use this snarl distance index for clustering (recommended, see `vg index`) + -s, --snarls FILE align to alternate paths in these snarls (unnecessary if providing -d, see `vg snarls`) +input: + -f, --fastq FILE input FASTQ (possibly gzipped), can be given twice for paired ends (for stdin use -) + -i, --interleaved input contains interleaved paired ends +algorithm presets: + -n, --nt-type TYPE sequence type preset: 'DNA' for genomic data, 'RNA' for transcriptomic data [RNA] + -l, --read-length TYPE read length preset: 'very-short', 'short', or 'long' (approx. <50bp, 50-500bp, and >500bp) [short] + -e, --error-rate TYPE error rate preset: 'low' or 'high' (approx. PHRED >20 and <20) [low] +output: + -F, --output-fmt TYPE format to output alignments in: 'GAMP for' multipath alignments, 'GAM' or 'GAF' for single-path + alignments, 'SAM', 'BAM', or 'CRAM' for linear reference alignments (may also require -S) [GAMP] + -S, --ref-paths FILE paths in the graph either 1) one per line in a text file, or 2) in an HTSlib .dict, to treat as + reference sequences for HTSlib formats (see -F) [all paths] + -N, --sample NAME add this sample name to output + -R, --read-group NAME add this read group to output + -p, --suppress-progress do not report progress to stderr +computational parameters: + -t, --threads INT number of compute threads to use [all available] + +advanced options: +algorithm: + -X, --not-spliced do not form spliced alignments, even if aligning with --nt-type 'rna' + -M, --max-multimaps INT report (up to) this many mappings per read [10 rna / 1 dna] + -a, --agglomerate-alns combine separate multipath alignments into one (possibly disconnected) alignment + -r, --intron-distr FILE intron length distribution (from scripts/intron_length_distribution.py) + -Q, --mq-max INT cap mapping quality estimates at this much [60] + -b, --frag-sample INT look for this many unambiguous mappings to estimate the fragment length distribution [1000] + -I, --frag-mean FLOAT mean for a pre-determined fragment length distribution (also requires -D) + -D, --frag-stddev FLOAT standard deviation for a pre-determined fragment length distribution (also requires -I) + -G, --gam-input FILE input GAM (for stdin, use -) + -u, --map-attempts INT perform (up to) this many mappings per read (0 for no limit) [24 paired / 64 unpaired] + -c, --hit-max INT use at most this many hits for any match seeds (0 for no limit) [1024 DNA / 100 RNA] +scoring: + -A, --no-qual-adjust do not perform base quality adjusted alignments even when base qualities are available + -q, --match INT use this match score [1] + -z, --mismatch INT use this mismatch penalty [4 low error, 1 high error] + -o, --gap-open INT use this gap open penalty [6 low error, 1 high error] + -y, --gap-extend INT use this gap extension penalty [1] + -L, --full-l-bonus INT add this score to alignments that align each end of the read [mismatch+1 short, 0 long] + -w, --score-matrix FILE read a 4x4 integer substitution scoring matrix from a file (in the order ACGT) + -m, --remove-bonuses remove full length alignment bonuses in reported scores + +``` + + +## pack + + + +Convert alignments to a compact coverage index. + + + + + +``` +usage: vg pack [options] +options: + -x, --xg FILE use this basis graph (any format accepted, does not have to be xg) + -o, --packs-out FILE write compressed coverage packs to this output file + -i, --packs-in FILE begin by summing coverage packs from each provided FILE + -g, --gam FILE read alignments from this GAM file (could be '-' for stdin) + -a, --gaf FILE read alignments from this GAF file (could be '-' for stdin) + -d, --as-table write table on stdout representing packs + -D, --as-edge-table write table on stdout representing edge coverage + -u, --as-qual-table write table on stdout representing average node mapqs + -e, --with-edits record and write edits rather than only recording graph-matching coverage + -b, --bin-size N number of sequence bases per CSA bin [default: inf] + -n, --node ID write table for only specified node(s) + -N, --node-list FILE a white space or line delimited list of nodes to collect + -Q, --min-mapq N ignore reads with MAPQ < N and positions with base quality < N [default: 0] + -c, --expected-cov N expected coverage. used only for memory tuning [default : 128] + -s, --trim-ends N ignore the first and last N bases of each read + -t, --threads N use N threads (defaults to numCPUs) + +``` + + +## paths + + + +Traverse paths in the graph. + + + + + +``` +usage: vg paths [options] +options: + input: + -x, --xg FILE use the paths and haplotypes in this graph FILE. Supports GBZ haplotypes. + (Also accepts -v, --vg) + -g, --gbwt FILE use the threads in the GBWT index in FILE + (graph also required for most output options; -g takes priority over -x) + output graph (.vg format) + -V, --extract-vg output a path-only graph covering the selected paths + -d, --drop-paths output a graph with the selected paths removed + -r, --retain-paths output a graph with only the selected paths retained + -n, --normalize-paths output a graph where all equivalent paths in a site a merged (using selected paths to snap to if possible) + output path data: + -X, --extract-gam print (as GAM alignments) the stored paths in the graph + -A, --extract-gaf print (as GAF alignments) the stored paths in the graph + -L, --list print (as a list of names, one per line) the path (or thread) names + -E, --lengths print a list of path names (as with -L) but paired with their lengths + -M, --metadata print a table of path names and their metadata + -C, --cyclicity print a list of path names (as with -L) but paired with flag denoting the cyclicity + -F, --extract-fasta print the paths in FASTA format + -c, --coverage print the coverage stats for selected paths (not including cylces) + path selection: + -p, --paths-file FILE select the paths named in a file (one per line) + -Q, --paths-by STR select the paths with the given name prefix + -S, --sample STR select the haplotypes or reference paths for this sample + -a, --variant-paths select the variant paths added by 'vg construct -a' + -G, --generic-paths select the generic, non-reference, non-haplotype paths + -R, --reference-paths select the reference paths + -H, --haplotype-paths select the haplotype paths paths + configuration: + -o, --overlay apply a ReferencePathOverlayHelper to the graph + -t, --threads N number of threads to use [all available]. applies only to snarl finding within -n + +``` + + +## prune + + +``` +usage: vg prune [options] >[output.vg] + +Prunes the complex regions of the graph for GCSA2 indexing. Pruning the graph +removes embedded paths. + +Pruning parameters: + -k, --kmer-length N kmer length used for pruning + defaults: 24 with -P; 24 with -r; 24 with -u + -e, --edge-max N remove the edges on kmers making > N edge choices + defaults: 3 with -P; 3 with -r; 3 with -u + -s, --subgraph-min N remove subgraphs of < N bases + defaults: 33 with -P; 33 with -r; 33 with -u + -M, --max-degree N if N > 0, remove nodes with degree > N before pruning + defaults: 0 with -P; 0 with -r; 0 with -u + +Pruning modes (-P, -r, and -u are mutually exclusive): + -P, --prune simply prune the graph (default) + -r, --restore-paths restore the edges on non-alt paths + -u, --unfold-paths unfold non-alt paths and GBWT threads + -v, --verify-paths verify that the paths exist after pruning + (potentially very slow) + +Unfolding options: + -g, --gbwt-name FILE unfold the threads from this GBWT index + -m, --mapping FILE store the node mapping for duplicates in this file (required with -u) + -a, --append-mapping append to the existing node mapping + +Other options: + -p, --progress show progress + -t, --threads N use N threads (default: 8) + -d, --dry-run determine the validity of the combination of options + + +``` + + +## rna + + +``` + +usage: vg rna [options] graph.[vg|pg|hg|gbz] > splicing_graph.[vg|pg|hg] + +General options: + -t, --threads INT number of compute threads to use [1] + -p, --progress show progress + -h, --help print help message + +Input options: + -n, --transcripts FILE transcript file(s) in gtf/gff format; may repeat + -m, --introns FILE intron file(s) in bed format; may repeat + -y, --feature-type NAME parse only this feature type in the gtf/gff (parses all if empty) [exon] + -s, --transcript-tag NAME use this attribute tag in the gtf/gff file(s) as id [transcript_id] + -l, --haplotypes FILE project transcripts onto haplotypes in GBWT index file + -z, --gbz-format input graph is in GBZ format (contains both a graph and haplotypes (GBWT index)) + +Construction options: + -j, --use-hap-ref use haplotype paths in GBWT index as reference sequences (disables projection) + -e, --proj-embed-paths project transcripts onto embedded haplotype paths + -c, --path-collapse TYPE collapse identical transcript paths across no|haplotype|all paths [haplotype] + -k, --max-node-length INT chop nodes longer than maximum node length (0 disables chopping) [0] + -d, --remove-non-gene remove intergenic and intronic regions (deletes all paths in the graph) + -o, --do-not-sort do not topological sort and compact the graph + -r, --add-ref-paths add reference transcripts as embedded paths in the graph + -a, --add-hap-paths add projected transcripts as embedded paths in the graph + +Output options: + -b, --write-gbwt FILE write pantranscriptome transcript paths as GBWT index file + -v, --write-hap-gbwt FILE write input haplotypes as a GBWT with node IDs matching the output graph + -f, --write-fasta FILE write pantranscriptome transcript sequences as fasta file + -i, --write-info FILE write pantranscriptome transcript info table as tsv file + -q, --out-exclude-ref exclude reference transcripts from pantranscriptome output + -g, --gbwt-bidirectional use bidirectional paths in GBWT index construction + + +``` + + +## sim + + +``` +usage: vg sim [options] +Samples sequences from the xg-indexed graph. + +basic options: + -x, --xg-name FILE use the graph in FILE (required) + -n, --num-reads N simulate N reads or read pairs + -l, --read-length N simulate reads of length N + -r, --progress show progress information +output options: + -a, --align-out write alignments in GAM-format + -J, --json-out write alignments in json + --multi-position annotate alignments with multiple reference positions +simulation parameters: + -F, --fastq FILE match the error profile of NGS reads in FILE, repeat for paired reads (ignores -l,-f) + -I, --interleaved reads in FASTQ (-F) are interleaved read pairs + -s, --random-seed N use this specific seed for the PRNG + -e, --sub-rate FLOAT base substitution rate (default 0.0) + -i, --indel-rate FLOAT indel rate (default 0.0) + -d, --indel-err-prop FLOAT proportion of trained errors from -F that are indels (default 0.01) + -S, --scale-err FLOAT scale trained error probabilities from -F by this much (default 1.0) + -f, --forward-only don't simulate from the reverse strand + -p, --frag-len N make paired end reads with given fragment length N + -v, --frag-std-dev FLOAT use this standard deviation for fragment length estimation + -N, --allow-Ns allow reads to be sampled from the graph with Ns in them + --max-tries N attempt sampling operations up to N times before giving up [100] + -t, --threads number of compute threads (only when using FASTQ with -F) [1] +simulate from paths: + -P, --path PATH simulate from this path (may repeat; cannot also give -T) + -A, --any-path simulate from any path (overrides -P) + -m, --sample-name NAME simulate from this sample (may repeat; requires -g) + -R, --ploidy-regex RULES use the given comma-separated list of colon-delimited REGEX:PLOIDY rules to assign + ploidies to contigs not visited by the selected samples, or to all contigs simulated + from if no samples are used. Unmatched contigs get ploidy 2. + -g, --gbwt-name FILE use samples from this GBWT index + -T, --tx-expr-file FILE simulate from an expression profile formatted as RSEM output (cannot also give -P) + -H, --haplo-tx-file FILE transcript origin info table from vg rna -i (required for -T on haplotype transcripts) + -u, --unsheared sample from unsheared fragments + -E, --path-pos-file FILE output a TSV with sampled position on path of each read (requires -F) + +``` + + +## stats + + +``` +usage: vg stats [options] [] +options: + -z, --size size of graph + -N, --node-count number of nodes in graph + -E, --edge-count number of edges in graph + -l, --length length of sequences in graph + -L, --self-loops number of self-loops + -s, --subgraphs describe subgraphs of graph + -H, --heads list the head nodes of the graph + -T, --tails list the tail nodes of the graph + -e, --nondeterm list the nondeterministic edge sets + -c, --components print the strongly connected components of the graph + -A, --is-acyclic print if the graph is acyclic or not + -n, --node ID consider node with the given id + -d, --to-head show distance to head for each provided node + -t, --to-tail show distance to head for each provided node + -a, --alignments FILE compute stats for reads aligned to the graph + -r, --node-id-range X:Y where X and Y are the smallest and largest node id in the graph, respectively + -o, --overlap PATH for each overlapping path mapping in the graph write a table: + PATH, other_path, rank1, rank2 + multiple allowed; limit comparison to those provided + -O, --overlap-all print overlap table for the cartesian product of paths + -R, --snarls print statistics for each snarl + --snarl-contents print out a table of + -C, --chains print statistics for each chain + -F, --format graph format from {VG-Protobuf, PackedGraph, HashGraph, XG}. Can't detect Protobuf if graph read from stdin + -D, --degree-dist print degree distribution of the graph. + -b, --dist-snarls FILE print the sizes and depths of the snarls in a given distance index. + -p, --threads N number of threads to use [all available] + -v, --verbose output longer reports + +``` + + +## surject + + +``` +usage: vg surject [options] >[proj.cram] +Transforms alignments to be relative to particular paths. + +options: + -x, --xg-name FILE use this graph or xg index (required) + -t, --threads N number of threads to use + -p, --into-path NAME surject into this path or its subpaths (many allowed, default: reference, then non-alt generic) + -F, --into-paths FILE surject into path names listed in HTSlib sequence dictionary or path list FILE + -i, --interleaved GAM is interleaved paired-ended, so when outputting HTS formats, pair reads + -M, --multimap include secondary alignments to all overlapping paths instead of just primary + -G, --gaf-input input file is GAF instead of GAM + -m, --gamp-input input file is GAMP instead of GAM + -c, --cram-output write CRAM to stdout + -b, --bam-output write BAM to stdout + -s, --sam-output write SAM to stdout + -l, --subpath-local let the multipath mapping surjection produce local (rather than global) alignments + -T, --max-tail-len N only align up to N bases of read tails (default: 10000) + -P, --prune-low-cplx prune short and low complexity anchors during realignment + -a, --max-anchors N use no more than N anchors per target path (default: unlimited) + -S, --spliced interpret long deletions against paths as spliced alignments + -A, --qual-adj adjust scoring for base qualities, if they are available + -N, --sample NAME set this sample name for all reads + -R, --read-group NAME set this read group for all reads + -f, --max-frag-len N reads with fragment lengths greater than N will not be marked properly paired in SAM/BAM/CRAM + -L, --list-all-paths annotate SAM records with a list of all attempted re-alignments to paths in SS tag + -C, --compression N level for compression [0-9] + -V, --no-validate skip checking whether alignments plausibly are against the provided graph + -w, --watchdog-timeout N warn when reads take more than the given number of seconds to surject + +``` + + +## view + + + +format conversions for graphs and alignments + + + + + +``` +usage: vg view [options] [ | | | [] ] +options: + -g, --gfa output GFA format (default) + -F, --gfa-in input GFA format, reducing overlaps if they occur + -v, --vg output VG format [DEPRECATED, use vg convert instead] + -V, --vg-in input VG format only + -j, --json output JSON format + -J, --json-in input JSON format + -c, --json-stream streaming conversion of a VG format graph in line delimited JSON format + (this cannot be loaded directly via -J) + -G, --gam output GAM format (vg alignment format: Graph Alignment/Map) + -Z, --translation-in input is a graph translation description + -t, --turtle output RDF/turtle format (can not be loaded by VG) + -T, --turtle-in input turtle format. + -r, --rdf_base_uri set base uri for the RDF output + -a, --align-in input GAM format + -A, --aln-graph GAM add alignments from GAM to the graph + -q, --locus-in input stream is Locus format + -z, --locus-out output stream Locus format + -Q, --loci FILE input is Locus format for use by dot output + -d, --dot output dot format + -S, --simple-dot simplify the dot output; remove node labels, simplify alignments + -u, --noseq-dot shows size information instead of sequence in the dot output + -e, --ascii-labels use labels for paths or superbubbles with char/colors rather than emoji + -Y, --ultra-label label nodes with emoji/colors that correspond to ultrabubbles + -m, --skip-missing skip mappings to nodes not in the graph when drawing alignments + -C, --color color nodes that are not in the reference path (DOT OUTPUT ONLY) + -p, --show-paths show paths in dot output + -w, --walk-paths add labeled edges to represent paths in dot output + -n, --annotate-paths add labels to normal edges to represent paths in dot output + -M, --show-mappings with -p print the mappings in each path in JSON + -I, --invert-ports invert the edge ports in dot so that ne->nw is reversed + -s, --random-seed N use this seed when assigning path symbols in dot output + -b, --bam input BAM or other htslib-parseable alignments + -f, --fastq-in input fastq (output defaults to GAM). Takes two + positional file arguments if paired + -X, --fastq-out output fastq (input defaults to GAM) + -i, --interleaved fastq is interleaved paired-ended + -L, --pileup output VG Pileup format + -l, --pileup-in input VG Pileup format + -B, --distance-in input distance index + -R, --snarl-in input VG Snarl format + -E, --snarl-traversal-in input VG SnarlTraversal format + -K, --multipath-in input VG MultipathAlignment format (GAMP) + -k, --multipath output VG MultipathAlignment format (GAMP) + -D, --expect-duplicates don't warn if encountering the same node or edge multiple times + -x, --extract-tag TAG extract and concatenate messages with the given tag + --verbose explain the file being read with --extract-tag + --threads N for parallel operations use this many threads [1] + +``` + + diff --git a/doc/vgmanmd.desc.md b/doc/vgmanmd.desc.md index 0e094da5f1c..7bb6227a166 100644 --- a/doc/vgmanmd.desc.md +++ b/doc/vgmanmd.desc.md @@ -25,7 +25,7 @@ For more in-depth explanations of tools and workflows, see the [vg wiki page](ht - **Downstream analyses** - [`vg pack`](#pack) convert alignments to a compact coverage index. Used with [vg call](#call) - [`vg call`](#call) call or genotype VCF variants. Uses [vg pack](#pack). [wiki page](https://github.com/vgteam/vg/wiki/SV-Genotyping-and-variant-calling) - - [`vg rna`](#rna) transciptomic analyses. [wiki page](https://github.com/vgteam/vg/wiki/Transcriptomic-analyses). Also see [rpvg](https://github.com/jonassibbesen/rpvg) + - [`vg rna`](#rna) construct splicing graphs and pantranscriptomes. [wiki page](https://github.com/vgteam/vg/wiki/Transcriptomic-analyses). Also see [rpvg](https://github.com/jonassibbesen/rpvg) - [`vg deconstruct`](#deconstruct) create a VCF from variation in the graph. [wiki page](https://github.com/vgteam/vg/wiki/VCF-export-with-vg-deconstruct) - **Working with read alignments** - [`vg gamsort`](#gamsort) sort a GAM/GAF file or index a sorted GAM file. From 7e52da447dd3bc4248a34ae6db5c2f50cb21e7c8 Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 28 Nov 2024 15:50:33 +0100 Subject: [PATCH 04/14] Add the man page to the readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 200ecff311c..20e8f959aac 100644 --- a/README.md +++ b/README.md @@ -501,6 +501,8 @@ Most commands allow the streaming of graphs into and out of `vg`. ### Command line interface +See the [man-page](https://github.com/vgteam/vg/blob/docmd/doc/man.md) + A variety of commands are available: - *autoindex*: construct graphs and indexes for other tools from common interchange file formats From af691b46e7cc1e0df6fed2c2ca3aaf7c3e04b284 Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 28 Nov 2024 16:05:42 +0100 Subject: [PATCH 05/14] Trim down man page --- doc/man.md | 24 ++++++++++++++---------- doc/vgmanmd.desc.md | 10 ---------- doc/vgmanmd.py | 2 +- 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/doc/man.md b/doc/man.md index e4a87c88ca0..d97aa2b4c23 100644 --- a/doc/man.md +++ b/doc/man.md @@ -351,16 +351,6 @@ options: Filter alignments by properties. -`vg filter --tsv-out` can be used to produce a TSV file of user-specified fields from the GAM file. For example, - -`vg filter --tsv-out "name;mapping_quality" ` - -is the equivalent of - -`vg view -aj | jq -r '[.name,.mapping_quality] | @tsv'` - -To find which fields are stored in a GAM file, use [`vg view`](#view) to view the GAM as a JSON file. - @@ -798,6 +788,20 @@ snarl distance index options ``` +## inject + + +``` +usage: vg inject -x graph.xg [options] input.[bam|sam|cram] >output.gam + +options: + -x, --xg-name FILE use this graph or xg index (required, non-XG formats also accepted) + -o, --output-format NAME output the alignments in NAME format (gam / gaf / json) [gam] + -t, --threads N number of threads to use + +``` + + ## map diff --git a/doc/vgmanmd.desc.md b/doc/vgmanmd.desc.md index 7bb6227a166..3b769e9cf85 100644 --- a/doc/vgmanmd.desc.md +++ b/doc/vgmanmd.desc.md @@ -90,13 +90,3 @@ format conversions for graphs and alignments # filter Filter alignments by properties. - -`vg filter --tsv-out` can be used to produce a TSV file of user-specified fields from the GAM file. For example, - -`vg filter --tsv-out "name;mapping_quality" ` - -is the equivalent of - -`vg view -aj | jq -r '[.name,.mapping_quality] | @tsv'` - -To find which fields are stored in a GAM file, use [`vg view`](#view) to view the GAM as a JSON file. diff --git a/doc/vgmanmd.py b/doc/vgmanmd.py index 984994e26e7..833aa1b6a04 100644 --- a/doc/vgmanmd.py +++ b/doc/vgmanmd.py @@ -6,7 +6,7 @@ cmds = ['index', 'view', 'autoindex', 'pack', 'giraffe', 'map', 'call', 'mpmap', 'rna', 'chunk', 'stats', 'gbwt', 'paths', 'find', 'filter', 'construct', 'minimizer', 'haplotypes', 'deconstruct', 'convert', - 'gamsort', 'surject', 'mod', 'prune', 'ids', 'sim', 'annotate'] + 'gamsort', 'inject', 'surject', 'mod', 'prune', 'ids', 'sim', 'annotate'] cmds.sort() # parse short descriptions From e5aa032652846f044f242c84791f9cf4ae16233a Mon Sep 17 00:00:00 2001 From: Xian Date: Fri, 29 Nov 2024 19:14:40 +0100 Subject: [PATCH 06/14] Make man page a shield pointing to the wiki page --- README.md | 5 +- doc/man.md | 1391 ---------------------------------------------------- 2 files changed, 3 insertions(+), 1393 deletions(-) delete mode 100644 doc/man.md diff --git a/README.md b/README.md index 38912a4cf8d..82ea7562dbf 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ # vg [![Join the chat at https://gitter.im/vgteam/vg](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/vgteam/vg?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![Latest Release](https://img.shields.io/github/release/vgteam/vg.svg)](https://github.com/vgteam/vg/releases/latest) -[![Doxygen API Documentation](https://img.shields.io/badge/doxygen-docs-brightgreen.svg)](https://vgteam.github.io/vg/) +[![Doxygen API Documentation](https://img.shields.io/badge/doxygen-docs-firebrick.svg)](https://vgteam.github.io/vg/) +[![vg man page](https://img.shields.io/badge/manpage-seagreen.svg)](https://github.com/vgteam/vg/wiki/vg-manpage) ## variation graph data structures, interchange formats, alignment, genotyping, and variant calling methods @@ -501,7 +502,7 @@ Most commands allow the streaming of graphs into and out of `vg`. ### Command line interface -See the [man-page](https://github.com/vgteam/vg/blob/docmd/doc/man.md) +See the [man-page](https://github.com/vgteam/vg/wiki/vg-manpage) A variety of commands are available: diff --git a/doc/man.md b/doc/man.md deleted file mode 100644 index d97aa2b4c23..00000000000 --- a/doc/man.md +++ /dev/null @@ -1,1391 +0,0 @@ -# vg manpage - -*Automatically made for vg version v1.61.0-36-g64d7e82e0 "Plodio".* - - - -This is a redundant and incomplete list of subcommands of vg, organized by common uses. For a complete list of subcommands, run `vg help`. - -For more in-depth explanations of tools and workflows, see the [vg wiki page](https://github.com/vgteam/vg/wiki) - -- **Graph construction and indexing** - See the [wiki page](https://github.com/vgteam/vg/wiki/Index-Types) for an overview of vg indexes. - - [`vg autoindex`](#autoindex) automatically construct a graph and indexes for a specific workflow (e.g. giraffe, rpvg). [wiki page](https://github.com/vgteam/vg/wiki/Automatic-indexing-for-read-mapping-and-downstream-inference) - - [`vg construct`](#construct) manually construct a graph from a reference and variants. [wiki page](https://github.com/vgteam/vg/wiki/Construction) - - [`vg index`](#index) manually build individual indexes (xg, distance, GCSA, etc). [wiki page](https://github.com/vgteam/vg/wiki/Index-Construction) - - [`vg gbwt`](#gbwt) manually build and manipulate GBWTs and indexes (GBWTgraph, GBZ, r-index). [wiki page](https://github.com/vgteam/vg/wiki/VG-GBWT-Subcommand) - - [`vg minimizer`](#minimizer) manually build a minimizer index for mapping. - - [`vg haplotypes`](#haplotypes) haplotype sample a graph. Recommended for mapping with giraffe. [wiki page](https://github.com/vgteam/vg/wiki/Haplotype-Sampling) -- **Read mapping** - - [`vg giraffe`](#giraffe) fast haplotype-aware short read alignment. [wiki page](https://github.com/vgteam/vg/wiki/Mapping-short-reads-with-Giraffe) - - [`vg mpmap`](#mpmap) splice-aware multipath alignment of short reads. [wiki page](https://github.com/vgteam/vg/wiki/Multipath-alignments-and-vg-mpmap) - - [`vg map`](#map) MEM-based read alignment. [wiki page](https://github.com/vgteam/vg/wiki/Working-with-a-whole-genome-variation-graph) -- **Downstream analyses** - - [`vg pack`](#pack) convert alignments to a compact coverage index. Used with [vg call](#call) - - [`vg call`](#call) call or genotype VCF variants. Uses [vg pack](#pack). [wiki page](https://github.com/vgteam/vg/wiki/SV-Genotyping-and-variant-calling) - - [`vg rna`](#rna) construct splicing graphs and pantranscriptomes. [wiki page](https://github.com/vgteam/vg/wiki/Transcriptomic-analyses). Also see [rpvg](https://github.com/jonassibbesen/rpvg) - - [`vg deconstruct`](#deconstruct) create a VCF from variation in the graph. [wiki page](https://github.com/vgteam/vg/wiki/VCF-export-with-vg-deconstruct) -- **Working with read alignments** - - [`vg gamsort`](#gamsort) sort a GAM/GAF file or index a sorted GAM file. - - [`vg filter`](#filter) filter alignments by properties. - - [`vg surject`](#surject) project alignments on a graph onto a linear reference (gam/gaf->bam/sam/cram). - - [`vg inject`](#inject) project alignments on a linear reference onto a graph (bam/sam/cram->gam/gaf). - - [`vg sim`](#sim) simulate reads from a graph. [wiki page](https://github.com/vgteam/vg/wiki/Simulating-reads-with-vg-sim) -- **Graph and read statistics** - - [`vg stats`](#stats) get stats about the graph. - - [`vg paths`](#paths) get stats about the paths. [wiki page](https://github.com/vgteam/vg/wiki/Path-Metadata-Model) - - [`vg gbwt`](#gbwt) get stats about a GBWT. - - [`vg filter`](#filter) get stats about alignments (use `--tsv-out`). -- **Manipulating a graph** - - [`vg mod`](#mod) filter, transform, and edit the graph. - - [`vg prune`](#prune) prune the graph for GCSA2 indexing. - - [`vg ids`](#ids) manipulate graph node ids. - - [`vg paths`](#paths) manipulate paths in a graph. - - [`vg gbwt`](#gbwt) manipulate GBWTs and associated indexes. [wiki page](https://github.com/vgteam/vg/wiki/VG-GBWT-Subcommand) - - [`vg annotate`](#annotate) annotate a graph or alignments. -- **Conversion between formats** - - [`vg convert`](#convert) convert between handle graph formats and GFA, and between alignment formats. - - [`vg view`](#view) convert between non-handle graph formats and alignment formats (dot, json, turtle...). - - [`vg surject`](#surject) project alignments on a graph onto a linear reference (gam/gaf->bam/sam/cram). - - [`vg inject`](#inject) project alignments on a linear reference onto a graph (bam/sam/cram->gam/gaf). - - [`vg paths`](#paths) extract a fasta from a graph. [wiki page](https://github.com/vgteam/vg/wiki/Extracting-a-FASTA-from-a-Graph) -- **Subgraph extraction** - - [`vg chunk`](#chunk) split a graph and/or alignment into smaller chunks. - - [`vg find`](#find) use an index to find nodes, edges, kmers, paths, or positions. - - - - - -## annotate - - - -Annotate alignments with graphs and graphs with alignments. - - - - - -``` -usage: vg annotate [options] >output.{gam,vg,tsv} -graph annotation options: - -x, --xg-name FILE xg index or graph to annotate (required) - -b, --bed-name FILE a BED file to convert to GAM. May repeat. - -f, --gff-name FILE a GFF3 file to convert to GAM. May repeat. - -g, --ggff output at GGFF subgraph annotation file instead of GAM (requires -s) - -F, --gaf-output output in GAF format rather than GAM - -s, --snarls FILE file containing snarls to expand GFF intervals into -alignment annotation options: - -a, --gam FILE file of Alignments to annotate (required) - -x, --xg-name FILE xg index of the graph against which the Alignments are aligned (required) - -p, --positions annotate alignments with reference positions - -m, --multi-position annotate alignments with multiple reference positions - -l, --search-limit N when annotating with positions, search this far for paths (default: read length) - -b, --bed-name FILE annotate alignments with overlapping region names from this BED. May repeat. - -n, --novelty output TSV table with header describing how much of each Alignment is novel - -t, --threads use the specified number of threads - -``` - - -## autoindex - - - -Mapping tool-oriented index construction from interchange formats. - - - - - -``` -usage: vg autoindex [options] -options: - output: - -p, --prefix PREFIX prefix to use for all output (default: index) - -w, --workflow NAME workflow to produce indexes for, can be provided multiple - times. options: map, mpmap, rpvg, giraffe (default: map) - input data: - -r, --ref-fasta FILE FASTA file containing the reference sequence (may repeat) - -v, --vcf FILE VCF file with sequence names matching -r (may repeat) - -i, --ins-fasta FILE FASTA file with sequences of INS variants from -v - -g, --gfa FILE GFA file to make a graph from - -x, --tx-gff FILE GTF/GFF file with transcript annotations (may repeat) - -H, --hap-tx-gff FILE GTF/GFF file with transcript annotations of a named haplotype (may repeat) - configuration: - -f, --gff-feature STR GTF/GFF feature type (col. 3) to add to graph (default: exon) - -a, --gff-tx-tag STR GTF/GFF tag (in col. 9) for transcript ID (default: transcript_id) - logging and computation: - -T, --tmp-dir DIR temporary directory to use for intermediate files - -M, --target-mem MEM target max memory usage (not exact, formatted INT[kMG]) - (default: 1/2 of available) - -t, --threads NUM number of threads (default: all available) - -V, --verbosity NUM log to stderr (0 = none, 1 = basic, 2 = debug; default 1) - -h, --help print this help message to stderr and exit - -``` - - -## call - - -``` -usage: vg call [options] > output.vcf -Call variants or genotype known variants - -support calling options: - -k, --pack FILE Supports created from vg pack for given input graph - -m, --min-support M,N Minimum allele support (M) and minimum site support (N) for call [default = 2,4] - -e, --baseline-error X,Y Baseline error rates for Poisson model for small (X) and large (Y) variants [default= 0.005,0.01] - -B, --bias-mode Use old ratio-based genotyping algorithm as opposed to porbablistic model - -b, --het-bias M,N Homozygous alt/ref allele must have >= M/N times more support than the next best allele [default = 6,6] -GAF options: - -G, --gaf Output GAF genotypes instead of VCF - -T, --traversals Output all candidate traversals in GAF without doing any genotyping - -M, --trav-padding N Extend each flank of traversals (from -T) with reference path by N bases if possible -general options: - -v, --vcf FILE VCF file to genotype (must have been used to construct input graph with -a) - -a, --genotype-snarls Genotype every snarl, including reference calls (use to compare multiple samples) - -A, --all-snarls Genotype all snarls, including nested child snarls (like deconstruct -a) - -c, --min-length N Genotype only snarls with at least one traversal of length >= N - -C, --max-length N Genotype only snarls where all traversals have length <= N - -f, --ref-fasta FILE Reference fasta (required if VCF contains symbolic deletions or inversions) - -i, --ins-fasta FILE Insertions fasta (required if VCF contains symbolic insertions) - -s, --sample NAME Sample name [default=SAMPLE] - -r, --snarls FILE Snarls (from vg snarls) to avoid recomputing. - -g, --gbwt FILE Only call genotypes that are present in given GBWT index. - -z, --gbz Only call genotypes that are present in GBZ index (applies only if input graph is GBZ). - -N, --translation FILE Node ID translation (as created by vg gbwt --translation) to apply to snarl names in output - -O, --gbz-translation Use the ID translation from the input gbz to apply snarl names to snarl names and AT fields in output - -p, --ref-path NAME Reference path to call on (multipile allowed. defaults to all paths) - -S, --ref-sample NAME Call on all paths with given sample name (cannot be used with -p) - -o, --ref-offset N Offset in reference path (multiple allowed, 1 per path) - -l, --ref-length N Override length of reference in the contig field of output VCF - -d, --ploidy N Ploidy of sample. Only 1 and 2 supported. (default: 2) - -R, --ploidy-regex RULES use the given comma-separated list of colon-delimited REGEX:PLOIDY rules to assign - ploidies to contigs not visited by the selected samples, or to all contigs simulated - from if no samples are used. Unmatched contigs get ploidy 2 (or that from -d). - -n, --nested Activate nested calling mode (experimental) - -I, --chains Call chains instead of snarls (experimental) - --progress Show progress - -t, --threads N number of threads to use - -``` - - -## chunk - - -``` -usage: vg chunk [options] > [chunk.vg] -Splits a graph and/or alignment into smaller chunks - -Graph chunks are saved to .vg files, read chunks are saved to .gam files, and haplotype annotations are -saved to .annotate.txt files, of the form ----.. -The BASENAME is specified with -b and defaults to "./chunks". -For a single-range chunk (-p or -r), the graph data is sent to standard output instead of a file. - -options: - -x, --xg-name FILE use this graph or xg index to chunk subgraphs - -G, --gbwt-name FILE use this GBWT haplotype index for haplotype extraction (for -T) - -a, --gam-name FILE chunk this gam file instead of the graph (multiple allowed) - -g, --gam-and-graph when used in combination with -a, both gam and graph will be chunked - -F, --in-gaf input alignment is a sorted bgzipped GAF -path chunking: - -p, --path TARGET write the chunk in the specified (0-based inclusive, multiple allowed) - path range TARGET=path[:pos1[-pos2]] to standard output - -P, --path-list FILE write chunks for all path regions in (line - separated file). format - for each as in -p (all paths chunked unless otherwise specified) - -e, --input-bed FILE write chunks for all (0-based end-exclusive) bed regions - -S, --snarls FILE write given path-range(s) and all snarls fully contained in them, as alternative to -c -id range chunking: - -r, --node-range N:M write the chunk for the specified node range to standard output - -R, --node-ranges FILE write the chunk for each node range in (newline or whitespace separated) file - -n, --n-chunks N generate this many id-range chunks, which are determined using the xg index -simple gam chunking: - -m, --gam-split-size N split gam (specified with -a, sort/index not required) up into chunks with at most N reads each -component chunking: - -C, --components create a chunk for each connected component. if a targets given with (-p, -P, -r, -R), limit to components containing them - -M, --path-components create a chunk for each path in the graph's connected component -general: - -s, --chunk-size N create chunks spanning N bases (or nodes with -r/-R) for all input regions. - -o, --overlap N overlap between chunks when using -s [0] - -E, --output-bed FILE write all created chunks to a bed file - -b, --prefix BASENAME write output chunk files with the given base name. Files for chunk i will - be named: ----. [./chunk] - -c, --context-steps N expand the context of the chunk this many node steps [1] - -l, --context-length N expand the context of the chunk by this many bp [0] - -T, --trace trace haplotype threads in chunks (and only expand forward from input coordinates). - Produces a .annotate.txt file with haplotype frequencies for each chunk. - --no-embedded-haplotypes Don't load haplotypes from the graph. It is possible to -T without any haplotypes available. - -f, --fully-contained only return GAM alignments that are fully contained within chunk - -O, --output-fmt Specify output format (vg, pg, hg, gfa). [pg (vg with -T)] - -t, --threads N for tasks that can be done in parallel, use this many threads [1] - -h, --help - -``` - - -## construct - - -``` -usage: vg construct [options] >new.vg -options: -construct from a reference and variant calls: - -r, --reference FILE input FASTA reference (may repeat) - -v, --vcf FILE input VCF (may repeat) - -n, --rename V=F match contig V in the VCFs to contig F in the FASTAs (may repeat) - -a, --alt-paths save paths for alts of variants by SHA1 hash - -A, --alt-paths-plain save paths for alts of variants by variant ID if possible, otherwise SHA1 - (IDs must be unique across all input VCFs) - -R, --region REGION specify a VCF contig name or 1-based inclusive region (may repeat, if on different contigs) - -C, --region-is-chrom don't attempt to parse the regions (use when the reference - sequence name could be inadvertently parsed as a region) - -z, --region-size N variants per region to parallelize (default: 1024) - -t, --threads N use N threads to construct graph (defaults to numCPUs) - -S, --handle-sv include structural variants in construction of graph. - -I, --insertions FILE a FASTA file containing insertion sequences - (referred to in VCF) to add to graph. - -f, --flat-alts N don't chop up alternate alleles from input VCF - -l, --parse-max N don't chop up alternate alleles from input VCF longer than N (default: 100) - -i, --no-trim-indels don't remove the 1bp reference base from alt alleles of indels. - -N, --in-memory construct the entire graph in memory before outputting it. -construct from a multiple sequence alignment: - -M, --msa FILE input multiple sequence alignment - -F, --msa-format format of the MSA file (options: fasta, clustal; default fasta) - -d, --drop-msa-paths don't add paths for the MSA sequences into the graph -shared construction options: - -m, --node-max N limit the maximum allowable node sequence size (default: 32) - nodes greater than this threshold will be divided - Note: nodes larger than ~1024 bp can't be GCSA2-indexed - -p, --progress show progress - -``` - - -## convert - - - -Convert graphs between handle-graph compliant formats as well as GFA. - - - - - -``` -usage: vg convert [options] -input options: - -g, --gfa-in input in GFA format - -r, --in-rgfa-rank N import rgfa tags with rank <= N as paths [default=0] - -b, --gbwt-in FILE input graph is a GBWTGraph using the GBWT in FILE - --ref-sample STR change haplotypes for this sample to reference paths (may repeat) -gfa input options (use with -g): - -T, --gfa-trans FILE write gfa id conversions to FILE -output options: - -v, --vg-out output in VG's original Protobuf format [DEPRECATED: use -p instead]. - -a, --hash-out output in HashGraph format - -p, --packed-out output in PackedGraph format [default] - -x, --xg-out output in XG format - -f, --gfa-out output in GFA format - -H, --drop-haplotypes do not include haplotype paths in the output - (useful with GBWTGraph / GBZ inputs) -gfa output options (use with -f): - -P, --rgfa-path STR write given path as rGFA tags instead of lines - (multiple allowed, only rank-0 supported) - -Q, --rgfa-prefix STR write paths with given prefix as rGFA tags instead of lines - (multiple allowed, only rank-0 supported) - -B, --rgfa-pline paths written as rGFA tags also written as lines - -W, --no-wline Write all paths as GFA P-lines instead of W-lines. - Allows handling multiple phase blocks and subranges used together. - --gbwtgraph-algorithm Always use the GBWTGraph library GFA algorithm. - Not compatible with other GFA output options or non-GBWT graphs. - --vg-algorithm Always use the VG GFA algorithm. Works with all options and graph types, - but can't preserve original GFA coordinates. - --no-translation When using the GBWTGraph algorithm, convert the graph directly to GFA. - Do not use the translation to preserve original coordinates. -alignment options: - -G, --gam-to-gaf FILE convert GAM FILE to GAF - -F, --gaf-to-gam FILE convert GAF FILE to GAM -general options: - -t, --threads N use N threads (defaults to numCPUs) - -``` - - -## deconstruct - - -``` -usage: vg deconstruct [options] [-p|-P] -Outputs VCF records for Snarls present in a graph (relative to a chosen reference path). -options: - -p, --path NAME A reference path to deconstruct against (multiple allowed). - -P, --path-prefix NAME All paths [excluding GBWT threads / non-reference GBZ paths] beginning with NAME used as reference (multiple allowed). - Other non-ref paths not considered as samples. - -r, --snarls FILE Snarls file (from vg snarls) to avoid recomputing. - -g, --gbwt FILE consider alt traversals that correspond to GBWT haplotypes in FILE (not needed for GBZ graph input). - -T, --translation FILE Node ID translation (as created by vg gbwt --translation) to apply to snarl names and AT fields in output - -O, --gbz-translation Use the ID translation from the input gbz to apply snarl names to snarl names and AT fields in output - -a, --all-snarls Process all snarls, including nested snarls (by default only top-level snarls reported). - -c, --context-jaccard N Set context mapping size used to disambiguate alleles at sites with multiple reference traversals (default: 10000). - -u, --untangle-travs Use context mapping to determine the reference-relative positions of each step in allele traversals (AP INFO field). - -K, --keep-conflicted Retain conflicted genotypes in output. - -S, --strict-conflicts Drop genotypes when we have more than one haplotype for any given phase (set by default when using GBWT input). - -C, --contig-only-ref Only use the CONTIG name (and not SAMPLE#CONTIG#HAPLOTYPE etc) for the reference if possible (ie there is only one reference sample). - -L, --cluster F Cluster traversals whose (handle) Jaccard coefficient is >= F together (default: 1.0) [experimental] - -n, --nested Write a nested VCF, including special tags. [experimental] - -R, --star-allele Use *-alleles to denote alleles that span but do not cross the site. Only works with -n - -t, --threads N Use N threads - -v, --verbose Print some status messages - - -``` - - -## filter - - - -Filter alignments by properties. - - - - -``` -vg: invalid option -- 'h' -usage: vg filter [options] > out.gam -Filter alignments by properties. - -options: - -M, --input-mp-alns input is multipath alignments (GAMP) rather than GAM - -n, --name-prefix NAME keep only reads with this prefix in their names [default=''] - -N, --name-prefixes FILE keep reads with names with one of many prefixes, one per nonempty line - -e, --exact-name match read names exactly instead of by prefix - -a, --subsequence NAME keep reads that contain this subsequence - -A, --subsequences FILE keep reads that contain one of these subsequences, one per nonempty line - -p, --proper-pairs keep reads that are annotated as being properly paired - -P, --only-mapped keep reads that are mapped - -X, --exclude-contig REGEX drop reads with refpos annotations on contigs matching the given regex (may repeat) - -F, --exclude-feature NAME drop reads with the given feature in the "features" annotation (may repeat) - -s, --min-secondary N minimum score to keep secondary alignment - -r, --min-primary N minimum score to keep primary alignment - -O, --rescore re-score reads using default parameters and only alignment information - -f, --frac-score normalize score based on length - -u, --substitutions use substitution count instead of score - -o, --max-overhang N filter reads whose alignments begin or end with an insert > N [default=99999] - -m, --min-end-matches N filter reads that don't begin with at least N matches on each end - -S, --drop-split remove split reads taking nonexistent edges - -x, --xg-name FILE use this xg index or graph (required for -S and -D) - -v, --verbose print out statistics on numbers of reads filtered by what. - -V, --no-output print out statistics (as above) but do not write out filtered GAM. - -T, --tsv-out FIELD[;FIELD] do not write filtered gam but a tsv of the given fields - -q, --min-mapq N filter alignments with mapping quality < N - -E, --repeat-ends N filter reads with tandem repeat (motif size <= 2N, spanning >= N bases) at either end - -D, --defray-ends N clip back the ends of reads that are ambiguously aligned, up to N bases - -C, --defray-count N stop defraying after N nodes visited (used to keep runtime in check) [default=99999] - -d, --downsample S.P filter out all but the given portion 0.P of the reads. S may be an integer seed as in SAMtools - -i, --interleaved assume interleaved input. both ends will be filtered out if either fails filter - -I, --interleaved-all assume interleaved input. both ends will be filtered out if *both* fail filters - -b, --min-base-quality Q:F filter reads with where fewer than fraction F bases have base quality >= PHRED score Q. - -B, --annotation K[:V] keep reads if the annotation is present. If a value is given, keep reads if the values are equal - similar to running jq 'select(.annotation.K==V)' on the json - -c, --correctly-mapped keep only reads that are marked as correctly-mapped - -U, --complement apply the complement of the filter implied by the other arguments. - -t, --threads N number of threads [1] - -``` - - -## find - - - -Use an index to find nodes, edges, kmers, paths, or positions. - - - - - -``` -usage: vg find [options] >sub.vg -options: -graph features: - -x, --xg-name FILE use this xg index or graph (instead of rocksdb db) - -n, --node ID find node(s), return 1-hop context as graph - -N, --node-list FILE a white space or line delimited list of nodes to collect - --mapping FILE also include nodes that map to the selected node ids - -e, --edges-end ID return edges on end of node with ID - -s, --edges-start ID return edges on start of node with ID - -c, --context STEPS expand the context of the subgraph this many steps - -L, --use-length treat STEPS in -c or M in -r as a length in bases - -P, --position-in PATH find the position of the node (specified by -n) in the given path - -I, --list-paths write out the path names in the index - -r, --node-range N:M get nodes from N to M - -G, --gam GAM accumulate the graph touched by the alignments in the GAM - --connecting-start POS find the graph connecting from POS (node ID, + or -, node offset) to --connecting-end - --connecting-end POS find the graph connecting to POS (node ID, + or -, node offset) from --connecting-start - --connecting-range INT traverse up to INT bases when going from --connecting-start to --connecting-end (default: 100) -subgraphs by path range: - -p, --path TARGET find the node(s) in the specified path range(s) TARGET=path[:pos1[-pos2]] - -R, --path-bed FILE read our targets from the given BED FILE - -E, --path-dag with -p or -R, gets any node in the partial order from pos1 to pos2, assumes id sorted DAG - -W, --save-to PREFIX instead of writing target subgraphs to stdout, - write one per given target to a separate file named PREFIX[path]:[start]-[end].vg - -K, --subgraph-k K instead of graphs, write kmers from the subgraphs - -H, --gbwt FILE when enumerating kmers from subgraphs, determine their frequencies in this GBWT haplotype index -alignments: - -l, --sorted-gam FILE use this sorted, indexed GAM file - -F, --sorted-gaf FILE use this sorted, indexed GAF file - -o, --alns-on N:M write alignments which align to any of the nodes between N and M (inclusive) - -A, --to-graph VG get alignments to the provided subgraph -sequences: - -g, --gcsa FILE use this GCSA2 index of the sequence space of the graph (required for sequence queries) - -S, --sequence STR search for sequence STR using - -M, --mems STR describe the super-maximal exact matches of the STR (gcsa2) in JSON - -B, --reseed-length N find non-super-maximal MEMs inside SMEMs of length at least N - -f, --fast-reseed use fast SMEM reseeding algorithm - -Y, --max-mem N the maximum length of the MEM (default: GCSA2 order) - -Z, --min-mem N the minimum length of the MEM (default: 1) - -D, --distance return distance on path between pair of nodes (-n). if -P not used, best path chosen heurstically - -Q, --paths-named S return all paths whose names are prefixed with S (multiple allowed) - -``` - - -## gamsort - - -``` -gamsort: sort a GAM/GAF file, or index a sorted GAM file -Usage: gamsort [Options] gamfile -Options: - -i / --index FILE produce an index of the sorted GAM file - -d / --dumb-sort use naive sorting algorithm (no tmp files, faster for small GAMs) - -p / --progress Show progress. - -G / --gaf-input Input is a GAF file. - -c / --chunk-size Number of reads per chunk when sorting GAFs. - -t / --threads Use the specified number of threads. - - -``` - - -## gbwt - - -``` -usage: vg gbwt [options] [args] - -Manipulate GBWTs. Input GBWTs are loaded from input args or built in earlier steps. -The input graph is provided with one of -x, -G, or -Z - -General options: - -x, --xg-name FILE read the graph from FILE - -o, --output FILE write output GBWT to FILE - -d, --temp-dir DIR use directory DIR for temporary files - -p, --progress show progress and statistics - -GBWT construction parameters (for steps 1 and 4): - --buffer-size N GBWT construction buffer size in millions of nodes (default 100) - --id-interval N store path ids at one out of N positions (default 1024) - -Multithreading: - --num-jobs N use at most N parallel build jobs (for -v, -G, -A, -l, -P; default 4) - --num-threads N use N parallel search threads (for -b and -r; default 8) - -Step 1: GBWT construction (requires -o and one of { -v, -G, -Z, -E, A }): - -v, --vcf-input index the haplotypes in the VCF files specified in input args in parallel - (inputs must be over different contigs; requires -x, implies -f) - (does not store graph contigs in the GBWT) - --preset X use preset X (available: 1000gp) - --inputs-as-jobs create one build job for each input instead of using first-fit heuristic - --parse-only store the VCF parses without building GBWTs - (use -o for the file name prefix; skips subsequent steps) - --ignore-missing do not warn when variants are missing from the graph - --actual-phasing do not interpret unphased homozygous genotypes as phased - --force-phasing replace unphased genotypes with randomly phased ones - --discard-overlaps skip overlapping alternate alleles if the overlap cannot be resolved - instead of creating a phase break - --batch-size N index the haplotypes in batches of N samples (default 200) - --sample-range X-Y index samples X to Y (inclusive, 0-based) - --rename V=P VCF contig V matches path P in the graph (may repeat) - --vcf-variants variants in the graph use VCF contig names instead of path names - --vcf-region C:X-Y restrict VCF contig C to coordinates X to Y (inclusive, 1-based; may repeat) - --exclude-sample X do not index the sample with name X (faster than -R; may repeat) - -G, --gfa-input index the walks or paths in the GFA file (one input arg) - --max-node N chop long segments into nodes of at most N bp (default 1024, use 0 to disable) - --path-regex X parse metadata as haplotypes from path names using regex X instead of vg-parser-compatible rules - --path-fields X parse metadata as haplotypes, mapping regex submatches to these fields instead of using vg-parser-compatible rules - --translation FILE write the segment to node translation table to FILE - -Z, --gbz-input extract GBWT and GBWTGraph from GBZ input (one input arg) - --translation FILE write the segment to node translation table to FILE - -I, --gg-in FILE load GBWTGraph from FILE and GBWT from input (one input arg) - -E, --index-paths index the embedded non-alt paths in the graph (requires -x, no input args) - -A, --alignment-input index the alignments in the GAF files specified in input args (requires -x) - --gam-format the input files are in GAM format instead of GAF format - -Step 2: Merge multiple input GBWTs (requires -o): - -m, --merge use the insertion algorithm - -f, --fast fast merging algorithm (node ids must not overlap) - -b, --parallel use the parallel algorithm - --chunk-size N search in chunks of N sequences (default 1) - --pos-buffer N use N MiB position buffers for each search thread (default 64) - --thread-buffer N use N MiB thread buffers for each search thread (default 256) - --merge-buffers N merge 2^N thread buffers into one file per merge job (default 6) - --merge-jobs N run N parallel merge jobs (default 4) - -Step 3: Alter GBWT (requires -o and one input GBWT): - -R, --remove-sample X remove the sample with name X from the index (may repeat) - --set-tag K=V set a GBWT tag (may repeat) - --set-reference X set sample X as the reference (may repeat) - -Step 4: Path cover GBWT construction (requires an input graph, -o, and one of { -a, -l, -P }): - -a, --augment-gbwt add a path cover of missing components (one input GBWT) - -l, --local-haplotypes sample local haplotypes (one input GBWT) - -P, --path-cover build a greedy path cover (no input GBWTs) - -n, --num-paths N find N paths per component (default 64 for -l, 16 otherwise) - -k, --context-length N use N-node contexts (default 4) - --pass-paths include named graph paths in local haplotype or greedy path cover GBWT - -Step 5: GBWTGraph construction (requires an input graph and one input GBWT): - -g, --graph-name FILE build GBWTGraph and store it in FILE - --gbz-format serialize both GBWT and GBWTGraph in GBZ format (makes -o unnecessary) - -Step 6: R-index construction (one input GBWT): - -r, --r-index FILE build an r-index and store it in FILE - -Step 7: Metadata (one input GBWT): - -M, --metadata print basic metadata - -C, --contigs print the number of contigs - -H, --haplotypes print the number of haplotypes - -S, --samples print the number of samples - -L, --list-names list contig/sample names (use with -C or -S) - -T, --path-names list path names - --tags list GBWT tags - -Step 8: Paths (one input GBWT): - -c, --count-paths print the number of paths - -e, --extract FILE extract paths in SDSL format to FILE - - -``` - - -## giraffe - - -``` -usage: - vg giraffe -Z graph.gbz [-d graph.dist -m graph.min] [other options] > output.gam - vg giraffe -Z graph.gbz --haplotype-name graph.hapl --kff-name sample.kff [other options] > output.gam - -Fast haplotype-aware short read mapper. - -basic options: - -Z, --gbz-name FILE map to this GBZ graph - -d, --dist-name FILE cluster using this distance index - -m, --minimizer-name FILE use this minimizer index - -p, --progress show progress - -t, --threads INT number of mapping threads to use - -b, --parameter-preset NAME set computational parameters (fast / default) [default] - -h, --help print full help with all available options -input options: - -G, --gam-in FILE read and realign GAM-format reads from FILE - -f, --fastq-in FILE read and align FASTQ-format reads from FILE (two are allowed, one for each mate) - -i, --interleaved GAM/FASTQ input is interleaved pairs, for paired-end alignment -haplotype sampling: - --haplotype-name FILE sample from haplotype information in FILE - --kff-name FILE sample according to kmer counts in FILE - --index-basename STR name prefix for generated graph/index files (default: from graph name) -alternate graphs: - -x, --xg-name FILE map to this graph (if no -Z / -g), or use this graph for HTSLib output - -g, --graph-name FILE map to this GBWTGraph (if no -Z) - -H, --gbwt-name FILE use this GBWT index (when mapping to -x / -g) -output options: - -N, --sample NAME add this sample name - -R, --read-group NAME add this read group - -o, --output-format NAME output the alignments in NAME format (gam / gaf / json / tsv / SAM / BAM / CRAM) [gam] - --ref-paths FILE ordered list of paths in the graph, one per line or HTSlib .dict, for HTSLib @SQ headers - --named-coordinates produce GAM/GAF outputs in named-segment (GFA) space - -P, --prune-low-cplx prune short and low complexity anchors during linear format realignment - -n, --discard discard all output alignments (for profiling) - --output-basename NAME write output to a GAM file beginning with the given prefix for each setting combination - --report-name NAME write a TSV of output file and mapping speed to the given file - --show-work log how the mapper comes to its conclusions about mapping locations -Giraffe parameters: - -A, --rescue-algorithm NAME use algorithm NAME for rescue (none / dozeu / gssw) [dozeu] - --fragment-mean FLOAT force the fragment length distribution to have this mean (requires --fragment-stdev) - --fragment-stdev FLOAT force the fragment length distribution to have this standard deviation (requires --fragment-mean) - --track-provenance track how internal intermediate alignment candidates were arrived at - --track-correctness track if internal intermediate alignment candidates are correct (implies --track-provenance) - -B, --batch-size INT number of reads or pairs per batch to distribute to threads [512] -program options: - --watchdog-timeout INT complain after INT seconds working on a read or read pair [10] -scoring options: - --match INT use this match score [1] - --mismatch INT use this mismatch penalty [4] - --gap-open INT use this gap open penalty [6] - --gap-extend INT use this gap extension penalty [1] - --full-l-bonus INT the full-length alignment bonus [5] -result options: - -M, --max-multimaps INT produce up to INT alignments for each read [1] -computational parameters: - -c, --hit-cap INT use all minimizers with at most INT hits [10] - -C, --hard-hit-cap INT ignore all minimizers with more than INT hits [500] - -F, --score-fraction FLOAT select minimizers between hit caps until score is FLOAT of total [0.9] - -U, --max-min INT use at most INT minimizers [500] - --num-bp-per-min INT use maximum of number minimizers calculated by READ_LENGTH / INT and --max-min [1000] - -D, --distance-limit INT cluster using this distance limit [200] - -e, --max-extensions INT extend up to INT clusters [800] - -a, --max-alignments INT align up to INT extensions [8] - -s, --cluster-score FLOAT only extend clusters if they are within INT of the best score [50] - -S, --pad-cluster-score FLOAT also extend clusters within INT of above threshold to get a second-best cluster [20] - -u, --cluster-coverage FLOAT only extend clusters if they are within FLOAT of the best read coverage [0.3] - -v, --extension-score INT only align extensions if their score is within INT of the best score [1] - -w, --extension-set FLOAT only align extension sets if their score is within INT of the best score [20] - -O, --no-dp disable all gapped alignment - -r, --rescue-attempts INT attempt up to INT rescues per read in a pair [15] - -L, --max-fragment-length INT assume that fragment lengths should be smaller than INT when estimating the fragment length distribution [2000] - --exclude-overlapping-min exclude overlapping minimizers - --paired-distance-limit FLOAT cluster pairs of read using a distance limit FLOAT standard deviations greater than the mean [2] - --rescue-subgraph-size FLOAT search for rescued alignments FLOAT standard deviations greater than the mean [4] - --rescue-seed-limit INT attempt rescue with at most INT seeds [100] -long-read/chaining parameters: - --align-from-chains chain up extensions to create alignments, instead of doing each separately - --chaining-cluster-distance INT maximum distance to cluster over before chaining [100] - --precluster-connection-coverage-threshold FLOAT threshold of precluster pair coverage below the base, after which to stop reseeding between preclusters [0.3] - --min-precluster-connections INT minimum number of precluster connections to reseed over [10] - --max-precluster-connections INT maximum number of precluster connections to reseed over [50] - --max-lookback-bases INT maximum distance to look back when chaining [100] - --min-lookback-items INT minimum items to consider coming from when chaining [1] - --lookback-item-hard-cap INT maximum items to consider coming from when chaining [15] - --chain-score-threshold FLOAT only align chains if their score is within this many points of the best score [100] - --min-chains INT ignore score threshold to get this many chains aligned [1] - --chain-min-score INT do not align chains with less than this score [100] - --max-chain-connection INT maximum distance across which to connect seeds when aligning a chain [100] - --max-tail-length INT maximum length of a tail to align before forcing softclipping when aligning a chain [100] - --max-dp-cells INT maximum number of alignment cells to allow in a tail with GSSW [16777216] - -``` - - -## haplotypes - - -``` -Usage: - vg haplotypes [options] -k kmers.kff -g output.gbz graph.gbz - vg haplotypes [options] -H output.hapl graph.gbz - vg haplotypes [options] -i graph.hapl -k kmers.kff -g output.gbz graph.gbz - vg haplotypes [options] -i graph.hapl --vcf-input variants.vcf graph.gbz > output.tsv - vg haplotypes [options] -i graph.hapl -k kmers.kff --extract M:N graph.gbz > output.fa - -Haplotype sampling based on kmer counts. - -Output files: - -g, --gbz-output X write the output GBZ to X - -H, --haplotype-output X write haplotype information to X - -Input files: - -d, --distance-index X use this distance index (default: .dist) - -r, --r-index X use this r-index (default: .ri) - -i, --haplotype-input X use this haplotype information (default: generate) - -k, --kmer-input X use kmer counts from this KFF file (required for --gbz-output) - -Options for generating haplotype information: - --kmer-length N kmer length for building the minimizer index (default: 29) - --window-length N window length for building the minimizer index (default: 11) - --subchain-length N target length (in bp) for subchains (default: 10000) - --linear-structure extend subchains to avoid haplotypes visiting them multiple times - -Options for sampling haplotypes: - --preset X use preset X (default, haploid, diploid) - --coverage N kmer coverage in the KFF file (default: estimate) - --num-haplotypes N generate N haplotypes (default: 4) - sample from N candidates (with --diploid-sampling; default: 32) - --present-discount F discount scores for present kmers by factor F (default: 0.9) - --het-adjustment F adjust scores for heterozygous kmers by F (default: 0.05) - --absent-score F score absent kmers -F/+F (default: 0.8) - --haploid-scoring use a scoring model without heterozygous kmers - --diploid-sampling choose the best pair from the sampled haplotypes - --include-reference include named and reference paths in the output - -Other options: - -v, --verbosity N verbosity level (0 = silent, 1 = basic, 2 = detailed, 3 = debug; default: 0) - -t, --threads N approximate number of threads (default: 8 on this system) - -Developer options: - --validate validate the generated information (may be slow) - --vcf-input X map the variants in VCF file X to subchains - --contig-prefix X a prefix for transforming VCF contig names into GBWT contig names - --extract M:N extract haplotypes in chain M, subchain N in FASTA format - --score-output X write haplotype scores to X - --classify X classify kmers and write output to X - - -``` - - -## ids - - - -Manipulate node ids. - - - - - -``` -usage: vg ids [options] [graph2.vg ...] >new.vg -options: - -c, --compact minimize the space of integers used by the ids - -i, --increment N increase ids by N - -d, --decrement N decrease ids by N - -j, --join make a joint id space for all the graphs that are supplied - by iterating through the supplied graphs and incrementing - their ids to be non-conflicting (modifies original files) - -m, --mapping FILE create an empty node mapping for vg prune - -s, --sort assign new node IDs in (generalized) topological sort order - -``` - - -## index - - -``` -usage: vg index [options] [graph2.vg ...] -Creates an index on the specified graph or graphs. All graphs indexed must -already be in a joint ID space. -general options: - -b, --temp-dir DIR use DIR for temporary files - -t, --threads N number of threads to use - -p, --progress show progress -xg options: - -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing - -L, --xg-alts include alt paths in xg -gcsa options: - -g, --gcsa-out FILE output a GCSA2 index to the given file - -f, --mapping FILE use this node mapping in GCSA2 construction - -k, --kmer-size N index kmers of size N in the graph (default 16) - -X, --doubling-steps N use this number of doubling steps for GCSA2 construction (default 4) - -Z, --size-limit N limit temporary disk space usage to N gigabytes (default 2048) - -V, --verify-index validate the GCSA2 index using the input kmers (important for testing) -gam indexing options: - -l, --index-sorted-gam input is sorted .gam format alignments, store a GAI index of the sorted GAM in INPUT.gam.gai -vg in-place indexing options: - --index-sorted-vg input is ID-sorted .vg format graph chunks, store a VGI index of the sorted vg in INPUT.vg.vgi -snarl distance index options - -j --dist-name FILE use this file to store a snarl-based distance index - --snarl-limit N don't store snarl distances for snarls with more than N nodes (default 10000) - if N is 0 then don't store distances, only the snarl tree - --no-nested-distance only store distances along the top-level chain - -``` - - -## inject - - -``` -usage: vg inject -x graph.xg [options] input.[bam|sam|cram] >output.gam - -options: - -x, --xg-name FILE use this graph or xg index (required, non-XG formats also accepted) - -o, --output-format NAME output the alignments in NAME format (gam / gaf / json) [gam] - -t, --threads N number of threads to use - -``` - - -## map - - -``` -vg: invalid option -- 'h' -usage: vg map [options] -d idxbase -f in1.fq [-f in2.fq] >aln.gam -Align reads to a graph. - -graph/index: - -d, --base-name BASE use BASE.xg and BASE.gcsa as the input index pair - -x, --xg-name FILE use this xg index or graph (defaults to .vg.xg) - -g, --gcsa-name FILE use this GCSA2 index (defaults to .gcsa) - -1, --gbwt-name FILE use this GBWT haplotype index (defaults to .gbwt) -algorithm: - -t, --threads N number of compute threads to use - -k, --min-mem INT minimum MEM length (if 0 estimate via -e) [0] - -e, --mem-chance FLOAT set {-k} such that this fraction of {-k} length hits will by chance [5e-4] - -c, --hit-max N ignore MEMs who have >N hits in our index (0 for no limit) [2048] - -Y, --max-mem INT ignore mems longer than this length (unset if 0) [0] - -r, --reseed-x FLOAT look for internal seeds inside a seed longer than FLOAT*--min-seed [1.5] - -u, --try-up-to INT attempt to align up to the INT best candidate chains of seeds (1/2 for paired) [128] - -l, --try-at-least INT attempt to align at least the INT best candidate chains of seeds [1] - -E, --approx-mq-cap INT weight MQ by suffix tree based estimate when estimate less than FLOAT [0] - --id-mq-weight N scale mapping quality by the alignment score identity to this power [2] - -W, --min-chain INT discard a chain if seeded bases shorter than INT [0] - -C, --drop-chain FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [0.45] - -n, --mq-overlap FLOAT scale MQ by count of alignments with this overlap in the query with the primary [0] - -P, --min-ident FLOAT accept alignment only if the alignment identity is >= FLOAT [0] - -H, --max-target-x N skip cluster subgraphs with length > N*read_length [100] - -w, --band-width INT band width for long read alignment [256] - -O, --band-overlap INT band overlap for long read alignment [{-w}/8] - -J, --band-jump INT the maximum number of bands of insertion we consider in the alignment chain model [128] - -B, --band-multi INT consider this many alignments of each band in banded alignment [16] - -Z, --band-min-mq INT treat bands with less than this MQ as unaligned [0] - -I, --fragment STR fragment length distribution specification STR=m:μ:σ:o:d [5000:0:0:0:1] - max, mean, stdev, orientation (1=same, 0=flip), direction (1=forward, 0=backward) - -U, --fixed-frag-model don't learn the pair fragment model online, use {-I} without update - -p, --print-frag-model suppress alignment output and print the fragment model on stdout as per {-I} format - --frag-calc INT update the fragment model every INT perfect pairs [10] - --fragment-x FLOAT calculate max fragment size as frag_mean+frag_sd*FLOAT [10] - --mate-rescues INT attempt up to INT mate rescues per pair [64] - -S, --unpaired-cost INT penalty for an unpaired read pair [17] - --no-patch-aln do not patch banded alignments by locally aligning unaligned regions - --xdrop-alignment use X-drop heuristic (much faster for long-read alignment) - --max-gap-length maximum gap length allowed in each contiguous alignment (for X-drop alignment) [40] -scoring: - -q, --match INT use this match score [1] - -z, --mismatch INT use this mismatch penalty [4] - --score-matrix FILE read a 4x4 integer substitution scoring matrix from a file - -o, --gap-open INT use this gap open penalty [6] - -y, --gap-extend INT use this gap extension penalty [1] - -L, --full-l-bonus INT the full-length alignment bonus [5] - --drop-full-l-bonus remove the full length bonus from the score before sorting and MQ calculation - -a, --hap-exp FLOAT the exponent for haplotype consistency likelihood in alignment score [1] - --recombination-penalty FLOAT use this log recombination penalty for GBWT haplotype scoring [20.7] - -A, --qual-adjust perform base quality adjusted alignments (requires base quality input) -preset: - -m, --alignment-model STR use a preset alignment scoring model, either "short" (default) or "long" (for ONT/PacBio) - "long" is equivalent to `-u 2 -L 63 -q 1 -z 2 -o 2 -y 1 -w 128 -O 32` -input: - -s, --sequence STR align a string to the graph in graph.vg using partial order alignment - -V, --seq-name STR name the sequence using this value (for graph modification with new named paths) - -T, --reads FILE take reads (one per line) from FILE, write alignments to stdout - -b, --hts-input FILE align reads from htslib-compatible FILE (BAM/CRAM/SAM) stdin (-), alignments to stdout - -G, --gam-input FILE realign GAM input - -f, --fastq FILE input fastq or (2-line format) fasta, possibly compressed, two are allowed, one for each mate - -F, --fasta FILE align the sequences in a FASTA file that may have multiple lines per reference sequence - -i, --interleaved fastq or GAM is interleaved paired-ended - -N, --sample NAME for --reads input, add this sample - -R, --read-group NAME for --reads input, add this read group -output: - -j, --output-json output JSON rather than an alignment stream (helpful for debugging) - -%, --gaf output alignments in GAF format - --surject-to TYPE surject the output into the graph's paths, writing TYPE := bam |sam | cram - --ref-paths FILE ordered list of paths in the graph, one per line or HTSlib .dict, for HTSLib @SQ headers - --buffer-size INT buffer this many alignments together before outputting in GAM [512] - -X, --compare realign GAM input (-G), writing alignment with "correct" field set to overlap with input - -v, --refpos-table for efficient testing output a table of name, chr, pos, mq, score - -K, --keep-secondary produce alignments for secondary input alignments in addition to primary ones - -M, --max-multimaps INT produce up to INT alignments for each read [1] - -Q, --mq-max INT cap the mapping quality at INT [60] - --exclude-unaligned exclude reads with no alignment - -D, --debug print debugging information about alignment to stderr - --log-time print runtime to stderr - -``` - - -## minimizer - - -``` -usage: vg minimizer [options] -d graph.dist -o graph.min graph - -Builds a (w, k)-minimizer index or a (k, s)-syncmer index of the threads in the GBWT -index. The graph can be any HandleGraph, which will be transformed into a GBWTGraph. -The transformation can be avoided by providing a GBWTGraph or a GBZ graph. - -Required options: - -d, --distance-index X annotate the hits with positions in this distance index - -o, --output-name X store the index to file X - -Minimizer options: - -k, --kmer-length N length of the kmers in the index (default 29, max 31) - -w, --window-length N choose the minimizer from a window of N kmers (default 11) - -c, --closed-syncmers index closed syncmers instead of minimizers - -s, --smer-length N use smers of length N in closed syncmers (default 18) - -Weighted minimizers: - -W, --weighted use weighted minimizers - --threshold N downweight kmers with more than N hits (default 500) - --iterations N downweight frequent kmers by N iterations (default 3) - --fast-counting use the fast kmer counting algorithm (default) - --save-memory use the space-efficient kmer counting algorithm - --hash-table N use 2^N-cell hash tables for kmer counting (default: guess) - -Other options: - -l, --load-index X load the index from file X and insert the new kmers into it - (overrides minimizer / weighted minimizer options) - -g, --gbwt-name X use the GBWT index in file X (required with a non-GBZ graph) - -p, --progress show progress information - -t, --threads N use N threads for index construction (default 8) - (using more than 16 threads rarely helps) - --no-dist build the index without distance index annotations (not recommended) - - -``` - - -## mod - - -``` -usage: vg mod [options] >[mod.vg] -Modifies graph, outputs modified on stdout. - -options: - -P, --label-paths don't edit with -i alignments, just use them for labeling the graph - -c, --compact-ids should we sort and compact the id space? (default false) - -b, --break-cycles use an approximate topological sort to break cycles in the graph - -n, --normalize normalize the graph so that edges are always non-redundant - (nodes have unique starting and ending bases relative to neighbors, - and edges that do not introduce new paths are removed and neighboring - nodes are merged) - -U, --until-normal N iterate normalization until convergence, or at most N times - -z, --nomerge-pre STR do not let normalize (-n, -U) zip up any pair of nodes that both belong to path with prefix STR - -E, --unreverse-edges flip doubly-reversing edges so that they are represented on the - forward strand of the graph - -s, --simplify remove redundancy from the graph that will not change its path space - -d, --dagify-step N copy strongly connected components of the graph N times, forwarding - edges from old to new copies to convert the graph into a DAG - -w, --dagify-to N copy strongly connected components of the graph forwarding - edges from old to new copies to convert the graph into a DAG - until the shortest path through each SCC is N bases long - -L, --dagify-len-max N stop a dagification step if the unrolling component has this much sequence - -f, --unfold N represent inversions accessible up to N from the forward - component of the graph - -O, --orient-forward orient the nodes in the graph forward - -N, --remove-non-path keep only nodes and edges which are part of paths - -A, --remove-path keep only nodes and edges which are not part of any path - -k, --keep-path NAME keep only nodes and edges in the path - -R, --remove-null removes nodes that have no sequence, forwarding their edges - -g, --subgraph ID gets the subgraph rooted at node ID, multiple allowed - -x, --context N steps the subgraph out by N steps (default: 1) - -p, --prune-complex remove nodes that are reached by paths of --length which - cross more than --edge-max edges - -S, --prune-subgraphs remove subgraphs which are shorter than --length - -l, --length N for pruning complex regions and short subgraphs - -X, --chop N chop nodes in the graph so they are not more than N bp long - -u, --unchop where two nodes are only connected to each other and by one edge - replace the pair with a single node that is the concatenation of their labels - -e, --edge-max N only consider paths which make edge choices at <= this many points - -M, --max-degree N unlink nodes that have edge degree greater than N - -m, --markers join all head and tails nodes to marker nodes - ('###' starts and '$$$' ends) of --length, for debugging - -y, --destroy-node ID remove node with given id - -a, --cactus convert to cactus graph representation - -v, --sample-vcf FILE for a graph with allele paths, compute the sample graph from the given VCF - -G, --sample-graph FILE subset an augmented graph to a sample graph using a Locus file - -t, --threads N for tasks that can be done in parallel, use this many threads - -``` - - -## mpmap - - -``` -usage: vg mpmap [options] -x graph.xg -g index.gcsa [-f reads1.fq [-f reads2.fq] | -G reads.gam] > aln.gamp -Multipath align reads to a graph. - -basic options: -graph/index: - -x, --graph-name FILE graph (required; XG format recommended but other formats are valid, see `vg convert`) - -g, --gcsa-name FILE use this GCSA2/LCP index pair for MEMs (required; both FILE and FILE.lcp, see `vg index`) - -d, --dist-name FILE use this snarl distance index for clustering (recommended, see `vg index`) - -s, --snarls FILE align to alternate paths in these snarls (unnecessary if providing -d, see `vg snarls`) -input: - -f, --fastq FILE input FASTQ (possibly gzipped), can be given twice for paired ends (for stdin use -) - -i, --interleaved input contains interleaved paired ends -algorithm presets: - -n, --nt-type TYPE sequence type preset: 'DNA' for genomic data, 'RNA' for transcriptomic data [RNA] - -l, --read-length TYPE read length preset: 'very-short', 'short', or 'long' (approx. <50bp, 50-500bp, and >500bp) [short] - -e, --error-rate TYPE error rate preset: 'low' or 'high' (approx. PHRED >20 and <20) [low] -output: - -F, --output-fmt TYPE format to output alignments in: 'GAMP for' multipath alignments, 'GAM' or 'GAF' for single-path - alignments, 'SAM', 'BAM', or 'CRAM' for linear reference alignments (may also require -S) [GAMP] - -S, --ref-paths FILE paths in the graph either 1) one per line in a text file, or 2) in an HTSlib .dict, to treat as - reference sequences for HTSlib formats (see -F) [all paths] - -N, --sample NAME add this sample name to output - -R, --read-group NAME add this read group to output - -p, --suppress-progress do not report progress to stderr -computational parameters: - -t, --threads INT number of compute threads to use [all available] - -advanced options: -algorithm: - -X, --not-spliced do not form spliced alignments, even if aligning with --nt-type 'rna' - -M, --max-multimaps INT report (up to) this many mappings per read [10 rna / 1 dna] - -a, --agglomerate-alns combine separate multipath alignments into one (possibly disconnected) alignment - -r, --intron-distr FILE intron length distribution (from scripts/intron_length_distribution.py) - -Q, --mq-max INT cap mapping quality estimates at this much [60] - -b, --frag-sample INT look for this many unambiguous mappings to estimate the fragment length distribution [1000] - -I, --frag-mean FLOAT mean for a pre-determined fragment length distribution (also requires -D) - -D, --frag-stddev FLOAT standard deviation for a pre-determined fragment length distribution (also requires -I) - -G, --gam-input FILE input GAM (for stdin, use -) - -u, --map-attempts INT perform (up to) this many mappings per read (0 for no limit) [24 paired / 64 unpaired] - -c, --hit-max INT use at most this many hits for any match seeds (0 for no limit) [1024 DNA / 100 RNA] -scoring: - -A, --no-qual-adjust do not perform base quality adjusted alignments even when base qualities are available - -q, --match INT use this match score [1] - -z, --mismatch INT use this mismatch penalty [4 low error, 1 high error] - -o, --gap-open INT use this gap open penalty [6 low error, 1 high error] - -y, --gap-extend INT use this gap extension penalty [1] - -L, --full-l-bonus INT add this score to alignments that align each end of the read [mismatch+1 short, 0 long] - -w, --score-matrix FILE read a 4x4 integer substitution scoring matrix from a file (in the order ACGT) - -m, --remove-bonuses remove full length alignment bonuses in reported scores - -``` - - -## pack - - - -Convert alignments to a compact coverage index. - - - - - -``` -usage: vg pack [options] -options: - -x, --xg FILE use this basis graph (any format accepted, does not have to be xg) - -o, --packs-out FILE write compressed coverage packs to this output file - -i, --packs-in FILE begin by summing coverage packs from each provided FILE - -g, --gam FILE read alignments from this GAM file (could be '-' for stdin) - -a, --gaf FILE read alignments from this GAF file (could be '-' for stdin) - -d, --as-table write table on stdout representing packs - -D, --as-edge-table write table on stdout representing edge coverage - -u, --as-qual-table write table on stdout representing average node mapqs - -e, --with-edits record and write edits rather than only recording graph-matching coverage - -b, --bin-size N number of sequence bases per CSA bin [default: inf] - -n, --node ID write table for only specified node(s) - -N, --node-list FILE a white space or line delimited list of nodes to collect - -Q, --min-mapq N ignore reads with MAPQ < N and positions with base quality < N [default: 0] - -c, --expected-cov N expected coverage. used only for memory tuning [default : 128] - -s, --trim-ends N ignore the first and last N bases of each read - -t, --threads N use N threads (defaults to numCPUs) - -``` - - -## paths - - - -Traverse paths in the graph. - - - - - -``` -usage: vg paths [options] -options: - input: - -x, --xg FILE use the paths and haplotypes in this graph FILE. Supports GBZ haplotypes. - (Also accepts -v, --vg) - -g, --gbwt FILE use the threads in the GBWT index in FILE - (graph also required for most output options; -g takes priority over -x) - output graph (.vg format) - -V, --extract-vg output a path-only graph covering the selected paths - -d, --drop-paths output a graph with the selected paths removed - -r, --retain-paths output a graph with only the selected paths retained - -n, --normalize-paths output a graph where all equivalent paths in a site a merged (using selected paths to snap to if possible) - output path data: - -X, --extract-gam print (as GAM alignments) the stored paths in the graph - -A, --extract-gaf print (as GAF alignments) the stored paths in the graph - -L, --list print (as a list of names, one per line) the path (or thread) names - -E, --lengths print a list of path names (as with -L) but paired with their lengths - -M, --metadata print a table of path names and their metadata - -C, --cyclicity print a list of path names (as with -L) but paired with flag denoting the cyclicity - -F, --extract-fasta print the paths in FASTA format - -c, --coverage print the coverage stats for selected paths (not including cylces) - path selection: - -p, --paths-file FILE select the paths named in a file (one per line) - -Q, --paths-by STR select the paths with the given name prefix - -S, --sample STR select the haplotypes or reference paths for this sample - -a, --variant-paths select the variant paths added by 'vg construct -a' - -G, --generic-paths select the generic, non-reference, non-haplotype paths - -R, --reference-paths select the reference paths - -H, --haplotype-paths select the haplotype paths paths - configuration: - -o, --overlay apply a ReferencePathOverlayHelper to the graph - -t, --threads N number of threads to use [all available]. applies only to snarl finding within -n - -``` - - -## prune - - -``` -usage: vg prune [options] >[output.vg] - -Prunes the complex regions of the graph for GCSA2 indexing. Pruning the graph -removes embedded paths. - -Pruning parameters: - -k, --kmer-length N kmer length used for pruning - defaults: 24 with -P; 24 with -r; 24 with -u - -e, --edge-max N remove the edges on kmers making > N edge choices - defaults: 3 with -P; 3 with -r; 3 with -u - -s, --subgraph-min N remove subgraphs of < N bases - defaults: 33 with -P; 33 with -r; 33 with -u - -M, --max-degree N if N > 0, remove nodes with degree > N before pruning - defaults: 0 with -P; 0 with -r; 0 with -u - -Pruning modes (-P, -r, and -u are mutually exclusive): - -P, --prune simply prune the graph (default) - -r, --restore-paths restore the edges on non-alt paths - -u, --unfold-paths unfold non-alt paths and GBWT threads - -v, --verify-paths verify that the paths exist after pruning - (potentially very slow) - -Unfolding options: - -g, --gbwt-name FILE unfold the threads from this GBWT index - -m, --mapping FILE store the node mapping for duplicates in this file (required with -u) - -a, --append-mapping append to the existing node mapping - -Other options: - -p, --progress show progress - -t, --threads N use N threads (default: 8) - -d, --dry-run determine the validity of the combination of options - - -``` - - -## rna - - -``` - -usage: vg rna [options] graph.[vg|pg|hg|gbz] > splicing_graph.[vg|pg|hg] - -General options: - -t, --threads INT number of compute threads to use [1] - -p, --progress show progress - -h, --help print help message - -Input options: - -n, --transcripts FILE transcript file(s) in gtf/gff format; may repeat - -m, --introns FILE intron file(s) in bed format; may repeat - -y, --feature-type NAME parse only this feature type in the gtf/gff (parses all if empty) [exon] - -s, --transcript-tag NAME use this attribute tag in the gtf/gff file(s) as id [transcript_id] - -l, --haplotypes FILE project transcripts onto haplotypes in GBWT index file - -z, --gbz-format input graph is in GBZ format (contains both a graph and haplotypes (GBWT index)) - -Construction options: - -j, --use-hap-ref use haplotype paths in GBWT index as reference sequences (disables projection) - -e, --proj-embed-paths project transcripts onto embedded haplotype paths - -c, --path-collapse TYPE collapse identical transcript paths across no|haplotype|all paths [haplotype] - -k, --max-node-length INT chop nodes longer than maximum node length (0 disables chopping) [0] - -d, --remove-non-gene remove intergenic and intronic regions (deletes all paths in the graph) - -o, --do-not-sort do not topological sort and compact the graph - -r, --add-ref-paths add reference transcripts as embedded paths in the graph - -a, --add-hap-paths add projected transcripts as embedded paths in the graph - -Output options: - -b, --write-gbwt FILE write pantranscriptome transcript paths as GBWT index file - -v, --write-hap-gbwt FILE write input haplotypes as a GBWT with node IDs matching the output graph - -f, --write-fasta FILE write pantranscriptome transcript sequences as fasta file - -i, --write-info FILE write pantranscriptome transcript info table as tsv file - -q, --out-exclude-ref exclude reference transcripts from pantranscriptome output - -g, --gbwt-bidirectional use bidirectional paths in GBWT index construction - - -``` - - -## sim - - -``` -usage: vg sim [options] -Samples sequences from the xg-indexed graph. - -basic options: - -x, --xg-name FILE use the graph in FILE (required) - -n, --num-reads N simulate N reads or read pairs - -l, --read-length N simulate reads of length N - -r, --progress show progress information -output options: - -a, --align-out write alignments in GAM-format - -J, --json-out write alignments in json - --multi-position annotate alignments with multiple reference positions -simulation parameters: - -F, --fastq FILE match the error profile of NGS reads in FILE, repeat for paired reads (ignores -l,-f) - -I, --interleaved reads in FASTQ (-F) are interleaved read pairs - -s, --random-seed N use this specific seed for the PRNG - -e, --sub-rate FLOAT base substitution rate (default 0.0) - -i, --indel-rate FLOAT indel rate (default 0.0) - -d, --indel-err-prop FLOAT proportion of trained errors from -F that are indels (default 0.01) - -S, --scale-err FLOAT scale trained error probabilities from -F by this much (default 1.0) - -f, --forward-only don't simulate from the reverse strand - -p, --frag-len N make paired end reads with given fragment length N - -v, --frag-std-dev FLOAT use this standard deviation for fragment length estimation - -N, --allow-Ns allow reads to be sampled from the graph with Ns in them - --max-tries N attempt sampling operations up to N times before giving up [100] - -t, --threads number of compute threads (only when using FASTQ with -F) [1] -simulate from paths: - -P, --path PATH simulate from this path (may repeat; cannot also give -T) - -A, --any-path simulate from any path (overrides -P) - -m, --sample-name NAME simulate from this sample (may repeat; requires -g) - -R, --ploidy-regex RULES use the given comma-separated list of colon-delimited REGEX:PLOIDY rules to assign - ploidies to contigs not visited by the selected samples, or to all contigs simulated - from if no samples are used. Unmatched contigs get ploidy 2. - -g, --gbwt-name FILE use samples from this GBWT index - -T, --tx-expr-file FILE simulate from an expression profile formatted as RSEM output (cannot also give -P) - -H, --haplo-tx-file FILE transcript origin info table from vg rna -i (required for -T on haplotype transcripts) - -u, --unsheared sample from unsheared fragments - -E, --path-pos-file FILE output a TSV with sampled position on path of each read (requires -F) - -``` - - -## stats - - -``` -usage: vg stats [options] [] -options: - -z, --size size of graph - -N, --node-count number of nodes in graph - -E, --edge-count number of edges in graph - -l, --length length of sequences in graph - -L, --self-loops number of self-loops - -s, --subgraphs describe subgraphs of graph - -H, --heads list the head nodes of the graph - -T, --tails list the tail nodes of the graph - -e, --nondeterm list the nondeterministic edge sets - -c, --components print the strongly connected components of the graph - -A, --is-acyclic print if the graph is acyclic or not - -n, --node ID consider node with the given id - -d, --to-head show distance to head for each provided node - -t, --to-tail show distance to head for each provided node - -a, --alignments FILE compute stats for reads aligned to the graph - -r, --node-id-range X:Y where X and Y are the smallest and largest node id in the graph, respectively - -o, --overlap PATH for each overlapping path mapping in the graph write a table: - PATH, other_path, rank1, rank2 - multiple allowed; limit comparison to those provided - -O, --overlap-all print overlap table for the cartesian product of paths - -R, --snarls print statistics for each snarl - --snarl-contents print out a table of - -C, --chains print statistics for each chain - -F, --format graph format from {VG-Protobuf, PackedGraph, HashGraph, XG}. Can't detect Protobuf if graph read from stdin - -D, --degree-dist print degree distribution of the graph. - -b, --dist-snarls FILE print the sizes and depths of the snarls in a given distance index. - -p, --threads N number of threads to use [all available] - -v, --verbose output longer reports - -``` - - -## surject - - -``` -usage: vg surject [options] >[proj.cram] -Transforms alignments to be relative to particular paths. - -options: - -x, --xg-name FILE use this graph or xg index (required) - -t, --threads N number of threads to use - -p, --into-path NAME surject into this path or its subpaths (many allowed, default: reference, then non-alt generic) - -F, --into-paths FILE surject into path names listed in HTSlib sequence dictionary or path list FILE - -i, --interleaved GAM is interleaved paired-ended, so when outputting HTS formats, pair reads - -M, --multimap include secondary alignments to all overlapping paths instead of just primary - -G, --gaf-input input file is GAF instead of GAM - -m, --gamp-input input file is GAMP instead of GAM - -c, --cram-output write CRAM to stdout - -b, --bam-output write BAM to stdout - -s, --sam-output write SAM to stdout - -l, --subpath-local let the multipath mapping surjection produce local (rather than global) alignments - -T, --max-tail-len N only align up to N bases of read tails (default: 10000) - -P, --prune-low-cplx prune short and low complexity anchors during realignment - -a, --max-anchors N use no more than N anchors per target path (default: unlimited) - -S, --spliced interpret long deletions against paths as spliced alignments - -A, --qual-adj adjust scoring for base qualities, if they are available - -N, --sample NAME set this sample name for all reads - -R, --read-group NAME set this read group for all reads - -f, --max-frag-len N reads with fragment lengths greater than N will not be marked properly paired in SAM/BAM/CRAM - -L, --list-all-paths annotate SAM records with a list of all attempted re-alignments to paths in SS tag - -C, --compression N level for compression [0-9] - -V, --no-validate skip checking whether alignments plausibly are against the provided graph - -w, --watchdog-timeout N warn when reads take more than the given number of seconds to surject - -``` - - -## view - - - -format conversions for graphs and alignments - - - - - -``` -usage: vg view [options] [ | | | [] ] -options: - -g, --gfa output GFA format (default) - -F, --gfa-in input GFA format, reducing overlaps if they occur - -v, --vg output VG format [DEPRECATED, use vg convert instead] - -V, --vg-in input VG format only - -j, --json output JSON format - -J, --json-in input JSON format - -c, --json-stream streaming conversion of a VG format graph in line delimited JSON format - (this cannot be loaded directly via -J) - -G, --gam output GAM format (vg alignment format: Graph Alignment/Map) - -Z, --translation-in input is a graph translation description - -t, --turtle output RDF/turtle format (can not be loaded by VG) - -T, --turtle-in input turtle format. - -r, --rdf_base_uri set base uri for the RDF output - -a, --align-in input GAM format - -A, --aln-graph GAM add alignments from GAM to the graph - -q, --locus-in input stream is Locus format - -z, --locus-out output stream Locus format - -Q, --loci FILE input is Locus format for use by dot output - -d, --dot output dot format - -S, --simple-dot simplify the dot output; remove node labels, simplify alignments - -u, --noseq-dot shows size information instead of sequence in the dot output - -e, --ascii-labels use labels for paths or superbubbles with char/colors rather than emoji - -Y, --ultra-label label nodes with emoji/colors that correspond to ultrabubbles - -m, --skip-missing skip mappings to nodes not in the graph when drawing alignments - -C, --color color nodes that are not in the reference path (DOT OUTPUT ONLY) - -p, --show-paths show paths in dot output - -w, --walk-paths add labeled edges to represent paths in dot output - -n, --annotate-paths add labels to normal edges to represent paths in dot output - -M, --show-mappings with -p print the mappings in each path in JSON - -I, --invert-ports invert the edge ports in dot so that ne->nw is reversed - -s, --random-seed N use this seed when assigning path symbols in dot output - -b, --bam input BAM or other htslib-parseable alignments - -f, --fastq-in input fastq (output defaults to GAM). Takes two - positional file arguments if paired - -X, --fastq-out output fastq (input defaults to GAM) - -i, --interleaved fastq is interleaved paired-ended - -L, --pileup output VG Pileup format - -l, --pileup-in input VG Pileup format - -B, --distance-in input distance index - -R, --snarl-in input VG Snarl format - -E, --snarl-traversal-in input VG SnarlTraversal format - -K, --multipath-in input VG MultipathAlignment format (GAMP) - -k, --multipath output VG MultipathAlignment format (GAMP) - -D, --expect-duplicates don't warn if encountering the same node or edge multiple times - -x, --extract-tag TAG extract and concatenate messages with the given tag - --verbose explain the file being read with --extract-tag - --threads N for parallel operations use this many threads [1] - -``` - - From 725c0aff61a0c32f263b2361db7c611e7dede232 Mon Sep 17 00:00:00 2001 From: Xian Date: Fri, 29 Nov 2024 19:17:20 +0100 Subject: [PATCH 07/14] Update manpage writing script to not write the title since it's in the wiki page --- doc/vgmanmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/vgmanmd.py b/doc/vgmanmd.py index 833aa1b6a04..b4a99931448 100644 --- a/doc/vgmanmd.py +++ b/doc/vgmanmd.py @@ -27,7 +27,7 @@ desc_inf.close() # start page -print('# vg manpage') +#print('# vg manpage') # get vg version ret = subprocess.run(['vg', 'version'], capture_output=True) From 138980ab4b92518c43247c507f36fc57ef20bc9c Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 2 Dec 2024 12:47:16 -0500 Subject: [PATCH 08/14] Replace old manpage system and tie new one into make man --- Makefile | 6 +- doc/README.md | 4 +- doc/asciidoc/man/vg-giraffe.adoc | 203 ------------------------------- doc/asciidoc/man/vg-test.adoc | 154 ----------------------- doc/asciidoc/man/vg-version.adoc | 67 ---------- doc/vgmanmd.py | 1 + doc/wiki | 2 +- 7 files changed, 7 insertions(+), 430 deletions(-) delete mode 100644 doc/asciidoc/man/vg-giraffe.adoc delete mode 100644 doc/asciidoc/man/vg-test.adoc delete mode 100644 doc/asciidoc/man/vg-version.adoc mode change 100644 => 100755 doc/vgmanmd.py diff --git a/Makefile b/Makefile index b2f25faaffc..2c81ff30944 100644 --- a/Makefile +++ b/Makefile @@ -544,10 +544,10 @@ docs: $(SRC_DIR)/*.cpp $(SRC_DIR)/*.hpp $(ALGORITHMS_SRC_DIR)/*.cpp $(ALGORITHMS doxygen echo "View documentation at: file://$(PWD)/doc/doxygen/index.html" -man: $(patsubst doc/asciidoc/man/%.adoc,doc/man/%.1,$(wildcard doc/asciidoc/man/*.adoc)) +man: doc/wiki/vg-manpage.md -doc/man/%.1: doc/asciidoc/man/%.adoc - asciidoctor -b manpage -d manpage -o $@ $< +doc/wiki/vg-manpage.md: $(BIN_DIR)/$(EXE) doc/vgmanmd.desc.md doc/vgmanmd.py + cd doc && ./vgmanmd.py > wiki/vg-manpage.md.tmp && mv wiki/vg-manpage.md.tmp wiki/vg-manpage.md # Hack to use gshuf or shuf as appropriate to the platform when testing $(BIN_DIR)/shuf: diff --git a/doc/README.md b/doc/README.md index 0a2626842bd..c69e1c6e5de 100644 --- a/doc/README.md +++ b/doc/README.md @@ -4,10 +4,10 @@ Make a markdown document with the usage messages of (selected) `vg` subcommands. Calls the `vg` command, so it will match the version available in the command line. ```sh -python3 vgmanmd.py > man.md +python3 vgmanmd.py > wiki/vg-manpage.md ``` -Then copy the markdown content to a wiki page, for example, or move the markdown page to somewhere in the vg repo? +Then commit and push the changes to the wiki submodule, or copy the markdown content to the [wiki page](https://github.com/vgteam/vg/wiki/vg-manpage). ## Edit descriptions diff --git a/doc/asciidoc/man/vg-giraffe.adoc b/doc/asciidoc/man/vg-giraffe.adoc deleted file mode 100644 index 05d538c3192..00000000000 --- a/doc/asciidoc/man/vg-giraffe.adoc +++ /dev/null @@ -1,203 +0,0 @@ -= vg-giraffe(1) -vgteam contributors -v1.20.0 -:doctype: manpage -:manmanual: vg -:mansource: vg -:man-linkstyle: pass:[blue R < >] - -== Name - -vg-giraffe - map unpaired short reads using minimizers and gapless extension - -== Synopsis - -*vg giraffe* [_OPTION_]... [_FASTA_ [_VCF_]] > output.gam - -== Arguments - -_FASTA_:: - Specify a FASTA file to build the graph from. Must have an extension *.fa*, *.fna*, or *.fasta*, with optional *.gz*. The name without extension is used as the _basename_ under which to look for index files with their own extensions, if *-x*/*--xg-name* is not specified. If omitted, *-x*/*--xg-name* is required. - -_VCF_:: - Variant Call Format file containing phased haplotypes, used to build the graph and haplotype database (GBWT) if those are not themselves provided. Must have a *.vcf.gz* extension, and an associated *.vcf.gz.tbi* index file. If omitted, a graph and GBWT must already exist and must be provided, either explicitly with *-x*/*--xg-name* and *-H*/*--gbwt-name*, or via the _FASTA_ or *-x*/*--xg-name* derived _basename_. - -_TAG_:: - Specify a collection of tests to run, via []-enclosed tag. Tag may need to be quoted to avoid being interpreted as a shell wildcard character class. - -== Options - -*-x*:: -*--xg-name*=_FILE_:: - Use this xg index or graph. The file name without extension is also used as the _basename_ for finding indexes, overriding any FASTA-derived _basename_. If omitted, _FASTA_ is required. If not specified, will load _basename.vg_ and create that file if not present. - -*-g*:: -*--graph-name*=_FILE_:: - Load this GBWTGraph. If not specified, will load _basename.gg_ and create that file if not present. - -*-H*:: -*--gbwt-name*=_FILE_:: - Use this GBWT index. If not specified, will load _basename.gbwt_ and create that file if not present. - -*-m*:: -*--minimizer-name*=_FILE_:: - Use this minimizer index. If not specified, will load _basename.min_ and create that file if not present. - -*-d*:: -*--dist-name*=_FILE_:: - Cluster using this distance index. If not specified, will load _basename.dist_ and create that file if not present. - -*-p*:: -*--progress*:: - Show progress - -*-G*:: -*--gam-in*=_FILE_:: - Read and realign GAM-format reads from FILE (may repeat) - -*-f*:: -*--fastq-in*=_FILE_:: - Read and align FASTQ-format reads from FILE (may repeat) - -*-i*:: -*--interleaved*:: - GAM/FASTQ input is interleaved pairs, for paired-end alignment - -*-M*:: -*--max-multimap*=_INT_:: - Produce up to INT alignments for each read [1] - -*-N*:: -*--sample*=_NAME_:: - Add this sample name - -*-R*:: -*--read-group*=_NAME_:: - Add this read group - -*-n*:: -*--discard*:: - Discard all output alignments (for profiling) - -*--output-basename*=_NAME_:: - Write output to a GAM file beginning with the given prefix for each setting combination - -*--report-name*=_NAME_:: - Write a TSV of output file and mapping speed to the given file - -*-c*:: -*--hit-cap*=_INT_:: - Use all minimizers with at most INT hits [10] - -*-C*:: -*--hard-hit-cap*=_INT_:: - Use all minimizers with at most INT hits [10] - -*-F*:: -*--score-fraction*=_FLOAT_:: - Select minimizers between hit caps until score is FLOAT of total [0.6] - -*-D*:: -*--distance-limit*=_INT_:: - Cluster using this distance limit [200] - -*-e*:: -*--max-extensions*=_INT_:: - Extend up to INT clusters [48] - -*-a*:: -*--max-alignments*=_INT_:: - Align up to INT clusters [8] - -*-s*:: -*--cluster-score*=_INT_:: - Only extend clusters if they are within INT of the best score [50] - -*-u*:: -*--cluster-coverage*=_FLOAT_:: - Only extend clusters if they are within INT of the best read coverage [0.4] - -*-v*:: -*--extension-score*=_INT_:: - Only align extensions if their score is within INT of the best score [1] - -*-w*:: -*--extension-set*=_INT_:: - Only align extension sets if their score is within extension-set of the best score [20] - -*-O*:: -*--no-dp*:: - Disable all gapped alignment - -*--track-provenance*:: - Track how internal intermediate alignment candidates were arrived at - -*--track-correctness*:: - Track if internal intermediate alignment candidates are correct (implies --track-provenance) - -*-t*:: -*--threads*=_INT_:: - Number of compute threads to use - - -== Description - -*vg gaffe* is a fast (experimental) algorithm to map reads to a graph. -It is specialized for low-error-rate short reads. -Giraffe uses minimizers of the graph's haplotypes and gapless extension to map the reads. -Because the graph is expected to contain a relatively complete inventory of a certain type of variation, gapless alignment is sufficient to align most reads and a more expensive gapped alignment step is required for only a minority of cases. - -*vg gaffe* requires four input files to define the reference: A graph or GBWTGraph, a GBWT index, a minimizer index, and a distance index. -Each can also be automatically produced by *vg gaffe*, given the requisite input files. -The graph and indexes can be produced automatically if _FASTA_ and _VCF_ are specified. -The _basename_ is a file path derived from the graph file (specified by *-x*/*--xg-name*), or from the _FASTA_ argument if no graph file is specified. It is combined with an extension for each index type to produce the filename from which that index will be loaded, or to which it will be saved if it is constructed. - -Because indexing is resource-intensive, the graph and indexes can be manually constructed in advance. -The graph can be built wiht *vg construct*. -Indexes can be manually built with *vg index* and *vg minimizer*, as well as *vg snarls* to provide the snarls file needed for the distance index. -If desired, the GBWTgraph can also be pre-generated with *vg gbwt*. - -When building the graph with *vg construct* for use with *vg gaffe*, it is important to provide the *-a* option in order to embed the variant information necessary to later build the GBWT. - -When building snarls with *vg snarls*, it is important to provide the *-T*/*--include-trivial* option to include trivial snarls, which are required when building the distance index. - -== Examples - -To map reads to an indexed graph and write the alignment to a gam file: - ----- -$ vg gaffe -x reference.xg -H reference.gbwt -m reference.min -d reference.dist -G reads.gam > mapped.gam ----- - -Same as above, but implicitly finding other indexes using the graph's filename: - ----- -$ vg gaffe -x reference.xg -G reads.gam > mapped.gam ----- - -To map reads building all indexes dynamically, if not found, from a FASTA and indexed VCF: - ----- -$ vg gaffe reference.fa phased_haplotypes.vcf.gz -G reads.gam > mapped.gam ----- - -Same as above, but manually pre-building the graph and all indexes, and providing the graph to define _basename_: - ----- -$ vg construct -a -r reference.fa -v phased_haplotypes.vcf.gz >reference.vg -$ vg index -G reference.gbwt -v phased_haplotypes.vcf.gz reference.vg -$ vg snarls --include-trivial reference.vg > reference.snarls -$ vg index -s reference.snarls -j reference.dist reference.vg -$ vg minimizer -k 29 -w 11 -g reference.gbwt -i reference.min reference.vg -$ vg gbwt -g reference.gg -x reference.vg reference.gbwt -$ vg gaffe -x reference.vg -G reads.gam > mapped.gam ----- - -== See Also -*vg*(1) - -== Copyright - -Copyright (C) 2020 {author}. - -Free use of this documentation is granted under the terms of the MIT License. diff --git a/doc/asciidoc/man/vg-test.adoc b/doc/asciidoc/man/vg-test.adoc deleted file mode 100644 index 1c6a6d6a05b..00000000000 --- a/doc/asciidoc/man/vg-test.adoc +++ /dev/null @@ -1,154 +0,0 @@ -= vg-test(1) -vgteam contributors -v1.20.0 -:doctype: manpage -:manmanual: vg -:mansource: vg -:man-linkstyle: pass:[blue R < >] - -== Name - -vg-test - run internal vg unit tests - -== Synopsis - -*vg test* [_TESTNAME_ | _PATTERN_ | _TAG_]... [_OPTION_]... - -== Arguments - -_TESTNAME_:: - Specify a test to run, by full name. - -_PATTERN_:: - Specify a collection of tests to run, via regular expression match. - -_TAG_:: - Specify a collection of tests to run, via []-enclosed tag. Tag may need to be quoted to avoid being interpreted as a shell wildcard character class. - -== Options - -*-?*:: -*-h*:: -*--help*:: - display usage information - -*-l*:: -*--list-tests*:: - list all/matching test cases - -*-t*:: -*--list-tags*:: - list all/matching tags - -*-s*:: -*--success*:: - include successful tests in output - -*-b*:: -*--break*:: - break into debugger on failure - -*-e*:: -*--nothrow*:: - skip exception tests - -*-i*:: -*--invisibles*:: - show invisibles (tabs, newlines) - -*-o*:: -*--out*=_FILENAME_:: - output filename - -*-r*:: -*--reporter*=_NAME_:: - reporter to use (defaults to console) - -*-n*:: -*--name*=_NAME_:: - suite name - -*-a*:: -*--abort*:: - abort at first failure - -*-x*:: -*--abortx*=_NUM_:: - abort after _NUM_ failures - -*-w*:: -*--warn*=_NAME_:: - enable warnings - -*-d*:: -*--durations*=[_yes_|_no_]:: - show test durations - -*-f*:: -*--input-file*=_FILE_:: - load test names to run from a file - -*-#*:: -*--filenames-as-tags*:: - adds a tag for the filename - -*-c*:: -*--section*=_NAME_:: - specify section to run - -*--list-test-names-only*:: - list all/matching test cases names only - -*--list-reporters*:: - list all reporters - -*--order*=[_decl_|_lex_|_rand_]:: - test case order (defaults to _decl_) - -*--rng-seed*=[_time_|_NUM_]:: - set a specific seed for random numbers - -*--force-colour*:: - force colourised output (deprecated) - -*--use-colour*=[_yes_|_no_]:: - should output be colourised - -== Description - -When run without options or arguments, *vg test* runs all unit tests compiled into the *vg* binary. - -Particular tests can be selected by name, by pattern match on name, or by tag (between _[_ and _]_), by specifying the selectors as arguments. If multiple selectors are specified, only tests matching all of the selectors will be run. - -The tool supports all options provided by the Catch 1.x testing framework. - -See https://github.com/catchorg/Catch2/blob/Catch1.x/docs/command-line.md for more information on Catch's available options. - -== Examples - -To run all tests: - ----- -vg test ----- - -To see all available test tags: - ----- -vg test --list-tags ----- - -To run only tests tagged with _[a-star]_: - ----- -vg test [a-star] ----- - -== See Also -*vg*(1) - -== Copyright - -Copyright (C) 2020 {author}. - -Free use of this documentation is granted under the terms of the MIT License. diff --git a/doc/asciidoc/man/vg-version.adoc b/doc/asciidoc/man/vg-version.adoc deleted file mode 100644 index 50032b2406a..00000000000 --- a/doc/asciidoc/man/vg-version.adoc +++ /dev/null @@ -1,67 +0,0 @@ -= vg-version(1) -vgteam contributors -v1.20.0 -:doctype: manpage -:manmanual: vg -:mansource: vg -:man-linkstyle: pass:[blue R < >] - -== Name - -vg-version - get version and build information about vg - -== Synopsis - -*vg version* [_OPTION_]... - -== Options - -*-s*:: -*--slug*:: - Print only the one-line, whitespace-free version string (e.g. _v1.20.0-70-g472e24c9c_), for use in scripts. - -*-h*:: -*--help*:: - Print help about the *vg version* command and its supported options. - -== Description - -When run without options, *vg version* outputs information about the version of *vg* that is running, including: - -* The most recent released version on which your *vg* is based -* The number of commits since that version (if not itself a released version) -* The Git commit hash (if not itself a released version) -* The compiler that was used to build *vg* -* The OS that was used to build *vg* -* The C++ standard library that *vg* was linked against -* The user name and host name that built *vg* - -When run with the *-s* option, *vg version* prints just the release and Git commit information. - -== Examples - -To print all version information (human-readable): - ----- -$ vg version -vg version v1.20.0-70-g472e24c9c "Ginestra" -Compiled with g++ (GCC) 8.1.0 on Linux -Linked against libstd++ 20180502 -Built by anovak@courtyard ----- - -To print just the short "`version slug`": - ----- -$ vg version -s -v1.20.0-70-g472e24c9c ----- - -== See Also -*vg*(1), *git*(1) - -== Copyright - -Copyright (C) 2019 {author}. - -Free use of this documentation is granted under the terms of the MIT License. diff --git a/doc/vgmanmd.py b/doc/vgmanmd.py old mode 100644 new mode 100755 index b4a99931448..625c5e2a0b5 --- a/doc/vgmanmd.py +++ b/doc/vgmanmd.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import subprocess diff --git a/doc/wiki b/doc/wiki index f28a1e56005..9c76a661ee7 160000 --- a/doc/wiki +++ b/doc/wiki @@ -1 +1 @@ -Subproject commit f28a1e56005c729cf5c2dad6a251447bedba2949 +Subproject commit 9c76a661ee7615f672aec99e08b55d3d41456f68 From fe2a324b8ad6267b275d1820e843e7d6adba502e Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 2 Dec 2024 12:59:01 -0500 Subject: [PATCH 09/14] At least try to deliver a man-format manpage --- Brewfile | 1 + Dockerfile | 2 +- Makefile | 7 +++++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Brewfile b/Brewfile index f730aae8bee..33431d1aa07 100644 --- a/Brewfile +++ b/Brewfile @@ -20,3 +20,4 @@ brew "autoconf" brew "cmake" brew "boost" brew "pybind11" +brew "pandoc" diff --git a/Dockerfile b/Dockerfile index cf4279d6045..c430c745eab 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,7 @@ RUN apt-get -qq -y update && apt-get -qq -y upgrade && apt-get -qq -y install \ samtools curl unzip redland-utils librdf-dev cmake pkg-config wget gtk-doc-tools \ raptor2-utils rasqal-utils bison flex gawk libgoogle-perftools-dev liblz4-dev liblzma-dev \ libcairo2-dev libpixman-1-dev libffi-dev libcairo-dev libprotobuf-dev libboost-all-dev \ - tabix bcftools libzstd-dev pybind11-dev python3-pybind11 + tabix bcftools libzstd-dev pybind11-dev python3-pybind11 pandoc ###DEPS_END### # Prepare to build submodule dependencies diff --git a/Makefile b/Makefile index 2c81ff30944..f4ca8ace626 100644 --- a/Makefile +++ b/Makefile @@ -544,10 +544,13 @@ docs: $(SRC_DIR)/*.cpp $(SRC_DIR)/*.hpp $(ALGORITHMS_SRC_DIR)/*.cpp $(ALGORITHMS doxygen echo "View documentation at: file://$(PWD)/doc/doxygen/index.html" -man: doc/wiki/vg-manpage.md +man: doc/wiki/vg-manpage.md doc/man/vg.1 doc/wiki/vg-manpage.md: $(BIN_DIR)/$(EXE) doc/vgmanmd.desc.md doc/vgmanmd.py - cd doc && ./vgmanmd.py > wiki/vg-manpage.md.tmp && mv wiki/vg-manpage.md.tmp wiki/vg-manpage.md + cd doc && ./vgmanmd.py > $@.tmp && mv $@.tmp $@ + +doc/man/vg.1: doc/wiki/vg-manpage.md + mkdir -p doc/man && pandoc --standalone --to man $< -o $@ # Hack to use gshuf or shuf as appropriate to the platform when testing $(BIN_DIR)/shuf: From 63e129f4aac2a87c671e69176c655425eb3a1219 Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 19 Dec 2024 16:59:39 +0100 Subject: [PATCH 10/14] Make manpage markdown build --- Makefile | 2 +- doc/vgmanmd.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index f4ca8ace626..854a20de5bb 100644 --- a/Makefile +++ b/Makefile @@ -547,7 +547,7 @@ docs: $(SRC_DIR)/*.cpp $(SRC_DIR)/*.hpp $(ALGORITHMS_SRC_DIR)/*.cpp $(ALGORITHMS man: doc/wiki/vg-manpage.md doc/man/vg.1 doc/wiki/vg-manpage.md: $(BIN_DIR)/$(EXE) doc/vgmanmd.desc.md doc/vgmanmd.py - cd doc && ./vgmanmd.py > $@.tmp && mv $@.tmp $@ + ./doc/vgmanmd.py > $@.tmp && mv $@.tmp $@ doc/man/vg.1: doc/wiki/vg-manpage.md mkdir -p doc/man && pandoc --standalone --to man $< -o $@ diff --git a/doc/vgmanmd.py b/doc/vgmanmd.py index 625c5e2a0b5..fb186719a47 100755 --- a/doc/vgmanmd.py +++ b/doc/vgmanmd.py @@ -11,7 +11,7 @@ cmds.sort() # parse short descriptions -desc_inf = open('vgmanmd.desc.md', 'rt') +desc_inf = open('./doc/vgmanmd.desc.md', 'rt') desc = {} cur_desc = '' cur_header = '' From 1c667d843771179d8d25a973db0663b1744e7f62 Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 19 Dec 2024 17:46:48 +0100 Subject: [PATCH 11/14] Make man page look more like a man page --- doc/vgmanmd.desc.md | 88 +++++++++++++++++++++++++-------------------- doc/vgmanmd.py | 36 +++++++++++++++---- 2 files changed, 78 insertions(+), 46 deletions(-) diff --git a/doc/vgmanmd.desc.md b/doc/vgmanmd.desc.md index 3b769e9cf85..7f0383844fb 100644 --- a/doc/vgmanmd.desc.md +++ b/doc/vgmanmd.desc.md @@ -1,59 +1,62 @@ # file-info This file contains extra text that will be added to the man pages generated with doc/vgmanmd.py -The `# intro` section is added to the top of the page, and each `# subcommand` section will be added to given subcommand -When adding a new subcommand, add it to the appropriate section(s) in the intro +The `# description` section is added to the top of the page, and each `# subcommand` section will be added to given subcommand +When adding a new subcommand, add it to the appropriate section(s) in the description -# intro +# description -This is a redundant and incomplete list of subcommands of vg, organized by common uses. For a complete list of subcommands, run `vg help`. +vg is a toolkit for variation graph data structures, interchange formats, alignment, genotyping, and variant calling methods. For more in-depth explanations of tools and workflows, see the [vg wiki page](https://github.com/vgteam/vg/wiki) +# synopsis +This is an incomplete list of vg subcommands. For a complete list, run `vg help`. + - **Graph construction and indexing** See the [wiki page](https://github.com/vgteam/vg/wiki/Index-Types) for an overview of vg indexes. - - [`vg autoindex`](#autoindex) automatically construct a graph and indexes for a specific workflow (e.g. giraffe, rpvg). [wiki page](https://github.com/vgteam/vg/wiki/Automatic-indexing-for-read-mapping-and-downstream-inference) - - [`vg construct`](#construct) manually construct a graph from a reference and variants. [wiki page](https://github.com/vgteam/vg/wiki/Construction) - - [`vg index`](#index) manually build individual indexes (xg, distance, GCSA, etc). [wiki page](https://github.com/vgteam/vg/wiki/Index-Construction) - - [`vg gbwt`](#gbwt) manually build and manipulate GBWTs and indexes (GBWTgraph, GBZ, r-index). [wiki page](https://github.com/vgteam/vg/wiki/VG-GBWT-Subcommand) - - [`vg minimizer`](#minimizer) manually build a minimizer index for mapping. - - [`vg haplotypes`](#haplotypes) haplotype sample a graph. Recommended for mapping with giraffe. [wiki page](https://github.com/vgteam/vg/wiki/Haplotype-Sampling) + - [`vg autoindex`](#autoindex): automatically construct a graph and indexes for a specific workflow (e.g. giraffe, rpvg). [wiki page](https://github.com/vgteam/vg/wiki/Automatic-indexing-for-read-mapping-and-downstream-inference) + - [`vg construct`](#construct): manually construct a graph from a reference and variants. [wiki page](https://github.com/vgteam/vg/wiki/Construction) + - [`vg index`](#index): manually build individual indexes (xg, distance, GCSA, etc). [wiki page](https://github.com/vgteam/vg/wiki/Index-Construction) + - [`vg gbwt`](#gbwt): manually build and manipulate GBWTs and indexes (GBWTgraph, GBZ, r-index). [wiki page](https://github.com/vgteam/vg/wiki/VG-GBWT-Subcommand) + - [`vg minimizer`](#minimizer): manually build a minimizer index for mapping. + - [`vg haplotypes`](#haplotypes): haplotype sample a graph. Recommended for mapping with giraffe. [wiki page](https://github.com/vgteam/vg/wiki/Haplotype-Sampling) - **Read mapping** - - [`vg giraffe`](#giraffe) fast haplotype-aware short read alignment. [wiki page](https://github.com/vgteam/vg/wiki/Mapping-short-reads-with-Giraffe) - - [`vg mpmap`](#mpmap) splice-aware multipath alignment of short reads. [wiki page](https://github.com/vgteam/vg/wiki/Multipath-alignments-and-vg-mpmap) - - [`vg map`](#map) MEM-based read alignment. [wiki page](https://github.com/vgteam/vg/wiki/Working-with-a-whole-genome-variation-graph) + - [`vg giraffe`](#giraffe): fast haplotype-aware short read alignment. [wiki page](https://github.com/vgteam/vg/wiki/Mapping-short-reads-with-Giraffe) + - [`vg mpmap`](#mpmap): splice-aware multipath alignment of short reads. [wiki page](https://github.com/vgteam/vg/wiki/Multipath-alignments-and-vg-mpmap) + - [`vg map`](#map): MEM-based read alignment. [wiki page](https://github.com/vgteam/vg/wiki/Working-with-a-whole-genome-variation-graph) - **Downstream analyses** - - [`vg pack`](#pack) convert alignments to a compact coverage index. Used with [vg call](#call) - - [`vg call`](#call) call or genotype VCF variants. Uses [vg pack](#pack). [wiki page](https://github.com/vgteam/vg/wiki/SV-Genotyping-and-variant-calling) - - [`vg rna`](#rna) construct splicing graphs and pantranscriptomes. [wiki page](https://github.com/vgteam/vg/wiki/Transcriptomic-analyses). Also see [rpvg](https://github.com/jonassibbesen/rpvg) - - [`vg deconstruct`](#deconstruct) create a VCF from variation in the graph. [wiki page](https://github.com/vgteam/vg/wiki/VCF-export-with-vg-deconstruct) + - [`vg pack`](#pack): convert alignments to a compact coverage index. Used with [vg call](#call) + - [`vg call`](#call): call or genotype VCF variants. Uses [vg pack](#pack). [wiki page](https://github.com/vgteam/vg/wiki/SV-Genotyping-and-variant-calling) + - [`vg rna`](#rna): construct splicing graphs and pantranscriptomes. [wiki page](https://github.com/vgteam/vg/wiki/Transcriptomic-analyses). Also see [rpvg](https://github.com/jonassibbesen/rpvg) + - [`vg deconstruct`](#deconstruct): create a VCF from variation in the graph. [wiki page](https://github.com/vgteam/vg/wiki/VCF-export-with-vg-deconstruct) - **Working with read alignments** - - [`vg gamsort`](#gamsort) sort a GAM/GAF file or index a sorted GAM file. - - [`vg filter`](#filter) filter alignments by properties. - - [`vg surject`](#surject) project alignments on a graph onto a linear reference (gam/gaf->bam/sam/cram). - - [`vg inject`](#inject) project alignments on a linear reference onto a graph (bam/sam/cram->gam/gaf). - - [`vg sim`](#sim) simulate reads from a graph. [wiki page](https://github.com/vgteam/vg/wiki/Simulating-reads-with-vg-sim) + - [`vg gamsort`](#gamsort): sort a GAM/GAF file or index a sorted GAM file. + - [`vg filter`](#filter): filter alignments by properties. + - [`vg surject`](#surject): project alignments on a graph onto a linear reference (gam/gaf->bam/sam/cram). + - [`vg inject`](#inject): project alignments on a linear reference onto a graph (bam/sam/cram->gam/gaf). + - [`vg sim`](#sim): simulate reads from a graph. [wiki page](https://github.com/vgteam/vg/wiki/Simulating-reads-with-vg-sim) - **Graph and read statistics** - - [`vg stats`](#stats) get stats about the graph. - - [`vg paths`](#paths) get stats about the paths. [wiki page](https://github.com/vgteam/vg/wiki/Path-Metadata-Model) - - [`vg gbwt`](#gbwt) get stats about a GBWT. - - [`vg filter`](#filter) get stats about alignments (use `--tsv-out`). + - [`vg stats`](#stats): get stats about the graph. + - [`vg paths`](#paths): get stats about the paths. [wiki page](https://github.com/vgteam/vg/wiki/Path-Metadata-Model) + - [`vg gbwt`](#gbwt): get stats about a GBWT. + - [`vg filter`](#filter): get stats about alignments (use `--tsv-out`). - **Manipulating a graph** - - [`vg mod`](#mod) filter, transform, and edit the graph. - - [`vg prune`](#prune) prune the graph for GCSA2 indexing. - - [`vg ids`](#ids) manipulate graph node ids. - - [`vg paths`](#paths) manipulate paths in a graph. - - [`vg gbwt`](#gbwt) manipulate GBWTs and associated indexes. [wiki page](https://github.com/vgteam/vg/wiki/VG-GBWT-Subcommand) - - [`vg annotate`](#annotate) annotate a graph or alignments. + - [`vg mod`](#mod): filter, transform, and edit the graph. + - [`vg prune`](#prune): prune the graph for GCSA2 indexing. + - [`vg ids`](#ids): manipulate graph node ids. + - [`vg paths`](#paths): manipulate paths in a graph. + - [`vg gbwt`](#gbwt): manipulate GBWTs and associated indexes. [wiki page](https://github.com/vgteam/vg/wiki/VG-GBWT-Subcommand) + - [`vg annotate`](#annotate): annotate a graph or alignments. - **Conversion between formats** - - [`vg convert`](#convert) convert between handle graph formats and GFA, and between alignment formats. - - [`vg view`](#view) convert between non-handle graph formats and alignment formats (dot, json, turtle...). - - [`vg surject`](#surject) project alignments on a graph onto a linear reference (gam/gaf->bam/sam/cram). - - [`vg inject`](#inject) project alignments on a linear reference onto a graph (bam/sam/cram->gam/gaf). - - [`vg paths`](#paths) extract a fasta from a graph. [wiki page](https://github.com/vgteam/vg/wiki/Extracting-a-FASTA-from-a-Graph) + - [`vg convert`](#convert): convert between handle graph formats and GFA, and between alignment formats. + - [`vg view`](#view): convert between non-handle graph formats and alignment formats (dot, json, turtle...). + - [`vg surject`](#surject): project alignments on a graph onto a linear reference (gam/gaf->bam/sam/cram). + - [`vg inject`](#inject): project alignments on a linear reference onto a graph (bam/sam/cram->gam/gaf). + - [`vg paths`](#paths): extract a fasta from a graph. [wiki page](https://github.com/vgteam/vg/wiki/Extracting-a-FASTA-from-a-Graph) - **Subgraph extraction** - - [`vg chunk`](#chunk) split a graph and/or alignment into smaller chunks. - - [`vg find`](#find) use an index to find nodes, edges, kmers, paths, or positions. + - [`vg chunk`](#chunk): split a graph and/or alignment into smaller chunks. + - [`vg find`](#find): use an index to find nodes, edges, kmers, paths, or positions. # annotate @@ -90,3 +93,10 @@ format conversions for graphs and alignments # filter Filter alignments by properties. + +# bugs + +Bugs can be reported at: https://github.com/vgteam/vg/issues + +For technical support, please visit: https://www.biostars.org/tag/vg/ + diff --git a/doc/vgmanmd.py b/doc/vgmanmd.py index fb186719a47..38c7ef6e002 100755 --- a/doc/vgmanmd.py +++ b/doc/vgmanmd.py @@ -10,6 +10,7 @@ 'gamsort', 'inject', 'surject', 'mod', 'prune', 'ids', 'sim', 'annotate'] cmds.sort() + # parse short descriptions desc_inf = open('./doc/vgmanmd.desc.md', 'rt') desc = {} @@ -34,11 +35,23 @@ ret = subprocess.run(['vg', 'version'], capture_output=True) vg_v = ret.stdout.decode().split('\n')[0] -print('\n*Automatically made for ' + vg_v + '.*\n\n') +print("% vg() | Variation Graph Toolkit\n\n") + +#Start with the name +print("NAME") +print("====") +print('vg - variation graph tool, ' + vg_v + '.\n\n') + +if 'description' in desc: + print("DESCRIPTION") + print("====") + print(desc['description']) + print('\n\n') -# add intro text -if 'intro' in desc: - print(desc['intro']) +if 'synopsis' in desc: + print("SYNOPSIS") + print("====") + print(desc['synopsis']) print('\n\n') # table of contents @@ -47,14 +60,23 @@ # #print('\n\n') +print("COMMANDS") +print("====") + # help for each cmd for cmd in cmds: print('## {cmd}\n\n'.format(cmd=cmd)) - if cmd in desc: - print(desc[cmd]) - print('\n\n') # run subcommand with -h ret = subprocess.run(['vg', cmd, '-h'], capture_output=True) print('```') + if cmd in desc: + print(desc[cmd]) + print('\n\n') print(ret.stderr.decode()) print('```\n\n') + +if 'bugs' in desc: + print("BUGS") + print("====") + print(desc['bugs']) + print('\n\n') From 491d32a5bae9fc859dc8922fbe990546fc17f5f5 Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 19 Dec 2024 17:51:05 +0100 Subject: [PATCH 12/14] Take out -h from subcommands since some don't work --- doc/vgmanmd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/vgmanmd.py b/doc/vgmanmd.py index 38c7ef6e002..20b04a6bd1f 100755 --- a/doc/vgmanmd.py +++ b/doc/vgmanmd.py @@ -66,8 +66,8 @@ # help for each cmd for cmd in cmds: print('## {cmd}\n\n'.format(cmd=cmd)) - # run subcommand with -h - ret = subprocess.run(['vg', cmd, '-h'], capture_output=True) + # run subcommand without -h because not everything has -h + ret = subprocess.run(['vg', cmd], capture_output=True) print('```') if cmd in desc: print(desc[cmd]) From ad1b14ef363f5f592f852d6e1dd70ee9deac28c7 Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 19 Dec 2024 17:58:30 +0100 Subject: [PATCH 13/14] I think everything goes on page 1 of the manpage? --- doc/vgmanmd.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/vgmanmd.py b/doc/vgmanmd.py index 20b04a6bd1f..1db0aa4ecd2 100755 --- a/doc/vgmanmd.py +++ b/doc/vgmanmd.py @@ -35,7 +35,8 @@ ret = subprocess.run(['vg', 'version'], capture_output=True) vg_v = ret.stdout.decode().split('\n')[0] -print("% vg() | Variation Graph Toolkit\n\n") +#Metadata +print("% vg(1) | Variation Graph Toolkit\n\n") #Start with the name print("NAME") From ccf4277f907c50cdcff11cd56930ea08f6e031d2 Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 19 Dec 2024 18:23:17 +0100 Subject: [PATCH 14/14] Make two md man pages, one for the wiki and one for the actual man page --- Makefile | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 854a20de5bb..237c72a0c1d 100644 --- a/Makefile +++ b/Makefile @@ -546,11 +546,15 @@ docs: $(SRC_DIR)/*.cpp $(SRC_DIR)/*.hpp $(ALGORITHMS_SRC_DIR)/*.cpp $(ALGORITHMS man: doc/wiki/vg-manpage.md doc/man/vg.1 -doc/wiki/vg-manpage.md: $(BIN_DIR)/$(EXE) doc/vgmanmd.desc.md doc/vgmanmd.py - ./doc/vgmanmd.py > $@.tmp && mv $@.tmp $@ +#The manpage markdown has an extra line needed for the actual manpage format +doc/man/vg-manpage.md: $(BIN_DIR)/$(EXE) doc/vgmanmd.desc.md doc/vgmanmd.py + mkdir -p doc/man && ./doc/vgmanmd.py > $@.tmp && mv $@.tmp $@ -doc/man/vg.1: doc/wiki/vg-manpage.md - mkdir -p doc/man && pandoc --standalone --to man $< -o $@ +doc/wiki/vg-manpage.md: doc/man/vg-manpage.md + sed 1d doc/man/vg-manpage.md > $@ + +doc/man/vg.1: doc/man/vg-manpage.md + pandoc --standalone --to man $< -o $@ # Hack to use gshuf or shuf as appropriate to the platform when testing $(BIN_DIR)/shuf: