diff --git a/README.md b/README.md index 44b0b64..29fed10 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,25 @@ With the MAP graph, we can use the "principal bundle decomposition" to study com ## Documentation, Usage and Examples +Command Line Tools: + +PGR-TK provides the following tool to + +- create the PGR-TK sequence and index database + - `pgr-mdb`: create pgr minimizer database with AGC backend + - `pgr-make-frgdb`: create PGR-TK fragment minimizer database with frg format backend +- query the database to fetch sequences + - `pgr-query`: query a PGR-TK pangenome sequence database, ouput the hit summary and generate fasta files from the target sequences +- generate MAP-graph in GFA format and principal bundle decomposition bed file + - `pgr-pbundle-decomp`: generat the principal bundle decomposition though MAP Graph from a fasta file +- generate SVG from the principal bundle decomposition bed file + - `pgr-pbundle-bed2svg`: generate SVG from a principal bundle bed file +- auxiliary tools + - `pgr-pbundle-bed2sorted`: generate annotation file with a sorting order from the principal bundle decomposition + - `pgr-pbundle-bed2dist`: generate alignment scores between sequences using bundle decomposition from a principal bundle bed file + +For each comannd, `command --help` provides the detail usage information. + The API documentation is at https://sema4-research.github.io/pgr-tk/ A collection of Jupyter Notebooks are at https://github.com/sema4-Research/pgr-tk-notebooks/ diff --git a/pgr-bin/Cargo.toml b/pgr-bin/Cargo.toml index afcb842..3f67653 100644 --- a/pgr-bin/Cargo.toml +++ b/pgr-bin/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pgr-bin" -version = "0.4.0-dev" +version = "0.4.0" edition = "2021" authors = ["Jason Chin "] diff --git a/pgr-bin/src/bin/pgr-fetch-seqs.rs b/pgr-bin/src/bin/pgr-fetch-seqs.rs index 9e69201..28fc76d 100644 --- a/pgr-bin/src/bin/pgr-fetch-seqs.rs +++ b/pgr-bin/src/bin/pgr-fetch-seqs.rs @@ -6,22 +6,28 @@ use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Write}; use std::path::Path; +/// List or fetch sequences from a PGR-TK database #[derive(Parser, Debug)] #[clap(name = "pgr-fetch-seqs")] #[clap(author, version)] -#[clap(about = "list or fetch sequences from a pgr database", long_about = None)] +#[clap(about, long_about = None)] struct CmdOptions { + /// the prefix to a PGR-TK sequence database pgr_db_prefix: String, + /// using the frg format for the sequence database (default to the AGC backend databse if not specified) #[clap(long, default_value_t = false)] frg_file: bool, + /// the regions file path #[clap(short, long, default_value=None)] region_file: Option, + /// output file name #[clap(short, long, default_value=None)] output_file: Option, + /// list all sequence source, contig names in the database #[clap(long, default_value_t = false)] list: bool, } diff --git a/pgr-bin/src/bin/pgr-make-fragdb.rs b/pgr-bin/src/bin/pgr-make-frgdb.rs similarity index 90% rename from pgr-bin/src/bin/pgr-make-fragdb.rs rename to pgr-bin/src/bin/pgr-make-frgdb.rs index a5a5628..028ad4c 100644 --- a/pgr-bin/src/bin/pgr-make-fragdb.rs +++ b/pgr-bin/src/bin/pgr-make-frgdb.rs @@ -8,12 +8,13 @@ use std::fs::File; use std::io::{BufRead, BufReader}; use std::path::Path; +/// Create PGR-TK fragment minimizer database with frg format backend #[derive(Parser, Debug)] -#[clap(name = "pgr-mdb")] +#[clap(name = "pgr-make-frgdb")] #[clap(author, version)] -#[clap(about = "create pgr fragment minimizer db", long_about = None)] +#[clap(about, long_about = None)] struct CmdOptions { - // file contains the paths to the fastx files to load + /// the path to the file contains the paths to the fastx files to load filepath: String, prefix: String, /// minimizer window size diff --git a/pgr-bin/src/bin/pgr-mdb.rs b/pgr-bin/src/bin/pgr-mdb.rs index 4999ecf..ae5ae3f 100644 --- a/pgr-bin/src/bin/pgr-mdb.rs +++ b/pgr-bin/src/bin/pgr-mdb.rs @@ -10,10 +10,11 @@ use std::io::{BufRead, BufReader}; use pgr_db::seq_db; +/// Create pgr minimizer database with AGC backend #[derive(Parser, Debug)] #[clap(name = "pgr-mdb")] #[clap(author, version)] -#[clap(about = "create pgr minimizer db", long_about = None)] +#[clap(about, long_about = None)] struct CmdOptions { filepath: String, prefix: String, diff --git a/pgr-bin/src/bin/pgr-pbundle-bed2dist.rs b/pgr-bin/src/bin/pgr-pbundle-bed2dist.rs index ff3aa18..23aec5c 100644 --- a/pgr-bin/src/bin/pgr-pbundle-bed2dist.rs +++ b/pgr-bin/src/bin/pgr-pbundle-bed2dist.rs @@ -6,12 +6,15 @@ use std::io::{BufRead, BufReader, BufWriter, Write}; use std::path::Path; use std::{fs::File, path}; +/// Generate alignment scores between sequences using bundle decomposition from a principal bundle bed file #[derive(Parser, Debug)] #[clap(name = "pgr-pbundle-bed2dist")] #[clap(author, version)] -#[clap(about = "generate alignment scores between contigs using bundle decomposition from a principal bundle bed file", long_about = None)] +#[clap(about, long_about = None)] struct CmdOptions { + /// the path to the pricipal bundle bed file bed_file_path: String, + /// the prefix of the output file output_prefix: String, } diff --git a/pgr-bin/src/bin/pgr-pbundle-bed2sorted.rs b/pgr-bin/src/bin/pgr-pbundle-bed2sorted.rs index 68a8492..8bfd758 100644 --- a/pgr-bin/src/bin/pgr-pbundle-bed2sorted.rs +++ b/pgr-bin/src/bin/pgr-pbundle-bed2sorted.rs @@ -5,12 +5,15 @@ use std::io::{BufRead, BufReader, BufWriter, Write}; use std::path::Path; use std::{fs::File, path}; +/// Generate annotation file with a sorting order from the principal bundle decomposition #[derive(Parser, Debug)] #[clap(name = "pgr-pbundle-bed2sorted")] #[clap(author, version)] -#[clap(about = "sort the contig by bunldes", long_about = None)] +#[clap(about, long_about = None)] struct CmdOptions { + /// the path to the pricipal bundle bed file bed_file_path: String, + /// the prefix of the output file output_prefix: String, } diff --git a/pgr-bin/src/bin/pgr-pbundle-bed2svg.rs b/pgr-bin/src/bin/pgr-pbundle-bed2svg.rs index 0754d5a..1bc9305 100644 --- a/pgr-bin/src/bin/pgr-pbundle-bed2svg.rs +++ b/pgr-bin/src/bin/pgr-pbundle-bed2svg.rs @@ -6,33 +6,49 @@ use std::{fs::File, path}; use svg::node::{self, element, Node}; use svg::Document; +/// Generate SVG from a principal bundle bed file #[derive(Parser, Debug)] #[clap(name = "pgr-pbundle-bed2svg")] #[clap(author, version)] -#[clap(about = "generate SVG from a principal bundle bed file", long_about = None)] +#[clap(about, long_about = None)] struct CmdOptions { + /// the path to the pricipal bundle bed file bed_file_path: String, + /// the prefix of the output file output_prefix: String, + /// the prefix of the dendrogram file generated by pgr-pbundle-bed2dist #[clap(long)] ddg_file: Option, + /// the path the annotation files #[clap(long)] annotations: Option, + /// the path the annotation track file #[clap(long)] annotation_region_bedfile: Option, + /// the track range in base pair count #[clap(long, default_value_t = 100000)] track_range: usize, + /// the track tick interval #[clap(long, default_value_t = 10000)] track_tick_interval: usize, + /// the track panel size in pixel #[clap(long, default_value_t = 1600)] track_panel_width: usize, + /// the left padding in pixel #[clap(long)] left_padding: Option, + /// the stroke boundary width #[clap(long, default_value_t = 0.5)] stroke_width: f32, + /// the stroke with for the annotation track #[clap(long, default_value_t = 2.5)] annotation_region_stroke_width: f32, + /// the anotation panel width #[clap(long, default_value_t = 500.0)] annotation_panel_width: f32, + /// the factor to increase the bounder width for highlighting repeatitive bundles + #[clap(long, default_value_t = 1.0)] + highlight_repeats: f32, } static CMAP: [&str; 97] = [ @@ -237,6 +253,11 @@ fn main() -> Result<(), std::io::Error> { let ctg_with_svg_paths: Vec<(String, (Vec, Vec, element::Text))> = ctg_data_vec .into_iter() .map(|(ctg, annotation,bundle_segment, annotation_segments)| { + let mut bundle_segment_count = FxHashMap::::default(); + bundle_segment.iter().for_each(|&(_bgn, _end, bundle_id, _direction)| { + let e = bundle_segment_count.entry(bundle_id).or_insert_with(|| 0); + *e += 1; + }); let paths: Vec = bundle_segment .into_iter() @@ -250,24 +271,26 @@ fn main() -> Result<(), std::io::Error> { let bundle_color = CMAP[((bundle_id * 17) % 97) as usize]; let stroke_color = CMAP[((bundle_id * 47) % 43) as usize]; let arror_end = end as f32; + let halfwidth = 5.0; let end = if direction == 0 { - if end as f32 - 5.0 < bgn { + if end as f32 - halfwidth < bgn { bgn } else { - end as f32 - 5.0 + end as f32 - halfwidth } - } else if end as f32 + 5.0 > bgn { + } else if end as f32 + halfwidth > bgn { bgn } else { - end as f32 + 5.0 + end as f32 + halfwidth }; - let bottom0 = -3_i32 + y_offset as i32; - let top0 = 3_i32 + y_offset as i32; - let bottom1 = -4_i32 + y_offset as i32; - let top1 = 4_i32 + y_offset as i32; - let center = y_offset as i32; - + let bottom0 = -halfwidth * 0.6 + y_offset as f32; + let top0 = halfwidth * 0.6 + y_offset as f32; + let bottom1 = -halfwidth * 0.8 + y_offset as f32; + let top1 = halfwidth * 0.8 + y_offset as f32; + let center = y_offset as f32; + let stroke_width = stroke_width * ( if args.highlight_repeats > 1.0001 && + *bundle_segment_count.get(&bundle_id).unwrap_or(&0) > 1 {args.highlight_repeats} else {1.0}); let path_str = format!( "M {bgn} {bottom0} L {bgn} {top0} L {end} {top0} L {end} {top1} L {arror_end} {center} L {end} {bottom1} L {end} {bottom0} Z"); element::Path::new() @@ -398,7 +421,9 @@ fn main() -> Result<(), std::io::Error> { // println!("{}", ctg); document.append(text); paths.into_iter().for_each(|path| document.append(path)); - annotation_paths.into_iter().for_each(|path| document.append(path)); + annotation_paths + .into_iter() + .for_each(|path| document.append(path)); }); let out_path = path::Path::new(&args.output_prefix).with_extension("svg"); svg::save(out_path, &document).unwrap(); diff --git a/pgr-bin/src/bin/pgr-pbundle-decomp.rs b/pgr-bin/src/bin/pgr-pbundle-decomp.rs index 858c8d3..1732220 100644 --- a/pgr-bin/src/bin/pgr-pbundle-decomp.rs +++ b/pgr-bin/src/bin/pgr-pbundle-decomp.rs @@ -10,29 +10,41 @@ use std::{ path::Path, }; +/// Generat the principal bundle decomposition though MAP Graph from a fasta file #[derive(Parser, Debug)] #[clap(name = "pgr-pbundle-decomp")] #[clap(author, version)] -#[clap(about = "take a fasta file and output the principal bundle decomposition though MAP Graph", long_about = None)] +#[clap(about, long_about = None)] struct CmdOptions { + /// the path to the input fasta file fastx_path: String, + /// the prefix of the output files output_prefix: String, + /// the path to the file that contains a list of contig name to be analyzed #[clap(long, short, default_value = None)] include: Option, + /// the SHIMMER parameter w #[clap(short, default_value_t = 48)] w: u32, + /// the SHIMMER parameter k #[clap(short, default_value_t = 56)] k: u32, + /// the SHIMMER parameter r #[clap(short, default_value_t = 4)] r: u32, + /// the SHIMMER parameter minimum span length #[clap(long, default_value_t = 12)] min_span: u32, + /// minimum coverage to be included in principal bundles #[clap(long, default_value_t = 0)] min_cov: usize, + /// the minimum branch length to be included in the principal bundles #[clap(long, default_value_t = 8)] min_branch_size: usize, + /// the minimum local project bundle size to includes #[clap(long, default_value_t = 2500)] bundle_length_cutoff: usize, + /// merge two bundles with the same id with the specified length #[clap(long, default_value_t = 10000)] bundle_merge_distance: usize, } @@ -191,6 +203,10 @@ fn main() -> Result<(), std::io::Error> { let mut outpu_bed_file = BufWriter::new(File::create(output_prefix_path.with_extension("bed"))?); + let mut output_ctg_summary_file = BufWriter::new(File::create( + output_prefix_path.with_extension("ctg.summary.tsv"), + )?); + writeln!(outpu_bed_file, "# cmd: {}", cmd_string).expect("bed file write error"); let mut seq_info = seq_index_db @@ -202,7 +218,10 @@ fn main() -> Result<(), std::io::Error> { seq_info.sort_by_key(|k| k.1 .0.clone()); - seq_info.into_iter().for_each(|(sid, sdata)| { + let mut repeat_count = FxHashMap::>::default(); + let mut non_repeat_count = FxHashMap::>::default(); + + seq_info.iter().for_each(|(sid, sdata)| { let (ctg, _src, _len) = sdata; let smps = sid_smps.get(&sid).unwrap(); let smp_partitions = group_smps_by_principle_bundle_id( @@ -210,14 +229,33 @@ fn main() -> Result<(), std::io::Error> { args.bundle_length_cutoff, args.bundle_merge_distance, ); + let mut ctg_bundle_count = FxHashMap::::default(); + smp_partitions.iter().for_each(|p| { + let bid = p[0].1; + *ctg_bundle_count.entry(bid).or_insert_with(|| 0) += 1; + }); smp_partitions.into_iter().for_each(|p| { let b = p[0].0 .2; let e = p[p.len() - 1].0 .3 + args.k; let bid = p[0].1; let direction = p[0].2; + let is_repeat ; + if *ctg_bundle_count.get(&bid).unwrap_or(&0) > 1 { + repeat_count + .entry(*sid) + .or_insert_with(|| vec![]) + .push(e - b - args.k); + is_repeat = "R"; + } else { + non_repeat_count + .entry(*sid) + .or_insert_with(|| vec![]) + .push(e - b - args.k); + is_repeat = "U"; + } let _ = writeln!( outpu_bed_file, - "{}\t{}\t{}\t{}:{}:{}:{}:{}", + "{}\t{}\t{}\t{}:{}:{}:{}:{}:{}", ctg, b, e, @@ -225,9 +263,116 @@ fn main() -> Result<(), std::io::Error> { bid_to_size[&bid], direction, p[0].3, - p[p.len() - 1].3 + p[p.len() - 1].3, + is_repeat ); }); }); + let _ = writeln!( + output_ctg_summary_file, + "#{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", + "ctg", + "length", + "repeat_bundle_count", + "repeat_bundle_sum", + "repeat_bunlde_percentage", + "repeat_bundle_mean", + "repeat_bundle_min", + "repeat_bundle_max", + "non_repeat_bundle_count", + "non_repeat_bundle_sum", + "non_repeat_bunlde_percentage", + "non_repeat_bundle_mean", + "non_repeat_bundle_min", + "non_repeat_bundle_max", + "total_bundle_count", + "total_bundle_coverage_percentage" + ); + seq_info.into_iter().for_each(|(sid, sdata)| { + let (ctg, _src, len) = sdata; + let repeat_bundle_count = repeat_count.get(&sid).unwrap_or(&vec![]).len(); + let non_repeat_bundle_count = non_repeat_count.get(&sid).unwrap_or(&vec![]).len(); + let repeat_sum = repeat_count + .get(&sid) + .unwrap_or(&vec![]) + .iter() + .fold(0, |acc, x| acc + x); + let repeat_bundle_max = repeat_count + .get(&sid) + .unwrap_or(&vec![]) + .into_iter() + .fold(0, |x, &y| if x > y { x } else { y }); + let repeat_bundle_min = repeat_count + .get(&sid) + .unwrap_or(&vec![]) + .into_iter() + .fold(len, |x, &y| if x < y { x } else { y }); + let non_repeat_sum = non_repeat_count + .get(&sid) + .unwrap_or(&vec![]) + .into_iter() + .fold(0, |acc, x| acc + x); + let non_repeat_bundle_max = non_repeat_count + .get(&sid) + .unwrap_or(&vec![]) + .into_iter() + .fold(0, |x, &y| if x > y { x } else { y }); + let non_repeat_bundle_min = non_repeat_count + .get(&sid) + .unwrap_or(&vec![]) + .into_iter() + .fold(len, |x, &y| if x < y { x } else { y }); + let repeat_bundle_mean = if repeat_bundle_count > 0 { + format!("{}", repeat_sum as f32 / repeat_bundle_count as f32) + } else { + "NA".to_string() + }; + let non_repeat_bundle_mean = if non_repeat_bundle_count > 0 { + format!("{}", non_repeat_sum as f32 / non_repeat_bundle_count as f32) + } else { + "NA".to_string() + }; + let repeat_bundle_min = if repeat_bundle_count > 0 { + format!("{}", repeat_bundle_min) + } else { + "NA".to_string() + }; + let repeat_bundle_max = if repeat_bundle_count > 0 { + format!("{}", repeat_bundle_max) + } else { + "NA".to_string() + }; + let non_repeat_bundle_min = if non_repeat_bundle_count > 0 { + format!("{}", non_repeat_bundle_min) + } else { + "NA".to_string() + }; + let non_repeat_bundle_max = if non_repeat_bundle_count > 0 { + format!("{}", non_repeat_bundle_max) + } else { + "NA".to_string() + }; + + let _ = writeln!( + output_ctg_summary_file, + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", + ctg, + len, + repeat_bundle_count, + repeat_sum, + 100.0 * repeat_sum as f32 / len as f32, + repeat_bundle_mean, + repeat_bundle_min, + repeat_bundle_max, + non_repeat_bundle_count, + non_repeat_sum, + 100.0 * non_repeat_sum as f32 / len as f32, + non_repeat_bundle_mean, + non_repeat_bundle_min, + non_repeat_bundle_max, + repeat_bundle_count + non_repeat_bundle_count, + 100.0 * (repeat_sum + non_repeat_sum) as f32 / len as f32, + ); + }); Ok(()) } diff --git a/pgr-bin/src/bin/pgr-query.rs b/pgr-bin/src/bin/pgr-query.rs index 5269e2d..fe6674f 100644 --- a/pgr-bin/src/bin/pgr-query.rs +++ b/pgr-bin/src/bin/pgr-query.rs @@ -7,33 +7,45 @@ use std::fs::File; use std::io::{self, BufWriter, Write}; use std::path::Path; +/// Query a PGR-TK pangenome sequence database, +/// ouput the hit summary and generate fasta files from the target sequences #[derive(Parser, Debug)] #[clap(name = "pgr-query")] #[clap(author, version)] -#[clap(about = "query a pgr pangenome database and ouput the hits", long_about = None)] +#[clap(about, long_about = None)] struct CmdOptions { + /// the prefix to a PGR-TK sequence database pgr_db_prefix: String, + /// the path to the query fasta file query_fastx_path: String, + /// the prefix of the output file output_prfix: String, + /// using the frg format for the sequence database (default to the AGC backend databse if not specified) #[clap(long, default_value_t = false)] frg_file: bool, + /// the gap penality factor for sparse alignments in the SHIMMER space #[clap(long, short, default_value_t = 0.025)] gap_penality_factor: f32, + /// merge hits with the specified distance #[clap(long, short, default_value_t = 100000)] merge_range_tol: usize, + /// the max count of SHIMMER used for the sparse alignemnt #[clap(long, default_value_t = 128)] max_count: u32, + /// the max count of SHIMMER in the query sequences used for the sparse alignemnt #[clap(long, default_value_t = 128)] max_query_count: u32, + /// the max count of SHIMMER in the targets sequences used for the sparse alignemnt #[clap(long, default_value_t = 128)] max_target_count: u32, + /// the span of the chain for building the sparse alignment directed acyclic graph #[clap(long, default_value_t = 8)] max_aln_chain_span: u32, } diff --git a/pgr-db/Cargo.toml b/pgr-db/Cargo.toml index 44428a3..7eeccb7 100644 --- a/pgr-db/Cargo.toml +++ b/pgr-db/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pgr-db" -version = "0.4.0-dev" +version = "0.4.0" edition = "2021" authors = ["Jason Chin "] build = "build.rs" diff --git a/pgr-tk/Cargo.toml b/pgr-tk/Cargo.toml index 5a2487f..a5bb21a 100644 --- a/pgr-tk/Cargo.toml +++ b/pgr-tk/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pgrtk" -version = "0.4.0-dev" +version = "0.4.0" authors = ["Jason Chin "] edition = "2018"