From f8eaa8fe1b7c69998dd49893c4f67036ad5b3825 Mon Sep 17 00:00:00 2001 From: Frederic Lemoine Date: Fri, 21 Dec 2018 14:38:56 +0100 Subject: [PATCH] phase doc --- docs/api/phase.md | 64 +++++++++++++++++++++++++++++------------- docs/commands/phase.md | 47 ++++++++++++++++++++----------- 2 files changed, 75 insertions(+), 36 deletions(-) diff --git a/docs/api/phase.md b/docs/api/phase.md index 7c99dc0..4cdeb7b 100644 --- a/docs/api/phase.md +++ b/docs/api/phase.md @@ -4,7 +4,7 @@ ### phase -Printing ATG "phased" version of an input file +Printing amino acid sequence "phased" version of an input file ```go package main @@ -20,38 +20,62 @@ import ( ) func main() { - var fi io.Closer - var r *bufio.Reader + var fi, fi2 io.Closer + var r, r2 *bufio.Reader var err error - var seqs, phased, aaphased align.SeqBag - var removed []string + var seqs, refs align.SeqBag + var phaser align.Phaser + var phased chan align.PhasedSequence - /* Get reader (plain text or gzip) */ - fi, r, err = utils.GetReader("align.fa") - if err != nil { + /* Get ref sequences reader (plain text or gzip) */ + if fi, r, err = utils.GetReader("refs.fa"); err != nil { panic(err) } + defer fi.Close() - /* Parse Fasta */ - seqs, err = fasta.NewParser(r).ParseUnalign() - if err != nil { + /* Parse Reference orfs (nt or aa) */ + if refs, err = fasta.NewParser(r).ParseUnalign(); err != nil { + panic(err) + } + + /* Get sequences reader (plain text or gzip) */ + if fi2, r2, err = utils.GetReader("align.fa"); err != nil { panic(err) } - fi.Close() + defer fi2.Close() - if phased, aaphased, _, removed, err = seqs.Phase(nil); err != nil { + /* Parse Fasta sequences */ + seqs, err = fasta.NewParser(r2).ParseUnalign() + if err != nil { panic(err) } - /* Printing nt unaligned phased sequences */ - fmt.Println(fasta.WriteAlignment(phased)) + /* Initialize phaser */ + phaser = align.NewPhaser() + phaser.SetLenCutoff(0.8) + phaser.SetMatchCutoff(0.8) + phaser.SetReverse(true) + phaser.SetCutEnd(true) + phaser.SetCpus(1) + phaser.SetTranslate(true) - /* Printing aa unaligned phased sequences */ - fmt.Println(fasta.WriteAlignment(aaphased)) + /* Phase */ + if phased, err = phaser.Phase(refs, seqs); err != nil { + panic(err) + } - /* Printing removed sequences */ - for _, s := range removed { - fmt.Println(s) + for p := range phased { + if p.Err != nil { + panic(err) + } + if p.Removed { + fmt.Printf("%s: Removed\n", p.NtSeq.Name()) + } else { + fmt.Printf("Nt Sequence %s : %s\n", p.NtSeq.Name(), p.NtSeq.Sequence()) + fmt.Printf("Aa Sequence %s : %s\n", p.AaSeq.Name(), p.AaSeq.Sequence()) + fmt.Printf("Position: %d\n", p.Position) + fmt.Printf("Length: %d\n", p.AaSeq.Length()) + } } } ``` diff --git a/docs/commands/phase.md b/docs/commands/phase.md index 2c4c624..6ac4fb6 100644 --- a/docs/commands/phase.md +++ b/docs/commands/phase.md @@ -3,18 +3,25 @@ ## Commands ### phase -This command "phases" sequences on the ATG giving the longest ORF on the forward strand. To do so, phase will: - -1. Search for the longest ORF in the dataset if no reference orf is given; -1. Translate the given ORF in aminoacids; -2. For each sequence of the dataset: translate it in the 3 phases (forward strand only), - align it with the translated orf, and take the phase giving the best alignment; If no phase - gives a good alignment (>80% orf length, and starting at first position of the ORF), then - the sequence is discarded; +This command "phases" input sequences on the basis on either a set of input sequences, or the longest detected orf. +To do so, phase will: + +1. Search for the longest ORF in the dataset if no reference orf(s) is(are) given; +1. Translate the given ORF(s) in aminoacids; +2. For each sequence of the dataset: translate it in the 3 phases (or 6 if `--reverse` is given), + align it with all the translated orfs, and take the phase and the reference orf giving the best alignment; + If no phase gives a good alignment in any reference orf (cutoffs given by `--len-cutoff` and `--match-cutoff`), + then the sequence flagged as removed; 3. For each sequence, take the Start corresponding to the Start of the ORF, and remove - nucleotides before; + nucleotides before (and nucleotides after is `--cut-end` is given); 4. Return the trimmed nucleotidic sequences (phased), the corresponding amino-acid sequences (phasedaa) - and the positions of starts in the nucleotidic sequences. + and the start position on the original nt sequence; +5. The log file contains information on: + 1. Sequence name + 2. Its best matching reference orf + 3. Start position on original nt sequence + 4. Extracted sequence length + 5. Position of the first stop in phase if --unaligned is set, format options are ignored (phylip, nexus, etc.), and @@ -30,12 +37,18 @@ Usage: goalign phase [flags] Flags: - --aa-output string Output Met "phased" aa FASTA file (default "none") - -h, --help help for phase - -l, --log string Output log: positions of the considered ATG for each sequence (default "none") - -o, --output string Output ATG "phased" FASTA file (default "stdout") - --ref-orf string Reference ORF to phase against (if none is given, then will try to get the longest orf in the input data) (default "none") - --unaligned Considers sequences as unaligned and only fasta format is accepted (phylip, nexus,... options are ignored) + --aa-output string Output Met "phased" aa FASTA file (default "none") + --cut-end Iftrue, then also remove the end of sequences that do not align with orf + -h, --help help for phase + --len-cutoff float Length cutoff, over orf length, to consider sequence hits (-1==No cutoff) (default -1) + -l, --log string Output log: positions of the considered ATG for each sequence (default "none") + --match float Score for a match for pairwise alignment (if omitted, then take substitution matrix) (default 1) + --match-cutoff float Nb Matches cutoff, over alignment length, to consider sequence hits (-1==No cutoff) (default 0.5) + --mismatch float Score for a mismatch for pairwise alignment (if omitted, then take substitution matrix) (default -1) + -o, --output string Output ATG "phased" FASTA file (default "stdout") + --ref-orf string Reference ORF to phase against (if none is given, then will try to get the longest orf in the input data) (default "none") + --reverse Search ALSO in the reverse strand (in addition to the forward strand) + --unaligned Considers sequences as unaligned and only format fasta is accepted (phylip, nexus,... options are ignored) Global Flags: -i, --align string Alignment input file (default "stdin") @@ -47,6 +60,8 @@ Global Flags: --one-line Write Phylip sequences on 1 line (only used with -p) --output-strict Strict phylip output format (only used with -p) -p, --phylip Alignment is in phylip? default fasta + --seed int Random Seed: -1 = nano seconds since 1970/01/01 00:00:00 (default -1) + -t, --threads int Number of threads (default 1) ``` #### Examples