Doc phase & phasent

evolbioinfo · Dec 21, 2018 · c55002c · c55002c
1 parent 598cfd0
commit c55002c
Show file tree

Hide file tree

Showing 6 changed files with 166 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -137,7 +137,8 @@ You may go to the [doc](docs/index.md) for a more detailed documentation of the
   * gaps: Add gaps uniformly in an input alignment
   * snvs: Add substitutions uniformly in an input alignment
 * orf:   Find the longest orf in all given sequences in forward strand
-* phase: Try to find ATG in each sequence and remove nucleotides before
+* phase: Try to find reference orf(s) (aa) in input sequences, and align it on the same phase
+* phasent: Try to find reference sequence (nt) in input sequences, and align it on the same phase
 * random:      Generate random sequences
 * reformat:    Reformats input alignment into several formats
   * fasta

diff --git a/cmd/phase.go b/cmd/phase.go
@@ -22,30 +22,36 @@ var phasecutend bool
 // translateCmd represents the addid command
 var phaseCmd = &cobra.Command{
 	Use:   "phase",
-	Short: "Find best ATGs and set them as new start positions",
-	Long: `Find best ATGs and set them as new start positions.
+	Short: "Find best Starts and set them as new start positions",
+	Long: `Find best Starts and set them as new start positions.
+
+This command "phases" input sequences on the basis on either a set of input sequences, or the longest detected orf.
+To do so, it will:
+
+1. Search for the longest ORF in the dataset if no reference orf(s) is(are) given;
+2. Translate the given ORF(s) in aminoacids;
+3. For each sequence of the dataset: translate it in the 3 phases (or 6 if --reverse is given),
+   align it with all the translated orfs, and take the phase and the reference orf giving the best alignment;
+   If no phase gives a good alignment in any reference orf (cutoffs given by --len-cutoff and --match-cutoff),
+   then the sequence flagged as removed;
+4. For each sequence, take the Start corresponding to the Start of the ORF, and remove
+   nucleotides before (and nucleotides after if --cut-end is given);
+5. Return the trimmed nucleotidic sequences (phased), the corresponding amino-acid sequences (phasedaa)
+   and the start position on the original nt sequence;
+6. The log file contains information on:
+    1. Sequence name
+    2. Its best matching reference orf
+    3. Start position on original nt sequence
+    4. Extracted sequence length
+    5. Position of the first stop in phase
+
 
 if --unaligned is set, format options are ignored (phylip, nexus, etc.), and
-only Fasta is accepted.
+only Fasta is accepted. Otherwise, alignment is first "unaligned".
 
 If input sequences are not nucleotidic, then returns an error.
 
-Output files:
-
-1) --output : unaligned set of phased sequences in fasta
-2) --aa-output: unaligned set of phased aa sequencs in fasta
-
-Phase command will:
-1. Search for the longest ORF in the dataset if no reference orf is given;
-1. Translate the given ORF in aminoacids;
-2. For each sequence of the dataset: translate it in the 3 phases (forward strand only),
-   align it with the translated orf, and take the phase giving the best alignment; If no phase
-   gives a good alignment (>80% orf length, and starting at first position of the ORF), then
-   the sequence is discarded;
-3. For each sequence, take the Start corresponding to the Start of the ORF, and remove
-   nucleotides before;
-4. Return the trimmed nucleotidic sequences (phased), the corresponding amino-acid sequences (phasedaa)
-   and the positions of starts in the nucleotidic sequences.
+Output file is an unaligned set of sequences in fasta.
 `,
 	RunE: func(cmd *cobra.Command, args []string) (err error) {
 		var f, aaf, logf *os.File
@@ -153,9 +159,9 @@ Phase command will:
 
 func init() {
 	RootCmd.AddCommand(phaseCmd)
-	phaseCmd.PersistentFlags().StringVarP(&phaseOutput, "output", "o", "stdout", "Output ATG \"phased\" FASTA file")
+	phaseCmd.PersistentFlags().StringVarP(&phaseOutput, "output", "o", "stdout", "Output \"phased\" FASTA file")
 	phaseCmd.PersistentFlags().StringVar(&phaseAAOutput, "aa-output", "none", "Output Met \"phased\" aa FASTA file")
-	phaseCmd.PersistentFlags().StringVarP(&phaseLogOutput, "log", "l", "none", "Output log: positions of the considered ATG for each sequence")
+	phaseCmd.PersistentFlags().StringVarP(&phaseLogOutput, "log", "l", "none", "Output log: positions of the considered Start for each sequence")
 	phaseCmd.PersistentFlags().Float64Var(&lencutoff, "len-cutoff", -1.0, "Length cutoff, over orf length, to consider sequence hits (-1==No cutoff)")
 	phaseCmd.PersistentFlags().Float64Var(&matchcutoff, "match-cutoff", .5, "Nb Matches cutoff, over alignment length, to consider sequence hits (-1==No cutoff)")
 	phaseCmd.PersistentFlags().Float64Var(&match, "match", 1.0, "Score for a match for pairwise alignment (if omitted, then take substitution matrix)")

diff --git a/cmd/phasent.go b/cmd/phasent.go
@@ -14,30 +14,37 @@ var phaseCodonOutput string
 // translateCmd represents the addid command
 var phasentCmd = &cobra.Command{
 	Use:   "phasent",
-	Short: "Find best ATGs and set them as new start positions",
-	Long: `Find best ATGs and set them as new start positions.
+	Short: "Find best Starts and set them as new start positions",
+	Long: `Find best Starts and set them as new start positions.
+
+Unlike goalign phase, it does not take into account translation of input sequences.
+
+This command "phases" input sequences on the basis on either a set of input sequences, or the longest detected orf.
+To do so, it will:
+
+1. Search for the longest ORF in the dataset if no reference orf(s) is(are) given;
+2. For each sequence of the dataset: will take the sequence in forward and revcomp (if --reverse is given),
+   align it with all ref orfs, and take the phase (fwd or revcomp) and the reference orf giving the best alignment;
+   If no phase gives a good alignment in any reference orf (cutoffs given by --len-cutoff and --match-cutoff),
+   then the sequence flagged as removed;
+3. For each sequence, take the Start corresponding to the Start of the ORF, and remove
+   nucleotides before (and nucleotides after if --cut-end is given);
+4. Return the trimmed nucleotidic sequences (phased), the corresponding amino-acid sequences (phasedaa)
+   and the start position on the original nt sequence;
+5. The log file contains information on:
+    1. Sequence name
+    2. Its best matching reference orf
+    3. Start position on original nt sequence
+    4. Extracted sequence length
+    5. Positions of nt not in phase with reference orf
+    6. Position of the first stop in phase
 
 if --unaligned is set, format options are ignored (phylip, nexus, etc.), and
-only Fasta is accepted.
-
-Unlike goalign phase, it does not take into account protein information (3 or 6 phases).
+only Fasta is accepted. Otherwise, alignment is first "unaligned".
 
 If input sequences are not nucleotidic, then returns an error.
 
-Output files:
-
---output : unaligned set of phased sequences in fasta
-
-
- 1. If alignment is bad (>lencutoff * orf length, >matchcutoff matches over the 
-    align length and starting at first position of the ORF), then the sequence 
-    is discarded;
- 3. For each sequence, take the Start corresponding to the Start of the ORF, and
-    remove nucleotides before;
- 4. Return the trimmed nucleotidic sequences (phased), the positions of starts in
-    the nucleotidic sequences, and the removed sequence names.
- If cutend is true, then also remove the end of sequences that do not align with orf
- It does not modify the input object
+Output file is an unaligned set of sequences in fasta.
 `,
 	RunE: func(cmd *cobra.Command, args []string) (err error) {
 		var f, aaf, codonf, logf *os.File

diff --git a/docs/commands/phase.md b/docs/commands/phase.md
@@ -13,7 +13,7 @@ To do so, phase will:
    If no phase gives a good alignment in any reference orf (cutoffs given by `--len-cutoff` and `--match-cutoff`),
    then the sequence flagged as removed;
 3. For each sequence, take the Start corresponding to the Start of the ORF, and remove
-   nucleotides before (and nucleotides after is `--cut-end` is given);
+   nucleotides before (and nucleotides after if `--cut-end` is given);
 4. Return the trimmed nucleotidic sequences (phased), the corresponding amino-acid sequences (phasedaa)
    and the start position on the original nt sequence;
 5. The log file contains information on:

diff --git a/docs/commands/phasent.md b/docs/commands/phasent.md
@@ -0,0 +1,108 @@
+# Goalign: toolkit and api for alignment manipulation
+
+## Commands
+
+### phase
+Although similar to goalign phase, goalign phasent does not take into align translation of input sequences.
+
+This command "phases" input sequences on the basis on either a set of input sequences, or the longest detected orf.
+To do so, it will:
+
+1. Search for the longest ORF in the dataset if no reference orf(s) is(are) given;
+2. For each sequence of the dataset: will take the sequence in forward and revcomp (if `--reverse` is given),
+   align it with all ref orfs, and take the phase (fwd or revcomp) and the reference orf giving the best alignment;
+   If no phase gives a good alignment in any reference orf (cutoffs given by `--len-cutoff` and `--match-cutoff`),
+   then the sequence flagged as removed;
+3. For each sequence, take the Start corresponding to the Start of the ORF, and remove
+   nucleotides before (and nucleotides after if `--cut-end` is given);
+4. Return the trimmed nucleotidic sequences (phased), the corresponding amino-acid sequences (phasedaa)
+   and the start position on the original nt sequence;
+5. The log file contains information on:
+    1. Sequence name
+    2. Its best matching reference orf
+    3. Start position on original nt sequence
+    4. Extracted sequence length
+    5. Positions of nt not in phase with reference orf
+    6. Position of the first stop in phase
+
+if `--unaligned` is set, format options are ignored (phylip, nexus, etc.), and
+only Fasta is accepted. Otherwise, alignment is first "unaligned".
+
+If input sequences are not nucleotidic, then returns an error.
+
+Output file is an unaligned set of sequences in fasta.
+
+#### Usage
+```
+Usage:
+  goalign phasent [flags]
+
+Flags:
+      --aa-output string     Output translated sequences FASTA file (default "none")
+      --cut-end              Iftrue, then also remove the end of sequences that do not align with orf
+  -h, --help                 help for phasent
+      --len-cutoff float     Length cutoff, over orf length, to consider sequence hits (-1==No cutoff) (default -1)
+  -l, --log string           Output log: positions of the considered ATG for each sequence (default "none")
+      --match float          Score for a match for pairwise alignment (if omitted, then take substitution matrix) (default 1)
+      --match-cutoff float   Nb Matches cutoff, over alignment length, to consider sequence hits (-1==No cutoff) (default 0.5)
+      --mismatch float       Score for a mismatch for pairwise alignment (if omitted, then take substitution matrix) (default -1)
+      --nt-output string     Output ATG "phased" FASTA file + first nts not in ref phase removed (nt corresponding to aa-output sequence) (default "none")
+  -o, --output string        Output ATG "phased" FASTA file (default "stdout")
+      --ref-orf string       Reference ORF to phase against (if none is given, then will try to get the longest orf in the input data) (default "none")
+      --reverse              Search ALSO in the reverse strand (in addition to the forward strand)
+      --unaligned            Considers sequences as unaligned and only format fasta is accepted (phylip, nexus,... options are ignored)
+
+Global Flags:
+  -i, --align string    Alignment input file (default "stdin")
+      --auto-detect     Auto detects input format (overrides -p, -x and -u)
+  -u, --clustal         Alignment is in clustal? default fasta
+      --input-strict    Strict phylip input format (only used with -p)
+  -x, --nexus           Alignment is in nexus? default fasta
+      --no-block        Write Phylip sequences without space separated blocks (only used with -p)
+      --one-line        Write Phylip sequences on 1 line (only used with -p)
+      --output-strict   Strict phylip output format (only used with -p)
+  -p, --phylip          Alignment is in phylip? default fasta
+      --seed int        Random Seed: -1 = nano seconds since 1970/01/01 00:00:00 (default -1)
+  -t, --threads int     Number of threads (default 1)
+```
+
+#### Examples
+
+input.fa
+
+```
+>allcodons
+GCTGCCGCAGCGTTATTGCTTCTCCTACTGCGTCGCCGACGGAGAAGGAAAAAGAATAACATG
+GATGACTTTTTCTGTTGCCCTCCCCCACCGCAACAGTCTTCCTCATCGAGTAGCGAAGAGACT
+ACCACAACGGGTGGCGGAGGGTGGCATCACTATTACATTATCATAGTTGTCGTAGTGTAATGA
+TAGC
+>allcodons2
+GCTGCAGCGTTATTGCTTCTCCTACTGCGTCGCCGACGGAGAAGGAAAAAGAATAACATG
+GATGACTTTTTCTGTTGCCCTCCCCCACCGCAACAGTCTTCCTCATCGAGTAGCGAAGAGACT
+ACCACAACGGGTGGCGGAGGGTGGCATCACTATTACATTATCATAGTTGTCGTATAATGA
+```
+
+```
+goalign phasent -i input.fa --unaligned -o stdout --aa-output aa.fa
+
+```
+
+should give
+
+stdout
+```
+>allcodons
+ATGGATGACTTTTTCTGTTGCCCTCCCCCACCGCAACAGTCTTCCTCATCGAGTAGCGAAGAGACTACCACAACGGGTGG
+CGGAGGGTGGCATCACTATTACATTATCATAGTTGTCGTAGTGTAATGATAGC
+>allcodons2
+ATGGATGACTTTTTCTGTTGCCCTCCCCCACCGCAACAGTCTTCCTCATCGAGTAGCGAAGAGACTACCACAACGGGTGG
+CGGAGGGTGGCATCACTATTACATTATCATAGTTGTCGTATAATGA
+```
+
+aa.fa
+```
+>allcodons
+MDDFFCCPPPPQQSSSSSSEETTTTGGGGWHHYYIIIVVVV***
+>allcodons2
+MDDFFCCPPPPQQSSSSSSEETTTTGGGGWHHYYIIIVVV**
+```
diff --git a/docs/index.md b/docs/index.md
@@ -77,7 +77,8 @@ Command                                                     | Subcommand |
 --                                                          | gaps       | Adds gaps uniformly in an input alignment
 --                                                          | snvs       | Adds substitutions uniformly in an input alignment
 [orf](commands/orf.md) ([api](api/orf.md))                  |            | Find the longest orf in all given sequences in forward strand
-[phase](commands/phase.md) ([api](api/phase.md))            |            | Find best ATGs and set them as new start positions
+[phase](commands/phase.md) ([api](api/phase.md))            |            | Find best Starts by aligning to translated ref sequences and set them as new start positions
+[phasent](commands/phasent.md) ([api](api/phase.md))        |            | Find best Starts by aligning to ref sequences and set them as new start positions
 [random](commands/random.md) ([api](api/random.md))         |            | Generate random sequences
 [reformat](commands/reformat.md) ([api](api/reformat.md))   |            | Reformats input alignment into phylip of fasta format
 --                                                          | clustal    | Reformats an input alignment into Clustal