From e4f80c6600616f0fe144a710ccb5ba65ba572335 Mon Sep 17 00:00:00 2001 From: Frederic Lemoine Date: Mon, 1 Apr 2019 15:29:53 +0200 Subject: [PATCH] Added invertebrate mitochondrial code --- align/align_test.go | 43 +++++++++++++++++++++++++++++++++++++++++++ align/const.go | 32 ++++++++++++++++++++++++++++++-- cmd/phase.go | 4 +++- cmd/phasent.go | 4 +++- cmd/translate.go | 5 ++++- test.sh | 27 ++++++++++++++++++++++++++- 6 files changed, 109 insertions(+), 6 deletions(-) diff --git a/align/align_test.go b/align/align_test.go index f5b1c62..ab52c98 100644 --- a/align/align_test.go +++ b/align/align_test.go @@ -1045,6 +1045,49 @@ func TestTranslateMito(t *testing.T) { } } +func TestTranslateMitoI(t *testing.T) { + + in := NewSeqBag(UNKNOWN) + in.AddSequence("Seq0000", "ATGGATGACTTTTTCTGTTGCCCTCCCCCACCGCAACAGTCTTCCTCATCGAGTAGCGAAGAGACTACCACAACGGGTGGCGGAGGGTGGCATCACTATTACATTATCATAGTTGTCGTAGTGTAATGATAGC", "") + in.AddSequence("Seq0001", "ATGGATGACTTTTTCTGTTGCCCTCCCCCACCGCAACAGTCTTCCTCATCGAGTAGCGAAGAGACTACCACAACGGGTGGCGGAGGGTGGCATCACTATTACATTATCATAGTTGTCGTATAATGA", "") + in.AutoAlphabet() + + in2, err2 := in.CloneSeqBag() + if err2 != nil { + t.Error(err2) + } + + expaa := NewSeqBag(UNKNOWN) + expaa.AddSequence("Seq0000", "MDDFFCCPPPPQQSSSSSSEETTTTGGGGWHHYYIIMVVVV*W*", "") + expaa.AddSequence("Seq0001", "MDDFFCCPPPPQQSSSSSSEETTTTGGGGWHHYYIIMVVV*W", "") + expaa.AutoAlphabet() + + exp3phases := NewSeqBag(UNKNOWN) + exp3phases.AddSequence("Seq0000_0", "MDDFFCCPPPPQQSSSSSSEETTTTGGGGWHHYYIIMVVVV*W*", "") + exp3phases.AddSequence("Seq0000_1", "WMTFSVALPHRNSLPHRVAKSLPQRVAEGGITITLS*LS*CNDS", "") + exp3phases.AddSequence("Seq0000_2", "GWLFLLPSPTATVFLIE*RSDYHNGWRSVASLLHYHSCRSVMM", "") + exp3phases.AddSequence("Seq0001_0", "MDDFFCCPPPPQQSSSSSSEETTTTGGGGWHHYYIIMVVV*W", "") + exp3phases.AddSequence("Seq0001_1", "WMTFSVALPHRNSLPHRVAKSLPQRVAEGGITITLS*LSYN", "") + exp3phases.AddSequence("Seq0001_2", "GWLFLLPSPTATVFLIE*RSDYHNGWRSVASLLHYHSCRMM", "") + exp3phases.AutoAlphabet() + + if err := in.Translate(0, 2); err != nil { + t.Error(err) + } else { + if !expaa.Identical(in) { + t.Error(fmt.Errorf("Expected sequences are different from phased sequences")) + } + } + + if err := in2.Translate(-1, 2); err != nil { + t.Error(err) + } else { + if !exp3phases.Identical(in2) { + t.Error(fmt.Errorf("Expected sequences are different from 3 phase translated sequences")) + } + } +} + func TestMaskNt(t *testing.T) { in := NewAlign(UNKNOWN) in.AddSequence("Seq0000", "GATTAATTTGCCGTAGGCCA", "") diff --git a/align/const.go b/align/const.go index aca2e10..f530a2a 100644 --- a/align/const.go +++ b/align/const.go @@ -31,8 +31,9 @@ const ( POSITION_SEMI_CONSERVED = 2 // Same weak group POSITION_NOT_CONSERVED = 3 // None of the above values - GENETIC_CODE_STANDARD = 0 // Standard genetic code - GENETIC_CODE_VETEBRATE_MITO = 1 // Vertebrate mitochondrial genetic code + GENETIC_CODE_STANDARD = 0 // Standard genetic code + GENETIC_CODE_VETEBRATE_MITO = 1 // Vertebrate mitochondrial genetic code + GENETIC_CODE_INVETEBRATE_MITO = 2 // Invertebrate mitochondrial genetic code ) var stdaminoacid = []rune{'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'} @@ -88,12 +89,39 @@ var vertebratemitocode = map[string]rune{ "AGA": '*', "AGG": '*', "TAA": '*', "TAG": '*', } +var invertebratemitocode = map[string]rune{ + "---": '-', + "GCT": 'A', "GCC": 'A', "GCA": 'A', "GCG": 'A', + "TTA": 'L', "TTG": 'L', "CTT": 'L', "CTC": 'L', "CTA": 'L', "CTG": 'L', + "CGT": 'R', "CGC": 'R', "CGA": 'R', "CGG": 'R', + "AAA": 'K', "AAG": 'K', + "AAT": 'N', "AAC": 'N', + "ATG": 'M', "ATA": 'M', + "GAT": 'D', "GAC": 'D', + "TTT": 'F', "TTC": 'F', + "TGT": 'C', "TGC": 'C', + "CCT": 'P', "CCC": 'P', "CCA": 'P', "CCG": 'P', + "CAA": 'Q', "CAG": 'Q', + "AGA": 'S', "AGG": 'S', "TCT": 'S', "TCC": 'S', "TCA": 'S', "TCG": 'S', "AGT": 'S', "AGC": 'S', + "GAA": 'E', "GAG": 'E', + "ACT": 'T', "ACC": 'T', "ACA": 'T', "ACG": 'T', + "GGT": 'G', "GGC": 'G', "GGA": 'G', "GGG": 'G', + "TGG": 'W', "TGA": 'W', + "CAT": 'H', "CAC": 'H', + "TAT": 'Y', "TAC": 'Y', + "ATT": 'I', "ATC": 'I', + "GTT": 'V', "GTC": 'V', "GTA": 'V', "GTG": 'V', + "TAA": '*', "TAG": '*', +} + func geneticCode(code int) (gencode map[string]rune, err error) { switch code { case GENETIC_CODE_STANDARD: gencode = standardcode case GENETIC_CODE_VETEBRATE_MITO: gencode = vertebratemitocode + case GENETIC_CODE_INVETEBRATE_MITO: + gencode = invertebratemitocode default: err = fmt.Errorf("This genetic code does not exis") } diff --git a/cmd/phase.go b/cmd/phase.go index 1498250..22710ce 100644 --- a/cmd/phase.go +++ b/cmd/phase.go @@ -116,6 +116,8 @@ Output file is an unaligned set of sequences in fasta. geneticcode = align.GENETIC_CODE_STANDARD case "mitov": geneticcode = align.GENETIC_CODE_VETEBRATE_MITO + case "mitoi": + geneticcode = align.GENETIC_CODE_INVETEBRATE_MITO default: err = fmt.Errorf("Unknown genetic code : %s", phaseGeneticCode) return @@ -172,7 +174,7 @@ Output file is an unaligned set of sequences in fasta. func init() { RootCmd.AddCommand(phaseCmd) phaseCmd.PersistentFlags().StringVarP(&phaseOutput, "output", "o", "stdout", "Output \"phased\" FASTA file") - phaseCmd.PersistentFlags().StringVar(&phaseGeneticCode, "genetic-code", "standard", "Genetic Code: standard, or mitov (vertebrate mitochondrial)") + phaseCmd.PersistentFlags().StringVar(&phaseGeneticCode, "genetic-code", "standard", "Genetic Code: standard, mitoi (invertebrate mitochondrial) or mitov (vertebrate mitochondrial)") phaseCmd.PersistentFlags().StringVar(&phaseAAOutput, "aa-output", "none", "Output Met \"phased\" aa FASTA file") phaseCmd.PersistentFlags().StringVarP(&phaseLogOutput, "log", "l", "none", "Output log: positions of the considered Start for each sequence") phaseCmd.PersistentFlags().Float64Var(&lencutoff, "len-cutoff", -1.0, "Length cutoff, over orf length, to consider sequence hits (-1==No cutoff)") diff --git a/cmd/phasent.go b/cmd/phasent.go index ff2b98a..aece640 100644 --- a/cmd/phasent.go +++ b/cmd/phasent.go @@ -120,6 +120,8 @@ Output file is an unaligned set of sequences in fasta. geneticcode = align.GENETIC_CODE_STANDARD case "mitov": geneticcode = align.GENETIC_CODE_VETEBRATE_MITO + case "mitoi": + geneticcode = align.GENETIC_CODE_INVETEBRATE_MITO default: err = fmt.Errorf("Unknown genetic code : %s", phaseGeneticCode) return @@ -196,7 +198,7 @@ func init() { RootCmd.AddCommand(phasentCmd) phasentCmd.PersistentFlags().StringVarP(&phaseOutput, "output", "o", "stdout", "Output ATG \"phased\" FASTA file") phasentCmd.PersistentFlags().StringVar(&phaseCodonOutput, "nt-output", "none", "Output ATG \"phased\" FASTA file + first nts not in ref phase removed (nt corresponding to aa-output sequence)") - phasentCmd.PersistentFlags().StringVar(&phaseGeneticCode, "genetic-code", "standard", "Genetic Code: standard, or mitov (vertebrate mitochondrial)") + phasentCmd.PersistentFlags().StringVar(&phaseGeneticCode, "genetic-code", "standard", "Genetic Code: standard, mitoi (invertebrate mitochondrial) or mitov (vertebrate mitochondrial)") phasentCmd.PersistentFlags().StringVar(&phaseAAOutput, "aa-output", "none", "Output translated sequences FASTA file") phasentCmd.PersistentFlags().StringVarP(&phaseLogOutput, "log", "l", "none", "Output log: positions of the considered ATG for each sequence") phasentCmd.PersistentFlags().Float64Var(&lencutoff, "len-cutoff", -1.0, "Length cutoff, over orf length, to consider sequence hits (-1==No cutoff)") diff --git a/cmd/translate.go b/cmd/translate.go index d12d45f..a03e2e6 100644 --- a/cmd/translate.go +++ b/cmd/translate.go @@ -47,6 +47,9 @@ It only translates using the standard genetic code so far. geneticcode = align.GENETIC_CODE_STANDARD case "mitov": geneticcode = align.GENETIC_CODE_VETEBRATE_MITO + case "mitoi": + geneticcode = align.GENETIC_CODE_INVETEBRATE_MITO + default: err = fmt.Errorf("Unknown genetic code : %s", translateGeneticCode) return @@ -92,7 +95,7 @@ It only translates using the standard genetic code so far. func init() { RootCmd.AddCommand(translateCmd) - translateCmd.PersistentFlags().StringVar(&translateGeneticCode, "genetic-code", "standard", "Genetic Code: standard, or mitov (vertebrate mitochondrial)") + translateCmd.PersistentFlags().StringVar(&translateGeneticCode, "genetic-code", "standard", "Genetic Code: standard, mitoi (invertebrate mitochondrial) or mitov (vertebrate mitochondrial)") translateCmd.PersistentFlags().StringVarP(&translateOutput, "output", "o", "stdout", "Output translated alignment file") translateCmd.PersistentFlags().IntVar(&translatePhase, "phase", 0, "Number of characters to drop from the start of the alignment (if -1: Translate in the 3 phases, from positions 0, 1, and 2)") translateCmd.PersistentFlags().BoolVar(&unaligned, "unaligned", false, "Considers sequences as unaligned and format fasta (phylip, nexus,... options are ignored)") diff --git a/test.sh b/test.sh index fd05b58..978dfd6 100755 --- a/test.sh +++ b/test.sh @@ -1926,7 +1926,7 @@ ${GOALIGN} translate -i input --phase 0 --unaligned -o result diff -q -b expected result rm -f input expected result -echo "->goalign translate 3 phases" +echo "->goalign translate 3 phases (standard)" cat > input <test1 CATGAGTCTCTCTGATAAGGACAAGGCTGCTGTGAAAGCCCTATGG @@ -1968,6 +1968,31 @@ ${GOALIGN} translate -i input --phase -1 --unaligned --genetic-code mitov -o res diff -q -b expected result rm -f input expected result +echo "->goalign translate 3 phases (mito invertebrate)" +cat > input <s1 +ATGGATGACTTTTTCTGTTGCCCTCCCCCACCGCAACAGTCTTCCTCATCGAGTAGCGAAGAGACTACCACAACGGGTGGCGGAGGGTGGCATCACTATTACATTATCATAGTTGTCGTAGTGTAATGATAGC +>s2 +ATGGATGACTTTTTCTGTTGCCCTCCCCCACCGCAACAGTCTTCCTCATCGAGTAGCGAAGAGACTACCACAACGGGTGGCGGAGGGTGGCATCACTATTACATTATCATAGTTGTCGTATAATGA +EOF +cat > expected <s1_0 +MDDFFCCPPPPQQSSSSSSEETTTTGGGGWHHYYIIMVVVV*W* +>s1_1 +WMTFSVALPHRNSLPHRVAK*LPQRVAEGGITITLS*LS*CNDS +>s1_2 +GWLFLLPSPTATVFLIE*R*DYHNGWR*VASLLHYHSCRSVMM +>s2_0 +MDDFFCCPPPPQQSSSSSSEETTTTGGGGWHHYYIIMVVV*W +>s2_1 +WMTFSVALPHRNSLPHRVAK*LPQRVAEGGITITLS*LSYN +>s2_2 +GWLFLLPSPTATVFLIE*R*DYHNGWR*VASLLHYHSCRMM +EOF +${GOALIGN} translate -i input --phase -1 --unaligned --genetic-code mitov -o result +diff -q -b expected result +rm -f input expected result + echo "->goalign dedup" cat > input <