From ab9176e44e25bee36bcc503f1d47e1fe09247cd3 Mon Sep 17 00:00:00 2001 From: Frederic Lemoine Date: Tue, 28 Apr 2020 15:16:57 +0200 Subject: [PATCH] Added comparison to profile for goalign stats --- align/align.go | 154 ++++++++++++++-------------- align/align_test.go | 82 +++++++++++---- align/profile.go | 172 ++++++++++++++++++++++++++++++++ align/seqbag.go | 46 ++++++++- align/seqbag_test.go | 40 ++++++++ align/sequence.go | 54 ++++++++++ align/sequence_test.go | 3 +- cmd/rarefy.go | 5 +- cmd/root.go | 21 +--- cmd/stats.go | 140 +++++++++++++++++--------- cmd/stats_gaps.go | 33 +++++- cmd/stats_mutations.go | 42 ++++++-- cmd/subset.go | 5 +- docs/commands/stats.md | 14 ++- io/countprofile/countprofile.go | 82 +++++++++++++++ io/utils/readfiles.go | 17 ++++ test.sh | 42 ++++++++ 17 files changed, 766 insertions(+), 186 deletions(-) create mode 100644 align/profile.go create mode 100644 align/seqbag_test.go create mode 100644 io/countprofile/countprofile.go diff --git a/align/align.go b/align/align.go index 742d30e..c5f4b01 100644 --- a/align/align.go +++ b/align/align.go @@ -55,17 +55,13 @@ type Alignment interface { Length() int // Length of the alignment Mask(start, length int) error // Masks given positions MaxCharStats(excludeGaps bool) ([]rune, []int) - Mutate(rate float64) // Adds uniform substitutions in the alignment (~sequencing errors) - NbVariableSites() int // Nb of variable sites - NumGapsUniquePerSequence() []int // Number of Gaps in each sequence that are unique in their alignment site + Mutate(rate float64) // Adds uniform substitutions in the alignment (~sequencing errors) + NbVariableSites() int // Nb of variable sites + // Number of Gaps in each sequence that are unique in their alignment site + NumGapsUniquePerSequence(countProfile *CountProfile) (numuniques []int, numnew []int, numboth []int, err error) // returns the number of characters in each sequence that are unique in their alignment site (gaps or others) // It does not take into account 'N' and '-' as unique mutations - NumMutationsUniquePerSequence() (numuniques []int) - // returns the number of differences between the reference sequence and each sequence of the alignment - // If lengths are different, returns an error - // It does not take into account 'N' and '-' in sequences as mutations compared to ref - /// sequence (ref sequence can have a '-' or a 'N') - NumMutationsComparedToReferenceSequence(seq Sequence) (nummutations []int, err error) + NumMutationsUniquePerSequence(profile *CountProfile) (numuniques []int, numnew []int, nummuts []int, err error) Pssm(log bool, pseudocount float64, normalization int) (pssm map[rune][]float64, err error) // Normalization: PSSM_NORM_NONE, PSSM_NORM_UNIF, PSSM_NORM_DATA Rarefy(nb int, counts map[string]int) (Alignment, error) // Take a new rarefied sample taking into accounts weights RandSubAlign(length int) (Alignment, error) // Extract a random subalignment with given length from this alignment @@ -1075,7 +1071,7 @@ func (a *align) Pssm(log bool, pseudocount float64, normalization int) (pssm map /* We add pseudo counts */ if pseudocount > 0 { for _, v := range pssm { - for i, _ := range v { + for i := range v { v[i] += pseudocount } } @@ -1364,27 +1360,55 @@ func (a *align) NbVariableSites() int { } // NumGapsUniquePerSequence returns the number of Gaps in the sequence that are unique in their alignment site -func (a *align) NumGapsUniquePerSequence() (numgaps []int) { - numgaps = make([]int, a.NbSequences()) +// This function counts, for each sequence of the given alignment, the number of : +// - gaps that are unique to the sequence compared to the others of the alignment +// - gaps that are new compared to the profile (not seen in the profile) : numnew +// - gaps that are new compared to the profile and found only once in the given alignment: numboth +// If the profile is nil, then does not compute numnewmuts neither nummutsboth (0 filled slices) +func (a *align) NumGapsUniquePerSequence(countProfile *CountProfile) (numuniques []int, numnew []int, numboth []int, err error) { + numuniques = make([]int, a.NbSequences()) + numnew = make([]int, a.NbSequences()) + numboth = make([]int, a.NbSequences()) + uniqueIndex := -1 nbGapsColumn := 0 + // Check that profile has the right length + if countProfile != nil { + if !countProfile.CheckLength(a.Length()) { + err = fmt.Errorf("Profile does not have same length than alignment") + return + } + } + + var c int for i := 0; i < a.Length(); i++ { uniqueIndex = -1 nbGapsColumn = 0 for j, s := range a.seqs { - if s.sequence[i] == GAP { + r := s.sequence[i] + if r == GAP { nbGapsColumn++ uniqueIndex = j - } - if nbGapsColumn > 1 { - break + if countProfile != nil { + c, _ = countProfile.Count(r, i) + if c == 0 { + numnew[j]++ + } + } else if nbGapsColumn > 1 { + break + } } } if nbGapsColumn == 1 { - numgaps[uniqueIndex]++ + numuniques[uniqueIndex]++ + if countProfile != nil { + if c, _ = countProfile.Count(GAP, i); c == 0 { + numboth[uniqueIndex]++ + } + } } } return @@ -1392,78 +1416,56 @@ func (a *align) NumGapsUniquePerSequence() (numgaps []int) { // NumMutationsUniquePerSequence returns the number of characters in each sequence that are unique in their alignment site. // It does not take into account 'N' and '-' as unique mutations -func (a *align) NumMutationsUniquePerSequence() (numuniques []int) { +// This function counts, for each sequence of the given alignment, the number of : +// - mutations that are unique to the sequence compared to the others of the alignment +// - mutations that are new compared to the profile (not seen in the profile) : numnew +// - mutations that are new compared to the profile and found only once in the given alignment: numboth +// If the profile is nil, then does not compute numnewmuts neither nummutsboth (0 filled slices) +func (a *align) NumMutationsUniquePerSequence(countProfile *CountProfile) (numuniques []int, numnew []int, numboth []int, err error) { numuniques = make([]int, a.NbSequences()) - for i := 0; i < a.Length(); i++ { - occurences := make(map[rune]int) - indices := make(map[rune]int) - - for j, s := range a.seqs { - occurences[s.sequence[i]]++ - indices[s.sequence[i]] = j - } - - all := '.' - if a.Alphabet() == AMINOACIDS { - all = ALL_AMINO - } else if a.Alphabet() == NUCLEOTIDS { - all = ALL_NUCLE - } - - for c, num := range occurences { - if num == 1 && c != all && c != GAP { - ind := indices[c] - numuniques[ind]++ - } - } - } - return -} - -// returns the number of differences between the reference sequence and each sequence of the alignment -// Counts only non GAPS and non N sites in each sequences (may be a gap or a N in the reference sequence though) -// If a character is ambigous (IUPAC notation), then it is counted as a mutation only if it is incompatible with -// the reference character. -// -// If lengths are different, returns an error -func (a *align) NumMutationsComparedToReferenceSequence(refseq Sequence) (nummutations []int, err error) { - var refseqCode []uint8 - var nt uint8 - - nummutations = make([]int, a.NbSequences()) - if refseq.Length() != a.Length() { - err = fmt.Errorf("Reference sequence and alignment do not have same length (%d,%d), cannot compute a number of mutation", refseq.Length(), a.Length()) - return - } + numnew = make([]int, a.NbSequences()) + numboth = make([]int, a.NbSequences()) all := '.' if a.Alphabet() == AMINOACIDS { all = ALL_AMINO } else if a.Alphabet() == NUCLEOTIDS { - refseqCode = make([]uint8, a.Length()) - for i := 0; i < a.Length(); i++ { - if refseqCode[i], err = Nt2IndexIUPAC(refseq.SequenceChar()[i]); err != nil { - return - } - } all = ALL_NUCLE } + // Check that profile has the right length + if countProfile != nil { + if !countProfile.CheckLength(a.Length()) { + err = fmt.Errorf("Profile does not have same length than alignment") + return + } + } + + var c int for i := 0; i < a.Length(); i++ { + occurences := make([]int, 130) + indices := make([]int, 130) + for j, s := range a.seqs { - eq := true - if a.Alphabet() == NUCLEOTIDS { - if nt, err = Nt2IndexIUPAC(s.sequence[i]); err != nil { - return + r := s.sequence[i] + occurences[int(r)]++ + indices[int(r)] = j + if countProfile != nil && r != all && r != GAP { + if c, _ = countProfile.Count(r, i); c == 0 { + numnew[j]++ } - if eq, err = EqualOrCompatible(nt, refseqCode[i]); err != nil { - return - } - } else { - eq = (s.sequence[i] == refseq.SequenceChar()[i]) } - if s.SequenceChar()[i] != GAP && s.SequenceChar()[i] != all && !eq { - nummutations[j]++ + } + + for c, num := range occurences { + if num == 1 && rune(c) != all && rune(c) != GAP { + ind := indices[c] + numuniques[ind]++ + if countProfile != nil { + if c, _ = countProfile.Count(rune(c), i); c == 0 { + numboth[ind]++ + } + } } } } diff --git a/align/align_test.go b/align/align_test.go index 7b90091..8620316 100644 --- a/align/align_test.go +++ b/align/align_test.go @@ -1786,46 +1786,83 @@ func TestConsensusGaps(t *testing.T) { } func Test_align_NumGapsUniquePerSequence(t *testing.T) { - var a Alignment - var ng []int - var exp []int + var a, ref Alignment + var ng, nn, nb []int + var expng, expnn, expnb []int + var err error + var profile *CountProfile a = NewAlign(NUCLEOTIDS) a.AddSequence("A", "ACGACGA-GACC", "") a.AddSequence("B", "AT-TT-T-TTTC", "") a.AddSequence("C", "ATCTT-TTT--T", "") - exp = []int{0, 1, 2} + ref = NewAlign(NUCLEOTIDS) + ref.AddSequence("A", "AAAAAAAAAAAA", "") + ref.AddSequence("B", "AAAAAAAAAAAA", "") + ref.AddSequence("C", "AAAAAAAAAAAA", "") - ng = a.NumGapsUniquePerSequence() + expng = []int{0, 1, 2} + expnn = []int{1, 3, 3} + expnb = []int{0, 1, 2} + + profile = NewCountProfileFromAlignment(ref) + if ng, nn, nb, err = a.NumGapsUniquePerSequence(profile); err != nil { + t.Error(err) + } - if !reflect.DeepEqual(exp, ng) { - t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", ng, exp)) + if !reflect.DeepEqual(expng, ng) { + t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", ng, expng)) + } + if !reflect.DeepEqual(expnn, nn) { + t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", nn, expnn)) + } + if !reflect.DeepEqual(expnb, nb) { + t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", nb, expnb)) } } func Test_align_NumMutationsUniquePerSequence(t *testing.T) { - var a Alignment - var ng []int - var exp []int + var a, ref Alignment + var ng, nn, nb []int + var expng, expnn, expnb []int + var err error + var profile *CountProfile a = NewAlign(NUCLEOTIDS) a.AddSequence("A", "ACGACGA-GACC", "") a.AddSequence("B", "AT-TT-T-TTTC", "") a.AddSequence("C", "ATCTT-TTT--T", "") - exp = []int{9, 2, 3} + ref = NewAlign(NUCLEOTIDS) + ref.AddSequence("A", "AAAAAAAAAAAA", "") + ref.AddSequence("B", "AAAAAAAAAAAA", "") + ref.AddSequence("C", "AAAAAAAAAAAA", "") - ng = a.NumMutationsUniquePerSequence() + expng = []int{9, 2, 3} + expnn = []int{7, 8, 8} + expnb = []int{6, 2, 3} - if !reflect.DeepEqual(exp, ng) { - t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", ng, exp)) + profile = NewCountProfileFromAlignment(ref) + if ng, nn, nb, err = a.NumMutationsUniquePerSequence(profile); err != nil { + t.Error(err) + } + + if !reflect.DeepEqual(expng, ng) { + t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", ng, expng)) + } + if !reflect.DeepEqual(expnn, nn) { + t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", nn, expnn)) + } + + if !reflect.DeepEqual(expnb, nb) { + t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", nb, expnb)) } } func Test_align_NumMutationsComparedToReferenceSequence(t *testing.T) { var a Alignment - var ng []int + var ng int var exp []int var err error @@ -1835,15 +1872,16 @@ func Test_align_NumMutationsComparedToReferenceSequence(t *testing.T) { a.AddSequence("C", "ATCTT-TTT--T", "") a.AddSequence("D", "CCCCCCCCCCCC", "") - s := NewSequence("ref", []rune("CCCCCCCCCCCC"), "") + ref := NewSequence("ref", []rune("CCCCCCCCCCCC"), "") exp = []int{7, 8, 8, 0} - if ng, err = a.NumMutationsComparedToReferenceSequence(s); err != nil { - t.Error(err) - } - - if !reflect.DeepEqual(exp, ng) { - t.Error(fmt.Errorf("Nummutations is not what is expected, have %v, want %v", ng, exp)) + for i, s := range a.Sequences() { + if ng, err = s.NumMutationsComparedToReferenceSequence(a.Alphabet(), ref); err != nil { + t.Error(err) + } + if !reflect.DeepEqual(exp[i], ng) { + t.Error(fmt.Errorf("Nummutations is not what is expected, have %v, want %v", ng, exp)) + } } } diff --git a/align/profile.go b/align/profile.go new file mode 100644 index 0000000..bffda89 --- /dev/null +++ b/align/profile.go @@ -0,0 +1,172 @@ +package align + +import ( + "fmt" +) + +// CountProfile represents a simple view of an alignment +// and stores the number of occurences of each characters at +// each position of an alignment +type CountProfile struct { + names []int // mapping between character and index. names uses utf8 int value to store indexes names[int(r)]=index + header []rune // characters in the count table, similar to prev map + counts [][]int // first dimension: len(head), second dimension; nb sites in the underlying alignment +} + +// NewCountProfileFromAlignment initializes a new CountProfile using an input alignment +func NewCountProfileFromAlignment(al Alignment) (p *CountProfile) { + p = NewCountProfile() + + p.header = make([]rune, 0, 100) + p.counts = make([][]int, 0, 100) + + al.IterateChar(func(name string, seq []rune) bool { + for i, r := range seq { + idx := p.names[int(r)] + if idx < 0 { + idx = len(p.header) + p.names[int(r)] = idx + p.header = append(p.header, r) + p.counts = append(p.counts, make([]int, al.Length())) + } + p.counts[idx][i]++ + } + return false + }) + return +} + +// NewCountProfile initializes a new Profile with nil attributes +func NewCountProfile() (p *CountProfile) { + p = &CountProfile{ + names: make([]int, 130), + header: nil, + counts: nil, + } + + // Initialize to "not exit" all characters + for i := 0; i < 130; i++ { + p.names[i] = -1 + } + + return +} + +// CountsAt returns the counts for all sites, for the ith character +// (arbitrary order of character) +func (p *CountProfile) CountsAt(i int) (counts []int, err error) { + if i > len(p.counts) || i < 0 { + err = fmt.Errorf("No counts for character at index %d", i) + return + } + counts = p.counts[i] + return +} + +// NameAt returns the name of ith character in the header +func (p *CountProfile) NameAt(i int) (name rune, err error) { + if i >= len(p.header) || i < 0 { + err = fmt.Errorf("No character at index %d", i) + return + } + name = p.header[i] + return +} + +// NameIndex returns the index of the given character in the header +// If the character does not exist, returns false +func (p *CountProfile) NameIndex(r rune) (index int, ok bool) { + index = p.names[int(r)] + ok = (index >= 0) + return +} + +// Count returns the number of occurences of the character r at the position site +func (p *CountProfile) Count(r rune, site int) (count int, err error) { + var index int + count = 0 + if index = p.names[int(r)]; index < 0 { + err = fmt.Errorf("Character does not exist in the Profile %c", r) + return + } + if site < 0 || site >= len(p.counts[index]) { + err = fmt.Errorf("Site %d is outside the profile", site) + return + } + count = p.counts[index][site] + return +} + +// CountAt returns the number of occurences of the ith character at the position site +func (p *CountProfile) CountAt(i, site int) (count int, err error) { + if i >= len(p.header) || i < 0 { + err = fmt.Errorf("No character at index %d", i) + return + } + if site < 0 || site >= len(p.counts[i]) { + err = fmt.Errorf("Site %d is outside the profile", site) + return + } + count = p.counts[i][site] + return +} + +// NbCharacters returns the number of different characters in the profile +func (p *CountProfile) NbCharacters() (nb int) { + return len(p.header) +} + +// CheckLength returns true if the number of sites of the profile corresponds to the given length +// false otherwise. +func (p *CountProfile) CheckLength(length int) bool { + for _, v := range p.counts { + if len(v) != length { + return false + } + } + return true +} + +// SetHeader sets the Header and initializes the count structure +func (p *CountProfile) SetHeader(header []rune) { + p.header = header + p.counts = make([][]int, len(p.header)) + for i, r := range header { + p.names[int(r)] = i + p.counts[i] = make([]int, 0, 100) + } + return +} + +// AppendCount appends a new site to the profile for the ith character, and +// associates count to it +func (p *CountProfile) AppendCount(i, count int) (err error) { + if i >= len(p.header) || i < 0 { + err = fmt.Errorf("No character at index %d", i) + return + } + p.counts[i] = append(p.counts[i], count) + return +} + +func (p *CountProfile) Print() { + fmt.Print("site") + for _, n := range p.header { + fmt.Printf("\t%c", n) + } + fmt.Println() + i := 0 + length := 0 + for { + fmt.Printf("%d", i) + for _, c := range p.counts { + length = len(c) + fmt.Printf("\t%d", c[i]) + } + fmt.Println() + i++ + if i >= length { + break + } + } +} diff --git a/align/seqbag.go b/align/seqbag.go index dc4a55e..69288b2 100644 --- a/align/seqbag.go +++ b/align/seqbag.go @@ -26,6 +26,7 @@ type SeqBag interface { AlphabetCharToIndex(c rune) int // Returns index of the character (nt or aa) in the AlphabetCharacters() array AutoAlphabet() // detects and sets alphabet automatically for all the sequences CharStats() map[rune]int64 + UniqueCharacters() []rune CharStatsSeq(idx int) (map[rune]int, error) // Computes frequency of characters for the given sequence CleanNames(namemap map[string]string) // Clean sequence names (newick special char) Clear() // Removes all sequences @@ -37,6 +38,7 @@ type SeqBag interface { GetSequenceChar(name string) ([]rune, bool) GetSequenceCharById(ith int) ([]rune, bool) GetSequenceNameById(ith int) (string, bool) + GetSequenceByName(name string) (Sequence, bool) SetSequenceChar(ithAlign, ithSite int, char rune) error IgnoreIdentical(bool) // if true, then it won't add the sequence if a sequence with the same name AND same sequence exists SampleSeqBag(nb int) (SeqBag, error) // generate a sub sample of the sequences @@ -303,6 +305,15 @@ func (sb *seqbag) GetSequence(name string) (string, bool) { return "", ok } +// If sequence exists in alignment, return true,sequence +// Otherwise, return false,nil +func (sb *seqbag) GetSequenceByName(name string) (Sequence, bool) { + if seq, ok := sb.seqmap[name]; ok { + return seq, true + } + return nil, false +} + // If sequence exists in alignment, return sequence,true // Otherwise, return "",false func (sb *seqbag) GetSequenceById(ith int) (string, bool) { @@ -492,16 +503,43 @@ func (sb *seqbag) AutoAlphabet() { } // Returns the distribution of all characters -func (sb *seqbag) CharStats() map[rune]int64 { - outmap := make(map[rune]int64) +func (sb *seqbag) CharStats() (chars map[rune]int64) { + chars = make(map[rune]int64) + present := make([]int, 130) for _, seq := range sb.seqs { for _, r := range seq.sequence { - outmap[unicode.ToUpper(r)]++ + present[unicode.ToUpper(r)]++ + } + } + + for i, r := range present { + if r > 0 { + chars[rune(i)] = int64(r) + } + } + + return +} + +// Returns the distribution of all characters +func (sb *seqbag) UniqueCharacters() (chars []rune) { + chars = make([]rune, 0, 40) + present := make([]bool, 130) + + for _, seq := range sb.seqs { + for _, r := range seq.sequence { + present[unicode.ToUpper(r)] = true + } + } + + for i, r := range present { + if r { + chars = append(chars, rune(i)) } } - return outmap + return } // CharStatsSeq Returns the frequency of all characters in the sequence diff --git a/align/seqbag_test.go b/align/seqbag_test.go new file mode 100644 index 0000000..cb5432b --- /dev/null +++ b/align/seqbag_test.go @@ -0,0 +1,40 @@ +package align + +import ( + "reflect" + "testing" +) + +func Test_seqbag_UniqueCharacters(t *testing.T) { + type fields struct { + seqmap map[string]*seq + seqs []*seq + ignoreidentical bool + alphabet int + } + tests := []struct { + name string + fields fields + wantChars []rune + }{ + {name: "t1", + fields: fields{seqmap: nil, + seqs: []*seq{ + &seq{sequence: []rune("ACGTACGTACGT")}, + &seq{sequence: []rune("ACGTAC*TACGT")}}}, + wantChars: []rune("*ACGT")}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + sb := &seqbag{ + seqmap: tt.fields.seqmap, + seqs: tt.fields.seqs, + ignoreidentical: tt.fields.ignoreidentical, + alphabet: tt.fields.alphabet, + } + if gotChars := sb.UniqueCharacters(); !reflect.DeepEqual(gotChars, tt.wantChars) { + t.Errorf("seqbag.UniqueCharacters() = %v, want %v", gotChars, tt.wantChars) + } + }) + } +} diff --git a/align/sequence.go b/align/sequence.go index 71031a7..941deac 100644 --- a/align/sequence.go +++ b/align/sequence.go @@ -29,6 +29,12 @@ type Sequence interface { NumGapsOpenning() int // Number of Gaps opennin, it counts streches of gap only once NumGapsFromStart() int // Number of Gaps from Start (until a non gap is encountered) NumGapsFromEnd() int // Number of Gaps from End (until a non gap is encountered) + // returns the number of differences between the reference sequence and each sequence of the alignment + // If lengths are different, returns an error + // It does not take into account 'N' and '-' in sequences as mutations compared to ref + /// sequence (ref sequence can have a '-' or a 'N') + NumMutationsComparedToReferenceSequence(alphabet int, seq Sequence) (nummutations int, err error) + Clone() Sequence } @@ -250,6 +256,53 @@ func (s *seq) NumGapsFromEnd() (numgaps int) { return } +// returns the number of differences between the reference sequence and each sequence of the alignment +// Counts only non GAPS and non N sites in each sequences (may be a gap or a N in the reference sequence though) +// If a character is ambigous (IUPAC notation), then it is counted as a mutation only if it is incompatible with +// the reference character. +// +// If lengths are different, returns an error +func (s *seq) NumMutationsComparedToReferenceSequence(alphabet int, refseq Sequence) (nummutations int, err error) { + var refseqCode []uint8 + var nt uint8 + + if refseq.Length() != s.Length() { + err = fmt.Errorf("Reference sequence and sequence do not have same length (%d,%d), cannot compute a number of mutation", refseq.Length(), s.Length()) + return + } + + all := '.' + if alphabet == NUCLEOTIDS { + refseqCode = make([]uint8, s.Length()) + for i := 0; i < s.Length(); i++ { + if refseqCode[i], err = Nt2IndexIUPAC(refseq.SequenceChar()[i]); err != nil { + return + } + } + all = ALL_NUCLE + } else { + all = ALL_AMINO + } + + for i := 0; i < s.Length(); i++ { + eq := true + if alphabet == NUCLEOTIDS { + if nt, err = Nt2IndexIUPAC(s.sequence[i]); err != nil { + return + } + if eq, err = EqualOrCompatible(nt, refseqCode[i]); err != nil { + return + } + } else { + eq = (s.sequence[i] == refseq.SequenceChar()[i]) + } + if s.SequenceChar()[i] != GAP && s.SequenceChar()[i] != all && !eq { + nummutations++ + } + } + return +} + // Translates the given sequence start at nucleotide index "phase" // // If the sequence is not nucleotidic, then throws an error @@ -321,6 +374,7 @@ func (s *seq) Clone() Sequence { // The 3 nucleotites in arguments are converted to upper case and U converted to T. // If one character does not correspond to a known nucleotide in IUPAC code, then // Returns an empty slice. +// If one of the nucleotides is a GAP, then returns an empty slice. // // For example GenAllPossibleCodons('A','G','N') should return // {"AGA","AGC","AGG","AGT"}. diff --git a/align/sequence_test.go b/align/sequence_test.go index 4f3c3b6..ad65587 100644 --- a/align/sequence_test.go +++ b/align/sequence_test.go @@ -53,7 +53,8 @@ func Test_seq_Translate(t *testing.T) { wantErr bool }{ {name: "Seq1", fields: fields{name: "seq1", sequence: []rune{'G', 'A', 'Y', 'A', 'A', 'R', 'U', 'A', 'Y', 'C', 'A', 'Y', 'R', 'A', 'Y', 'U', 'A', 'G'}}, args: args{phase: 0, geneticcode: 0}, wantTr: &seq{name: "seq1", sequence: []rune{'D', 'K', 'Y', 'H', 'X', '*'}}, wantErr: false}, - {name: "Seq1", fields: fields{name: "seq1", sequence: []rune{'G', 'A', 'Y', 'A', 'A', 'R', 'U', 'A', 'Y', 'C', 'A', 'Y', 'A', 'A', 'Y', 'U', 'A', 'G'}}, args: args{phase: 0, geneticcode: 0}, wantTr: &seq{name: "seq1", sequence: []rune{'D', 'K', 'Y', 'H', 'N', '*'}}, wantErr: false}, + {name: "Seq2", fields: fields{name: "seq1", sequence: []rune{'G', 'A', 'Y', 'A', 'A', 'R', 'U', 'A', 'Y', 'C', 'A', 'Y', 'A', 'A', 'Y', 'U', 'A', 'G'}}, args: args{phase: 0, geneticcode: 0}, wantTr: &seq{name: "seq1", sequence: []rune{'D', 'K', 'Y', 'H', 'N', '*'}}, wantErr: false}, + {name: "Seq3", fields: fields{name: "seq1", sequence: []rune{'-', 'A', 'Y', 'A', 'A', 'R', 'U', 'A', 'Y', 'C', 'A', 'Y', 'A', 'A', 'Y', 'U', 'A', 'G'}}, args: args{phase: 0, geneticcode: 0}, wantTr: &seq{name: "seq1", sequence: []rune{'X', 'K', 'Y', 'H', 'N', '*'}}, wantErr: false}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/cmd/rarefy.go b/cmd/rarefy.go index 803f11a..d2adea5 100644 --- a/cmd/rarefy.go +++ b/cmd/rarefy.go @@ -10,6 +10,7 @@ import ( "github.com/evolbioinfo/goalign/align" "github.com/evolbioinfo/goalign/io" + "github.com/evolbioinfo/goalign/io/utils" "github.com/spf13/cobra" ) @@ -138,7 +139,7 @@ func parseCountFile(file string) (counts map[string]int, err error) { } else { r = bufio.NewReader(f) } - l, e := Readln(r) + l, e := utils.Readln(r) for e == nil { cols := strings.Split(l, "\t") if cols == nil || len(cols) != 2 { @@ -149,7 +150,7 @@ func parseCountFile(file string) (counts map[string]int, err error) { return } counts[cols[0]] = c - l, e = Readln(r) + l, e = utils.Readln(r) } return diff --git a/cmd/root.go b/cmd/root.go index 245a642..0e07023 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -288,23 +288,6 @@ func openWriteFile(file string) (f *os.File, err error) { return } -// Readln returns a single line (without the ending \n) -// from the input buffered reader. -// An error is returned iff there is an error with the -// buffered reader. -func Readln(r *bufio.Reader) (string, error) { - var ( - isPrefix bool = true - err error = nil - line, ln []byte - ) - for isPrefix && err == nil { - line, isPrefix, err = r.ReadLine() - ln = append(ln, line...) - } - return string(ln), err -} - func readMapFile(file string, revert bool) (map[string]string, error) { outmap := make(map[string]string, 0) var mapfile *os.File @@ -324,7 +307,7 @@ func readMapFile(file string, revert bool) (map[string]string, error) { } else { reader = bufio.NewReader(mapfile) } - line, e := Readln(reader) + line, e := utils.Readln(reader) nl := 1 for e == nil { cols := strings.Split(line, "\t") @@ -336,7 +319,7 @@ func readMapFile(file string, revert bool) (map[string]string, error) { } else { outmap[cols[0]] = cols[1] } - line, e = Readln(reader) + line, e = utils.Readln(reader) nl++ } diff --git a/cmd/stats.go b/cmd/stats.go index 7ba479d..0ac6617 100644 --- a/cmd/stats.go +++ b/cmd/stats.go @@ -7,11 +7,13 @@ import ( "github.com/evolbioinfo/goalign/align" "github.com/evolbioinfo/goalign/io" + "github.com/evolbioinfo/goalign/io/countprofile" "github.com/spf13/cobra" ) var statpersequences bool var statrefsequence string +var statcountprofile string // statsCmd represents the stats command var statsCmd = &cobra.Command{ @@ -33,12 +35,19 @@ If --per-sequences is given, then it will print the following stats, for each se 2. Number of consecutive gaps at the beginning of the sequence; 3. Number of consecutive gaps at the end of the sequence; 4. Number of gaps unique to the sequence (present in no other sequence); + If --profile is given along, then the output will be : unique\tnew\tboth, with: + - 4a unique: # gaps that are unique in each sequence in the alignment + - 4b new: # gaps that are new in each sequence compared to the profile + - 4c both: # gaps that are unique in each sequence in the alignment and that are new compared the profile 5. Number of gap opennings (streches of gaps are counted once); 6. Number of Unique mutations; + If --profile is given along, then the output will be : unique\tnew\tboth, with: + - 6a unique: # mutations that are unique in each sequence in the alignment + - 6b new: # mutations that are new in each sequence compared to the profile + - 6c both: # mutations that are unique in each sequence in the alignment and that are new compared the profile 7. Number of mutations compared to a reference sequence (given with --ref-sequence, otherwise, no column); 8. Length of the sequence without gaps; 9..n Number of occurence of each character (A,C,G, etc.). - `, RunE: func(cmd *cobra.Command, args []string) (err error) { var aligns *align.AlignChannel @@ -57,9 +66,16 @@ If --per-sequences is given, then it will print the following stats, for each se fmt.Fprintf(os.Stdout, "alphabet\t%s\n", al.AlphabetStr()) } else { var refseq align.Sequence + var profile *align.CountProfile var s string var ok bool var sb align.SeqBag + if statcountprofile != "none" { + if profile, err = countprofile.FromFile(statcountprofile); err != nil { + io.LogError(err) + return + } + } if statrefsequence != "none" { // We try to get the sequence from its name in the alignment if s, ok = al.GetSequence(statrefsequence); !ok { @@ -77,7 +93,7 @@ If --per-sequences is given, then it will print the following stats, for each se } refseq = align.NewSequence("ref", []rune(s), "") } - err = printAllSequenceStats(al, refseq) + err = printAllSequenceStats(al, refseq, profile) } } @@ -116,39 +132,42 @@ func printCharStats(align align.Alignment, only string) { } } -func printSiteCharStats(align align.Alignment, only string) (err error) { - var sitemap map[rune]int +func printSiteCharStats(al align.Alignment, only string) (err error) { + var profile *align.CountProfile + var ok bool + var indexonly int - charmap := align.CharStats() - - // We add the only character we want to output - // To write 0 if there are no occurences of it - // in the alignment - if _, ok := charmap[rune(only[0])]; !ok && only != "*" { - charmap[rune(only[0])] = 0 + profile = align.NewCountProfileFromAlignment(al) + onlyr := []rune(only) + if len(onlyr) > 1 { + err = fmt.Errorf("Character should have length 1: %s", only) } - keys := make([]string, 0, len(charmap)) - for k := range charmap { - keys = append(keys, string(k)) - } - sort.Strings(keys) fmt.Fprintf(os.Stdout, "site") - for _, v := range keys { - if only == "*" || v == only { - fmt.Fprintf(os.Stdout, "\t%s", v) + if only == "*" { + indexonly = -1 + } else { + if indexonly, ok = profile.NameIndex(onlyr[0]); !ok { + for site := 0; site < al.Length(); site++ { + fmt.Fprintf(os.Stdout, "%d0%d\n", site, 0) + } + return } } - fmt.Fprintf(os.Stdout, "\n") - for site := 0; site < align.Length(); site++ { - if sitemap, err = align.CharStatsSite(site); err != nil { - return + + for index := 0; index < profile.NbCharacters(); index++ { + r, _ := profile.NameAt(index) + if only == "*" || r == onlyr[0] { + fmt.Fprintf(os.Stdout, "\t%c", r) } + } + fmt.Fprintf(os.Stdout, "\n") + for site := 0; site < al.Length(); site++ { fmt.Fprintf(os.Stdout, "%d", site) - for _, k := range keys { - if only == "*" || k == only { - nb := sitemap[rune(k[0])] - fmt.Fprintf(os.Stdout, "\t%d", nb) + for index := 0; index < profile.NbCharacters(); index++ { + if indexonly == -1 || index == indexonly { + count, _ := profile.CountAt(index, site) + fmt.Fprintf(os.Stdout, "\t%d", count) } } fmt.Fprintf(os.Stdout, "\n") @@ -197,39 +216,53 @@ func printSequenceCharStats(sb align.SeqBag, only string) (err error) { return } -func printAllSequenceStats(al align.Alignment, refSequence align.Sequence) (err error) { +func printAllSequenceStats(al align.Alignment, refSequence align.Sequence, countProfile *align.CountProfile) (err error) { var sequencemap map[rune]int - var numgapsunique, nummutuniques, nummutations []int + + var numnewgaps []int // new gaps that are not found in the profile + var numnewmuts []int // new mutations that are not found in the profile + + var nummutuniques []int // mutations that are unique in the given alignment + var numgapsuniques []int // gaps that are unique in the given alignment + + var nummutsboth []int // mutations that are unique in the given alignment and not found in the profile + var numgapsboth []int // gaps that are unique in the given alignment and not found in the profile + + var nummutations int var gaps int var name string + var uniquechars []rune - charmap := al.CharStats() - numgapsunique = al.NumGapsUniquePerSequence() - nummutuniques = al.NumMutationsUniquePerSequence() - if refSequence != nil { - if nummutations, err = al.NumMutationsComparedToReferenceSequence(refSequence); err != nil { - io.LogError(err) - return - } + uniquechars = al.UniqueCharacters() + + if numgapsuniques, numnewgaps, numgapsboth, err = al.NumGapsUniquePerSequence(countProfile); err != nil { + return } - keys := make([]string, 0, len(charmap)) - for k := range charmap { - keys = append(keys, string(k)) + if nummutuniques, numnewmuts, nummutsboth, err = al.NumMutationsUniquePerSequence(countProfile); err != nil { + return } - sort.Strings(keys) + fmt.Fprintf(os.Stdout, "sequence") fmt.Fprintf(os.Stdout, "\tgaps") fmt.Fprintf(os.Stdout, "\tgapsstart") fmt.Fprintf(os.Stdout, "\tgapsend") fmt.Fprintf(os.Stdout, "\tgapsuniques") + if countProfile != nil { + fmt.Fprintf(os.Stdout, "\tgapsnew") + fmt.Fprintf(os.Stdout, "\tgapsboth") + } fmt.Fprintf(os.Stdout, "\tgapsopenning") fmt.Fprintf(os.Stdout, "\tmutuniques") + if countProfile != nil { + fmt.Fprintf(os.Stdout, "\tmutsnew") + fmt.Fprintf(os.Stdout, "\tmutsboth") + } if refSequence != nil { fmt.Fprintf(os.Stdout, "\tmutref") } fmt.Fprintf(os.Stdout, "\tlength") - for _, v := range keys { - fmt.Fprintf(os.Stdout, "\t%s", v) + for _, v := range uniquechars { + fmt.Fprintf(os.Stdout, "\t%c", v) } fmt.Fprintf(os.Stdout, "\n") @@ -243,15 +276,27 @@ func printAllSequenceStats(al align.Alignment, refSequence align.Sequence) (err fmt.Printf("\t%d", gaps) fmt.Printf("\t%d", s.NumGapsFromStart()) fmt.Printf("\t%d", s.NumGapsFromEnd()) - fmt.Printf("\t%d", numgapsunique[i]) + fmt.Printf("\t%d", numgapsuniques[i]) + if countProfile != nil { + fmt.Printf("\t%d", numnewgaps[i]) + fmt.Printf("\t%d", numgapsboth[i]) + } fmt.Printf("\t%d", s.NumGapsOpenning()) fmt.Printf("\t%d", nummutuniques[i]) + if countProfile != nil { + fmt.Printf("\t%d", numnewmuts[i]) + fmt.Printf("\t%d", nummutsboth[i]) + } if refSequence != nil { - fmt.Printf("\t%d", nummutations[i]) + if nummutations, err = s.NumMutationsComparedToReferenceSequence(al.Alphabet(), refSequence); err != nil { + io.LogError(err) + return + } + fmt.Printf("\t%d", nummutations) } fmt.Printf("\t%d", s.Length()-gaps) - for _, k := range keys { - nb := sequencemap[rune(k[0])] + for _, k := range uniquechars { + nb := sequencemap[k] fmt.Printf("\t%d", nb) } fmt.Printf("\n") @@ -275,4 +320,5 @@ func init() { RootCmd.AddCommand(statsCmd) statsCmd.PersistentFlags().BoolVar(&statpersequences, "per-sequences", false, "Prints statistics per alignment sequences") statsCmd.PersistentFlags().StringVar(&statrefsequence, "ref-sequence", "none", "Reference sequence to compare each sequence with (only with --per-sequences") + statsCmd.PersistentFlags().StringVar(&statcountprofile, "count-profile", "none", "A profile to compare the alignment with, and to compute statistics faster (only with --per-sequences)") } diff --git a/cmd/stats_gaps.go b/cmd/stats_gaps.go index b6389ca..3f732a6 100644 --- a/cmd/stats_gaps.go +++ b/cmd/stats_gaps.go @@ -5,6 +5,7 @@ import ( "github.com/evolbioinfo/goalign/align" "github.com/evolbioinfo/goalign/io" + "github.com/evolbioinfo/goalign/io/countprofile" "github.com/spf13/cobra" ) @@ -12,6 +13,7 @@ var statGapsFromStart bool var statGapsFromEnd bool var statGapsUnique bool var statGapsOpenning bool +var statGapsProfile string // charCmd represents the char command var statGapsCmd = &cobra.Command{ @@ -24,7 +26,11 @@ var statGapsCmd = &cobra.Command{ Following options are exclusive, and given in order of priority: - If --from-start is specified, then counts only gaps at sequence starts; - If --from-end is specified, then counts only gaps at sequence ends; - - If --unique is specified, then counts only gaps that are unique in their column + - If --unique is specified, then counts only gaps that are unique in their column. + If --profile is given along --unique, then the output will be : unique\tnew\tboth, with: + - unique: # gaps that are unique in each sequence in the alignment + - new: # gaps that are new in each sequence compared to the profile + - both: # gaps that are unique in each sequence in the alignment and that are new compared the profile - If --openning is specified, then counts only gap openning (streches of gaps are counted once) - Otherwise, counts total number of gaps for the given sequence. @@ -32,6 +38,7 @@ var statGapsCmd = &cobra.Command{ `, RunE: func(cmd *cobra.Command, args []string) (err error) { var aligns *align.AlignChannel + var profile *align.CountProfile if aligns, err = readalign(infile); err != nil { io.LogError(err) @@ -45,9 +52,22 @@ var statGapsCmd = &cobra.Command{ return } - var numgapsunique []int + if statGapsProfile != "none" { + if profile, err = countprofile.FromFile(statGapsProfile); err != nil { + io.LogError(err) + return + } + } + + var numnewgaps []int // new gaps that are not found in the profile + var numgapsuniques []int // gaps that are unique in the given alignment + var numgapsboth []int // gaps that are unique in the given alignment and not found in the profile + if statGapsUnique { - numgapsunique = al.NumGapsUniquePerSequence() + if numgapsuniques, numnewgaps, numgapsboth, err = al.NumGapsUniquePerSequence(profile); err != nil { + io.LogError(err) + return + } } for i, s := range al.Sequences() { @@ -56,7 +76,11 @@ var statGapsCmd = &cobra.Command{ } else if statGapsFromEnd { fmt.Printf("%s\t%d\n", s.Name(), s.NumGapsFromEnd()) } else if statGapsUnique { - fmt.Printf("%s\t%d\n", s.Name(), numgapsunique[i]) + fmt.Printf("%s\t%d", s.Name(), numgapsuniques[i]) + if statGapsProfile != "none" { + fmt.Printf("\t%d\t%d", numnewgaps[i], numgapsboth[i]) + } + fmt.Printf("\n") } else if statGapsOpenning { fmt.Printf("%s\t%d\n", s.Name(), s.NumGapsOpenning()) } else { @@ -73,6 +97,7 @@ func init() { statGapsCmd.PersistentFlags().BoolVar(&statGapsFromEnd, "from-end", false, "Count gaps in each sequence from end of sequences (until a non gap character is encountered)") statGapsCmd.PersistentFlags().BoolVar(&statGapsUnique, "unique", false, "Count, in each sequence, the number of gaps that are unique in a site") statGapsCmd.PersistentFlags().BoolVar(&statGapsOpenning, "openning", false, "Count, in each sequence, the number of gaps openning (a strech of gaps is counted once)") + statGapsCmd.PersistentFlags().StringVar(&statGapsProfile, "count-profile", "none", "A profile to compare the alignment with, and to compute statistics faster (only with --unique)") statsCmd.AddCommand(statGapsCmd) } diff --git a/cmd/stats_mutations.go b/cmd/stats_mutations.go index a9af8d3..e85be54 100644 --- a/cmd/stats_mutations.go +++ b/cmd/stats_mutations.go @@ -5,11 +5,13 @@ import ( "github.com/evolbioinfo/goalign/align" "github.com/evolbioinfo/goalign/io" + "github.com/evolbioinfo/goalign/io/countprofile" "github.com/spf13/cobra" ) var statMutationsRef string var statMutationsUnique bool +var statMutationsProfile string // charCmd represents the char command var statMutationsCmd = &cobra.Command{ @@ -19,6 +21,10 @@ var statMutationsCmd = &cobra.Command{ - If --unique is specified, then counts only mutations (characters) that are unique in their column for the given sequence. + If --profile is given along --unique, then the output will be : unique\tnew\tboth, with: + - unique: # mutations that are unique in each sequence in the alignment + - new: # mutations that are new in each sequence compared to the profile + - both: # mutations that are unique in each sequence in the alignment and that are new compared the profile - If --ref-sequence is specified, it will try to extract a seqsuence having that name from the alignment. If none exist, it will try to open a fasta file with the given name to take the first sequence as a reference. If a character is ambigous (IUPAC notation) in an nucleotide sequence, then it is counted as a mutation only if it is incompatible with the reference character. @@ -29,6 +35,7 @@ var statMutationsCmd = &cobra.Command{ `, RunE: func(cmd *cobra.Command, args []string) (err error) { var aligns *align.AlignChannel + var profile *align.CountProfile if aligns, err = readalign(infile); err != nil { io.LogError(err) @@ -42,7 +49,18 @@ var statMutationsCmd = &cobra.Command{ return } + if statMutationsProfile != "none" { + if profile, err = countprofile.FromFile(statMutationsProfile); err != nil { + io.LogError(err) + return + } + } + var nummutations []int + var numnewmuts []int // new mutations that are not found in the profile + var nummutsboth []int // mutations that are unique in the given alignment and not found in the profile + var num int + if statMutationsRef != "none" { var s string var sb align.SeqBag @@ -61,20 +79,29 @@ var statMutationsCmd = &cobra.Command{ } s, _ = sb.GetSequenceById(0) } - if nummutations, err = al.NumMutationsComparedToReferenceSequence(align.NewSequence("ref", []rune(s), "")); err != nil { - io.LogError(err) - return + for _, s2 := range al.Sequences() { + if num, err = s2.NumMutationsComparedToReferenceSequence(al.Alphabet(), align.NewSequence("ref", []rune(s), "")); err != nil { + io.LogError(err) + return + } + fmt.Printf("%s\t%d\n", s2.Name(), num) } + } else if statMutationsUnique { - nummutations = al.NumMutationsUniquePerSequence() + nummutations, numnewmuts, nummutsboth, err = al.NumMutationsUniquePerSequence(profile) + for i, s := range al.Sequences() { + fmt.Printf("%s\t%d", s.Name(), nummutations[i]) + if profile != nil { + fmt.Printf("\t%d", numnewmuts[i]) + fmt.Printf("\t%d", nummutsboth[i]) + } + fmt.Printf("\n") + } } else { err = fmt.Errorf("Mutations should be counted by comparing to a reference sequnce with --ref-sequence") io.LogError(err) return } - for i, s := range al.Sequences() { - fmt.Printf("%s\t%d\n", s.Name(), nummutations[i]) - } return }, @@ -83,6 +110,7 @@ var statMutationsCmd = &cobra.Command{ func init() { statMutationsCmd.PersistentFlags().StringVar(&statMutationsRef, "ref-sequence", "none", "Reference sequence to compare each sequence with.") statMutationsCmd.PersistentFlags().BoolVar(&statMutationsUnique, "unique", false, "Count, in each sequence, the number of mutations/characters that are unique in a site") + statMutationsCmd.PersistentFlags().StringVar(&statMutationsProfile, "count-profile", "none", "A profile to compare the alignment with, and to compute statistics faster (only with --unique)") statsCmd.AddCommand(statMutationsCmd) } diff --git a/cmd/subset.go b/cmd/subset.go index 1a8fe50..f107faf 100644 --- a/cmd/subset.go +++ b/cmd/subset.go @@ -10,6 +10,7 @@ import ( "github.com/evolbioinfo/goalign/align" "github.com/evolbioinfo/goalign/io" + "github.com/evolbioinfo/goalign/io/utils" "github.com/spf13/cobra" ) @@ -176,12 +177,12 @@ func parseNameFile(file string) (subset map[string]int, err error) { r = bufio.NewReader(f) } - l, e := Readln(r) + l, e := utils.Readln(r) for e == nil { for _, name := range strings.Split(l, ",") { subset[name] = 1 } - l, e = Readln(r) + l, e = utils.Readln(r) } return } diff --git a/docs/commands/stats.md b/docs/commands/stats.md index e25d3c7..4f84297 100644 --- a/docs/commands/stats.md +++ b/docs/commands/stats.md @@ -29,10 +29,20 @@ Different sub-commands: * `goalign stats alleles`: Prints the average number of alleles per site of the alignment; * `goalign stats alphabet`: Prints the alphabet of the alignemnts (aminoacids, nucleotides, unknown); * `goalign stats char`: Prints the character number of occurences. If `--per-sequences` is given, then prints the number of occurences of each characters for each seqences. If `--per-sites` is given, then prints the number of occurences of each characters for each sites. Is is possible to give `--only` option, to count the number of occurences of a single character. -* `goalign stats gaps`: Prints the number of gaps in each sequences (and possibly the number of gaps from start, and from end); By default, it prints, for each alignment sequence the number of gaps. Following options are exclusive, and given in order of priority: If `--from-start` is specified, then counts only gaps at sequence starts; If `--from-end` is specified, then counts only gaps at sequence ends; If `--unique` is specified, then counts only gaps that are unique in their column; If` --openning` is specified, then counts only gap openning (streches of gaps are counted once); Otherwise, counts total number of gaps on each sequence. +* `goalign stats gaps`: Prints the number of gaps in each sequences (and possibly the number of gaps from start, and from end); By default, it prints, for each alignment sequence the number of gaps. Following options are exclusive, and given in order of priority: If `--from-start` is specified, then counts only gaps at sequence starts; If `--from-end` is specified, then counts only gaps at sequence ends; If `--unique` is specified, then counts only gaps that are unique in their alignmebnnt columnIf` --openning` is specified, then counts only gap openning (streches of gaps are counted once); Otherwise, counts total number of gaps on each sequence. If `--profile` is given in addition to `--unique`, then the output will be : `unique\tnew\tboth`, with: + + - unique: # gaps that are unique in their column, for each sequence of the alignment + - new: # gaps that are new in each sequence compared to the profile + - both: # gaps that are unique in each sequence in the alignment and that are new compared the profile. + * `goalign stats length`: Prints alignment length; * `goalign stats maxchar`: Prints max occurence char for each alignment site; -* `goalign stats mutations`: Prints, for each sequence, the number of mutations on each alignment sequence, compared to a reference sequence or unique compared to all other sequences. It does not take into account '-' and 'N' as unique mutations, and does not take into account '-' and 'N' as mutations compared to a reference sequence; If `--unique` is specified, then counts only mutations (characters) that are unique in their column for the given sequence. If `--ref-sequence` is specified, it will try to extract a sequence having that name from the alignment. If none exist, it will try to open a fasta file with the given name to take the first sequence as a reference. If a character is ambigous (IUPAC notation) in an nucleotide sequence, then it is counted as a mutation only if it is incompatible with the reference character. +* `goalign stats mutations`: Prints, for each sequence, the number of mutations on each alignment sequence, compared to a reference sequence or unique compared to all other sequences. It does not take into account '-' and 'N' as unique mutations, and does not take into account '-' and 'N' as mutations compared to a reference sequence; If `--unique` is specified, then counts only mutations (characters) that are unique in their column for the given sequence. If `--ref-sequence` is specified, it will try to extract a sequence having that name from the alignment. If none exist, it will try to open a fasta file with the given name to take the first sequence as a reference. If a character is ambigous (IUPAC notation) in an nucleotide sequence, then it is counted as a mutation only if it is incompatible with the reference character. If `--profile` is given in addition to `--unique`, then the output will be : `unique\tnew\tboth`, with: + + - unique: # mutations that are unique in their column, for each sequence of the alignment + - new: # mutations that are new in each sequence compared to the profile + - both: # mutations that are unique in each sequence in the alignment and that are new compared the profile. + * `goalign stats nalign`: Prints the number of alignments in the input file (Phylip); * `goalign stats nseq`: Prints the number of sequences in the input alignment; * `goalign stats taxa`: Lists taxa in the input alignment. diff --git a/io/countprofile/countprofile.go b/io/countprofile/countprofile.go new file mode 100644 index 0000000..aaf3b16 --- /dev/null +++ b/io/countprofile/countprofile.go @@ -0,0 +1,82 @@ +package countprofile + +import ( + "bufio" + "compress/gzip" + "fmt" + "os" + "strconv" + "strings" + + "github.com/evolbioinfo/goalign/align" + "github.com/evolbioinfo/goalign/io/utils" +) + +// FromFile Parses a "profile" file constisting of a number of occurences of each character +// per site, tab separated, in the form: +// site - A B C D G H K M N R... +// 0 1 2 3 4 0 ... +func FromFile(file string) (p *align.CountProfile, err error) { + var f *os.File + var r *bufio.Reader + var gr *gzip.Reader + var l string + var i int + var field string + + p = align.NewCountProfile() + + if file == "stdin" || file == "-" { + f = os.Stdin + } else { + if f, err = os.Open(file); err != nil { + return + } + } + + if strings.HasSuffix(file, ".gz") { + if gr, err = gzip.NewReader(f); err != nil { + return + } + r = bufio.NewReader(gr) + } else { + r = bufio.NewReader(f) + } + + // We parse the header + if l, err = utils.Readln(r); err != nil { + return + } + headslice := strings.Split(l, "\t") + header := make([]rune, 0, len(headslice)-1) + for i, field = range headslice { + if i > 0 { + r := []rune(field) + if len(r) != 1 { + err = fmt.Errorf("Character name Should be One character") + return + } + header = append(header, r[0]) + } + } + p.SetHeader(header) + + // Then the counts + var count int = 0 + l, err = utils.Readln(r) + for err == nil { + for i, field = range strings.Split(l, "\t") { + if i > 0 { + if count, err = strconv.Atoi(field); err != nil { + return + } + p.AppendCount(i-1, count) + } + } + l, err = utils.Readln(r) + } + if err.Error() == "EOF" { + err = nil + } + return +} diff --git a/io/utils/readfiles.go b/io/utils/readfiles.go index e58ea5f..1035cbc 100644 --- a/io/utils/readfiles.go +++ b/io/utils/readfiles.go @@ -77,3 +77,20 @@ func isHttpFile(file string) bool { return strings.HasPrefix(file, "http://") || strings.HasPrefix(file, "https://") } + +// Readln returns a single line (without the ending \n) +// from the input buffered reader. +// An error is returned iff there is an error with the +// buffered reader. +func Readln(r *bufio.Reader) (string, error) { + var ( + isPrefix bool = true + err error = nil + line, ln []byte + ) + for isPrefix && err == nil { + line, isPrefix, err = r.ReadLine() + ln = append(ln, line...) + } + return string(ln), err +} diff --git a/test.sh b/test.sh index e1fbdd7..9ae9c58 100755 --- a/test.sh +++ b/test.sh @@ -3990,3 +3990,45 @@ ${GOALIGN} stats char --per-sequences -i input > result diff -q -b expected result rm -f input expected result + + +echo "->goalign stats char --per-sequences --count-profile" +cat > input <s1 +ACGTACGT +>s2 +-CGTACGT +>s3 +ACGTACGT +>s4 +ACGTTCGA +>s5 +ACGTTCGA +EOF + +cat > prof < expected < result +diff -q -b expected result +rm -f input expected result + +