From ab9176e44e25bee36bcc503f1d47e1fe09247cd3 Mon Sep 17 00:00:00 2001
From: Frederic Lemoine <fredericlemoine@users.noreply.github.com>
Date: Tue, 28 Apr 2020 15:16:57 +0200
Subject: [PATCH] Added comparison to profile for goalign stats

---
 align/align.go                  | 154 ++++++++++++++--------------
 align/align_test.go             |  82 +++++++++++----
 align/profile.go                | 172 ++++++++++++++++++++++++++++++++
 align/seqbag.go                 |  46 ++++++++-
 align/seqbag_test.go            |  40 ++++++++
 align/sequence.go               |  54 ++++++++++
 align/sequence_test.go          |   3 +-
 cmd/rarefy.go                   |   5 +-
 cmd/root.go                     |  21 +---
 cmd/stats.go                    | 140 +++++++++++++++++---------
 cmd/stats_gaps.go               |  33 +++++-
 cmd/stats_mutations.go          |  42 ++++++--
 cmd/subset.go                   |   5 +-
 docs/commands/stats.md          |  14 ++-
 io/countprofile/countprofile.go |  82 +++++++++++++++
 io/utils/readfiles.go           |  17 ++++
 test.sh                         |  42 ++++++++
 17 files changed, 766 insertions(+), 186 deletions(-)
 create mode 100644 align/profile.go
 create mode 100644 align/seqbag_test.go
 create mode 100644 io/countprofile/countprofile.go

diff --git a/align/align.go b/align/align.go
index 742d30e..c5f4b01 100644
--- a/align/align.go
+++ b/align/align.go
@@ -55,17 +55,13 @@ type Alignment interface {
 	Length() int                  // Length of the alignment
 	Mask(start, length int) error // Masks given positions
 	MaxCharStats(excludeGaps bool) ([]rune, []int)
-	Mutate(rate float64)             // Adds uniform substitutions in the alignment (~sequencing errors)
-	NbVariableSites() int            // Nb of variable sites
-	NumGapsUniquePerSequence() []int // Number of Gaps in each sequence that are unique in their alignment site
+	Mutate(rate float64)  // Adds uniform substitutions in the alignment (~sequencing errors)
+	NbVariableSites() int // Nb of variable sites
+	// Number of Gaps in each sequence that are unique in their alignment site
+	NumGapsUniquePerSequence(countProfile *CountProfile) (numuniques []int, numnew []int, numboth []int, err error)
 	// returns the number of characters in each sequence that are unique in their alignment site (gaps or others)
 	// It does not take into account 'N' and '-' as unique mutations
-	NumMutationsUniquePerSequence() (numuniques []int)
-	// returns the number of differences between the reference sequence and each sequence of the alignment
-	// If lengths are different, returns an error
-	// It does not take into account 'N' and '-' in sequences as mutations compared to ref
-	/// sequence (ref sequence can have a '-' or a 'N')
-	NumMutationsComparedToReferenceSequence(seq Sequence) (nummutations []int, err error)
+	NumMutationsUniquePerSequence(profile *CountProfile) (numuniques []int, numnew []int, nummuts []int, err error)
 	Pssm(log bool, pseudocount float64, normalization int) (pssm map[rune][]float64, err error) // Normalization: PSSM_NORM_NONE, PSSM_NORM_UNIF, PSSM_NORM_DATA
 	Rarefy(nb int, counts map[string]int) (Alignment, error)                                    // Take a new rarefied sample taking into accounts weights
 	RandSubAlign(length int) (Alignment, error)                                                 // Extract a random subalignment with given length from this alignment
@@ -1075,7 +1071,7 @@ func (a *align) Pssm(log bool, pseudocount float64, normalization int) (pssm map
 	/* We add pseudo counts */
 	if pseudocount > 0 {
 		for _, v := range pssm {
-			for i, _ := range v {
+			for i := range v {
 				v[i] += pseudocount
 			}
 		}
@@ -1364,27 +1360,55 @@ func (a *align) NbVariableSites() int {
 }
 
 // NumGapsUniquePerSequence returns the number of Gaps in the sequence that are unique in their alignment site
-func (a *align) NumGapsUniquePerSequence() (numgaps []int) {
-	numgaps = make([]int, a.NbSequences())
+// This function counts, for each sequence of the given alignment, the number of :
+// - gaps that are unique to the sequence compared to the others of the alignment
+// - gaps that are new compared to the profile (not seen in the profile) : numnew
+// - gaps that are new compared to the profile and found only once in the given alignment: numboth
+// If the profile is nil, then does not compute numnewmuts neither nummutsboth (0 filled slices)
+func (a *align) NumGapsUniquePerSequence(countProfile *CountProfile) (numuniques []int, numnew []int, numboth []int, err error) {
+	numuniques = make([]int, a.NbSequences())
+	numnew = make([]int, a.NbSequences())
+	numboth = make([]int, a.NbSequences())
+
 	uniqueIndex := -1
 	nbGapsColumn := 0
 
+	// Check that profile has the right length
+	if countProfile != nil {
+		if !countProfile.CheckLength(a.Length()) {
+			err = fmt.Errorf("Profile does not have same length than alignment")
+			return
+		}
+	}
+
+	var c int
 	for i := 0; i < a.Length(); i++ {
 		uniqueIndex = -1
 		nbGapsColumn = 0
 
 		for j, s := range a.seqs {
-			if s.sequence[i] == GAP {
+			r := s.sequence[i]
+			if r == GAP {
 				nbGapsColumn++
 				uniqueIndex = j
-			}
-			if nbGapsColumn > 1 {
-				break
+				if countProfile != nil {
+					c, _ = countProfile.Count(r, i)
+					if c == 0 {
+						numnew[j]++
+					}
+				} else if nbGapsColumn > 1 {
+					break
+				}
 			}
 		}
 
 		if nbGapsColumn == 1 {
-			numgaps[uniqueIndex]++
+			numuniques[uniqueIndex]++
+			if countProfile != nil {
+				if c, _ = countProfile.Count(GAP, i); c == 0 {
+					numboth[uniqueIndex]++
+				}
+			}
 		}
 	}
 	return
@@ -1392,78 +1416,56 @@ func (a *align) NumGapsUniquePerSequence() (numgaps []int) {
 
 // NumMutationsUniquePerSequence returns the number of characters in each sequence that are unique in their alignment site.
 // It does not take into account 'N' and '-' as unique mutations
-func (a *align) NumMutationsUniquePerSequence() (numuniques []int) {
+// This function counts, for each sequence of the given alignment, the number of :
+// - mutations that are unique to the sequence compared to the others of the alignment
+// - mutations that are new compared to the profile (not seen in the profile) : numnew
+// - mutations that are new compared to the profile and found only once in the given alignment: numboth
+// If the profile is nil, then does not compute numnewmuts neither nummutsboth (0 filled slices)
+func (a *align) NumMutationsUniquePerSequence(countProfile *CountProfile) (numuniques []int, numnew []int, numboth []int, err error) {
 	numuniques = make([]int, a.NbSequences())
-	for i := 0; i < a.Length(); i++ {
-		occurences := make(map[rune]int)
-		indices := make(map[rune]int)
-
-		for j, s := range a.seqs {
-			occurences[s.sequence[i]]++
-			indices[s.sequence[i]] = j
-		}
-
-		all := '.'
-		if a.Alphabet() == AMINOACIDS {
-			all = ALL_AMINO
-		} else if a.Alphabet() == NUCLEOTIDS {
-			all = ALL_NUCLE
-		}
-
-		for c, num := range occurences {
-			if num == 1 && c != all && c != GAP {
-				ind := indices[c]
-				numuniques[ind]++
-			}
-		}
-	}
-	return
-}
-
-// returns the number of differences between the reference sequence and each sequence of the alignment
-// Counts only non GAPS and non N sites in each sequences (may be a gap or a N in the reference sequence though)
-// If a character is ambigous (IUPAC notation), then it is counted as a mutation only if it is incompatible with
-// the reference character.
-//
-// If lengths are different, returns an error
-func (a *align) NumMutationsComparedToReferenceSequence(refseq Sequence) (nummutations []int, err error) {
-	var refseqCode []uint8
-	var nt uint8
-
-	nummutations = make([]int, a.NbSequences())
-	if refseq.Length() != a.Length() {
-		err = fmt.Errorf("Reference sequence and alignment do not have same length (%d,%d), cannot compute a number of mutation", refseq.Length(), a.Length())
-		return
-	}
+	numnew = make([]int, a.NbSequences())
+	numboth = make([]int, a.NbSequences())
 
 	all := '.'
 	if a.Alphabet() == AMINOACIDS {
 		all = ALL_AMINO
 	} else if a.Alphabet() == NUCLEOTIDS {
-		refseqCode = make([]uint8, a.Length())
-		for i := 0; i < a.Length(); i++ {
-			if refseqCode[i], err = Nt2IndexIUPAC(refseq.SequenceChar()[i]); err != nil {
-				return
-			}
-		}
 		all = ALL_NUCLE
 	}
 
+	// Check that profile has the right length
+	if countProfile != nil {
+		if !countProfile.CheckLength(a.Length()) {
+			err = fmt.Errorf("Profile does not have same length than alignment")
+			return
+		}
+	}
+
+	var c int
 	for i := 0; i < a.Length(); i++ {
+		occurences := make([]int, 130)
+		indices := make([]int, 130)
+
 		for j, s := range a.seqs {
-			eq := true
-			if a.Alphabet() == NUCLEOTIDS {
-				if nt, err = Nt2IndexIUPAC(s.sequence[i]); err != nil {
-					return
+			r := s.sequence[i]
+			occurences[int(r)]++
+			indices[int(r)] = j
+			if countProfile != nil && r != all && r != GAP {
+				if c, _ = countProfile.Count(r, i); c == 0 {
+					numnew[j]++
 				}
-				if eq, err = EqualOrCompatible(nt, refseqCode[i]); err != nil {
-					return
-				}
-			} else {
-				eq = (s.sequence[i] == refseq.SequenceChar()[i])
 			}
-			if s.SequenceChar()[i] != GAP && s.SequenceChar()[i] != all && !eq {
-				nummutations[j]++
+		}
+
+		for c, num := range occurences {
+			if num == 1 && rune(c) != all && rune(c) != GAP {
+				ind := indices[c]
+				numuniques[ind]++
+				if countProfile != nil {
+					if c, _ = countProfile.Count(rune(c), i); c == 0 {
+						numboth[ind]++
+					}
+				}
 			}
 		}
 	}
diff --git a/align/align_test.go b/align/align_test.go
index 7b90091..8620316 100644
--- a/align/align_test.go
+++ b/align/align_test.go
@@ -1786,46 +1786,83 @@ func TestConsensusGaps(t *testing.T) {
 }
 
 func Test_align_NumGapsUniquePerSequence(t *testing.T) {
-	var a Alignment
-	var ng []int
-	var exp []int
+	var a, ref Alignment
+	var ng, nn, nb []int
+	var expng, expnn, expnb []int
+	var err error
+	var profile *CountProfile
 
 	a = NewAlign(NUCLEOTIDS)
 	a.AddSequence("A", "ACGACGA-GACC", "")
 	a.AddSequence("B", "AT-TT-T-TTTC", "")
 	a.AddSequence("C", "ATCTT-TTT--T", "")
 
-	exp = []int{0, 1, 2}
+	ref = NewAlign(NUCLEOTIDS)
+	ref.AddSequence("A", "AAAAAAAAAAAA", "")
+	ref.AddSequence("B", "AAAAAAAAAAAA", "")
+	ref.AddSequence("C", "AAAAAAAAAAAA", "")
 
-	ng = a.NumGapsUniquePerSequence()
+	expng = []int{0, 1, 2}
+	expnn = []int{1, 3, 3}
+	expnb = []int{0, 1, 2}
+
+	profile = NewCountProfileFromAlignment(ref)
+	if ng, nn, nb, err = a.NumGapsUniquePerSequence(profile); err != nil {
+		t.Error(err)
+	}
 
-	if !reflect.DeepEqual(exp, ng) {
-		t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", ng, exp))
+	if !reflect.DeepEqual(expng, ng) {
+		t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", ng, expng))
+	}
+	if !reflect.DeepEqual(expnn, nn) {
+		t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", nn, expnn))
+	}
+	if !reflect.DeepEqual(expnb, nb) {
+		t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", nb, expnb))
 	}
 }
 
 func Test_align_NumMutationsUniquePerSequence(t *testing.T) {
-	var a Alignment
-	var ng []int
-	var exp []int
+	var a, ref Alignment
+	var ng, nn, nb []int
+	var expng, expnn, expnb []int
+	var err error
+	var profile *CountProfile
 
 	a = NewAlign(NUCLEOTIDS)
 	a.AddSequence("A", "ACGACGA-GACC", "")
 	a.AddSequence("B", "AT-TT-T-TTTC", "")
 	a.AddSequence("C", "ATCTT-TTT--T", "")
 
-	exp = []int{9, 2, 3}
+	ref = NewAlign(NUCLEOTIDS)
+	ref.AddSequence("A", "AAAAAAAAAAAA", "")
+	ref.AddSequence("B", "AAAAAAAAAAAA", "")
+	ref.AddSequence("C", "AAAAAAAAAAAA", "")
 
-	ng = a.NumMutationsUniquePerSequence()
+	expng = []int{9, 2, 3}
+	expnn = []int{7, 8, 8}
+	expnb = []int{6, 2, 3}
 
-	if !reflect.DeepEqual(exp, ng) {
-		t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", ng, exp))
+	profile = NewCountProfileFromAlignment(ref)
+	if ng, nn, nb, err = a.NumMutationsUniquePerSequence(profile); err != nil {
+		t.Error(err)
+	}
+
+	if !reflect.DeepEqual(expng, ng) {
+		t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", ng, expng))
+	}
+	if !reflect.DeepEqual(expnn, nn) {
+		t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", nn, expnn))
+	}
+
+	if !reflect.DeepEqual(expnb, nb) {
+		t.Error(fmt.Errorf("Numgaps is not what is expected, have %v, want %v", nb, expnb))
 	}
 }
 
 func Test_align_NumMutationsComparedToReferenceSequence(t *testing.T) {
 	var a Alignment
-	var ng []int
+	var ng int
 	var exp []int
 	var err error
 
@@ -1835,15 +1872,16 @@ func Test_align_NumMutationsComparedToReferenceSequence(t *testing.T) {
 	a.AddSequence("C", "ATCTT-TTT--T", "")
 	a.AddSequence("D", "CCCCCCCCCCCC", "")
 
-	s := NewSequence("ref", []rune("CCCCCCCCCCCC"), "")
+	ref := NewSequence("ref", []rune("CCCCCCCCCCCC"), "")
 
 	exp = []int{7, 8, 8, 0}
 
-	if ng, err = a.NumMutationsComparedToReferenceSequence(s); err != nil {
-		t.Error(err)
-	}
-
-	if !reflect.DeepEqual(exp, ng) {
-		t.Error(fmt.Errorf("Nummutations is not what is expected, have %v, want %v", ng, exp))
+	for i, s := range a.Sequences() {
+		if ng, err = s.NumMutationsComparedToReferenceSequence(a.Alphabet(), ref); err != nil {
+			t.Error(err)
+		}
+		if !reflect.DeepEqual(exp[i], ng) {
+			t.Error(fmt.Errorf("Nummutations is not what is expected, have %v, want %v", ng, exp))
+		}
 	}
 }
diff --git a/align/profile.go b/align/profile.go
new file mode 100644
index 0000000..bffda89
--- /dev/null
+++ b/align/profile.go
@@ -0,0 +1,172 @@
+package align
+
+import (
+	"fmt"
+)
+
+// CountProfile represents a simple view of an alignment
+// and stores the number of occurences of each characters at
+// each position of an alignment
+type CountProfile struct {
+	names  []int   // mapping between character and index. names uses utf8 int value to store indexes names[int(r)]=index
+	header []rune  // characters in the count table, similar to prev map
+	counts [][]int // first dimension: len(head), second dimension; nb sites in the underlying alignment
+}
+
+// NewCountProfileFromAlignment initializes a new CountProfile using an input alignment
+func NewCountProfileFromAlignment(al Alignment) (p *CountProfile) {
+	p = NewCountProfile()
+
+	p.header = make([]rune, 0, 100)
+	p.counts = make([][]int, 0, 100)
+
+	al.IterateChar(func(name string, seq []rune) bool {
+		for i, r := range seq {
+			idx := p.names[int(r)]
+			if idx < 0 {
+				idx = len(p.header)
+				p.names[int(r)] = idx
+				p.header = append(p.header, r)
+				p.counts = append(p.counts, make([]int, al.Length()))
+			}
+			p.counts[idx][i]++
+		}
+		return false
+	})
+	return
+}
+
+// NewCountProfile initializes a new Profile with nil attributes
+func NewCountProfile() (p *CountProfile) {
+	p = &CountProfile{
+		names:  make([]int, 130),
+		header: nil,
+		counts: nil,
+	}
+
+	// Initialize to "not exit" all characters
+	for i := 0; i < 130; i++ {
+		p.names[i] = -1
+	}
+
+	return
+}
+
+// CountsAt returns the counts for all sites, for the ith character
+// (arbitrary order of character)
+func (p *CountProfile) CountsAt(i int) (counts []int, err error) {
+	if i > len(p.counts) || i < 0 {
+		err = fmt.Errorf("No counts for character at index %d", i)
+		return
+	}
+	counts = p.counts[i]
+	return
+}
+
+// NameAt returns the name of ith character in the header
+func (p *CountProfile) NameAt(i int) (name rune, err error) {
+	if i >= len(p.header) || i < 0 {
+		err = fmt.Errorf("No character at index %d", i)
+		return
+	}
+	name = p.header[i]
+	return
+}
+
+// NameIndex returns the index of the given character in the header
+// If the character does not exist, returns false
+func (p *CountProfile) NameIndex(r rune) (index int, ok bool) {
+	index = p.names[int(r)]
+	ok = (index >= 0)
+	return
+}
+
+// Count returns the number of occurences of the character r at the position site
+func (p *CountProfile) Count(r rune, site int) (count int, err error) {
+	var index int
+	count = 0
+	if index = p.names[int(r)]; index < 0 {
+		err = fmt.Errorf("Character does not exist in the Profile %c", r)
+		return
+	}
+	if site < 0 || site >= len(p.counts[index]) {
+		err = fmt.Errorf("Site %d is outside the profile", site)
+		return
+	}
+	count = p.counts[index][site]
+	return
+}
+
+// CountAt returns the number of occurences of the ith character at the position site
+func (p *CountProfile) CountAt(i, site int) (count int, err error) {
+	if i >= len(p.header) || i < 0 {
+		err = fmt.Errorf("No character at index %d", i)
+		return
+	}
+	if site < 0 || site >= len(p.counts[i]) {
+		err = fmt.Errorf("Site %d is outside the profile", site)
+		return
+	}
+	count = p.counts[i][site]
+	return
+}
+
+// NbCharacters returns the number of different characters in the profile
+func (p *CountProfile) NbCharacters() (nb int) {
+	return len(p.header)
+}
+
+// CheckLength returns true if the number of sites of the profile corresponds to the given length
+// false otherwise.
+func (p *CountProfile) CheckLength(length int) bool {
+	for _, v := range p.counts {
+		if len(v) != length {
+			return false
+		}
+	}
+	return true
+}
+
+// SetHeader sets the Header and initializes the count structure
+func (p *CountProfile) SetHeader(header []rune) {
+	p.header = header
+	p.counts = make([][]int, len(p.header))
+	for i, r := range header {
+		p.names[int(r)] = i
+		p.counts[i] = make([]int, 0, 100)
+	}
+	return
+}
+
+// AppendCount appends a new site to the profile for the ith character, and
+// associates count to it
+func (p *CountProfile) AppendCount(i, count int) (err error) {
+	if i >= len(p.header) || i < 0 {
+		err = fmt.Errorf("No character at index %d", i)
+		return
+	}
+	p.counts[i] = append(p.counts[i], count)
+	return
+}
+
+func (p *CountProfile) Print() {
+	fmt.Print("site")
+	for _, n := range p.header {
+		fmt.Printf("\t%c", n)
+	}
+	fmt.Println()
+	i := 0
+	length := 0
+	for {
+		fmt.Printf("%d", i)
+		for _, c := range p.counts {
+			length = len(c)
+			fmt.Printf("\t%d", c[i])
+		}
+		fmt.Println()
+		i++
+		if i >= length {
+			break
+		}
+	}
+}
diff --git a/align/seqbag.go b/align/seqbag.go
index dc4a55e..69288b2 100644
--- a/align/seqbag.go
+++ b/align/seqbag.go
@@ -26,6 +26,7 @@ type SeqBag interface {
 	AlphabetCharToIndex(c rune) int // Returns index of the character (nt or aa) in the AlphabetCharacters() array
 	AutoAlphabet()                  // detects and sets alphabet automatically for all the sequences
 	CharStats() map[rune]int64
+	UniqueCharacters() []rune
 	CharStatsSeq(idx int) (map[rune]int, error)     // Computes frequency of characters for the given sequence
 	CleanNames(namemap map[string]string)           // Clean sequence names (newick special char)
 	Clear()                                         // Removes all sequences
@@ -37,6 +38,7 @@ type SeqBag interface {
 	GetSequenceChar(name string) ([]rune, bool)
 	GetSequenceCharById(ith int) ([]rune, bool)
 	GetSequenceNameById(ith int) (string, bool)
+	GetSequenceByName(name string) (Sequence, bool)
 	SetSequenceChar(ithAlign, ithSite int, char rune) error
 	IgnoreIdentical(bool)                // if true, then it won't add the sequence if a sequence with the same name AND same sequence exists
 	SampleSeqBag(nb int) (SeqBag, error) // generate a sub sample of the sequences
@@ -303,6 +305,15 @@ func (sb *seqbag) GetSequence(name string) (string, bool) {
 	return "", ok
 }
 
+// If sequence exists in alignment, return true,sequence
+// Otherwise, return false,nil
+func (sb *seqbag) GetSequenceByName(name string) (Sequence, bool) {
+	if seq, ok := sb.seqmap[name]; ok {
+		return seq, true
+	}
+	return nil, false
+}
+
 // If sequence exists in alignment, return sequence,true
 // Otherwise, return "",false
 func (sb *seqbag) GetSequenceById(ith int) (string, bool) {
@@ -492,16 +503,43 @@ func (sb *seqbag) AutoAlphabet() {
 }
 
 // Returns the distribution of all characters
-func (sb *seqbag) CharStats() map[rune]int64 {
-	outmap := make(map[rune]int64)
+func (sb *seqbag) CharStats() (chars map[rune]int64) {
+	chars = make(map[rune]int64)
+	present := make([]int, 130)
 
 	for _, seq := range sb.seqs {
 		for _, r := range seq.sequence {
-			outmap[unicode.ToUpper(r)]++
+			present[unicode.ToUpper(r)]++
+		}
+	}
+
+	for i, r := range present {
+		if r > 0 {
+			chars[rune(i)] = int64(r)
+		}
+	}
+
+	return
+}
+
+// Returns the distribution of all characters
+func (sb *seqbag) UniqueCharacters() (chars []rune) {
+	chars = make([]rune, 0, 40)
+	present := make([]bool, 130)
+
+	for _, seq := range sb.seqs {
+		for _, r := range seq.sequence {
+			present[unicode.ToUpper(r)] = true
+		}
+	}
+
+	for i, r := range present {
+		if r {
+			chars = append(chars, rune(i))
 		}
 	}
 
-	return outmap
+	return
 }
 
 // CharStatsSeq Returns the frequency of all characters in the sequence
diff --git a/align/seqbag_test.go b/align/seqbag_test.go
new file mode 100644
index 0000000..cb5432b
--- /dev/null
+++ b/align/seqbag_test.go
@@ -0,0 +1,40 @@
+package align
+
+import (
+	"reflect"
+	"testing"
+)
+
+func Test_seqbag_UniqueCharacters(t *testing.T) {
+	type fields struct {
+		seqmap          map[string]*seq
+		seqs            []*seq
+		ignoreidentical bool
+		alphabet        int
+	}
+	tests := []struct {
+		name      string
+		fields    fields
+		wantChars []rune
+	}{
+		{name: "t1",
+			fields: fields{seqmap: nil,
+				seqs: []*seq{
+					&seq{sequence: []rune("ACGTACGTACGT")},
+					&seq{sequence: []rune("ACGTAC*TACGT")}}},
+			wantChars: []rune("*ACGT")},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			sb := &seqbag{
+				seqmap:          tt.fields.seqmap,
+				seqs:            tt.fields.seqs,
+				ignoreidentical: tt.fields.ignoreidentical,
+				alphabet:        tt.fields.alphabet,
+			}
+			if gotChars := sb.UniqueCharacters(); !reflect.DeepEqual(gotChars, tt.wantChars) {
+				t.Errorf("seqbag.UniqueCharacters() = %v, want %v", gotChars, tt.wantChars)
+			}
+		})
+	}
+}
diff --git a/align/sequence.go b/align/sequence.go
index 71031a7..941deac 100644
--- a/align/sequence.go
+++ b/align/sequence.go
@@ -29,6 +29,12 @@ type Sequence interface {
 	NumGapsOpenning() int                                   // Number of Gaps opennin, it counts streches of gap only once
 	NumGapsFromStart() int                                  // Number of Gaps from Start (until a non gap is encountered)
 	NumGapsFromEnd() int                                    // Number of Gaps from End (until a non gap is encountered)
+	// returns the number of differences between the reference sequence and each sequence of the alignment
+	// If lengths are different, returns an error
+	// It does not take into account 'N' and '-' in sequences as mutations compared to ref
+	/// sequence (ref sequence can have a '-' or a 'N')
+	NumMutationsComparedToReferenceSequence(alphabet int, seq Sequence) (nummutations int, err error)
+
 	Clone() Sequence
 }
 
@@ -250,6 +256,53 @@ func (s *seq) NumGapsFromEnd() (numgaps int) {
 	return
 }
 
+// returns the number of differences between the reference sequence and each sequence of the alignment
+// Counts only non GAPS and non N sites in each sequences (may be a gap or a N in the reference sequence though)
+// If a character is ambigous (IUPAC notation), then it is counted as a mutation only if it is incompatible with
+// the reference character.
+//
+// If lengths are different, returns an error
+func (s *seq) NumMutationsComparedToReferenceSequence(alphabet int, refseq Sequence) (nummutations int, err error) {
+	var refseqCode []uint8
+	var nt uint8
+
+	if refseq.Length() != s.Length() {
+		err = fmt.Errorf("Reference sequence and sequence do not have same length (%d,%d), cannot compute a number of mutation", refseq.Length(), s.Length())
+		return
+	}
+
+	all := '.'
+	if alphabet == NUCLEOTIDS {
+		refseqCode = make([]uint8, s.Length())
+		for i := 0; i < s.Length(); i++ {
+			if refseqCode[i], err = Nt2IndexIUPAC(refseq.SequenceChar()[i]); err != nil {
+				return
+			}
+		}
+		all = ALL_NUCLE
+	} else {
+		all = ALL_AMINO
+	}
+
+	for i := 0; i < s.Length(); i++ {
+		eq := true
+		if alphabet == NUCLEOTIDS {
+			if nt, err = Nt2IndexIUPAC(s.sequence[i]); err != nil {
+				return
+			}
+			if eq, err = EqualOrCompatible(nt, refseqCode[i]); err != nil {
+				return
+			}
+		} else {
+			eq = (s.sequence[i] == refseq.SequenceChar()[i])
+		}
+		if s.SequenceChar()[i] != GAP && s.SequenceChar()[i] != all && !eq {
+			nummutations++
+		}
+	}
+	return
+}
+
 // Translates the given sequence start at nucleotide index "phase"
 //
 // If the sequence is not nucleotidic, then throws an error
@@ -321,6 +374,7 @@ func (s *seq) Clone() Sequence {
 // The 3 nucleotites in arguments are converted to upper case and U converted to T.
 // If one character does not correspond to a known nucleotide in IUPAC code, then
 // Returns an empty slice.
+// If one of the nucleotides is a GAP, then returns an empty slice.
 //
 // For example GenAllPossibleCodons('A','G','N') should return
 // {"AGA","AGC","AGG","AGT"}.
diff --git a/align/sequence_test.go b/align/sequence_test.go
index 4f3c3b6..ad65587 100644
--- a/align/sequence_test.go
+++ b/align/sequence_test.go
@@ -53,7 +53,8 @@ func Test_seq_Translate(t *testing.T) {
 		wantErr bool
 	}{
 		{name: "Seq1", fields: fields{name: "seq1", sequence: []rune{'G', 'A', 'Y', 'A', 'A', 'R', 'U', 'A', 'Y', 'C', 'A', 'Y', 'R', 'A', 'Y', 'U', 'A', 'G'}}, args: args{phase: 0, geneticcode: 0}, wantTr: &seq{name: "seq1", sequence: []rune{'D', 'K', 'Y', 'H', 'X', '*'}}, wantErr: false},
-		{name: "Seq1", fields: fields{name: "seq1", sequence: []rune{'G', 'A', 'Y', 'A', 'A', 'R', 'U', 'A', 'Y', 'C', 'A', 'Y', 'A', 'A', 'Y', 'U', 'A', 'G'}}, args: args{phase: 0, geneticcode: 0}, wantTr: &seq{name: "seq1", sequence: []rune{'D', 'K', 'Y', 'H', 'N', '*'}}, wantErr: false},
+		{name: "Seq2", fields: fields{name: "seq1", sequence: []rune{'G', 'A', 'Y', 'A', 'A', 'R', 'U', 'A', 'Y', 'C', 'A', 'Y', 'A', 'A', 'Y', 'U', 'A', 'G'}}, args: args{phase: 0, geneticcode: 0}, wantTr: &seq{name: "seq1", sequence: []rune{'D', 'K', 'Y', 'H', 'N', '*'}}, wantErr: false},
+		{name: "Seq3", fields: fields{name: "seq1", sequence: []rune{'-', 'A', 'Y', 'A', 'A', 'R', 'U', 'A', 'Y', 'C', 'A', 'Y', 'A', 'A', 'Y', 'U', 'A', 'G'}}, args: args{phase: 0, geneticcode: 0}, wantTr: &seq{name: "seq1", sequence: []rune{'X', 'K', 'Y', 'H', 'N', '*'}}, wantErr: false},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
diff --git a/cmd/rarefy.go b/cmd/rarefy.go
index 803f11a..d2adea5 100644
--- a/cmd/rarefy.go
+++ b/cmd/rarefy.go
@@ -10,6 +10,7 @@ import (
 
 	"github.com/evolbioinfo/goalign/align"
 	"github.com/evolbioinfo/goalign/io"
+	"github.com/evolbioinfo/goalign/io/utils"
 	"github.com/spf13/cobra"
 )
 
@@ -138,7 +139,7 @@ func parseCountFile(file string) (counts map[string]int, err error) {
 	} else {
 		r = bufio.NewReader(f)
 	}
-	l, e := Readln(r)
+	l, e := utils.Readln(r)
 	for e == nil {
 		cols := strings.Split(l, "\t")
 		if cols == nil || len(cols) != 2 {
@@ -149,7 +150,7 @@ func parseCountFile(file string) (counts map[string]int, err error) {
 			return
 		}
 		counts[cols[0]] = c
-		l, e = Readln(r)
+		l, e = utils.Readln(r)
 	}
 
 	return
diff --git a/cmd/root.go b/cmd/root.go
index 245a642..0e07023 100644
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -288,23 +288,6 @@ func openWriteFile(file string) (f *os.File, err error) {
 	return
 }
 
-// Readln returns a single line (without the ending \n)
-// from the input buffered reader.
-// An error is returned iff there is an error with the
-// buffered reader.
-func Readln(r *bufio.Reader) (string, error) {
-	var (
-		isPrefix bool  = true
-		err      error = nil
-		line, ln []byte
-	)
-	for isPrefix && err == nil {
-		line, isPrefix, err = r.ReadLine()
-		ln = append(ln, line...)
-	}
-	return string(ln), err
-}
-
 func readMapFile(file string, revert bool) (map[string]string, error) {
 	outmap := make(map[string]string, 0)
 	var mapfile *os.File
@@ -324,7 +307,7 @@ func readMapFile(file string, revert bool) (map[string]string, error) {
 	} else {
 		reader = bufio.NewReader(mapfile)
 	}
-	line, e := Readln(reader)
+	line, e := utils.Readln(reader)
 	nl := 1
 	for e == nil {
 		cols := strings.Split(line, "\t")
@@ -336,7 +319,7 @@ func readMapFile(file string, revert bool) (map[string]string, error) {
 		} else {
 			outmap[cols[0]] = cols[1]
 		}
-		line, e = Readln(reader)
+		line, e = utils.Readln(reader)
 		nl++
 	}
 
diff --git a/cmd/stats.go b/cmd/stats.go
index 7ba479d..0ac6617 100644
--- a/cmd/stats.go
+++ b/cmd/stats.go
@@ -7,11 +7,13 @@ import (
 
 	"github.com/evolbioinfo/goalign/align"
 	"github.com/evolbioinfo/goalign/io"
+	"github.com/evolbioinfo/goalign/io/countprofile"
 	"github.com/spf13/cobra"
 )
 
 var statpersequences bool
 var statrefsequence string
+var statcountprofile string
 
 // statsCmd represents the stats command
 var statsCmd = &cobra.Command{
@@ -33,12 +35,19 @@ If --per-sequences is given, then it will print the following stats, for each se
 2. Number of consecutive gaps at the beginning of the sequence;
 3. Number of consecutive gaps at the end of the sequence;
 4. Number of gaps unique to the sequence (present in no other sequence);
+	If --profile is given along, then the output will be : unique\tnew\tboth, with:
+	- 4a unique: # gaps that are unique in each sequence in the alignment
+	- 4b new: # gaps that are new in each sequence compared to the profile
+	- 4c both: # gaps that are unique in each sequence in the alignment and that are new compared the profile
 5. Number of gap opennings (streches of gaps are counted once);
 6. Number of Unique mutations;
+	If --profile is given along, then the output will be : unique\tnew\tboth, with:
+	- 6a unique: # mutations that are unique in each sequence in the alignment
+	- 6b new: # mutations that are new in each sequence compared to the profile
+	- 6c both: # mutations that are unique in each sequence in the alignment and that are new compared the profile
 7. Number of mutations compared to a reference sequence (given with --ref-sequence, otherwise, no column);
 8. Length of the sequence without gaps;
 9..n Number of occurence of each character (A,C,G, etc.).
-
 `,
 	RunE: func(cmd *cobra.Command, args []string) (err error) {
 		var aligns *align.AlignChannel
@@ -57,9 +66,16 @@ If --per-sequences is given, then it will print the following stats, for each se
 				fmt.Fprintf(os.Stdout, "alphabet\t%s\n", al.AlphabetStr())
 			} else {
 				var refseq align.Sequence
+				var profile *align.CountProfile
 				var s string
 				var ok bool
 				var sb align.SeqBag
+				if statcountprofile != "none" {
+					if profile, err = countprofile.FromFile(statcountprofile); err != nil {
+						io.LogError(err)
+						return
+					}
+				}
 				if statrefsequence != "none" {
 					// We try to get the sequence from its name in the alignment
 					if s, ok = al.GetSequence(statrefsequence); !ok {
@@ -77,7 +93,7 @@ If --per-sequences is given, then it will print the following stats, for each se
 					}
 					refseq = align.NewSequence("ref", []rune(s), "")
 				}
-				err = printAllSequenceStats(al, refseq)
+				err = printAllSequenceStats(al, refseq, profile)
 			}
 		}
 
@@ -116,39 +132,42 @@ func printCharStats(align align.Alignment, only string) {
 	}
 }
 
-func printSiteCharStats(align align.Alignment, only string) (err error) {
-	var sitemap map[rune]int
+func printSiteCharStats(al align.Alignment, only string) (err error) {
+	var profile *align.CountProfile
+	var ok bool
+	var indexonly int
 
-	charmap := align.CharStats()
-
-	// We add the only character we want to output
-	// To write 0 if there are no occurences of it
-	// in the alignment
-	if _, ok := charmap[rune(only[0])]; !ok && only != "*" {
-		charmap[rune(only[0])] = 0
+	profile = align.NewCountProfileFromAlignment(al)
+	onlyr := []rune(only)
+	if len(onlyr) > 1 {
+		err = fmt.Errorf("Character should have length 1: %s", only)
 	}
 
-	keys := make([]string, 0, len(charmap))
-	for k := range charmap {
-		keys = append(keys, string(k))
-	}
-	sort.Strings(keys)
 	fmt.Fprintf(os.Stdout, "site")
-	for _, v := range keys {
-		if only == "*" || v == only {
-			fmt.Fprintf(os.Stdout, "\t%s", v)
+	if only == "*" {
+		indexonly = -1
+	} else {
+		if indexonly, ok = profile.NameIndex(onlyr[0]); !ok {
+			for site := 0; site < al.Length(); site++ {
+				fmt.Fprintf(os.Stdout, "%d0%d\n", site, 0)
+			}
+			return
 		}
 	}
-	fmt.Fprintf(os.Stdout, "\n")
-	for site := 0; site < align.Length(); site++ {
-		if sitemap, err = align.CharStatsSite(site); err != nil {
-			return
+
+	for index := 0; index < profile.NbCharacters(); index++ {
+		r, _ := profile.NameAt(index)
+		if only == "*" || r == onlyr[0] {
+			fmt.Fprintf(os.Stdout, "\t%c", r)
 		}
+	}
+	fmt.Fprintf(os.Stdout, "\n")
+	for site := 0; site < al.Length(); site++ {
 		fmt.Fprintf(os.Stdout, "%d", site)
-		for _, k := range keys {
-			if only == "*" || k == only {
-				nb := sitemap[rune(k[0])]
-				fmt.Fprintf(os.Stdout, "\t%d", nb)
+		for index := 0; index < profile.NbCharacters(); index++ {
+			if indexonly == -1 || index == indexonly {
+				count, _ := profile.CountAt(index, site)
+				fmt.Fprintf(os.Stdout, "\t%d", count)
 			}
 		}
 		fmt.Fprintf(os.Stdout, "\n")
@@ -197,39 +216,53 @@ func printSequenceCharStats(sb align.SeqBag, only string) (err error) {
 	return
 }
 
-func printAllSequenceStats(al align.Alignment, refSequence align.Sequence) (err error) {
+func printAllSequenceStats(al align.Alignment, refSequence align.Sequence, countProfile *align.CountProfile) (err error) {
 	var sequencemap map[rune]int
-	var numgapsunique, nummutuniques, nummutations []int
+
+	var numnewgaps []int // new gaps that are not found in the profile
+	var numnewmuts []int // new mutations that are not found in the profile
+
+	var nummutuniques []int  // mutations that are unique in the given alignment
+	var numgapsuniques []int // gaps that are unique in the given alignment
+
+	var nummutsboth []int // mutations that are unique in the given alignment and not found in the profile
+	var numgapsboth []int // gaps that are unique in the given alignment and not found in the profile
+
+	var nummutations int
 	var gaps int
 	var name string
+	var uniquechars []rune
 
-	charmap := al.CharStats()
-	numgapsunique = al.NumGapsUniquePerSequence()
-	nummutuniques = al.NumMutationsUniquePerSequence()
-	if refSequence != nil {
-		if nummutations, err = al.NumMutationsComparedToReferenceSequence(refSequence); err != nil {
-			io.LogError(err)
-			return
-		}
+	uniquechars = al.UniqueCharacters()
+
+	if numgapsuniques, numnewgaps, numgapsboth, err = al.NumGapsUniquePerSequence(countProfile); err != nil {
+		return
 	}
-	keys := make([]string, 0, len(charmap))
-	for k := range charmap {
-		keys = append(keys, string(k))
+	if nummutuniques, numnewmuts, nummutsboth, err = al.NumMutationsUniquePerSequence(countProfile); err != nil {
+		return
 	}
-	sort.Strings(keys)
+
 	fmt.Fprintf(os.Stdout, "sequence")
 	fmt.Fprintf(os.Stdout, "\tgaps")
 	fmt.Fprintf(os.Stdout, "\tgapsstart")
 	fmt.Fprintf(os.Stdout, "\tgapsend")
 	fmt.Fprintf(os.Stdout, "\tgapsuniques")
+	if countProfile != nil {
+		fmt.Fprintf(os.Stdout, "\tgapsnew")
+		fmt.Fprintf(os.Stdout, "\tgapsboth")
+	}
 	fmt.Fprintf(os.Stdout, "\tgapsopenning")
 	fmt.Fprintf(os.Stdout, "\tmutuniques")
+	if countProfile != nil {
+		fmt.Fprintf(os.Stdout, "\tmutsnew")
+		fmt.Fprintf(os.Stdout, "\tmutsboth")
+	}
 	if refSequence != nil {
 		fmt.Fprintf(os.Stdout, "\tmutref")
 	}
 	fmt.Fprintf(os.Stdout, "\tlength")
-	for _, v := range keys {
-		fmt.Fprintf(os.Stdout, "\t%s", v)
+	for _, v := range uniquechars {
+		fmt.Fprintf(os.Stdout, "\t%c", v)
 	}
 
 	fmt.Fprintf(os.Stdout, "\n")
@@ -243,15 +276,27 @@ func printAllSequenceStats(al align.Alignment, refSequence align.Sequence) (err
 		fmt.Printf("\t%d", gaps)
 		fmt.Printf("\t%d", s.NumGapsFromStart())
 		fmt.Printf("\t%d", s.NumGapsFromEnd())
-		fmt.Printf("\t%d", numgapsunique[i])
+		fmt.Printf("\t%d", numgapsuniques[i])
+		if countProfile != nil {
+			fmt.Printf("\t%d", numnewgaps[i])
+			fmt.Printf("\t%d", numgapsboth[i])
+		}
 		fmt.Printf("\t%d", s.NumGapsOpenning())
 		fmt.Printf("\t%d", nummutuniques[i])
+		if countProfile != nil {
+			fmt.Printf("\t%d", numnewmuts[i])
+			fmt.Printf("\t%d", nummutsboth[i])
+		}
 		if refSequence != nil {
-			fmt.Printf("\t%d", nummutations[i])
+			if nummutations, err = s.NumMutationsComparedToReferenceSequence(al.Alphabet(), refSequence); err != nil {
+				io.LogError(err)
+				return
+			}
+			fmt.Printf("\t%d", nummutations)
 		}
 		fmt.Printf("\t%d", s.Length()-gaps)
-		for _, k := range keys {
-			nb := sequencemap[rune(k[0])]
+		for _, k := range uniquechars {
+			nb := sequencemap[k]
 			fmt.Printf("\t%d", nb)
 		}
 		fmt.Printf("\n")
@@ -275,4 +320,5 @@ func init() {
 	RootCmd.AddCommand(statsCmd)
 	statsCmd.PersistentFlags().BoolVar(&statpersequences, "per-sequences", false, "Prints  statistics per alignment sequences")
 	statsCmd.PersistentFlags().StringVar(&statrefsequence, "ref-sequence", "none", "Reference sequence to compare each sequence with (only with --per-sequences")
+	statsCmd.PersistentFlags().StringVar(&statcountprofile, "count-profile", "none", "A profile to compare the alignment with, and to compute statistics faster (only with --per-sequences)")
 }
diff --git a/cmd/stats_gaps.go b/cmd/stats_gaps.go
index b6389ca..3f732a6 100644
--- a/cmd/stats_gaps.go
+++ b/cmd/stats_gaps.go
@@ -5,6 +5,7 @@ import (
 
 	"github.com/evolbioinfo/goalign/align"
 	"github.com/evolbioinfo/goalign/io"
+	"github.com/evolbioinfo/goalign/io/countprofile"
 	"github.com/spf13/cobra"
 )
 
@@ -12,6 +13,7 @@ var statGapsFromStart bool
 var statGapsFromEnd bool
 var statGapsUnique bool
 var statGapsOpenning bool
+var statGapsProfile string
 
 // charCmd represents the char command
 var statGapsCmd = &cobra.Command{
@@ -24,7 +26,11 @@ var statGapsCmd = &cobra.Command{
 	Following options are exclusive, and given in order of priority:
 	- If --from-start is specified, then counts only gaps at sequence starts;
 	- If --from-end is specified, then counts only gaps at sequence ends;
-	- If --unique is specified, then counts only gaps that are unique in their column
+	- If --unique is specified, then counts only gaps that are unique in their column. 
+	  If --profile is given along --unique, then the output will be : unique\tnew\tboth, with:
+		- unique: # gaps that are unique in each sequence in the alignment
+		- new: # gaps that are new in each sequence compared to the profile
+		- both: # gaps that are unique in each sequence in the alignment and that are new compared the profile
 	- If --openning is specified, then counts only gap openning (streches of gaps are counted once)
 	- Otherwise, counts total number of gaps
 	for the given sequence.
@@ -32,6 +38,7 @@ var statGapsCmd = &cobra.Command{
 `,
 	RunE: func(cmd *cobra.Command, args []string) (err error) {
 		var aligns *align.AlignChannel
+		var profile *align.CountProfile
 
 		if aligns, err = readalign(infile); err != nil {
 			io.LogError(err)
@@ -45,9 +52,22 @@ var statGapsCmd = &cobra.Command{
 			return
 		}
 
-		var numgapsunique []int
+		if statGapsProfile != "none" {
+			if profile, err = countprofile.FromFile(statGapsProfile); err != nil {
+				io.LogError(err)
+				return
+			}
+		}
+
+		var numnewgaps []int     // new gaps that are not found in the profile
+		var numgapsuniques []int // gaps that are unique in the given alignment
+		var numgapsboth []int    // gaps that are unique in the given alignment and not found in the profile
+
 		if statGapsUnique {
-			numgapsunique = al.NumGapsUniquePerSequence()
+			if numgapsuniques, numnewgaps, numgapsboth, err = al.NumGapsUniquePerSequence(profile); err != nil {
+				io.LogError(err)
+				return
+			}
 		}
 
 		for i, s := range al.Sequences() {
@@ -56,7 +76,11 @@ var statGapsCmd = &cobra.Command{
 			} else if statGapsFromEnd {
 				fmt.Printf("%s\t%d\n", s.Name(), s.NumGapsFromEnd())
 			} else if statGapsUnique {
-				fmt.Printf("%s\t%d\n", s.Name(), numgapsunique[i])
+				fmt.Printf("%s\t%d", s.Name(), numgapsuniques[i])
+				if statGapsProfile != "none" {
+					fmt.Printf("\t%d\t%d", numnewgaps[i], numgapsboth[i])
+				}
+				fmt.Printf("\n")
 			} else if statGapsOpenning {
 				fmt.Printf("%s\t%d\n", s.Name(), s.NumGapsOpenning())
 			} else {
@@ -73,6 +97,7 @@ func init() {
 	statGapsCmd.PersistentFlags().BoolVar(&statGapsFromEnd, "from-end", false, "Count gaps in each sequence from end of sequences (until a non gap character is encountered)")
 	statGapsCmd.PersistentFlags().BoolVar(&statGapsUnique, "unique", false, "Count, in each sequence, the number of gaps that are unique in a site")
 	statGapsCmd.PersistentFlags().BoolVar(&statGapsOpenning, "openning", false, "Count, in each sequence, the number of gaps openning (a strech of gaps is counted once)")
+	statGapsCmd.PersistentFlags().StringVar(&statGapsProfile, "count-profile", "none", "A profile to compare the alignment with, and to compute statistics faster (only with --unique)")
 
 	statsCmd.AddCommand(statGapsCmd)
 }
diff --git a/cmd/stats_mutations.go b/cmd/stats_mutations.go
index a9af8d3..e85be54 100644
--- a/cmd/stats_mutations.go
+++ b/cmd/stats_mutations.go
@@ -5,11 +5,13 @@ import (
 
 	"github.com/evolbioinfo/goalign/align"
 	"github.com/evolbioinfo/goalign/io"
+	"github.com/evolbioinfo/goalign/io/countprofile"
 	"github.com/spf13/cobra"
 )
 
 var statMutationsRef string
 var statMutationsUnique bool
+var statMutationsProfile string
 
 // charCmd represents the char command
 var statMutationsCmd = &cobra.Command{
@@ -19,6 +21,10 @@ var statMutationsCmd = &cobra.Command{
 
 	- If --unique is specified, then counts only mutations (characters) that are unique in their column
 	for the given sequence.
+	If --profile is given along --unique, then the output will be : unique\tnew\tboth, with:
+		- unique: # mutations that are unique in each sequence in the alignment
+		- new: # mutations that are new in each sequence compared to the profile
+		- both: # mutations that are unique in each sequence in the alignment and that are new compared the profile
 	- If --ref-sequence is specified, it will try to extract a seqsuence having that name from the alignment. If none exist, 
 	it will try to open a fasta file with the given name to take the first sequence as a reference. If a character is ambigous 
 	(IUPAC notation) in an nucleotide sequence, then it is counted as a mutation only if it is incompatible with the reference character.
@@ -29,6 +35,7 @@ var statMutationsCmd = &cobra.Command{
 `,
 	RunE: func(cmd *cobra.Command, args []string) (err error) {
 		var aligns *align.AlignChannel
+		var profile *align.CountProfile
 
 		if aligns, err = readalign(infile); err != nil {
 			io.LogError(err)
@@ -42,7 +49,18 @@ var statMutationsCmd = &cobra.Command{
 			return
 		}
 
+		if statMutationsProfile != "none" {
+			if profile, err = countprofile.FromFile(statMutationsProfile); err != nil {
+				io.LogError(err)
+				return
+			}
+		}
+
 		var nummutations []int
+		var numnewmuts []int  // new mutations that are not found in the profile
+		var nummutsboth []int // mutations that are unique in the given alignment and not found in the profile
+		var num int
+
 		if statMutationsRef != "none" {
 			var s string
 			var sb align.SeqBag
@@ -61,20 +79,29 @@ var statMutationsCmd = &cobra.Command{
 				}
 				s, _ = sb.GetSequenceById(0)
 			}
-			if nummutations, err = al.NumMutationsComparedToReferenceSequence(align.NewSequence("ref", []rune(s), "")); err != nil {
-				io.LogError(err)
-				return
+			for _, s2 := range al.Sequences() {
+				if num, err = s2.NumMutationsComparedToReferenceSequence(al.Alphabet(), align.NewSequence("ref", []rune(s), "")); err != nil {
+					io.LogError(err)
+					return
+				}
+				fmt.Printf("%s\t%d\n", s2.Name(), num)
 			}
+
 		} else if statMutationsUnique {
-			nummutations = al.NumMutationsUniquePerSequence()
+			nummutations, numnewmuts, nummutsboth, err = al.NumMutationsUniquePerSequence(profile)
+			for i, s := range al.Sequences() {
+				fmt.Printf("%s\t%d", s.Name(), nummutations[i])
+				if profile != nil {
+					fmt.Printf("\t%d", numnewmuts[i])
+					fmt.Printf("\t%d", nummutsboth[i])
+				}
+				fmt.Printf("\n")
+			}
 		} else {
 			err = fmt.Errorf("Mutations should be counted by comparing to a reference sequnce with --ref-sequence")
 			io.LogError(err)
 			return
 		}
-		for i, s := range al.Sequences() {
-			fmt.Printf("%s\t%d\n", s.Name(), nummutations[i])
-		}
 
 		return
 	},
@@ -83,6 +110,7 @@ var statMutationsCmd = &cobra.Command{
 func init() {
 	statMutationsCmd.PersistentFlags().StringVar(&statMutationsRef, "ref-sequence", "none", "Reference sequence to compare each sequence with.")
 	statMutationsCmd.PersistentFlags().BoolVar(&statMutationsUnique, "unique", false, "Count, in each sequence, the number of mutations/characters that are unique in a site")
+	statMutationsCmd.PersistentFlags().StringVar(&statMutationsProfile, "count-profile", "none", "A profile to compare the alignment with, and to compute statistics faster (only with --unique)")
 
 	statsCmd.AddCommand(statMutationsCmd)
 }
diff --git a/cmd/subset.go b/cmd/subset.go
index 1a8fe50..f107faf 100644
--- a/cmd/subset.go
+++ b/cmd/subset.go
@@ -10,6 +10,7 @@ import (
 
 	"github.com/evolbioinfo/goalign/align"
 	"github.com/evolbioinfo/goalign/io"
+	"github.com/evolbioinfo/goalign/io/utils"
 	"github.com/spf13/cobra"
 )
 
@@ -176,12 +177,12 @@ func parseNameFile(file string) (subset map[string]int, err error) {
 		r = bufio.NewReader(f)
 	}
 
-	l, e := Readln(r)
+	l, e := utils.Readln(r)
 	for e == nil {
 		for _, name := range strings.Split(l, ",") {
 			subset[name] = 1
 		}
-		l, e = Readln(r)
+		l, e = utils.Readln(r)
 	}
 	return
 }
diff --git a/docs/commands/stats.md b/docs/commands/stats.md
index e25d3c7..4f84297 100644
--- a/docs/commands/stats.md
+++ b/docs/commands/stats.md
@@ -29,10 +29,20 @@ Different sub-commands:
 * `goalign stats alleles`: Prints the average number of alleles per site of the alignment;
 * `goalign stats alphabet`: Prints the alphabet of the alignemnts (aminoacids, nucleotides, unknown);
 * `goalign stats char`: Prints the character number of occurences. If `--per-sequences` is given, then prints the number of occurences of each characters for each seqences. If `--per-sites` is given, then prints the number of occurences of each characters for each sites. Is is possible to give `--only` option, to count the number of occurences of a single character.
-* `goalign stats gaps`: Prints the number of gaps in each sequences (and possibly the number of gaps from start, and from end); By default, it prints, for each alignment sequence the number of gaps. Following options are exclusive, and given in order of priority: If `--from-start` is specified, then counts only gaps at sequence starts; If `--from-end` is specified, then counts only gaps at sequence ends; If `--unique` is specified, then counts only gaps that are unique in their column;	If` --openning` is specified, then counts only gap openning (streches of gaps are counted once); Otherwise, counts total number of gaps on each sequence.
+* `goalign stats gaps`: Prints the number of gaps in each sequences (and possibly the number of gaps from start, and from end); By default, it prints, for each alignment sequence the number of gaps. Following options are exclusive, and given in order of priority: If `--from-start` is specified, then counts only gaps at sequence starts; If `--from-end` is specified, then counts only gaps at sequence ends; If `--unique` is specified, then counts only gaps that are unique in their alignmebnnt columnIf` --openning` is specified, then counts only gap openning (streches of gaps are counted once); Otherwise, counts total number of gaps on each sequence. If `--profile` is given in addition to `--unique`, then the output will be : `unique\tnew\tboth`, with:
+
+  - unique: # gaps that are unique in their column, for each sequence of the alignment
+  - new: # gaps that are new in each sequence compared to the profile
+  - both: # gaps that are unique in each sequence in the alignment and that are new compared the profile.
+
 * `goalign stats length`: Prints alignment length;
 * `goalign stats maxchar`: Prints max occurence char for each alignment site;
-* `goalign stats mutations`: Prints, for each sequence, the number of mutations on each alignment sequence, compared to a reference sequence or unique compared to all other sequences. It does not take into account '-' and 'N' as unique mutations, and does not take into account '-' and 'N' as mutations compared to a reference sequence; 	If `--unique` is specified, then counts only mutations (characters) that are unique in their column for the given sequence.	If `--ref-sequence` is specified, it will try to extract a sequence having that name from the alignment. If none exist, it will try to open a fasta file with the given name to take the first sequence as a reference. If a character is ambigous (IUPAC notation) in an nucleotide sequence, then it is counted as a mutation only if it is incompatible with the reference character.
+* `goalign stats mutations`: Prints, for each sequence, the number of mutations on each alignment sequence, compared to a reference sequence or unique compared to all other sequences. It does not take into account '-' and 'N' as unique mutations, and does not take into account '-' and 'N' as mutations compared to a reference sequence; 	If `--unique` is specified, then counts only mutations (characters) that are unique in their column for the given sequence.	If `--ref-sequence` is specified, it will try to extract a sequence having that name from the alignment. If none exist, it will try to open a fasta file with the given name to take the first sequence as a reference. If a character is ambigous (IUPAC notation) in an nucleotide sequence, then it is counted as a mutation only if it is incompatible with the reference character. If `--profile` is given in addition to `--unique`, then the output will be : `unique\tnew\tboth`, with:
+
+  - unique: # mutations that are unique in their column, for each sequence of the alignment
+  - new: # mutations that are new in each sequence compared to the profile
+  - both: # mutations that are unique in each sequence in the alignment and that are new compared the profile.
+
 * `goalign stats nalign`: Prints the number of alignments in the input file (Phylip);
 * `goalign stats nseq`: Prints the number of sequences in the input alignment;
 * `goalign stats taxa`: Lists taxa in the input alignment.
diff --git a/io/countprofile/countprofile.go b/io/countprofile/countprofile.go
new file mode 100644
index 0000000..aaf3b16
--- /dev/null
+++ b/io/countprofile/countprofile.go
@@ -0,0 +1,82 @@
+package countprofile
+
+import (
+	"bufio"
+	"compress/gzip"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+
+	"github.com/evolbioinfo/goalign/align"
+	"github.com/evolbioinfo/goalign/io/utils"
+)
+
+// FromFile Parses a "profile" file constisting of a number of occurences of each character
+// per site, tab separated, in the form:
+// site - A B C D G H K M N R...
+// 0 1 2 3 4 0 ...
+func FromFile(file string) (p *align.CountProfile, err error) {
+	var f *os.File
+	var r *bufio.Reader
+	var gr *gzip.Reader
+	var l string
+	var i int
+	var field string
+
+	p = align.NewCountProfile()
+
+	if file == "stdin" || file == "-" {
+		f = os.Stdin
+	} else {
+		if f, err = os.Open(file); err != nil {
+			return
+		}
+	}
+
+	if strings.HasSuffix(file, ".gz") {
+		if gr, err = gzip.NewReader(f); err != nil {
+			return
+		}
+		r = bufio.NewReader(gr)
+	} else {
+		r = bufio.NewReader(f)
+	}
+
+	// We parse the header
+	if l, err = utils.Readln(r); err != nil {
+		return
+	}
+	headslice := strings.Split(l, "\t")
+	header := make([]rune, 0, len(headslice)-1)
+	for i, field = range headslice {
+		if i > 0 {
+			r := []rune(field)
+			if len(r) != 1 {
+				err = fmt.Errorf("Character name Should be One character")
+				return
+			}
+			header = append(header, r[0])
+		}
+	}
+	p.SetHeader(header)
+
+	// Then the counts
+	var count int = 0
+	l, err = utils.Readln(r)
+	for err == nil {
+		for i, field = range strings.Split(l, "\t") {
+			if i > 0 {
+				if count, err = strconv.Atoi(field); err != nil {
+					return
+				}
+				p.AppendCount(i-1, count)
+			}
+		}
+		l, err = utils.Readln(r)
+	}
+	if err.Error() == "EOF" {
+		err = nil
+	}
+	return
+}
diff --git a/io/utils/readfiles.go b/io/utils/readfiles.go
index e58ea5f..1035cbc 100644
--- a/io/utils/readfiles.go
+++ b/io/utils/readfiles.go
@@ -77,3 +77,20 @@ func isHttpFile(file string) bool {
 	return strings.HasPrefix(file, "http://") ||
 		strings.HasPrefix(file, "https://")
 }
+
+// Readln returns a single line (without the ending \n)
+// from the input buffered reader.
+// An error is returned iff there is an error with the
+// buffered reader.
+func Readln(r *bufio.Reader) (string, error) {
+	var (
+		isPrefix bool  = true
+		err      error = nil
+		line, ln []byte
+	)
+	for isPrefix && err == nil {
+		line, isPrefix, err = r.ReadLine()
+		ln = append(ln, line...)
+	}
+	return string(ln), err
+}
diff --git a/test.sh b/test.sh
index e1fbdd7..9ae9c58 100755
--- a/test.sh
+++ b/test.sh
@@ -3990,3 +3990,45 @@ ${GOALIGN} stats char --per-sequences -i input > result
 diff -q -b expected result
 rm -f input expected result 
 
+
+
+echo "->goalign stats char --per-sequences --count-profile"
+cat > input <<EOF
+>s1
+ACGTACGT
+>s2
+-CGTACGT
+>s3
+ACGTACGT
+>s4
+ACGTTCGA
+>s5
+ACGTTCGA
+EOF
+
+cat > prof <<EOF
+site	-	T	C	G	A
+0	1	10	9	0	0
+1	0	0	0	10	10
+2	0	10	10	0	0
+3	10	0	0	0	10
+4	0	10	0	10	0
+5	10	10	0	0	0
+6	0	0	0	10	10
+7	10	0	0	0	10
+EOF
+
+cat > expected <<EOF
+sequence	gaps	gapsstart	gapsend	gapsuniques	gapsnew	gapsboth	gapsopenning	mutuniques	mutsnew	mutsboth	length	-	A	C	G	T
+s1	0	0	0	0	0	0	0	0	7	0	8	0	2	2	2	2
+s2	1	1	0	1	0	0	1	0	6	0	7	1	1	2	2	2
+s3	0	0	0	0	0	0	0	0	7	0	8	0	2	2	2	2
+s4	0	0	0	0	0	0	0	0	5	0	8	0	2	2	2	2
+s5	0	0	0	0	0	0	0	0	5	0	8	0	2	2	2	2
+EOF
+
+${GOALIGN} stats --per-sequences -i input --count-profile prof  > result
+diff -q -b expected result
+rm -f input expected result 
+
+