Added support of Stockholm format #15

evolbioinfo · Sep 17, 2024 · 05d8e38 · 05d8e38
1 parent 27d6175
commit 05d8e38
Show file tree

Hide file tree

Showing 7 changed files with 527 additions and 4 deletions.
diff --git a/align/const.go b/align/const.go
@@ -24,10 +24,11 @@ const (
 	PSSM_NORM_UNIF = 3 // Normalization by uniform frequency
 	PSSM_NORM_LOGO = 4 // Normalization like LOGO : v(site)=freq*(log2(alphabet)-H(site)-pseudocount
 
-	FORMAT_FASTA   = 0
-	FORMAT_PHYLIP  = 1
-	FORMAT_NEXUS   = 2
-	FORMAT_CLUSTAL = 3
+	FORMAT_FASTA     = 0
+	FORMAT_PHYLIP    = 1
+	FORMAT_NEXUS     = 2
+	FORMAT_CLUSTAL   = 3
+	FORMAT_STOCKHOLM = 4
 
 	POSITION_IDENTICAL      = 0 // All characters in a position are the same
 	POSITION_CONSERVED      = 1 // Same strong group

diff --git a/cmd/root.go b/cmd/root.go
@@ -20,6 +20,7 @@ import (
 	"github.com/evolbioinfo/goalign/io/paml"
 	"github.com/evolbioinfo/goalign/io/partition"
 	"github.com/evolbioinfo/goalign/io/phylip"
+	"github.com/evolbioinfo/goalign/io/stockholm"
 	"github.com/evolbioinfo/goalign/io/utils"
 	"github.com/evolbioinfo/goalign/version"
 	"github.com/fredericlemoine/cobrashell"
@@ -30,6 +31,7 @@ var infile string
 var rootphylip bool
 var rootnexus bool
 var rootclustal bool
+var rootstockholm bool
 var rootcpus int
 var rootinputstrict bool = false
 var rootoutputstrict bool = false
@@ -151,6 +153,8 @@ func readalign(file string) (alchan *align.AlignChannel, err error) {
 			rootnexus = true
 		} else if format == align.FORMAT_CLUSTAL {
 			rootclustal = true
+		} else if format == align.FORMAT_STOCKHOLM {
+			rootstockholm = true
 		}
 	} else {
 		if rootphylip {
@@ -186,6 +190,18 @@ func readalign(file string) (alchan *align.AlignChannel, err error) {
 			alchan.Achan <- al
 			fi.Close()
 			close(alchan.Achan)
+		} else if rootstockholm {
+			var al align.Alignment
+			cp := stockholm.NewParser(r)
+			cp.Alphabet(alphabet)
+			cp.IgnoreIdentical(ignoreidentical)
+			if al, err = cp.Parse(); err != nil {
+				return
+			}
+			alchan.Achan = make(chan align.Alignment, 1)
+			alchan.Achan <- al
+			fi.Close()
+			close(alchan.Achan)
 		} else {
 			var al align.Alignment
 			fp := fasta.NewParser(r)
@@ -219,6 +235,7 @@ func init() {
 	RootCmd.PersistentFlags().BoolVarP(&rootphylip, "phylip", "p", false, "Alignment is in phylip? default fasta")
 	RootCmd.PersistentFlags().BoolVarP(&rootnexus, "nexus", "x", false, "Alignment is in nexus? default fasta")
 	RootCmd.PersistentFlags().BoolVarP(&rootclustal, "clustal", "u", false, "Alignment is in clustal? default fasta")
+	RootCmd.PersistentFlags().BoolVarP(&rootstockholm, "stockholm", "s", false, "Alignment is in stockholm? default fasta")
 	RootCmd.PersistentFlags().IntVarP(&rootcpus, "threads", "t", 1, "Number of threads")
 
 	// If ignore is IGNORE_NONE: Does not ignore anything
@@ -251,6 +268,8 @@ func writeAlign(al align.Alignment, f utils.StringWriterCloser) {
 		f.WriteString(nexus.WriteAlignment(al))
 	} else if rootclustal {
 		f.WriteString(clustal.WriteAlignment(al))
+	} else if rootstockholm {
+		f.WriteString(stockholm.WriteAlignment(al))
 	} else {
 		f.WriteString(fasta.WriteAlignment(al))
 	}
@@ -276,6 +295,8 @@ func alignExtension() (out string) {
 		out = ".nx"
 	} else if rootclustal {
 		out = ".clustal"
+	} else if rootstockholm {
+		out = ".sto"
 	} else {
 		out = ".fa"
 	}

diff --git a/io/stockholm/stockholm_lexer.go b/io/stockholm/stockholm_lexer.go
@@ -0,0 +1,128 @@
+package stockholm
+
+import (
+	"bufio"
+	"bytes"
+	"io"
+	"strconv"
+	"strings"
+
+	aio "github.com/evolbioinfo/goalign/io"
+)
+
+// Scanner represents a lexical scanner.
+type Scanner struct {
+	r *bufio.Reader
+}
+
+// NewScanner returns a new instance of Scanner.
+func NewScanner(r io.Reader) *Scanner {
+	return &Scanner{r: bufio.NewReader(r)}
+}
+
+// read reads the next rune from the bufferred reader.
+// Returns the rune(0) if an error occurs (or io.EOF is returned).
+func (s *Scanner) read() rune {
+	ch, _, err := s.r.ReadRune()
+	if err != nil {
+		return eof
+	}
+	return ch
+}
+
+// unread places the previously read rune back on the reader.
+func (s *Scanner) unread() {
+	_ = s.r.UnreadRune()
+}
+
+// Scan returns the next token and literal value.
+func (s *Scanner) Scan() (tok Token, lit string) {
+	// Read the next rune.
+	ch := s.read()
+
+	// If we see whitespace then consume all contiguous whitespace.
+	// If we see a letter then consume as an ident or reserved word.
+	if isWhitespace(ch) {
+		s.unread()
+		return s.scanWhitespace()
+	}
+
+	if isEndOfLine(ch) {
+		if isCR(ch) {
+			ch := s.read()
+			if isNL(ch) {
+				return ENDOFLINE, ""
+			} else {
+				aio.PrintMessage("\\r without \\n detected...")
+			}
+		} else {
+			return ENDOFLINE, ""
+		}
+	}
+
+	switch ch {
+	case eof:
+		return EOF, ""
+	case '#':
+		return MARKUP, string(ch)
+	}
+
+	s.unread()
+	return s.scanIdent()
+}
+
+// scanWhitespace consumes the current rune and all contiguous whitespace.
+func (s *Scanner) scanWhitespace() (tok Token, lit string) {
+	// Create a buffer and read the current character into it.
+	var buf bytes.Buffer
+	buf.WriteRune(s.read())
+
+	// Read every subsequent whitespace character into the buffer.
+	// Non-whitespace characters and EOF will cause the loop to exit.
+	for {
+		if ch := s.read(); ch == eof {
+			break
+		} else if !isWhitespace(ch) {
+			s.unread()
+			break
+		} else {
+			buf.WriteRune(ch)
+		}
+	}
+
+	return WS, buf.String()
+}
+
+// scanIdent consumes the current rune and all contiguous ident runes.
+func (s *Scanner) scanIdent() (tok Token, lit string) {
+	// Create a buffer and read the current character into it.
+	var buf bytes.Buffer
+	buf.WriteRune(s.read())
+
+	// Read every subsequent ident character into the buffer.
+	// Non-ident characters and EOF will cause the loop to exit.
+	for {
+		if ch := s.read(); ch == eof {
+			break
+		} else if !isIdent(ch) {
+			s.unread()
+			break
+		} else {
+			_, _ = buf.WriteRune(ch)
+		}
+	}
+
+	_, err := strconv.ParseInt(buf.String(), 10, 64)
+	if err != nil {
+		switch strings.ToUpper(buf.String()) {
+		case "STOCKHOLM":
+			return STOCKHOLM, buf.String()
+		case "//":
+			return END, buf.String()
+		default:
+			return IDENT, buf.String()
+		}
+	} else {
+		return NUMERIC, buf.String()
+	}
+}
diff --git a/io/stockholm/stockholm_parser.go b/io/stockholm/stockholm_parser.go
@@ -0,0 +1,158 @@
+package stockholm
+
+import (
+	"fmt"
+	"io"
+	"strings"
+
+	"github.com/evolbioinfo/goalign/align"
+)
+
+// Parser represents a parser.
+// If ignore is align.IGNORE_NONE: Does not ignore anything
+// If ignore is align.IGNORE_NAME: Ignore sequences having the same name (keep the first one whatever their sequence)
+// If ignore is align.IGNORE_SEQUENCE: Ignore sequences having the same name and the same sequence
+// Otherwise, sets IGNORE_NONE
+type Parser struct {
+	s               *Scanner
+	ignoreidentical int
+	alphabet        int // can be align.BOTH, align.AMINOACIDS or align.NUCLEOTIDS
+	buf             struct {
+		tok Token  // last read token
+		lit string // last read literal
+		n   int    // buffer size (max=1)
+	}
+}
+
+// NewParser returns a new instance of Parser.
+func NewParser(r io.Reader) *Parser {
+	return &Parser{s: NewScanner(r), ignoreidentical: align.IGNORE_NONE, alphabet: align.BOTH}
+}
+
+// If sets to true, then will ignore duplicate sequences that have the same name and the same sequence
+// Otherwise, it just renames them just as the sequences that have same name and different sequences
+func (p *Parser) IgnoreIdentical(ignore int) *Parser {
+	p.ignoreidentical = ignore
+	return p
+}
+
+// alphabet: can be align.BOTH (auto detect alphabet), align.NUCLEOTIDS (considers alignment as nucleotides),
+// or align.AMINOACIDS (considers the alignment as aminoacids). If not auto, can return an error if the alignment
+// is not compatible with the given alphabet.
+// If another value is given, then align.BOTH is considered
+func (p *Parser) Alphabet(alphabet int) *Parser {
+	p.alphabet = alphabet
+	if p.alphabet != align.BOTH &&
+		p.alphabet != align.NUCLEOTIDS &&
+		p.alphabet != align.AMINOACIDS {
+		p.alphabet = align.BOTH
+	}
+	return p
+}
+
+// scan returns the next token from the underlying scanner.
+// If a token has been unscanned then read that instead.
+func (p *Parser) scan() (tok Token, lit string) {
+	// If we have a token on the buffer, then return it.
+	if p.buf.n != 0 {
+		p.buf.n = 0
+		return p.buf.tok, p.buf.lit
+	}
+
+	// Otherwise read the next token from the scanner.
+	tok, lit = p.s.Scan()
+
+	// Save it to the buffer in case we unscan later.
+	p.buf.tok, p.buf.lit = tok, lit
+
+	return
+}
+
+// scanIgnoreWhitespace scans the next non-whitespace token.
+func (p *Parser) scanIgnoreWhitespace() (tok Token, lit string) {
+	tok, lit = p.scan()
+	if tok == WS {
+		tok, lit = p.scan()
+	}
+	return
+}
+
+// Parses Stockholm content from the reader
+func (p *Parser) Parse() (al align.Alignment, err error) {
+	gap := '.'
+
+	// First token should be a "# STOCKHOLM 1.0" token.
+	tok, lit := p.scanIgnoreWhitespace()
+	if tok != MARKUP {
+		err = fmt.Errorf("found %q, expected # STOCKHOLM 1.0", lit)
+		return
+	}
+	tok, lit = p.scanIgnoreWhitespace()
+	if tok != STOCKHOLM {
+		err = fmt.Errorf("found %q, expected # STOCKHOLM 1.0", lit)
+		return
+	}
+	_, lit = p.scanIgnoreWhitespace()
+	if lit != "1.0" {
+		err = fmt.Errorf("found %q, expected # STOCKHOLM 1.0", lit)
+		return
+	}
+
+	al = align.NewAlign(align.UNKNOWN)
+	al.IgnoreIdentical(p.ignoreidentical)
+
+	// Now we can parse the remaining of the file
+	for {
+		tok, lit := p.scanIgnoreWhitespace()
+		if tok == ILLEGAL {
+			err = fmt.Errorf("found illegal token %q", lit)
+			return
+		}
+		if tok == EOF {
+			break
+		}
+		if tok == ENDOFLINE {
+			continue
+		}
+
+		if tok == MARKUP {
+			for tok != ENDOFLINE {
+				tok, _ = p.scanIgnoreWhitespace()
+			}
+			continue
+		}
+
+		if tok == END {
+			break
+		}
+
+		if tok == IDENT || tok == NUMERIC {
+			name := lit
+			tok, lit = p.scanIgnoreWhitespace()
+			if tok != IDENT {
+				err = fmt.Errorf("found illegal sequence %q", lit)
+				return
+			}
+			sequence := lit
+			sequence = strings.Replace(sequence, string(gap), string(align.GAP), -1)
+			if err = al.AddSequence(name, sequence, ""); err != nil {
+				return
+			}
+		}
+	}
+
+	if al.Length() == 0 {
+		err = fmt.Errorf("no sequence in this Stockholm file")
+		return
+	}
+
+	// If the alphabet given on the command line is BOTH,
+	// then we take the alphabet given in the stockholm file
+	if p.alphabet == align.BOTH {
+		al.AutoAlphabet()
+	} else {
+		err = al.SetAlphabet(p.alphabet)
+		return
+	}
+	return
+}