Skip to content

Commit

Permalink
Added support of Stockholm format #15
Browse files Browse the repository at this point in the history
  • Loading branch information
fredericlemoine committed Sep 17, 2024
1 parent 27d6175 commit 05d8e38
Show file tree
Hide file tree
Showing 7 changed files with 527 additions and 4 deletions.
9 changes: 5 additions & 4 deletions align/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@ const (
PSSM_NORM_UNIF = 3 // Normalization by uniform frequency
PSSM_NORM_LOGO = 4 // Normalization like LOGO : v(site)=freq*(log2(alphabet)-H(site)-pseudocount

FORMAT_FASTA = 0
FORMAT_PHYLIP = 1
FORMAT_NEXUS = 2
FORMAT_CLUSTAL = 3
FORMAT_FASTA = 0
FORMAT_PHYLIP = 1
FORMAT_NEXUS = 2
FORMAT_CLUSTAL = 3
FORMAT_STOCKHOLM = 4

POSITION_IDENTICAL = 0 // All characters in a position are the same
POSITION_CONSERVED = 1 // Same strong group
Expand Down
21 changes: 21 additions & 0 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/evolbioinfo/goalign/io/paml"
"github.com/evolbioinfo/goalign/io/partition"
"github.com/evolbioinfo/goalign/io/phylip"
"github.com/evolbioinfo/goalign/io/stockholm"
"github.com/evolbioinfo/goalign/io/utils"
"github.com/evolbioinfo/goalign/version"
"github.com/fredericlemoine/cobrashell"
Expand All @@ -30,6 +31,7 @@ var infile string
var rootphylip bool
var rootnexus bool
var rootclustal bool
var rootstockholm bool
var rootcpus int
var rootinputstrict bool = false
var rootoutputstrict bool = false
Expand Down Expand Up @@ -151,6 +153,8 @@ func readalign(file string) (alchan *align.AlignChannel, err error) {
rootnexus = true
} else if format == align.FORMAT_CLUSTAL {
rootclustal = true
} else if format == align.FORMAT_STOCKHOLM {
rootstockholm = true
}
} else {
if rootphylip {
Expand Down Expand Up @@ -186,6 +190,18 @@ func readalign(file string) (alchan *align.AlignChannel, err error) {
alchan.Achan <- al
fi.Close()
close(alchan.Achan)
} else if rootstockholm {
var al align.Alignment
cp := stockholm.NewParser(r)
cp.Alphabet(alphabet)
cp.IgnoreIdentical(ignoreidentical)
if al, err = cp.Parse(); err != nil {
return
}
alchan.Achan = make(chan align.Alignment, 1)
alchan.Achan <- al
fi.Close()
close(alchan.Achan)
} else {
var al align.Alignment
fp := fasta.NewParser(r)
Expand Down Expand Up @@ -219,6 +235,7 @@ func init() {
RootCmd.PersistentFlags().BoolVarP(&rootphylip, "phylip", "p", false, "Alignment is in phylip? default fasta")
RootCmd.PersistentFlags().BoolVarP(&rootnexus, "nexus", "x", false, "Alignment is in nexus? default fasta")
RootCmd.PersistentFlags().BoolVarP(&rootclustal, "clustal", "u", false, "Alignment is in clustal? default fasta")
RootCmd.PersistentFlags().BoolVarP(&rootstockholm, "stockholm", "s", false, "Alignment is in stockholm? default fasta")
RootCmd.PersistentFlags().IntVarP(&rootcpus, "threads", "t", 1, "Number of threads")

// If ignore is IGNORE_NONE: Does not ignore anything
Expand Down Expand Up @@ -251,6 +268,8 @@ func writeAlign(al align.Alignment, f utils.StringWriterCloser) {
f.WriteString(nexus.WriteAlignment(al))
} else if rootclustal {
f.WriteString(clustal.WriteAlignment(al))
} else if rootstockholm {
f.WriteString(stockholm.WriteAlignment(al))
} else {
f.WriteString(fasta.WriteAlignment(al))
}
Expand All @@ -276,6 +295,8 @@ func alignExtension() (out string) {
out = ".nx"
} else if rootclustal {
out = ".clustal"
} else if rootstockholm {
out = ".sto"
} else {
out = ".fa"
}
Expand Down
128 changes: 128 additions & 0 deletions io/stockholm/stockholm_lexer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
package stockholm

import (
"bufio"
"bytes"
"io"
"strconv"
"strings"

aio "github.com/evolbioinfo/goalign/io"
)

// Scanner represents a lexical scanner.
type Scanner struct {
r *bufio.Reader
}

// NewScanner returns a new instance of Scanner.
func NewScanner(r io.Reader) *Scanner {
return &Scanner{r: bufio.NewReader(r)}
}

// read reads the next rune from the bufferred reader.
// Returns the rune(0) if an error occurs (or io.EOF is returned).
func (s *Scanner) read() rune {
ch, _, err := s.r.ReadRune()
if err != nil {
return eof
}
return ch
}

// unread places the previously read rune back on the reader.
func (s *Scanner) unread() {
_ = s.r.UnreadRune()
}

// Scan returns the next token and literal value.
func (s *Scanner) Scan() (tok Token, lit string) {
// Read the next rune.
ch := s.read()

// If we see whitespace then consume all contiguous whitespace.
// If we see a letter then consume as an ident or reserved word.
if isWhitespace(ch) {
s.unread()
return s.scanWhitespace()
}

if isEndOfLine(ch) {
if isCR(ch) {
ch := s.read()
if isNL(ch) {
return ENDOFLINE, ""
} else {
aio.PrintMessage("\\r without \\n detected...")
}
} else {
return ENDOFLINE, ""
}
}

switch ch {
case eof:
return EOF, ""
case '#':
return MARKUP, string(ch)
}

s.unread()
return s.scanIdent()
}

// scanWhitespace consumes the current rune and all contiguous whitespace.
func (s *Scanner) scanWhitespace() (tok Token, lit string) {
// Create a buffer and read the current character into it.
var buf bytes.Buffer
buf.WriteRune(s.read())

// Read every subsequent whitespace character into the buffer.
// Non-whitespace characters and EOF will cause the loop to exit.
for {
if ch := s.read(); ch == eof {
break
} else if !isWhitespace(ch) {
s.unread()
break
} else {
buf.WriteRune(ch)
}
}

return WS, buf.String()
}

// scanIdent consumes the current rune and all contiguous ident runes.
func (s *Scanner) scanIdent() (tok Token, lit string) {
// Create a buffer and read the current character into it.
var buf bytes.Buffer
buf.WriteRune(s.read())

// Read every subsequent ident character into the buffer.
// Non-ident characters and EOF will cause the loop to exit.
for {
if ch := s.read(); ch == eof {
break
} else if !isIdent(ch) {
s.unread()
break
} else {
_, _ = buf.WriteRune(ch)
}
}

_, err := strconv.ParseInt(buf.String(), 10, 64)
if err != nil {
switch strings.ToUpper(buf.String()) {
case "STOCKHOLM":
return STOCKHOLM, buf.String()
case "//":
return END, buf.String()
default:
return IDENT, buf.String()
}
} else {
return NUMERIC, buf.String()
}
}
158 changes: 158 additions & 0 deletions io/stockholm/stockholm_parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package stockholm

import (
"fmt"
"io"
"strings"

"github.com/evolbioinfo/goalign/align"
)

// Parser represents a parser.
// If ignore is align.IGNORE_NONE: Does not ignore anything
// If ignore is align.IGNORE_NAME: Ignore sequences having the same name (keep the first one whatever their sequence)
// If ignore is align.IGNORE_SEQUENCE: Ignore sequences having the same name and the same sequence
// Otherwise, sets IGNORE_NONE
type Parser struct {
s *Scanner
ignoreidentical int
alphabet int // can be align.BOTH, align.AMINOACIDS or align.NUCLEOTIDS
buf struct {
tok Token // last read token
lit string // last read literal
n int // buffer size (max=1)
}
}

// NewParser returns a new instance of Parser.
func NewParser(r io.Reader) *Parser {
return &Parser{s: NewScanner(r), ignoreidentical: align.IGNORE_NONE, alphabet: align.BOTH}
}

// If sets to true, then will ignore duplicate sequences that have the same name and the same sequence
// Otherwise, it just renames them just as the sequences that have same name and different sequences
func (p *Parser) IgnoreIdentical(ignore int) *Parser {
p.ignoreidentical = ignore
return p
}

// alphabet: can be align.BOTH (auto detect alphabet), align.NUCLEOTIDS (considers alignment as nucleotides),
// or align.AMINOACIDS (considers the alignment as aminoacids). If not auto, can return an error if the alignment
// is not compatible with the given alphabet.
// If another value is given, then align.BOTH is considered
func (p *Parser) Alphabet(alphabet int) *Parser {
p.alphabet = alphabet
if p.alphabet != align.BOTH &&
p.alphabet != align.NUCLEOTIDS &&
p.alphabet != align.AMINOACIDS {
p.alphabet = align.BOTH
}
return p
}

// scan returns the next token from the underlying scanner.
// If a token has been unscanned then read that instead.
func (p *Parser) scan() (tok Token, lit string) {
// If we have a token on the buffer, then return it.
if p.buf.n != 0 {
p.buf.n = 0
return p.buf.tok, p.buf.lit
}

// Otherwise read the next token from the scanner.
tok, lit = p.s.Scan()

// Save it to the buffer in case we unscan later.
p.buf.tok, p.buf.lit = tok, lit

return
}

// scanIgnoreWhitespace scans the next non-whitespace token.
func (p *Parser) scanIgnoreWhitespace() (tok Token, lit string) {
tok, lit = p.scan()
if tok == WS {
tok, lit = p.scan()
}
return
}

// Parses Stockholm content from the reader
func (p *Parser) Parse() (al align.Alignment, err error) {
gap := '.'

// First token should be a "# STOCKHOLM 1.0" token.
tok, lit := p.scanIgnoreWhitespace()
if tok != MARKUP {
err = fmt.Errorf("found %q, expected # STOCKHOLM 1.0", lit)
return
}
tok, lit = p.scanIgnoreWhitespace()
if tok != STOCKHOLM {
err = fmt.Errorf("found %q, expected # STOCKHOLM 1.0", lit)
return
}
_, lit = p.scanIgnoreWhitespace()
if lit != "1.0" {
err = fmt.Errorf("found %q, expected # STOCKHOLM 1.0", lit)
return
}

al = align.NewAlign(align.UNKNOWN)
al.IgnoreIdentical(p.ignoreidentical)

// Now we can parse the remaining of the file
for {
tok, lit := p.scanIgnoreWhitespace()
if tok == ILLEGAL {
err = fmt.Errorf("found illegal token %q", lit)
return
}
if tok == EOF {
break
}
if tok == ENDOFLINE {
continue
}

if tok == MARKUP {
for tok != ENDOFLINE {
tok, _ = p.scanIgnoreWhitespace()
}
continue
}

if tok == END {
break
}

if tok == IDENT || tok == NUMERIC {
name := lit
tok, lit = p.scanIgnoreWhitespace()
if tok != IDENT {
err = fmt.Errorf("found illegal sequence %q", lit)
return
}
sequence := lit
sequence = strings.Replace(sequence, string(gap), string(align.GAP), -1)
if err = al.AddSequence(name, sequence, ""); err != nil {
return
}
}
}

if al.Length() == 0 {
err = fmt.Errorf("no sequence in this Stockholm file")
return
}

// If the alphabet given on the command line is BOTH,
// then we take the alphabet given in the stockholm file
if p.alphabet == align.BOTH {
al.AutoAlphabet()
} else {
err = al.SetAlphabet(p.alphabet)
return
}
return
}
Loading

0 comments on commit 05d8e38

Please sign in to comment.