diff --git a/align/align.go b/align/align.go index f644801..6f1d876 100644 --- a/align/align.go +++ b/align/align.go @@ -16,6 +16,9 @@ const ( ) +var stdaminoacid = []rune{'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'} +var stdnucleotides = []rune{'A', 'C', 'G', 'T'} + type Alignment interface { AddSequence(name string, sequence string, comment string) error AddSequenceChar(name string, sequence []rune, comment string) error @@ -34,6 +37,7 @@ type Alignment interface { Recombine(rate float64, lenprop float64) TrimNames(size int) (map[string]string, error) TrimSequences(trimsize int, fromStart bool) error + AppendSeqIdentifier(identifier string, right bool) CharStats() map[rune]int64 Alphabet() int } @@ -306,6 +310,21 @@ func (a *align) TrimSequences(trimsize int, fromStart bool) error { return nil } +// Append a string to all sequence names of the alignment +// If right is true, then append it to the right of each names, +// otherwise, appends it to the left +func (a *align) AppendSeqIdentifier(identifier string, right bool) { + if len(identifier) != 0 { + for _, seq := range a.seqs { + if right { + seq.name = seq.name + identifier + } else { + seq.name = identifier + seq.name + } + } + } +} + // Samples randomly a subset of the sequences // And returns this new alignment // If nb < 1 or nb > nbsequences returns nil and an error @@ -388,3 +407,16 @@ func (a *align) CharStats() map[rune]int64 { func (a *align) Alphabet() int { return a.alphabet } + +func RandomAlignment(alphabet, length, nbseq int) (Alignment, error) { + al := NewAlign(alphabet) + for i := 0; i < nbseq; i++ { + name := fmt.Sprintf("Seq%04d", i) + if seq, err := RandomSequence(alphabet, length); err != nil { + return nil, err + } else { + al.AddSequenceChar(name, seq, "") + } + } + return al, nil +} diff --git a/align/align_test.go b/align/align_test.go new file mode 100644 index 0000000..28a6ec8 --- /dev/null +++ b/align/align_test.go @@ -0,0 +1,45 @@ +package align + +import ( + "fmt" + "strings" + "testing" +) + +func TestRandomAlignment(t *testing.T) { + length := 3000 + nbseqs := 500 + a, err := RandomAlignment(AMINOACIDS, length, nbseqs) + if err != nil { + t.Error(err) + } + + if a.Length() != length { + t.Error(fmt.Sprintf("Length should be %d and is %d", length, a.Length())) + } + if a.NbSequences() != nbseqs { + t.Error(fmt.Sprintf("Nb sequences should be %d and is %d", nbseqs, a.NbSequences())) + } +} + +func TestAppendIdentifier(t *testing.T) { + a, err := RandomAlignment(AMINOACIDS, 300, 50) + if err != nil { + t.Error(err) + + } + a.AppendSeqIdentifier("IDENT", false) + + a.IterateChar(func(name string, sequence []rune) { + if !strings.HasPrefix(name, "IDENT") { + t.Error("Sequence name does not start with expected id: IDENT") + } + }) + + a.AppendSeqIdentifier("IDENT", true) + a.IterateChar(func(name string, sequence []rune) { + if !strings.HasSuffix(name, "IDENT") { + t.Error("Sequence name does not end with expected id: IDENT") + } + }) +} diff --git a/align/sequence.go b/align/sequence.go index 777e112..3deaebd 100644 --- a/align/sequence.go +++ b/align/sequence.go @@ -1,5 +1,10 @@ package align +import ( + "errors" + "math/rand" +) + type Sequence interface { Sequence() string SequenceChar() []rune @@ -35,3 +40,18 @@ func (s *seq) Name() string { func (s *seq) Comment() string { return s.comment } + +func RandomSequence(alphabet, length int) ([]rune, error) { + seq := make([]rune, length) + for i := 0; i < length; i++ { + switch alphabet { + case AMINOACIDS: + seq[i] = stdaminoacid[rand.Intn(len(stdaminoacid))] + case NUCLEOTIDS: + seq[i] = stdnucleotides[rand.Intn(len(stdnucleotides))] + default: + return nil, errors.New("Unexpected sequence alphabet type") + } + } + return seq, nil +} diff --git a/cmd/addid.go b/cmd/addid.go new file mode 100644 index 0000000..30a8c04 --- /dev/null +++ b/cmd/addid.go @@ -0,0 +1,34 @@ +package cmd + +import ( + "github.com/spf13/cobra" +) + +var addIdOutput string +var addIdName string +var addIdRight bool + +// addidCmd represents the addid command +var addidCmd = &cobra.Command{ + Use: "addid", + Short: "Adds a string to each sequence identifier of the input alignment", + Long: `Adds a string to each sequence identifier of the input alignment. + +The string may be added to the left or to the right of each sequence identifier. +`, + Run: func(cmd *cobra.Command, args []string) { + f := openWriteFile(addIdOutput) + for al := range rootaligns { + al.AppendSeqIdentifier(addIdName, addIdRight) + writeAlign(al, f) + } + f.Close() + }, +} + +func init() { + RootCmd.AddCommand(addidCmd) + addidCmd.PersistentFlags().StringVarP(&addIdOutput, "out-align", "o", "stdout", "Renamed alignment output file") + addidCmd.PersistentFlags().StringVarP(&addIdName, "name", "n", "none", "String to add to sequence names") + addidCmd.PersistentFlags().BoolVarP(&addIdRight, "right", "r", false, "Adds the String on the right of sequence names (otherwise, adds to left)") +} diff --git a/cmd/random.go b/cmd/random.go new file mode 100644 index 0000000..dba2bdd --- /dev/null +++ b/cmd/random.go @@ -0,0 +1,53 @@ +package cmd + +import ( + "math/rand" + "time" + + "github.com/fredericlemoine/goalign/align" + "github.com/fredericlemoine/goalign/io" + "github.com/spf13/cobra" +) + +var randomLength, randomSize int +var randomAA bool +var randomOutput string +var randomSeed int64 + +// randomCmd represents the random command +var randomCmd = &cobra.Command{ + Use: "random", + Short: "Generate random sequences", + Long: `Generate random sequences. +`, + PersistentPreRun: func(cmd *cobra.Command, args []string) { + }, + Run: func(cmd *cobra.Command, args []string) { + rand.Seed(sampleSeed) + var a align.Alignment + var err error + f := openWriteFile(addIdOutput) + if !randomAA { + a, err = align.RandomAlignment(align.NUCLEOTIDS, randomLength, randomSize) + if err != nil { + io.ExitWithMessage(err) + } + } else { + a, err = align.RandomAlignment(align.AMINOACIDS, randomLength, randomSize) + if err != nil { + io.ExitWithMessage(err) + } + } + writeAlign(a, f) + f.Close() + }, +} + +func init() { + RootCmd.AddCommand(randomCmd) + randomCmd.PersistentFlags().IntVarP(&randomLength, "length", "l", 100, "Length of sequences to generate") + randomCmd.PersistentFlags().IntVarP(&randomSize, "nb-seqs", "n", 10, "Number of sequences to generate") + randomCmd.PersistentFlags().BoolVarP(&randomAA, "amino-acids", "a", false, "Aminoacid sequences (otherwise, nucleotides)") + randomCmd.PersistentFlags().StringVarP(&randomOutput, "out-align", "o", "stdout", "Random alignment output file") + randomCmd.PersistentFlags().Int64VarP(&randomSeed, "seed", "s", time.Now().UTC().UnixNano(), "Initial Random Seed") +}