Skip to content

Commit

Permalink
Added option --nb-sequences to goalign divide command
Browse files Browse the repository at this point in the history
  • Loading branch information
fredericlemoine committed May 4, 2021
1 parent 891777d commit 87ff7b0
Show file tree
Hide file tree
Showing 3 changed files with 453 additions and 33 deletions.
134 changes: 115 additions & 19 deletions cmd/divide.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,24 @@ import (

var divideOutput string
var divideoutputFasta bool
var divideNbSeqs int
var divideUnaligned bool

// divideCmd represents the divide command
var divideCmd = &cobra.Command{
Use: "divide",
Short: "Divide an input alignment in several output files",
Long: `Divide an input alignment in several output files
The default behavior is to take an input alignment file containing
potentially several alignments (e.g. with Phylip format ), and output
one alignment per output file.
If the alignment is in fasta format : will create 1 file
Otherwise, will create one file per alignment in the input file
Otherwise, will create one output file per input alignment.
if the option --nb-sequences <n> is given, then outputs n sequences
per output file.
-o : is the prefix of output files
if -o div, it will create files div_0.ph...div_n.ph
Expand All @@ -33,44 +42,131 @@ gotree divide -i align.ph -p -o out
`,
RunE: func(cmd *cobra.Command, args []string) (err error) {
var aligns *align.AlignChannel
var tmpAlign align.Alignment
var tmpSeqs align.SeqBag

var f *os.File

if aligns, err = readalign(infile); err != nil {
io.LogError(err)
return
i := 0

ext := alignExtension()
if divideoutputFasta {
ext = ".fa"
}

i := 0
for al := range aligns.Achan {
if divideoutputFasta {
if f, err = openWriteFile(fmt.Sprintf("%s_%03d.fa", divideOutput, i)); err != nil {
if divideUnaligned {
var seqs align.SeqBag
if seqs, err = readsequences(infile); err != nil {
io.LogError(err)
return
}

if divideNbSeqs == 0 {
if f, err = openWriteFile(fmt.Sprintf("%s_%03d%s", divideOutput, i, ext)); err != nil {
io.LogError(err)
return
}
writeAlignFasta(al, f)
writeSequences(seqs, f)
f.Close()
i++
} else {
if f, err = openWriteFile(fmt.Sprintf("%s_%03d.ph", divideOutput, i)); err != nil {
io.LogError(err)
return
tmpSeqs = align.NewSeqBag(seqs.Alphabet())
nb := 0
seqs.IterateAll(func(name string, sequence []uint8, comment string) bool {
tmpSeqs.AddSequenceChar(name, sequence, comment)
nb++
if divideNbSeqs > 0 && nb%divideNbSeqs == 0 {
if f, err = openWriteFile(fmt.Sprintf("%s_%03d%s", divideOutput, i, ext)); err != nil {
io.LogError(err)
return true
}
writeSequences(tmpSeqs, f)
f.Close()
tmpSeqs = align.NewSeqBag(seqs.Alphabet())
i++
}
return false
})
if nb%divideNbSeqs > 0 {
if f, err = openWriteFile(fmt.Sprintf("%s_%03d%s", divideOutput, i, ext)); err != nil {
io.LogError(err)
}
writeSequences(tmpSeqs, f)
f.Close()
i++
}
}
} else {
if aligns, err = readalign(infile); err != nil {
io.LogError(err)
return
}

for al := range aligns.Achan {
if divideNbSeqs == 0 {
if f, err = openWriteFile(fmt.Sprintf("%s_%03d%s", divideOutput, i, ext)); err != nil {
io.LogError(err)
return
}
if divideoutputFasta {
writeAlignFasta(al, f)
} else {
writeAlign(al, f)
}
f.Close()
i++
} else {
tmpAlign = align.NewAlign(al.Alphabet())
nb := 0
al.IterateAll(func(name string, sequence []uint8, comment string) bool {
tmpAlign.AddSequenceChar(name, sequence, comment)
nb++
if nb%divideNbSeqs == 0 {
if f, err = openWriteFile(fmt.Sprintf("%s_%03d%s", divideOutput, i, ext)); err != nil {
io.LogError(err)
return true
}
if divideoutputFasta {
writeAlignFasta(tmpAlign, f)
} else {
writeAlign(tmpAlign, f)
}
f.Close()
i++
tmpAlign = align.NewAlign(al.Alphabet())
}
return false
})
if nb%divideNbSeqs > 0 {
if f, err = openWriteFile(fmt.Sprintf("%s_%03d%s", divideOutput, i, ext)); err != nil {
io.LogError(err)
}
if divideoutputFasta {
writeAlignFasta(tmpAlign, f)
} else {
writeAlign(tmpAlign, f)
}
f.Close()
i++
}
}
writeAlignPhylip(al, f)
f.Close()
}
i++
if aligns.Err != nil {
err = aligns.Err
io.LogError(err)
}
}

if aligns.Err != nil {
err = aligns.Err
io.LogError(err)
}
return
},
}

func init() {
RootCmd.AddCommand(divideCmd)
divideCmd.PersistentFlags().StringVarP(&divideOutput, "output", "o", "prefix", "Divided alignment output files prefix")
divideCmd.PersistentFlags().BoolVarP(&divideoutputFasta, "out-fasta", "f", false, "Output files in fasta format")
divideCmd.PersistentFlags().IntVar(&divideNbSeqs, "nb-sequences", 0, "Number of sequences per output file (<=0 : all sequences, >0: each alignment will be additionnaly split in several alignments)")
divideCmd.PersistentFlags().BoolVar(&divideUnaligned, "unaligned", false, "Considers sequences as unaligned and format fasta (phylip, nexus,... options are ignored)")
divideCmd.PersistentFlags().BoolVarP(&divideoutputFasta, "out-fasta", "f", false, "Forces output files to be in fasta format")

}
116 changes: 102 additions & 14 deletions docs/commands/divide.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,121 @@
## Commands

### divide
This command divides an input alignment file (phylip format) containing several alignments, in multiple files containing one alignment. If alignment is in Fasta, will create only one file. Option `-o` indicates prefix of output files.
The default behavior is to take an input alignment file containing potentially several alignments (e.g. with Phylip format ), and utput one alignment per output file.

If the alignment is in fasta format : will create 1 file. Otherwise, will create one output file per input alignment.

If the option `--nb-sequences <n>` is given, then will print n sequences per output file.

`-o` is the prefix of output files

Ex: if `-o div`, it will create files div_000.ph...div_n.ph

Output files will be in the same format as input files, or in fasta if `-f` is given.

Example:

gotree divide -i align.ph -p -o out


#### Usage
```
Usage:
goalign divide [flags]
Flags:
-f, --out-fasta Output files in fasta format (default, same as input)
-o, --output string Divided alignment output files prefix (default "prefix")
-h, --help help for divide
--nb-sequences int Number of sequences per output file (<=0 : all sequences, >0: each alignment will be additionnaly split in several alignments)
-f, --out-fasta Forces output files to be in fasta format
-o, --output string Divided alignment output files prefix (default "prefix")
--unaligned Considers sequences as unaligned and format fasta (phylip, nexus,... options are ignored)
Global Flags:
-i, --align string Alignment input file (default "stdin")
-p, --phylip Alignment is in phylip? False=Fasta
--input-strict Strict phylip input format (only used with -p)
--output-strict Strict phylip output format (only used with -p)
-i, --align string Alignment input file (default "stdin")
--auto-detect Auto detects input format (overrides -p, -x and -u)
-u, --clustal Alignment is in clustal? default fasta
--ignore-identical int Ignore duplicated sequences that have the same name and potentially have same sequences, 0 : Does not ignore anything, 1: Ignore sequences having the same name (keep the first one whatever their sequence), 2: Ignore sequences having the same name and the same sequence
--input-strict Strict phylip input format (only used with -p)
-x, --nexus Alignment is in nexus? default fasta
--no-block Write Phylip sequences without space separated blocks (only used with -p)
--one-line Write Phylip sequences on 1 line (only used with -p)
--output-strict Strict phylip output format (only used with -p)
-p, --phylip Alignment is in phylip? default fasta
```

#### Examples

* Generating a random tree with 5 tips ([Gotree](https://github.com/evolbioinfo/gotree)), simulating 3 alignments from this tree ([seq-gen](https://github.com/rambaut/Seq-Gen)), and writing them in 3 independ fasta files:
Input file, input.phylip:
```
gotree generate yuletree -l 5 --seed 1 -o true_tree.nw
seq-gen -op -mGTR -l500 -z 2 -n 3 true_tree.nw | goalign divide -p -f -o align
11 10
Seq0000 GATTAATTTG
Seq0001 CCGTAGGCCA
Seq0002 GAATCTGAAG
Seq0003 ATCGAACACT
Seq0004 TTAAGTTTTC
Seq0005 ACTTCTAATG
Seq0006 GAGAGGACTA
Seq0007 GTTCATACTT
Seq0008 TTTAAACACT
Seq0009 TTTACATCGA
Seq0010 TGTCGGACCT
3 10
Seq0000 GATTAATTTG
Seq0001 CCGTAGGCCA
Seq0002 GAATCTGAAG
```

There should be three files in the current directory:
* `align_000.fa`
* `align_001.fa`
* `align_002.fa`
* `goalign divide -i input -p -o divprefix -f --nb-sequences 2` will generate the following files:

* `divprefix_000.fa`
```
>Seq0000
GATTAATTTG
>Seq0001
CCGTAGGCCA
```
* `divprefix_001.fa`
```
>Seq0002
GAATCTGAAG
>Seq0003
ATCGAACACT
```
* `divprefix_002.fa`
```
>Seq0004
TTAAGTTTTC
>Seq0005
ACTTCTAATG
```
* `divprefix_003.fa`
```
>Seq0006
GAGAGGACTA
>Seq0007
GTTCATACTT
```
* `divprefix_004.fa`
```
>Seq0008
TTTAAACACT
>Seq0009
TTTACATCGA
```
* `divprefix_005.fa`
```
>Seq0010
TGTCGGACCT
```
* `divprefix_006.fa`
```
>Seq0000
GATTAATTTG
>Seq0001
CCGTAGGCCA
```
* `divprefix_007.fa`
```
>Seq0002
GAATCTGAAG
```
Loading

0 comments on commit 87ff7b0

Please sign in to comment.