diff --git a/cmd/divide.go b/cmd/divide.go index 94df020..dcea7e2 100644 --- a/cmd/divide.go +++ b/cmd/divide.go @@ -11,6 +11,8 @@ import ( var divideOutput string var divideoutputFasta bool +var divideNbSeqs int +var divideUnaligned bool // divideCmd represents the divide command var divideCmd = &cobra.Command{ @@ -18,8 +20,15 @@ var divideCmd = &cobra.Command{ Short: "Divide an input alignment in several output files", Long: `Divide an input alignment in several output files +The default behavior is to take an input alignment file containing +potentially several alignments (e.g. with Phylip format ), and output +one alignment per output file. + If the alignment is in fasta format : will create 1 file -Otherwise, will create one file per alignment in the input file +Otherwise, will create one output file per input alignment. + +if the option --nb-sequences is given, then outputs n sequences +per output file. -o : is the prefix of output files if -o div, it will create files div_0.ph...div_n.ph @@ -33,37 +42,122 @@ gotree divide -i align.ph -p -o out `, RunE: func(cmd *cobra.Command, args []string) (err error) { var aligns *align.AlignChannel + var tmpAlign align.Alignment + var tmpSeqs align.SeqBag + var f *os.File - if aligns, err = readalign(infile); err != nil { - io.LogError(err) - return + i := 0 + + ext := alignExtension() + if divideoutputFasta { + ext = ".fa" } - i := 0 - for al := range aligns.Achan { - if divideoutputFasta { - if f, err = openWriteFile(fmt.Sprintf("%s_%03d.fa", divideOutput, i)); err != nil { + if divideUnaligned { + var seqs align.SeqBag + if seqs, err = readsequences(infile); err != nil { + io.LogError(err) + return + } + + if divideNbSeqs == 0 { + if f, err = openWriteFile(fmt.Sprintf("%s_%03d%s", divideOutput, i, ext)); err != nil { io.LogError(err) return } - writeAlignFasta(al, f) + writeSequences(seqs, f) f.Close() + i++ } else { - if f, err = openWriteFile(fmt.Sprintf("%s_%03d.ph", divideOutput, i)); err != nil { - io.LogError(err) - return + tmpSeqs = align.NewSeqBag(seqs.Alphabet()) + nb := 0 + seqs.IterateAll(func(name string, sequence []uint8, comment string) bool { + tmpSeqs.AddSequenceChar(name, sequence, comment) + nb++ + if divideNbSeqs > 0 && nb%divideNbSeqs == 0 { + if f, err = openWriteFile(fmt.Sprintf("%s_%03d%s", divideOutput, i, ext)); err != nil { + io.LogError(err) + return true + } + writeSequences(tmpSeqs, f) + f.Close() + tmpSeqs = align.NewSeqBag(seqs.Alphabet()) + i++ + } + return false + }) + if nb%divideNbSeqs > 0 { + if f, err = openWriteFile(fmt.Sprintf("%s_%03d%s", divideOutput, i, ext)); err != nil { + io.LogError(err) + } + writeSequences(tmpSeqs, f) + f.Close() + i++ + } + } + } else { + if aligns, err = readalign(infile); err != nil { + io.LogError(err) + return + } + + for al := range aligns.Achan { + if divideNbSeqs == 0 { + if f, err = openWriteFile(fmt.Sprintf("%s_%03d%s", divideOutput, i, ext)); err != nil { + io.LogError(err) + return + } + if divideoutputFasta { + writeAlignFasta(al, f) + } else { + writeAlign(al, f) + } + f.Close() + i++ + } else { + tmpAlign = align.NewAlign(al.Alphabet()) + nb := 0 + al.IterateAll(func(name string, sequence []uint8, comment string) bool { + tmpAlign.AddSequenceChar(name, sequence, comment) + nb++ + if nb%divideNbSeqs == 0 { + if f, err = openWriteFile(fmt.Sprintf("%s_%03d%s", divideOutput, i, ext)); err != nil { + io.LogError(err) + return true + } + if divideoutputFasta { + writeAlignFasta(tmpAlign, f) + } else { + writeAlign(tmpAlign, f) + } + f.Close() + i++ + tmpAlign = align.NewAlign(al.Alphabet()) + } + return false + }) + if nb%divideNbSeqs > 0 { + if f, err = openWriteFile(fmt.Sprintf("%s_%03d%s", divideOutput, i, ext)); err != nil { + io.LogError(err) + } + if divideoutputFasta { + writeAlignFasta(tmpAlign, f) + } else { + writeAlign(tmpAlign, f) + } + f.Close() + i++ + } } - writeAlignPhylip(al, f) f.Close() } - i++ + if aligns.Err != nil { + err = aligns.Err + io.LogError(err) + } } - if aligns.Err != nil { - err = aligns.Err - io.LogError(err) - } return }, } @@ -71,6 +165,8 @@ gotree divide -i align.ph -p -o out func init() { RootCmd.AddCommand(divideCmd) divideCmd.PersistentFlags().StringVarP(÷Output, "output", "o", "prefix", "Divided alignment output files prefix") - divideCmd.PersistentFlags().BoolVarP(÷outputFasta, "out-fasta", "f", false, "Output files in fasta format") + divideCmd.PersistentFlags().IntVar(÷NbSeqs, "nb-sequences", 0, "Number of sequences per output file (<=0 : all sequences, >0: each alignment will be additionnaly split in several alignments)") + divideCmd.PersistentFlags().BoolVar(÷Unaligned, "unaligned", false, "Considers sequences as unaligned and format fasta (phylip, nexus,... options are ignored)") + divideCmd.PersistentFlags().BoolVarP(÷outputFasta, "out-fasta", "f", false, "Forces output files to be in fasta format") } diff --git a/docs/commands/divide.md b/docs/commands/divide.md index e546e9c..471b639 100644 --- a/docs/commands/divide.md +++ b/docs/commands/divide.md @@ -3,7 +3,22 @@ ## Commands ### divide -This command divides an input alignment file (phylip format) containing several alignments, in multiple files containing one alignment. If alignment is in Fasta, will create only one file. Option `-o` indicates prefix of output files. +The default behavior is to take an input alignment file containing potentially several alignments (e.g. with Phylip format ), and utput one alignment per output file. + +If the alignment is in fasta format : will create 1 file. Otherwise, will create one output file per input alignment. + +If the option `--nb-sequences ` is given, then will print n sequences per output file. + +`-o` is the prefix of output files + +Ex: if `-o div`, it will create files div_000.ph...div_n.ph + +Output files will be in the same format as input files, or in fasta if `-f` is given. + +Example: + +gotree divide -i align.ph -p -o out + #### Usage ``` @@ -11,25 +26,98 @@ Usage: goalign divide [flags] Flags: - -f, --out-fasta Output files in fasta format (default, same as input) - -o, --output string Divided alignment output files prefix (default "prefix") + -h, --help help for divide + --nb-sequences int Number of sequences per output file (<=0 : all sequences, >0: each alignment will be additionnaly split in several alignments) + -f, --out-fasta Forces output files to be in fasta format + -o, --output string Divided alignment output files prefix (default "prefix") + --unaligned Considers sequences as unaligned and format fasta (phylip, nexus,... options are ignored) Global Flags: - -i, --align string Alignment input file (default "stdin") - -p, --phylip Alignment is in phylip? False=Fasta - --input-strict Strict phylip input format (only used with -p) - --output-strict Strict phylip output format (only used with -p) + -i, --align string Alignment input file (default "stdin") + --auto-detect Auto detects input format (overrides -p, -x and -u) + -u, --clustal Alignment is in clustal? default fasta + --ignore-identical int Ignore duplicated sequences that have the same name and potentially have same sequences, 0 : Does not ignore anything, 1: Ignore sequences having the same name (keep the first one whatever their sequence), 2: Ignore sequences having the same name and the same sequence + --input-strict Strict phylip input format (only used with -p) + -x, --nexus Alignment is in nexus? default fasta + --no-block Write Phylip sequences without space separated blocks (only used with -p) + --one-line Write Phylip sequences on 1 line (only used with -p) + --output-strict Strict phylip output format (only used with -p) + -p, --phylip Alignment is in phylip? default fasta ``` #### Examples -* Generating a random tree with 5 tips ([Gotree](https://github.com/evolbioinfo/gotree)), simulating 3 alignments from this tree ([seq-gen](https://github.com/rambaut/Seq-Gen)), and writing them in 3 independ fasta files: +Input file, input.phylip: ``` -gotree generate yuletree -l 5 --seed 1 -o true_tree.nw -seq-gen -op -mGTR -l500 -z 2 -n 3 true_tree.nw | goalign divide -p -f -o align + 11 10 +Seq0000 GATTAATTTG +Seq0001 CCGTAGGCCA +Seq0002 GAATCTGAAG +Seq0003 ATCGAACACT +Seq0004 TTAAGTTTTC +Seq0005 ACTTCTAATG +Seq0006 GAGAGGACTA +Seq0007 GTTCATACTT +Seq0008 TTTAAACACT +Seq0009 TTTACATCGA +Seq0010 TGTCGGACCT + 3 10 +Seq0000 GATTAATTTG +Seq0001 CCGTAGGCCA +Seq0002 GAATCTGAAG ``` -There should be three files in the current directory: -* `align_000.fa` -* `align_001.fa` -* `align_002.fa` +* `goalign divide -i input -p -o divprefix -f --nb-sequences 2` will generate the following files: + +* `divprefix_000.fa` +``` +>Seq0000 +GATTAATTTG +>Seq0001 +CCGTAGGCCA +``` +* `divprefix_001.fa` +``` +>Seq0002 +GAATCTGAAG +>Seq0003 +ATCGAACACT +``` +* `divprefix_002.fa` +``` +>Seq0004 +TTAAGTTTTC +>Seq0005 +ACTTCTAATG +``` +* `divprefix_003.fa` +``` +>Seq0006 +GAGAGGACTA +>Seq0007 +GTTCATACTT +``` +* `divprefix_004.fa` +``` +>Seq0008 +TTTAAACACT +>Seq0009 +TTTACATCGA +``` +* `divprefix_005.fa` +``` +>Seq0010 +TGTCGGACCT +``` +* `divprefix_006.fa` +``` +>Seq0000 +GATTAATTTG +>Seq0001 +CCGTAGGCCA +``` +* `divprefix_007.fa` +``` +>Seq0002 +GAATCTGAAG +``` diff --git a/test.sh b/test.sh index e2b207d..cf7055e 100755 --- a/test.sh +++ b/test.sh @@ -1675,6 +1675,242 @@ cat divprefix_* > result diff -q -b result expected rm -f expected result divprefix* input +echo "->goalign divide --nb-sequences" +cat > expected.0 <Seq0000 +GATTAATTTG +>Seq0001 +CCGTAGGCCA +EOF +cat > expected.1 <Seq0002 +GAATCTGAAG +>Seq0003 +ATCGAACACT +EOF +cat > expected.2 <Seq0004 +TTAAGTTTTC +>Seq0005 +ACTTCTAATG +EOF +cat > expected.3 <Seq0006 +GAGAGGACTA +>Seq0007 +GTTCATACTT +EOF +cat > expected.4 <Seq0008 +TTTAAACACT +>Seq0009 +TTTACATCGA +EOF +cat > expected.5 <Seq0010 +TGTCGGACCT +EOF + +rm -f input +${GOALIGN} random -n 11 -l 10 --seed 10 -p > input +${GOALIGN} divide -i input -p -o divprefix -f --nb-sequences 2 +for i in {0..5} +do + diff -q -b divprefix_00${i}.fa expected.${i} + rm divprefix_00${i}.fa expected.${i} +done +rm -f input + +echo "->goalign divide several aligns --nb-sequences" +cat > input < expected.0 <Seq0000 +GATTAATTTG +>Seq0001 +CCGTAGGCCA +EOF +cat > expected.1 <Seq0002 +GAATCTGAAG +>Seq0003 +ATCGAACACT +EOF +cat > expected.2 <Seq0004 +TTAAGTTTTC +>Seq0005 +ACTTCTAATG +EOF +cat > expected.3 <Seq0006 +GAGAGGACTA +>Seq0007 +GTTCATACTT +EOF +cat > expected.4 <Seq0008 +TTTAAACACT +>Seq0009 +TTTACATCGA +EOF +cat > expected.5 <Seq0010 +TGTCGGACCT +EOF +cat > expected.6 <Seq0000 +GATTAATTTG +>Seq0001 +CCGTAGGCCA +EOF +cat > expected.7 <Seq0002 +GAATCTGAAG +EOF + +${GOALIGN} divide -i input -p -o divprefix -f --nb-sequences 2 +for i in {0..7} +do + diff -q -b divprefix_00${i}.fa expected.${i} + rm divprefix_00${i}.fa expected.${i} +done +rm -f input + + +echo "->goalign divide --unaligned" +cat > input <Seq0000 +G +>Seq0001 +CA +>Seq0002 +AAG +>Seq0003 +CACT +>Seq0004 +TTTTC +>Seq0005 +CTAATG +>Seq0006 +AGGACTA +>Seq0007 +TCATACTT +>Seq0008 +TTAAACACT +>Seq0009 +TTTACATCGA +EOF +cat > expected <Seq0000 +G +>Seq0001 +CA +>Seq0002 +AAG +>Seq0003 +CACT +>Seq0004 +TTTTC +>Seq0005 +CTAATG +>Seq0006 +AGGACTA +>Seq0007 +TCATACTT +>Seq0008 +TTAAACACT +>Seq0009 +TTTACATCGA +EOF +${GOALIGN} divide -i input -o divprefix --unaligned +diff -q -b divprefix_000.fa expected +rm -f expected input divprefix* + +echo "->goalign divide --nb-sequences --unaligned" +cat > input <Seq0000 +G +>Seq0001 +CA +>Seq0002 +AAG +>Seq0003 +CACT +>Seq0004 +TTTTC +>Seq0005 +CTAATG +>Seq0006 +AGGACTA +>Seq0007 +TCATACTT +>Seq0008 +TTAAACACT +>Seq0009 +TTTACATCGA +>Seq00010 +AAAAAAAAAA +EOF +cat > expected.0 <Seq0000 +G +>Seq0001 +CA +EOF +cat > expected.1 <Seq0002 +AAG +>Seq0003 +CACT +EOF +cat > expected.2 <Seq0004 +TTTTC +>Seq0005 +CTAATG +EOF +cat > expected.3 <Seq0006 +AGGACTA +>Seq0007 +TCATACTT +EOF +cat > expected.4 <Seq0008 +TTAAACACT +>Seq0009 +TTTACATCGA +EOF +cat > expected.5 <Seq00010 +AAAAAAAAAA +EOF + +${GOALIGN} divide -i input -o divprefix --unaligned --nb-sequences 2 +for i in {0..5} +do + diff -q -b divprefix_00${i}.fa expected.${i} + rm divprefix_00${i}.fa expected.${i} +done +rm -f input + echo "->goalign mutate gaps" cat > expected <