From 732ab49679c145ee1beb0e633a50f48381118d38 Mon Sep 17 00:00:00 2001 From: Frederic Lemoine Date: Tue, 14 Jan 2020 10:54:17 +0100 Subject: [PATCH] Corrected partition string + Added partitions to bootstrap --- align/align.go | 6 +- align/partition.go | 16 +++-- cmd/bootstrap.go | 167 ++++++++++++++++++++++++--------------------- cmd/root.go | 12 ---- cmd/split.go | 8 +-- test.sh | 60 +++++++++++++--- 6 files changed, 161 insertions(+), 108 deletions(-) diff --git a/align/align.go b/align/align.go index 1be1996..eb4805e 100644 --- a/align/align.go +++ b/align/align.go @@ -1389,7 +1389,11 @@ func (a *align) Split(part *PartitionSet) (als []Alignment, err error) { alsimpl[pi].seqs[si].sequence = append(alsimpl[pi].seqs[si].sequence, seq.sequence[pos]) } } - alsimpl[pi].length++ + if firstpos { + alsimpl[pi].length = 1 + } else { + alsimpl[pi].length++ + } firstpos = false } } diff --git a/align/partition.go b/align/partition.go index 8a16fbb..f9596ab 100644 --- a/align/partition.go +++ b/align/partition.go @@ -77,10 +77,6 @@ func (ps *PartitionSet) CheckSites() (err error) { func (ps *PartitionSet) String() string { var buffer bytes.Buffer - for j, p := range ps.partitions { - fmt.Println(j, p) - } - for i, pn := range ps.names { buffer.WriteString(ps.models[i]) buffer.WriteString(",") @@ -91,7 +87,6 @@ func (ps *PartitionSet) String() string { for j, p := range ps.partitions { if p == i { - fmt.Println(pn, i) if start == -1 { start = j end = j @@ -148,7 +143,7 @@ func (ps *PartitionSet) Partition(position int) int { return ps.partitions[position] } -// Returns the name of the partition associated to the given code +// Returns the name of the partition associated to the given index // If the code does not exist, then returns "" func (ps *PartitionSet) PartitionName(code int) string { if code < 0 || code > len(ps.names) { @@ -157,6 +152,15 @@ func (ps *PartitionSet) PartitionName(code int) string { return ps.names[code] } +// Returns the name of the modele associated to the given index +// If the code does not exist, then returns "" +func (ps *PartitionSet) ModeleName(code int) string { + if code < 0 || code > len(ps.models) { + return "" + } + return ps.models[code] +} + // returns the length of the alignment func (ps *PartitionSet) AliLength() int { return ps.length diff --git a/cmd/bootstrap.go b/cmd/bootstrap.go index 3175a33..737dcda 100644 --- a/cmd/bootstrap.go +++ b/cmd/bootstrap.go @@ -6,13 +6,12 @@ import ( "compress/gzip" "errors" "fmt" - "github.com/spf13/cobra" "os" - "sync" "time" "github.com/evolbioinfo/goalign/align" "github.com/evolbioinfo/goalign/io" + "github.com/spf13/cobra" ) var bootstrapNb int @@ -20,11 +19,8 @@ var bootstrapoutprefix string var bootstrapOrder bool var bootstraptar bool var bootstrapgz bool - -type outboot struct { - bootstr string - name string -} +var bootstrappartitionstr string +var bootstrapoutputpartitionstr string // seqbootCmd represents the bootstrap command var seqbootCmd = &cobra.Command{ @@ -44,8 +40,7 @@ The input may be a Phylip or Fasta file. files will be in fasta format as well. - It is possible to give a initial seed (--seed). In this case several runs of - the tool will give the exact same results (if run on 1 thread, an thus - computations are done on a single thread in this case). + the tool will give the exact same results. Example of usage: @@ -53,9 +48,18 @@ goalign build seqboot -i align.phylip -p -n 500 -o boot --tar-gz goalign build seqboot -i align.phylip -p -n 500 -o boot_ `, RunE: func(cmd *cobra.Command, args []string) (err error) { - var aligns *align.AlignChannel + var alignChan *align.AlignChannel + var aligns []align.Alignment + var al align.Alignment + var f *os.File + var tw *tar.Writer + var gw *gzip.Writer + var inputpartition, outputpartition *align.PartitionSet + var bootstring string + var boot, tmpboot align.Alignment - if aligns, err = readalign(infile); err != nil { + // We read input alignment + if alignChan, err = readalign(infile); err != nil { io.LogError(err) return } @@ -66,70 +70,34 @@ goalign build seqboot -i align.phylip -p -n 500 -o boot_ return } - var f *os.File - var tw *tar.Writer - var gw *gzip.Writer - - align, _ := <-aligns.Achan - if aligns.Err != nil { - err = aligns.Err + // We take the first alignment of the channel + al, _ = <-alignChan.Achan + if alignChan.Err != nil { + err = alignChan.Err io.LogError(err) return } - bootidx := make(chan int, 100) - outchan := make(chan outboot, 100) - - cpus := rootcpus - if bootstraptar { - cpus = min_int(1, cpus-1) - } - - go func() { - for i := 0; i < bootstrapNb; i++ { - bootidx <- i + // If a partition file is given, then we parse it + if bootstrappartitionstr != "none" { + if inputpartition, err = parsePartition(bootstrappartitionstr, al.Length()); err != nil { + io.LogError(err) + return } - close(bootidx) - }() - - var wg sync.WaitGroup // For waiting end of step computation - // Seed is set => 1 thread - if cmd.Flags().Changed("seed") { - cpus = 1 - } - for i := 0; i < cpus; i++ { - wg.Add(1) - go func() { - var bootstring string - for idx := range bootidx { - bootid := bootstrapoutprefix + fmt.Sprintf("%d", idx) - boot := align.BuildBootstrap() - if bootstrapOrder { - boot.ShuffleSequences() - } - - bootstring = writeAlignString(boot) - - // Output - if bootstraptar { - outchan <- outboot{bootstring, bootid} - } else { - if err2 := writenewfile(bootid, bootstrapgz, bootstring); err2 != nil { - io.LogError(err2) - err = err2 - return - } - } - } - wg.Done() - }() + if err = inputpartition.CheckSites(); err != nil { + io.LogError(err) + return + } + if aligns, err = al.Split(inputpartition); err != nil { + io.LogError(err) + return + } + outputpartition = align.NewPartitionSet(al.Length()) + //fmt.Println(bootstrappartition.String()) + } else { + aligns = []align.Alignment{al} } - go func() { - wg.Wait() - close(outchan) - }() - // Create new tar(/gz) file if bootstraptar { if bootstrapgz { @@ -152,16 +120,63 @@ goalign build seqboot -i align.phylip -p -n 500 -o boot_ defer tw.Close() } - idx := 0 - for oboot := range outchan { + for idx := 0; idx < bootstrapNb; idx++ { + boot = nil + bootid := bootstrapoutprefix + fmt.Sprintf("%d", idx) + // There may be several alignments to process if there are + // several partitions. We generate bootstrap replicates + // for each partition, and then concatenate them all. + for _, a := range aligns { + tmpboot = a.BuildBootstrap() + if boot == nil { + boot = tmpboot + } else { + if err = boot.Concat(tmpboot); err != nil { + io.LogError(err) + return + } + } + } + // We shuffle sequence order + if bootstrapOrder { + boot.ShuffleSequences() + } + + bootstring = writeAlignString(boot) + + // Output if bootstraptar { - if err = addstringtotargz(tw, oboot.name, oboot.bootstr); err != nil { + if err = addstringtotargz(tw, bootid, bootstring); err != nil { + io.LogError(err) + return + } + } else { + if err = writenewfile(bootid+alignExtension(), bootstrapgz, bootstring); err != nil { io.LogError(err) return } } - idx++ } + + var start, end int = 0, 0 + if outputpartition != nil { + for i, a := range aligns { + start = end + end = start + a.Length() + // We initialize an outputpartition + // Which will have all the sites of each + // partition grouped together. + outputpartition.AddRange( + inputpartition.PartitionName(i), + inputpartition.ModeleName(i), + start, end-1, 1) + } + if bootstrapoutputpartitionstr == "" { + bootstrapoutputpartitionstr = bootstrappartitionstr + "_boot" + } + writenewfile(bootstrapoutputpartitionstr, false, outputpartition.String()) + } + return }, } @@ -169,10 +184,8 @@ goalign build seqboot -i align.phylip -p -n 500 -o boot_ func writenewfile(name string, gz bool, bootstring string) (err error) { var f *os.File - ext := alignExtension() - if gz { - if f, err = os.Create(name + ext + ".gz"); err != nil { + if f, err = os.Create(name + ".gz"); err != nil { return } else { gw := gzip.NewWriter(f) @@ -183,7 +196,7 @@ func writenewfile(name string, gz bool, bootstring string) (err error) { f.Close() } } else { - if f, err = os.Create(name + ext); err != nil { + if f, err = os.Create(name); err != nil { return } else { f.WriteString(bootstring) @@ -228,5 +241,7 @@ func init() { seqbootCmd.PersistentFlags().BoolVar(&bootstraptar, "tar", false, "Will create a single tar file with all bootstrap alignments (one thread for tar, but not a bottleneck)") seqbootCmd.PersistentFlags().BoolVar(&bootstrapgz, "gz", false, "Will gzip output file(s). Maybe slow if combined with --tar (only one thread working for tar/gz)") seqbootCmd.PersistentFlags().IntVarP(&bootstrapNb, "nboot", "n", 1, "Number of bootstrap replicates to build") + seqbootCmd.PersistentFlags().StringVar(&bootstrappartitionstr, "partition", "none", "File containing definition of the partitions") + seqbootCmd.PersistentFlags().StringVar(&bootstrapoutputpartitionstr, "out-partition", "", "File containing output partitions (default: same name as input partition with _boot suffix)") seqbootCmd.PersistentFlags().StringVarP(&bootstrapoutprefix, "out-prefix", "o", "none", "Prefix of output bootstrap files") } diff --git a/cmd/root.go b/cmd/root.go index 9956472..245a642 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -227,18 +227,6 @@ func writeAlign(al align.Alignment, f *os.File) { } } -func extension() string { - if rootphylip { - return ".phy" - } else if rootnexus { - return ".nx" - } else if rootclustal { - return ".clustal" - } else { - return ".fasta" - } -} - func writeAlignString(al align.Alignment) (out string) { if rootphylip { out = phylip.WriteAlignment(al, rootoutputstrict, rootoutputoneline, rootoutputnoblock) diff --git a/cmd/split.go b/cmd/split.go index fc2fff6..17d5a17 100644 --- a/cmd/split.go +++ b/cmd/split.go @@ -2,9 +2,10 @@ package cmd import ( "fmt" - "github.com/spf13/cobra" "os" + "github.com/spf13/cobra" + "github.com/evolbioinfo/goalign/align" "github.com/evolbioinfo/goalign/io" ) @@ -64,7 +65,7 @@ goalign split -i align.phylip --partition partition.txt } for i, a := range splitAligns { - name := splitprefix + splitpartition.PartitionName(i) + extension() + name := splitprefix + splitpartition.PartitionName(i) + alignExtension() if f, err = openWriteFile(name); err != nil { io.LogError(err) return @@ -80,7 +81,6 @@ goalign split -i align.phylip --partition partition.txt func init() { RootCmd.AddCommand(splitCmd) - splitCmd.PersistentFlags().StringVar(&splitprefix, "prefix", "", "Prefix of output files") + splitCmd.PersistentFlags().StringVarP(&splitprefix, "out-prefix", "o", "", "Prefix of output files") splitCmd.PersistentFlags().StringVar(&splitpartitionstr, "partition", "none", "File containing definition of the partitions") - splitCmd.PersistentFlags().StringVarP(&bootstrapoutprefix, "out-prefix", "o", "none", "Prefix of output bootstrap files") } diff --git a/test.sh b/test.sh index 43474a3..a5fcf02 100755 --- a/test.sh +++ b/test.sh @@ -1716,7 +1716,7 @@ InexistantSeqName EOF ${GOALIGN} random -n 4000 --seed 10 -l 10 | ${GOALIGN} subset -f namefile > result diff -q -b result expected -rm -f expected result +rm -f expected result namefile echo "->goalign subset index" cat > expected <goalign build seqboot partition" +cat > partition < input <Seq0000 +ACGACGACGACGACGACGACGACG +>Seq0001 +ACGACGACGACGACGACGACGACG +>Seq0002 +ACGACGACGACGACGACGACGACG +>Seq0003 +ACGACGACGACGACGACGACGACG +EOF +cat > expected <Seq0000 +AAAAAAAACCCCCCCCGGGGGGGG +>Seq0001 +AAAAAAAACCCCCCCCGGGGGGGG +>Seq0002 +AAAAAAAACCCCCCCCGGGGGGGG +>Seq0003 +AAAAAAAACCCCCCCCGGGGGGGG +EOF +cat > expected_outpartition <goalign codonalign" cat > input.aa <Seq0000 @@ -3486,10 +3528,10 @@ CCCCC CCCCC EOF -${GOALIGN} split -i input --partition partitions --prefix ./ -diff -q -b exp_p1 p1.fasta -diff -q -b exp_p2 p2.fasta -rm -f input exp_p1 exp_p2 partitions +${GOALIGN} split -i input --partition partitions --out-prefix ./ +diff -q -b exp_p1 p1.fa +diff -q -b exp_p2 p2.fa +rm -f input exp_p1 exp_p2 partitions p1.fa p2.fa echo "->goalign split codons/2" cat > input <