From 110f0fe23d8ee28dbd6937b77aa4cac5d17474dc Mon Sep 17 00:00:00 2001 From: shiltemann Date: Mon, 23 May 2016 16:46:10 +0200 Subject: [PATCH 1/5] add mothur.tre datatype (subclass) --- config/datatypes_conf.xml.sample | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/config/datatypes_conf.xml.sample b/config/datatypes_conf.xml.sample index a2216a69f35e..87f79f614453 100644 --- a/config/datatypes_conf.xml.sample +++ b/config/datatypes_conf.xml.sample @@ -318,9 +318,9 @@ - - - + + + @@ -511,6 +511,7 @@ + From f63ab5c38446a2a43dd6532ec064f52eb8445f54 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Mon, 23 May 2016 16:47:10 +0200 Subject: [PATCH 2/5] scan entire file when setting metadata for large group files this would not find all groups --- lib/galaxy/datatypes/mothur.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/galaxy/datatypes/mothur.py b/lib/galaxy/datatypes/mothur.py index a17a5f07cbac..6f6a266e525f 100644 --- a/lib/galaxy/datatypes/mothur.py +++ b/lib/galaxy/datatypes/mothur.py @@ -133,7 +133,7 @@ def init_meta(self, dataset, copy_from=None): def init_meta(self, dataset, copy_from=None): Otu.init_meta(self, dataset, copy_from=copy_from) - def set_meta(self, dataset, overwrite=True, skip=1, max_data_lines=100000, **kwd): + def set_meta(self, dataset, overwrite=True, skip=1, **kwd): # See if file starts with header line if dataset.has_data(): label_names = set() @@ -142,7 +142,7 @@ def set_meta(self, dataset, overwrite=True, skip=1, max_data_lines=100000, **kwd comment_lines = 0 ncols = 0 - headers = get_headers(dataset.file_name, sep='\t', count=max_data_lines) + headers = get_headers(dataset.file_name, sep='\t', count=-1) for line in headers: if line[0] == 'label' and line[1] == 'Group': skip = 1 @@ -516,7 +516,7 @@ def set_meta(self, dataset, overwrite=True, skip=None, max_data_lines=None, **kw Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) group_names = set() - headers = get_headers(dataset.file_name, sep='\t') + headers = get_headers(dataset.file_name, sep='\t', count=-1) for line in headers: if len(line) > 1: group_names.add(line[1]) @@ -897,7 +897,7 @@ def __init__(self, **kwd): def set_meta(self, dataset, overwrite=True, skip=1, max_data_lines=None, **kwd): Tabular.set_meta(self, dataset, overwrite, 1, max_data_lines) - headers = get_headers(dataset.file_name, sep='\t') + headers = get_headers(dataset.file_name, sep='\t', count=-1) try: flow_values = int(headers[0][0]) dataset.metadata.flow_values = flow_values From b83f1b8d24536fc79625cc11da9e47d90e47752a Mon Sep 17 00:00:00 2001 From: shiltemann Date: Fri, 27 May 2016 15:06:58 +0200 Subject: [PATCH 3/5] add otulabels metadata to mothur.otu datatype --- lib/galaxy/datatypes/mothur.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/galaxy/datatypes/mothur.py b/lib/galaxy/datatypes/mothur.py index 6f6a266e525f..202e55cf7b78 100644 --- a/lib/galaxy/datatypes/mothur.py +++ b/lib/galaxy/datatypes/mothur.py @@ -16,6 +16,7 @@ class Otu(Text): file_ext = 'mothur.otu' MetadataElement(name="columns", default=0, desc="Number of columns", readonly=True, visible=True, no_value=0) MetadataElement(name="labels", default=[], desc="Label Names", readonly=True, visible=True, no_value=[]) + MetadataElement(name="otulabels", default=[], desc="OTU Names", readonly=True, visible=True, no_value=[]) def __init__(self, **kwd): Text.__init__(self, **kwd) @@ -23,11 +24,16 @@ def __init__(self, **kwd): def set_meta(self, dataset, overwrite=True, **kwd): if dataset.has_data(): label_names = set() + otulabel_names = set() ncols = 0 data_lines = 0 comment_lines = 0 headers = get_headers(dataset.file_name, sep='\t', count=-1) + # set otulabels + if len(headers[0]) > 2: + otulabel_names = headers[0][2:] + # set label names and number of lines for line in headers: if len(line) >= 2 and not line[0].startswith('@'): data_lines += 1 @@ -40,6 +46,8 @@ def set_meta(self, dataset, overwrite=True, **kwd): dataset.metadata.columns = ncols dataset.metadata.labels = list(label_names) dataset.metadata.labels.sort() + dataset.metadata.otulabels = list(otulabel_names) + dataset.metadata.otulabels.sort() def sniff(self, filename): """ From 61589b378ace13a853a952b892cae4d8ceaea92b Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 31 May 2016 14:25:29 +0200 Subject: [PATCH 4/5] minimize number of lines read in set_meta functions --- lib/galaxy/datatypes/mothur.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/lib/galaxy/datatypes/mothur.py b/lib/galaxy/datatypes/mothur.py index 202e55cf7b78..826770ab5df7 100644 --- a/lib/galaxy/datatypes/mothur.py +++ b/lib/galaxy/datatypes/mothur.py @@ -265,14 +265,11 @@ def __init__(self, **kwd): self.comment_lines = 1 def set_meta(self, dataset, overwrite=True, **kwd): - data_lines = 0 - headers = get_headers(dataset.file_name, sep='\t', count=-1) - for line in headers: - data_lines += 1 - dataset.metadata.comment_lines = 1 - dataset.metadata.data_lines = data_lines - 1 if data_lines > 0 else 0 + Tabular.set_meta(self, dataset, overwrite=overwrite, **kwd) dataset.metadata.column_names = self.column_names dataset.metadata.column_types = self.column_types + dataset.metadata.comment_lines = self.comment_lines + dataset.metadata.data_lines -= self.comment_lines class AlignReport(Tabular): @@ -302,7 +299,7 @@ def init_meta(self, dataset, copy_from=None): def set_meta(self, dataset, overwrite=True, skip=0, **kwd): Text.set_meta(self, dataset, overwrite=overwrite, skip=skip, **kwd) - headers = get_headers(dataset.file_name, sep='\t', count=-1) + headers = get_headers(dataset.file_name, sep='\t') for line in headers: if not line[0].startswith('@'): try: @@ -714,7 +711,7 @@ def __init__(self, **kwd): U68595 1 U68600 1 # Example 2 (with group columns): - Representative_Sequence total forest pastur + Representative_Sequence total forest pasture U68630 1 1 0 U68595 1 1 0 U68600 1 1 0 @@ -725,19 +722,17 @@ def __init__(self, **kwd): self.column_names = ['name', 'total'] def set_meta(self, dataset, overwrite=True, skip=1, max_data_lines=None, **kwd): - data_lines = 0 - headers = get_headers(dataset.file_name, sep='\t', count=-1) + Tabular.set_meta(self, dataset, overwrite=overwrite, **kwd) + headers = get_headers(dataset.file_name, sep='\t', count=1) colnames = headers[0] dataset.metadata.column_types = ['str'] + (['int'] * ( len(headers[0]) - 1)) if len(colnames) > 1: dataset.metadata.columns = len(colnames) if len(colnames) > 2: dataset.metadata.groups = colnames[2:] - for line in headers[1:]: - data_lines += 1 dataset.metadata.comment_lines = 1 - dataset.metadata.data_lines = data_lines + dataset.metadata.data_lines -= 1 class RefTaxonomy(Tabular): @@ -905,7 +900,7 @@ def __init__(self, **kwd): def set_meta(self, dataset, overwrite=True, skip=1, max_data_lines=None, **kwd): Tabular.set_meta(self, dataset, overwrite, 1, max_data_lines) - headers = get_headers(dataset.file_name, sep='\t', count=-1) + headers = get_headers(dataset.file_name, sep='\t', count=1) try: flow_values = int(headers[0][0]) dataset.metadata.flow_values = flow_values From abdae9539b2e2c7fc4b11973d13fd3148c49db2f Mon Sep 17 00:00:00 2001 From: shiltemann Date: Thu, 2 Jun 2016 13:19:56 +0200 Subject: [PATCH 5/5] update superclass accessing --- lib/galaxy/datatypes/mothur.py | 72 ++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/lib/galaxy/datatypes/mothur.py b/lib/galaxy/datatypes/mothur.py index 826770ab5df7..a7b3cdb4dffc 100644 --- a/lib/galaxy/datatypes/mothur.py +++ b/lib/galaxy/datatypes/mothur.py @@ -19,9 +19,11 @@ class Otu(Text): MetadataElement(name="otulabels", default=[], desc="OTU Names", readonly=True, visible=True, no_value=[]) def __init__(self, **kwd): - Text.__init__(self, **kwd) + super(Otu, self).__init__(**kwd) def set_meta(self, dataset, overwrite=True, **kwd): + super(Otu, self).set_meta(dataset, overwrite=overwrite, **kwd) + if dataset.has_data(): label_names = set() otulabel_names = set() @@ -88,10 +90,10 @@ def __init__(self, **kwd): """ http://www.mothur.org/wiki/Sabund_file """ - Otu.__init__(self, **kwd) + super(Sabund, self).__init__(**kwd) def init_meta(self, dataset, copy_from=None): - Otu.init_meta(self, dataset, copy_from=copy_from) + super(Sabund, self).init_meta(dataset, copy_from=copy_from) def sniff(self, filename): """ @@ -132,16 +134,18 @@ class GroupAbund(Otu): MetadataElement(name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[]) def __init__(self, **kwd): - Otu.__init__(self, **kwd) + super(GroupAbund, self).__init__(**kwd) """ def init_meta(self, dataset, copy_from=None): Otu.init_meta(self, dataset, copy_from=copy_from) """ def init_meta(self, dataset, copy_from=None): - Otu.init_meta(self, dataset, copy_from=copy_from) + super(GroupAbund, self).init_meta(dataset, copy_from=copy_from) def set_meta(self, dataset, overwrite=True, skip=1, **kwd): + super(GroupAbund, self).set_meta(dataset, overwrite=overwrite, **kwd) + # See if file starts with header line if dataset.has_data(): label_names = set() @@ -215,7 +219,7 @@ class SecondaryStructureMap(Tabular): def __init__(self, **kwd): """Initialize secondary structure map datatype""" - Tabular.__init__(self, **kwd) + super(SecondaryStructureMap, self).__init__(**kwd) self.column_names = ['Map'] def sniff(self, filename): @@ -259,13 +263,14 @@ class AlignCheck(Tabular): def __init__(self, **kwd): """Initialize AlignCheck datatype""" - Tabular.__init__(self, **kwd) + super(AlignCheck, self).__init__(**kwd) self.column_names = ['name', 'pound', 'dash', 'plus', 'equal', 'loop', 'tilde', 'total'] self.column_types = ['str', 'int', 'int', 'int', 'int', 'int', 'int', 'int'] self.comment_lines = 1 def set_meta(self, dataset, overwrite=True, **kwd): - Tabular.set_meta(self, dataset, overwrite=overwrite, **kwd) + super(AlignCheck, self).set_meta(dataset, overwrite=overwrite, **kwd) + dataset.metadata.column_names = self.column_names dataset.metadata.column_types = self.column_types dataset.metadata.comment_lines = self.comment_lines @@ -281,7 +286,7 @@ class AlignReport(Tabular): def __init__(self, **kwd): """Initialize AlignCheck datatype""" - Tabular.__init__(self, **kwd) + super(AlignReport, self).__init__(**kwd) self.column_names = ['QueryName', 'QueryLength', 'TemplateName', 'TemplateLength', 'SearchMethod', 'SearchScore', 'AlignmentMethod', 'QueryStart', 'QueryEnd', 'TemplateStart', 'TemplateEnd', 'PairwiseAlignmentLength', 'GapsInQuery', 'GapsInTemplate', 'LongestInsert', 'SimBtwnQuery&Template' @@ -294,10 +299,10 @@ class DistanceMatrix(Text): MetadataElement(name="sequence_count", default=0, desc="Number of sequences", readonly=True, visible=True, optional=True, no_value='?') def init_meta(self, dataset, copy_from=None): - Text.init_meta(self, dataset, copy_from=copy_from) + super(DistanceMatrix, self).init_meta(self, dataset, copy_from=copy_from) def set_meta(self, dataset, overwrite=True, skip=0, **kwd): - Text.set_meta(self, dataset, overwrite=overwrite, skip=skip, **kwd) + super(DistanceMatrix, self).set_meta(dataset, overwrite=overwrite, skip=skip, **kwd) headers = get_headers(dataset.file_name, sep='\t') for line in headers: @@ -314,10 +319,10 @@ class LowerTriangleDistanceMatrix(DistanceMatrix): def __init__(self, **kwd): """Initialize secondary structure map datatype""" - DistanceMatrix.__init__(self, **kwd) + super(LowerTriangleDistanceMatrix, self).__init__(**kwd) def init_meta(self, dataset, copy_from=None): - DistanceMatrix.init_meta(self, dataset, copy_from=copy_from) + super(LowerTriangleDistanceMatrix, self).init_meta(dataset, copy_from=copy_from) def sniff(self, filename): """ @@ -376,10 +381,10 @@ class SquareDistanceMatrix(DistanceMatrix): file_ext = 'mothur.square.dist' def __init__(self, **kwd): - DistanceMatrix.__init__(self, **kwd) + super(SquareDistanceMatrix, self).__init__(**kwd) def init_meta(self, dataset, copy_from=None): - DistanceMatrix.init_meta(self, dataset, copy_from=copy_from) + super(SquareDistanceMatrix, self).init_meta(self, dataset, copy_from=copy_from) def sniff(self, filename): """ @@ -437,12 +442,12 @@ class PairwiseDistanceMatrix(DistanceMatrix, Tabular): def __init__(self, **kwd): """Initialize secondary structure map datatype""" - Tabular.__init__(self, **kwd) + super(PairwiseDistanceMatrix, self).__init__(**kwd) self.column_names = ['Sequence', 'Sequence', 'Distance'] self.column_types = ['str', 'str', 'float'] def set_meta(self, dataset, overwrite=True, skip=None, **kwd): - Tabular.set_meta(self, dataset, overwrite=overwrite, skip=skip, **kwd) + super(PairwiseDistanceMatrix, self).set_meta(dataset, overwrite=overwrite, skip=skip, **kwd) def sniff(self, filename): """ @@ -489,7 +494,7 @@ def __init__(self, **kwd): http://www.mothur.org/wiki/Name_file Name file shows the relationship between a representative sequence(col 1) and the sequences(comma-separated) it represents(col 2) """ - Tabular.__init__(self, **kwd) + super(Names, self).__init__(**kwd) self.column_names = ['name', 'representatives'] self.columns = 2 @@ -499,7 +504,7 @@ class Summary(Tabular): def __init__(self, **kwd): """summarizes the quality of sequences in an unaligned or aligned fasta-formatted sequence file""" - Tabular.__init__(self, **kwd) + super(Summary, self).__init__(**kwd) self.column_names = ['seqname', 'start', 'end', 'nbases', 'ambigs', 'polymer'] self.columns = 6 @@ -513,14 +518,14 @@ def __init__(self, **kwd): http://www.mothur.org/wiki/Groups_file Group file assigns sequence (col 1) to a group (col 2) """ - Tabular.__init__(self, **kwd) + super(Group, self).__init__(**kwd) self.column_names = ['name', 'group'] self.columns = 2 def set_meta(self, dataset, overwrite=True, skip=None, max_data_lines=None, **kwd): - Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) - group_names = set() + super(Group, self).set_meta(dataset, overwrite, skip, max_data_lines) + group_names = set() headers = get_headers(dataset.file_name, sep='\t', count=-1) for line in headers: if len(line) > 1: @@ -533,7 +538,7 @@ class AccNos(Tabular): def __init__(self, **kwd): """A list of names""" - Tabular.__init__(self, **kwd) + super(AccNos, self).__init__(**kwd) self.column_names = ['name'] self.columns = 1 @@ -577,7 +582,7 @@ class Frequency(Tabular): def __init__(self, **kwd): """A list of names""" - Tabular.__init__(self, **kwd) + super(Frequency, self).__init__(**kwd) self.column_names = ['position', 'frequency'] self.column_types = ['int', 'float'] @@ -629,7 +634,7 @@ class Quantile(Tabular): def __init__(self, **kwd): """Quantiles for chimera analysis""" - Tabular.__init__(self, **kwd) + super(Quantile, self).__init__(**kwd) self.column_names = ['num', 'ten', 'twentyfive', 'fifty', 'seventyfive', 'ninetyfive', 'ninetynine'] self.column_types = ['int', 'float', 'float', 'float', 'float', 'float', 'float'] @@ -718,11 +723,12 @@ def __init__(self, **kwd): U68591 1 1 0 U68647 1 0 1 """ - Tabular.__init__(self, **kwd) + super(CountTable, self).__init__(**kwd) self.column_names = ['name', 'total'] def set_meta(self, dataset, overwrite=True, skip=1, max_data_lines=None, **kwd): - Tabular.set_meta(self, dataset, overwrite=overwrite, **kwd) + super(CountTable, self).set_meta(dataset, overwrite=overwrite, **kwd) + headers = get_headers(dataset.file_name, sep='\t', count=1) colnames = headers[0] dataset.metadata.column_types = ['str'] + (['int'] * ( len(headers[0]) - 1)) @@ -739,7 +745,7 @@ class RefTaxonomy(Tabular): file_ext = 'mothur.ref.taxonomy' def __init__(self, **kwd): - Tabular.__init__(self, **kwd) + super(RefTaxonomy, self).__init__(**kwd) self.column_names = ['name', 'taxonomy'] def sniff(self, filename): @@ -799,7 +805,7 @@ class ConsensusTaxonomy(Tabular): def __init__(self, **kwd): """A list of names""" - Tabular.__init__(self, **kwd) + super(ConsensusTaxonomy, self).__init__(**kwd) self.column_names = ['OTU', 'count', 'taxonomy'] @@ -808,7 +814,7 @@ class TaxonomySummary(Tabular): def __init__(self, **kwd): """A Summary of taxon classification""" - Tabular.__init__(self, **kwd) + super(TaxonomySummary, self).__init__(**kwd) self.column_names = ['taxlevel', 'rankID', 'taxon', 'daughterlevels', 'total'] @@ -817,7 +823,7 @@ class Axes(Tabular): def __init__(self, **kwd): """Initialize axes datatype""" - Tabular.__init__(self, **kwd) + super(Axes, self).__init__(**kwd) def sniff(self, filename): """ @@ -895,10 +901,10 @@ class SffFlow(Tabular): GQY1XT001CF5YW 88 1.02 0.02 1.01 0.04 0.06 1.02 0.03 ... """ def __init__(self, **kwd): - Tabular.__init__(self, **kwd) + super(SffFlow, self).__init__(**kwd) def set_meta(self, dataset, overwrite=True, skip=1, max_data_lines=None, **kwd): - Tabular.set_meta(self, dataset, overwrite, 1, max_data_lines) + super(SffFlow, self).set_meta(dataset, overwrite, 1, max_data_lines) headers = get_headers(dataset.file_name, sep='\t', count=1) try: