diff --git a/README.md b/README.md index 979f071..238cad5 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Code repository corresponding to paper "Detecting differential growth of microbi * python (virtualenv recommended) #### Download this repository -`git clone https://github.com/ptonner/gp_growth_phenotype.git` +`git clone https://github.com/SirRujak/gp_growth_phenotype.git` #### Setup a local python environment (optional) `virtualenv .` @@ -24,6 +24,8 @@ Code repository corresponding to paper "Detecting differential growth of microbi ## Examples +For a basic example that does not use jupyter, look at example.py in the notebooks folder. + Input to the B-GREAT method should come from two dataframes, *data* (n x p) and *meta* (p x k) where n, p, and k are: * n: number of time-points * p: number of time-course measurements (replicates) diff --git a/bgreat.py b/bgreat.py index f5621d9..e81ca99 100644 --- a/bgreat.py +++ b/bgreat.py @@ -1,8 +1,12 @@ +from __future__ import absolute_import +from __future__ import print_function import matplotlib.pyplot as plt import numpy as np import scipy, GPy, os, ast import pandas as pd import matplotlib as mpl +from six.moves import range +from six.moves import zip meta = data = parent = condition = control = None @@ -14,13 +18,13 @@ def plotSamples(samples,x=None,color='b',colors=None,plotMethod=None,label=None,*args,**kwargs): if x is None: - x = range(samples.shape[0]) + x = list(range(samples.shape[0])) if colors is None: colors = [color]*samples.shape[1] if plotMethod is None: plotMethod = plt.plot - for i,c in zip(range(samples.shape[1]),colors): + for i,c in zip(list(range(samples.shape[1])),colors): if not label is None: plotMethod(x,samples[:,i],color=c,label=label,*args,**kwargs) label = None @@ -38,7 +42,7 @@ def setGlobals(_data=None,_meta=None,_parent=None,_condition=None,_control=None) global data data = _data - data.columns = range(data.shape[1]) + data.columns = list(range(data.shape[1])) if not _meta is None: global meta @@ -234,7 +238,7 @@ def testMutants(mutants,numPerm=10,timeThin=4,dims=[],nullDim='strain-regression for i,m in enumerate(mutants): if (i + 1)%10 == 0: - print 1.*i/len(mutants),m + print(1.*i/len(mutants),m) select = ((meta.Condition==control) | (meta.Condition==condition)) & (meta.strain.isin([parent,m])) if m in results: diff --git a/gp_growth/analysis.py b/gp_growth/analysis.py index 0fc5398..5898918 100644 --- a/gp_growth/analysis.py +++ b/gp_growth/analysis.py @@ -1,9 +1,13 @@ +from __future__ import absolute_import +from __future__ import print_function import pandas as pd import numpy as np -from ConfigParser import ConfigParser +from six.moves.configparser import ConfigParser from registry import registry from gp_growth import storage import os, gc +from six.moves import range +from six.moves import zip class Analysis(): @@ -111,16 +115,16 @@ def run(self,verbosity=None): continue # build a dictionary of key-value pairs for selecting data - kwargs = dict(zip(self.groupby,vals)) + kwargs = dict(list(zip(self.groupby,vals))) temp = self.data.select(**kwargs) if verbosity > 0: - print kwargs, temp.key.shape[0] + print(kwargs, temp.key.shape[0]) if temp.key.shape[0] > self.maxSize: if verbosity > 0: - print "Number of samples to large (%d > %d)" % (temp.key.shape[0],self.maxSize) + print("Number of samples to large (%d > %d)" % (temp.key.shape[0],self.maxSize)) continue regression = temp.getData("gp") @@ -133,7 +137,7 @@ def run(self,verbosity=None): # del temp collected = gc.collect() if verbosity > 0: - print "Garbage collector: collected %d objects." % (collected) + print("Garbage collector: collected %d objects." % (collected)) if __name__ == "__main__": import sys, getopt diff --git a/gp_growth/categorical.py b/gp_growth/categorical.py index bfe87d9..b930932 100644 --- a/gp_growth/categorical.py +++ b/gp_growth/categorical.py @@ -1,3 +1,5 @@ +from __future__ import absolute_import +from __future__ import print_function from GPy.kern import Kern,RBF from GPy.core.parameterization.transformations import Logexp from GPy.core.parameterization import Param @@ -37,15 +39,15 @@ def update_gradients_full(self, dL_dK, X, X2=None): self.variance.gradient = np.nansum(dL_dK * np.where(X == X2.T,1,0)) def update_gradients_diag(self, dL_dKdiag, X): - print "update_gradients_diag" + print("update_gradients_diag") self.variance.gradient = np.sum(dL_dKdiag * np.ones(X.shape[0])) def gradients_X(self, dL_dK, X, X2): - print "gradients_X" + print("gradients_X") if X2 is None: X2 = X return dL_dK * np.where(X == X2.T,1,0) def gradients_X_diag(self, dL_dKdiag, X): - print "gradients_X_diag" + print("gradients_X_diag") return dL_dKdiag * np.ones(X.shape[0]) \ No newline at end of file diff --git a/gp_growth/changepoint.py b/gp_growth/changepoint.py index 4bd49f4..b7f86c6 100644 --- a/gp_growth/changepoint.py +++ b/gp_growth/changepoint.py @@ -1,9 +1,12 @@ +from __future__ import absolute_import +from __future__ import print_function from gp_growth import factory from gp_growth.categorical import Categorical import GPy from GPy.core.parameterization import Param from GPy.core.parameterization.transformations import Logexp import numpy as np +from six.moves import range CP_TOL = 1e-5 @@ -130,7 +133,7 @@ class ChangepointCross(GPy.kern._src.kern.CombinationKernel): """Kernel for points across a changepoint. K(X,Y) = kc * K1(X,cp) * K2(Y,cp) """ def __init__(self,cp,kc = 1,k=1,**kwargs): - ad = [0] + range(2,k+1) + ad = [0] + list(range(2,k+1)) super(ChangepointCross,self).__init__([GPy.kern.RBF(k,active_dims=ad,ARD=True,**kwargs),GPy.kern.RBF(k,active_dims=ad,ARD=True,**kwargs)],"changepoint") self.k = k @@ -157,7 +160,7 @@ def Kdiag(self,X): return self.kc * self.parts[0].Kdiag(X) * self.parts[1].Kdiag(X) def update_gradients_full(self, dL_dK, X, X2=None): - print "cp_cross update_gradients_full" + print("cp_cross update_gradients_full") k = self.K(X,X2)*dL_dK # try: for p in self.parts: @@ -208,7 +211,7 @@ def buildKernelPreprocess(self,): def buildKernel(self,): k = len(self.inputDimensions) - 1 - ad = [0] + range(2,k+1) + ad = [0] + list(range(2,k+1)) cp = self.cp if self.normalize: diff --git a/gp_growth/changepoint_multidim.py b/gp_growth/changepoint_multidim.py index 0215d03..6d2e9a3 100644 --- a/gp_growth/changepoint_multidim.py +++ b/gp_growth/changepoint_multidim.py @@ -1,3 +1,5 @@ +from __future__ import absolute_import +from __future__ import print_function from gp_growth import factory from gp_growth.categorical import Categorical import GPy @@ -5,6 +7,7 @@ from GPy.core.parameterization.transformations import Logexp from GPy.kern import Kern, RBF import numpy as np +from six.moves import range CP_TOL = 1e-5 @@ -51,7 +54,7 @@ def __init__(self,input_dim=1,k1=None,k2=None,kc=None,xc=None,cpDim=None,*args,* self.cpDim = cpDim if self.cpDim is None: self.cpDim = 0 - self.otherDims = range(self.input_dim) + self.otherDims = list(range(self.input_dim)) self.otherDims.remove(self.cpDim) self.otherDims = np.array(self.otherDims) @@ -298,7 +301,7 @@ class ChangepointCross(GPy.kern._src.kern.CombinationKernel): """Kernel for points across a changepoint. K(X,Y) = kc * K1(X,cp) * K2(Y,cp) """ def __init__(self,cp,kc = 1,k=1,**kwargs): - ad = [0] + range(2,k+1) + ad = [0] + list(range(2,k+1)) super(ChangepointCross,self).__init__([GPy.kern.RBF(k,active_dims=ad,ARD=True,**kwargs),GPy.kern.RBF(k,active_dims=ad,ARD=True,**kwargs)],"changepoint") self.k = k @@ -325,7 +328,7 @@ def Kdiag(self,X): return self.kc * self.parts[0].Kdiag(X) * self.parts[1].Kdiag(X) def update_gradients_full(self, dL_dK, X, X2=None): - print "cp_cross update_gradients_full" + print("cp_cross update_gradients_full") k = self.K(X,X2)*dL_dK # try: for p in self.parts: @@ -376,7 +379,7 @@ def buildKernelPreprocess(self,): def buildKernel(self,): k = len(self.inputDimensions) - 1 - ad = [0] + range(2,k+1) + ad = [0] + list(range(2,k+1)) cp = self.cp if self.normalize: diff --git a/gp_growth/classical.py b/gp_growth/classical.py index a4e68e3..4da4e8d 100644 --- a/gp_growth/classical.py +++ b/gp_growth/classical.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import import numpy as np from scipy.optimize import curve_fit @@ -26,7 +27,7 @@ def meanSquareError(time,od,model=gompertz): predict = np.array([gompertz(t,*popt) for t in time]) mse = 1./n * sum(((od - predict)**2),) - except RuntimeError, e: + except RuntimeError as e: mse = np.nan# 1./n * sum(((od - np.mean(od))**2),) return mse @@ -62,7 +63,7 @@ def optimize(time,od,model=gompertz,p0=None,cv=None): predict = np.array([model(t,*popt) for t in test_time]) mse = 1./test_od.shape[0] * sum(((test_od - predict)**2),) - except RuntimeError, e: + except RuntimeError as e: m,A,l = np.nan, np.nan, np.nan mse = np.nan diff --git a/gp_growth/data/filter.py b/gp_growth/data/filter.py index 53e6edf..be07475 100644 --- a/gp_growth/data/filter.py +++ b/gp_growth/data/filter.py @@ -1,4 +1,5 @@ -import growth +from __future__ import absolute_import +from . import growth class DataFilter(object): diff --git a/gp_growth/data/growth.py b/gp_growth/data/growth.py index edda9ce..713119e 100644 --- a/gp_growth/data/growth.py +++ b/gp_growth/data/growth.py @@ -1,9 +1,14 @@ +from __future__ import absolute_import +from __future__ import print_function import matplotlib.pyplot as plt import seaborn as sns import os, time, datetime import pandas as pd import numpy as np import itertools +import six +from six.moves import range +from six.moves import zip class GrowthData(object): @@ -64,7 +69,7 @@ def plot(self,output="",fixed_y=True,title_index=None,colorby=None,fixed_colors= condition = condition.tolist() if title_index is None: - title_index = range(len(condition)) + title_index = list(range(len(condition))) if newFig is None: newFig = True @@ -82,9 +87,9 @@ def plot(self,output="",fixed_y=True,title_index=None,colorby=None,fixed_colors= if colorby: categories = tuple(self.key[colorby].unique()) if colors is None: - color_lookup = dict(zip(categories,sns.color_palette("hls", len(categories)))) + color_lookup = dict(list(zip(categories,sns.color_palette("hls", len(categories))))) else: - color_lookup = dict(zip(categories,colors)) + color_lookup = dict(list(zip(categories,colors))) ncols = min(ncols,groups.ngroups) nrows = 1 @@ -97,12 +102,12 @@ def plot(self,output="",fixed_y=True,title_index=None,colorby=None,fixed_colors= plt.figure(figsize=(ncols*figWidth,nrows*figHeight)) if groups.ngroups > maxgroups: - print "Error, number of groups (%d) is greater than max (%d)." % (groups.ngroups,maxgroups) + print("Error, number of groups (%d) is greater than max (%d)." % (groups.ngroups,maxgroups)) return - groupList = list(groups.groups.iteritems()) + groupList = list(six.iteritems(groups.groups)) if groupOrder is None: - groupOrder = range(len(groupList)) + groupOrder = list(range(len(groupList))) for i,j in enumerate(groupOrder): val = groupList[j] @@ -119,10 +124,10 @@ def plot(self,output="",fixed_y=True,title_index=None,colorby=None,fixed_colors= if not fixed_colors: categories = tuple(temp_key[colorby].unique()) - color_lookup = dict(zip(categories,sns.color_palette("hls", len(categories)))) + color_lookup = dict(list(zip(categories,sns.color_palette("hls", len(categories))))) if colorby: - color_usage = dict(zip(categories,[False]*len(categories))) + color_usage = dict(list(zip(categories,[False]*len(categories)))) #print temp_key.Well,temp_data.head() @@ -132,7 +137,7 @@ def plot(self,output="",fixed_y=True,title_index=None,colorby=None,fixed_colors= ylim = (min(temp_data.min()),max(temp_data.max())) if colorby: - for well,temp_od in temp_data.iteritems(): + for well,temp_od in six.iteritems(temp_data): #remove na's select = ~temp_od.isnull() @@ -152,7 +157,7 @@ def plot(self,output="",fixed_y=True,title_index=None,colorby=None,fixed_colors= if len(categories) <= max_legend: plt.legend(loc="best",**legend_kwargs) else: - for well,temp_od in temp_data.iteritems(): + for well,temp_od in six.iteritems(temp_data): #remove na's select = ~temp_od.isnull() @@ -263,7 +268,7 @@ def _stack_data(self,meta=None,groupby=None,thinning=None): x = [] y = [] - for vals,ind in g.groups.iteritems(): + for vals,ind in six.iteritems(g.groups): select = reg[groupby] == vals temp = reg[select] x.append(reg[['time']+meta]) @@ -308,7 +313,7 @@ def select(self,**kwargs): selection = [True]*self.key.shape[0] - for k,v in kwargs.iteritems(): + for k,v in six.iteritems(kwargs): if k in key_copy.columns: # if v is np.nan: # if np.isnan(v): @@ -317,7 +322,7 @@ def select(self,**kwargs): if np.isnan(v): selection = np.all((selection,key_copy[k].isnull()),0) checked = True - except TypeError,e: + except TypeError as e: pass if not checked: selection = np.all((selection,key_copy[k]==v),0) @@ -368,7 +373,7 @@ def poly_scale(self,p,ind=None,groupby=None): time = self.data.time.iloc[:ind] - for k,index in group.groups.iteritems(): + for k,index in six.iteritems(group.groups): temp = self.data.loc[:,index] od = temp.values[:ind,:].ravel() @@ -380,13 +385,13 @@ def poly_scale(self,p,ind=None,groupby=None): def _parse_time(t): try: return time.struct_time(time.strptime(t,'%H:%M:%S')) - except ValueError, e: + except ValueError as e: try: t = time.strptime(t,'%d %H:%M:%S') t = list(t) t[2]+=1 return time.struct_time(t) - except ValueError, e: + except ValueError as e: raise Exception("Time format unknown") def _expand_data_row(r,thinning=0): diff --git a/gp_growth/factory.py b/gp_growth/factory.py index 65c86b2..291f2d3 100644 --- a/gp_growth/factory.py +++ b/gp_growth/factory.py @@ -1,9 +1,12 @@ # Author: Peter Tonner (peter.tonner@duke.edu) +from __future__ import absolute_import import GPy import numpy as np import pandas as pd -from categorical import Categorical +from .categorical import Categorical +import six +from six.moves import range class Factory(object): """Abstract factory class for defining how to contruct a GP. @@ -88,7 +91,7 @@ def convertCategoricalDimension(self,x): self.seenCategories[name] = {} for v in vals: if not v in self.seenCategories[name]: - self.seenCategories[name][v] = len(self.seenCategories[name].keys()) + self.seenCategories[name][v] = len(list(self.seenCategories[name].keys())) temp = np.zeros(x.shape) - 1 for i in range(temp.shape[0]): @@ -123,9 +126,9 @@ def buildInputFixed(self,size=None,convert=True,**kwargs): elif k.rstrip("_min") in inputColumns: dynamicMin = kwargs[k] - if len(fixedParams.keys()) < len(inputColumns) - 1: + if len(list(fixedParams.keys())) < len(inputColumns) - 1: raise ValueError("not enough fixed inputs!") - elif len(fixedParams.keys()) >= len(inputColumns): + elif len(list(fixedParams.keys())) >= len(inputColumns): raise ValueError("too many fixed inputs!") x = np.zeros((size,len(inputColumns)),dtype=object) @@ -190,7 +193,7 @@ def build(self,data,optimize=False,useSaved=False,renormalize=True,**kwargs): # get previously optimized hyperparameters if they exist if (not optimize) and useSaved: #(not name is None) and (name in self.savedParameters.index): select = [True] * self.savedParameters.shape[0] - for k,v in kwargs.iteritems(): + for k,v in six.iteritems(kwargs): select = np.all((select,self.savedParameters[k]==v),0) @@ -248,7 +251,7 @@ def plot(self,edata,gp=None,name=None,output=None,logged=None,colspan=None,rowsp for ind,col in enumerate(self.inputColumns()): plt.subplot2grid((nrows,colspan+1),[ind,0],colspan=colspan) - plt.plot(range(self.buildInput(edata).shape[0]),self.buildInput(edata)[:,ind]) + plt.plot(list(range(self.buildInput(edata).shape[0])),self.buildInput(edata)[:,ind]) plt.ylim(min(self.buildInput(edata)[:,ind])-.5,max(self.buildInput(edata)[:,ind])+.5) plt.xticks([]) plt.yticks([]) @@ -256,7 +259,7 @@ def plot(self,edata,gp=None,name=None,output=None,logged=None,colspan=None,rowsp # plot optical density plt.subplot2grid((nrows,colspan+1),[ind+1,0],colspan=colspan) - plt.plot(range(self.buildInput(edata).shape[0]),self.buildOutput(edata)) + plt.plot(list(range(self.buildInput(edata).shape[0])),self.buildOutput(edata)) plt.xticks([]) plt.yticks([]) plt.ylabel("OD",fontsize=15) @@ -377,7 +380,7 @@ def _get_params(gp,name=None,**kwargs): # params.append(p.values[ind]) ret["_".join([gp.likelihood.name,n])] = p.values[ind] - for k,v in kwargs.iteritems(): + for k,v in six.iteritems(kwargs): ret[k] = v #return pd.Series(params,index=param_names) diff --git a/gp_growth/gompertz.py b/gp_growth/gompertz.py index 14b4d5d..f1eac21 100644 --- a/gp_growth/gompertz.py +++ b/gp_growth/gompertz.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import import numpy as np from scipy.optimize import curve_fit @@ -16,7 +17,7 @@ def meanSquareError(time,od): predict = np.array([gompertz(t,*popt) for t in time]) mse = 1./n * sum(((od - predict)**2),) - except RuntimeError, e: + except RuntimeError as e: mse = 1./n * sum(((od - np.mean(od))**2),) return mse @@ -37,7 +38,7 @@ def optimize(time,od): predict = np.array([gompertz(t,*popt) for t in time]) mse = 1./n * sum(((od - predict)**2),) - except RuntimeError, e: + except RuntimeError as e: m,A,l = np.nan, np.nan, np.nan mse = 1./n * sum(((od - np.mean(od))**2),) diff --git a/gp_growth/likelihoods.py b/gp_growth/likelihoods.py index 0f1d482..0e47ac7 100644 --- a/gp_growth/likelihoods.py +++ b/gp_growth/likelihoods.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import from GPy import likelihoods import numpy as np diff --git a/gp_growth/metric.py b/gp_growth/metric.py index 3551353..dd351c3 100644 --- a/gp_growth/metric.py +++ b/gp_growth/metric.py @@ -1,6 +1,8 @@ +from __future__ import absolute_import import numpy as np from scipy import stats -from data import growth +from .data import growth +from six.moves import zip ######### # utility functions diff --git a/gp_growth/models.py b/gp_growth/models.py index 6fa2529..595f36e 100644 --- a/gp_growth/models.py +++ b/gp_growth/models.py @@ -1,7 +1,8 @@ +from __future__ import absolute_import import numpy as np from GPy.core import GP from GPy import likelihoods,kern, util, models -from likelihoods import MixedNoise_twoSide +from .likelihoods import MixedNoise_twoSide class GPHeteroscedasticRegression_twoSided(GP): """ diff --git a/gp_growth/normal.py b/gp_growth/normal.py index d5d1112..25e2057 100644 --- a/gp_growth/normal.py +++ b/gp_growth/normal.py @@ -1,6 +1,8 @@ +from __future__ import absolute_import import numpy as np import warnings import scipy +from six.moves import range class MultivariateNormal(object): @@ -46,7 +48,7 @@ def plot(self,x=None,confidence=None,label="",color="b",alpha=.1): import matplotlib.pyplot as plt if x is None: - x = range(self.n) + x = list(range(self.n)) if confidence is None: confidence = .95 diff --git a/gp_growth/plot.py b/gp_growth/plot.py index adf6810..d6392ea 100644 --- a/gp_growth/plot.py +++ b/gp_growth/plot.py @@ -1,11 +1,15 @@ +from __future__ import absolute_import +from __future__ import print_function import matplotlib.pyplot as plt import numpy as np import itertools +from six.moves import range +from six.moves import zip def multivariate_gaussian(x,mean,var=None,line_col="g",fill_col="g",label="",alpha=.5): if not var is None and not mean.shape == var.shape: - print "Error mean and variance not same shape (",str(mean.shape),",",str(var.shape),")" + print("Error mean and variance not same shape (",str(mean.shape),",",str(var.shape),")") return plt.plot(x,mean,c=line_col,label=label) @@ -53,7 +57,7 @@ def scatterplot_matrix(data, names, **kwargs): ha='center', va='center') # Turn on the proper x or y axes ticks. - for i, j in zip(range(numvars), itertools.cycle((-1, 0))): + for i, j in zip(list(range(numvars)), itertools.cycle((-1, 0))): axes[j,i].xaxis.set_visible(True) axes[i,j].yaxis.set_visible(True) diff --git a/gp_growth/stats.py b/gp_growth/stats.py index 5b3ab31..21dd903 100644 --- a/gp_growth/stats.py +++ b/gp_growth/stats.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import import numpy as np def mvn_chisquare(x,mu,cov): diff --git a/gp_growth/storage/datatypes.py b/gp_growth/storage/datatypes.py index 2fb1a72..36a8535 100644 --- a/gp_growth/storage/datatypes.py +++ b/gp_growth/storage/datatypes.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import from tables import * METADATA_TYPES = ["str","int","float"] diff --git a/gp_growth/storage/mongo.py b/gp_growth/storage/mongo.py index dfe9827..91aa3ff 100644 --- a/gp_growth/storage/mongo.py +++ b/gp_growth/storage/mongo.py @@ -1,6 +1,8 @@ +from __future__ import absolute_import from pymongo import MongoClient import pandas as pd from ..data.growth import GrowthData +from six.moves import range class MongoDB(): @@ -188,5 +190,5 @@ def getExperimentalDesigns(self,name,**kwargs): if not w[name] in ret: ret[w[name]] = None - return ret.keys() + return list(ret.keys()) diff --git a/gp_growth/storage/searcher.py b/gp_growth/storage/searcher.py index f7e1896..45ed8f1 100644 --- a/gp_growth/storage/searcher.py +++ b/gp_growth/storage/searcher.py @@ -1,4 +1,6 @@ -import template +from __future__ import absolute_import +from __future__ import print_function +from . import template class Searcher(template.Template): """A utility class for defining a templated search and preprocessing of data.""" @@ -16,7 +18,7 @@ def search(self,verbose=False,**kwargs): searchKwargs[k] = kwargs[k] if verbose: - print "Searching on",searchKwargs + print("Searching on",searchKwargs) data = self.db.getData(**searchKwargs) data = self.process(data) diff --git a/gp_growth/storage/storage.py b/gp_growth/storage/storage.py index f75793e..84e822b 100644 --- a/gp_growth/storage/storage.py +++ b/gp_growth/storage/storage.py @@ -1,9 +1,14 @@ #from tables import * +from __future__ import absolute_import +from __future__ import print_function from numpy import * import pandas as pd import os,sys from ..data.growth import GrowthData -from datatypes import * +from .datatypes import * +import six +from six.moves import range +from six.moves import input def open(fname,overwrite=False,warn=True): global h5file @@ -12,8 +17,8 @@ def open(fname,overwrite=False,warn=True): h5file = open_file(fname, mode = "r+", title = "Growth data") else: if os.path.isfile(fname) and warn: - print fname,"already exists, overwrite?" - select = raw_input() + print(fname,"already exists, overwrite?") + select = input() if select.lower() != "y": return h5file = open_file(fname, mode = "w", title = "Growth data") @@ -48,9 +53,9 @@ def createData(plate,data,plate_type=PLATE_TYPES[0]): raise NotImplementedError("No implmentation for %s" % plate_type) row = table.row - for i in xrange(data.shape[0]): + for i in range(data.shape[0]): row['time'] = data.iloc[i,0] - for j in xrange(1,data.shape[1]): + for j in range(1,data.shape[1]): row['well%d'%(j-1)] = data.iloc[i,j] row.append() @@ -87,7 +92,7 @@ def createMetadata(plate,meta,types={}): elif meta_row['type'] == METADATA_TYPES.index("float"): try: meta_row['val'] = float(meta_row['val']) - except ValueError, e: + except ValueError as e: meta_row['val'] = nan # else: # meta_row['type'] = METADATA_TYPES[0] @@ -209,11 +214,11 @@ def getWells(plates=None,verbose=False,**kwargs): hits[ed['name']] = hits[ed['name']].union(w_temp) # did we hit everything? - success = all([len(v)>0 for k,v in hits.iteritems()]) + success = all([len(v)>0 for k,v in six.iteritems(hits)]) # add the intersection of all wells if success: - keys = hits.keys() + keys = list(hits.keys()) if len(keys) > 0: w = hits[keys[0]] for k in keys[1:]: @@ -228,7 +233,7 @@ def getWells(plates=None,verbose=False,**kwargs): if verbose: perc = (100.*count)/len(plates) - print "\rFinding wells...%.2f%% (%d wells)"% (perc,sum([len(x) for y,x in wells])), + print("\rFinding wells...%.2f%% (%d wells)"% (perc,sum([len(x) for y,x in wells])), end=' ') sys.stdout.flush() return wells @@ -245,7 +250,7 @@ def getData(plates=None,verbose=False,logged=None,subtract=None,*args,**kwargs): wells = getWells(plates,verbose,**kwargs) if verbose: - print "" + print("") # return _get_data(wells) data = _generate_data(wells,verbose) @@ -294,7 +299,7 @@ def _get_experimentalDesign(wells,verbose=False): elif row['type'] == METADATA_TYPES.index("float"): temp.iloc[where([w in array_indexes for w in w_ind])[0],col_index] = temp.iloc[where([w in array_indexes for w in w_ind])[0],col_index].astype(float) - new_indexes = range(ind,ind+len(w_ind)) + new_indexes = list(range(ind,ind+len(w_ind))) temp['well'] = new_indexes temp['batch'] = plate._v_name @@ -309,7 +314,7 @@ def _get_experimentalDesign(wells,verbose=False): if verbose: perc = (100.*count)/len(wells) - print "\rBuilding meta table...%.2f%%"%perc, + print("\rBuilding meta table...%.2f%%"%perc, end=' ') sys.stdout.flush() return key @@ -352,13 +357,13 @@ def _get_data(wells,verbose=False): if verbose: perc = (100.*count)/len(wells) - print "\rBuilding data table...%.2f%%"%perc, + print("\rBuilding data table...%.2f%%"%perc, end=' ') sys.stdout.flush() # print plate._v_name,data.shape,temp.shape if verbose: - print "" + print("") key = _get_experimentalDesign(wells,verbose) # count = 0 diff --git a/gp_growth/storage/template.py b/gp_growth/storage/template.py index 7393baa..810363c 100644 --- a/gp_growth/storage/template.py +++ b/gp_growth/storage/template.py @@ -1,4 +1,6 @@ +from __future__ import absolute_import from copy import copy +from six.moves import range def replaceTemplateStrings(rep,**kwargs): rep = copy(rep) diff --git a/gp_growth/testStatistic.py b/gp_growth/testStatistic.py index 4fbb508..bd3e095 100644 --- a/gp_growth/testStatistic.py +++ b/gp_growth/testStatistic.py @@ -1,6 +1,9 @@ +from __future__ import absolute_import import numpy as np import scipy.stats import normal, GPy +from six.moves import range +from six.moves import zip class TestStatistic(object): @@ -8,9 +11,9 @@ class TestStatistic(object): def buildTestMatrix(n): A = np.zeros((n-1,2*n)) A[:,0] = 1 - A[range(n-1),range(1,n)] = -1 + A[list(range(n-1)),list(range(1,n))] = -1 A[:,n] = -1 - A[range(n-1),n+np.arange(1,n)] = 1 + A[list(range(n-1)),n+np.arange(1,n)] = 1 return A diff --git a/gp_growth/utils.py b/gp_growth/utils.py index 959cb03..93cefb0 100644 --- a/gp_growth/utils.py +++ b/gp_growth/utils.py @@ -1,3 +1,5 @@ +from __future__ import absolute_import +from __future__ import print_function import numpy as np import pandas as pd import os @@ -6,6 +8,9 @@ import datetime import re import matplotlib.pyplot as plt +import six +from six.moves import range +from six.moves import zip ######################################################## ## Plotting functions @@ -14,7 +19,7 @@ def gauss_plot(x,mean,var=None,line_col="g",fill_col="g",label=""): if not var is None and not mean.shape == var.shape: - print "Error mean and variance not same shape (",str(mean.shape),",",str(var.shape),")" + print("Error mean and variance not same shape (",str(mean.shape),",",str(var.shape),")") return plt.plot(x,mean,c=line_col,label=label) @@ -74,7 +79,7 @@ def load_iron(data,mode=1,subsample=-1): cols['replicate'] = 0 for exp,temp in group: #temp['replicate'] = range(temp.shape[0]) - cols.loc[group.groups[exp],'replicate'] = range(temp.shape[0]) + cols.loc[group.groups[exp],'replicate'] = list(range(temp.shape[0])) #print exp #print temp @@ -132,7 +137,7 @@ def metadata_condition_parse(cond,defaultValue=0): cond = cond.str.rstrip(suffix) split = cond.str.split(delim) - print split + print(split) ret = pd.DataFrame(split.apply(lambda x: delim.join(x[:-1]))) ret = ret.rename(columns={'Condition':'media'}) ret[suffix] = split.apply( lambda x: float(x[-1].rstrip(suffix)) if len(x) > 1 and suffix in x[-1] else defaultValue ) @@ -163,7 +168,7 @@ def expand_row(time,od,params=None): ret['od'] = od.values if not params is None: - for c,v in params.iteritems(): + for c,v in six.iteritems(params): ret[c] = v return ret @@ -180,13 +185,13 @@ def expand_data_row(r): def parse_time(t): try: return time.struct_time(time.strptime(t,'%H:%M:%S')) - except ValueError, e: + except ValueError as e: try: t = time.strptime(t,'%d %H:%M:%S') t = list(t) t[2]+=1 return time.struct_time(t) - except ValueError, e: + except ValueError as e: raise Exception("Time format unknown") def convert_encoding(f,encoding,outcoding): @@ -198,8 +203,8 @@ def load_bioscreen(folder,convert=False,removeBlank=True,removeEmpty=True): files = os.listdir(folder) - key_file = filter(lambda x: "key.xlsx" in x, files) - data_file = filter(lambda x: ".csv" in x, files) + key_file = [x for x in files if "key.xlsx" in x] + data_file = [x for x in files if ".csv" in x] assert len(data_file)==1, "No data file or more than one data file: "+ str(data_file) assert len(key_file)==1, "No key file or more than one key file: "+ str(key_file) @@ -282,9 +287,9 @@ def plot_bioscreen(key,data,title_index=[0],fixed_y=True,output=""): plt.figure(figsize=(5*4,groups.ngroups/5*4)) - for i,val in enumerate(groups.groups.iteritems()): + for i,val in enumerate(six.iteritems(groups.groups)): k,ind = val - print k,ind + print(k,ind) ax = plt.subplot(groups.ngroups/5+1,5,i+1) temp_key = groups.get_group(k) temp_data = od.ix[:,temp_key.Well.astype(str)] @@ -385,11 +390,11 @@ def expand_bioscreen(key,data): return expand_data def load_all_bioscreen(folders,convert=False): - print folders[0] + print(folders[0]) key,data = load_bioscreen(folders[0],convert) expand_data = expand_bioscreen(key,data) for folder in folders[1:]: - print folder + print(folder) key,data = load_bioscreen(folder,convert) expand_data = expand_data.append(expand_bioscreen(key,data)) @@ -410,7 +415,7 @@ def convert_bioscreen_key(f,condition,**kwargs): key['Heat Shift Time'] = 16 key['Media'] = np.where(key.Condition.str.contains("mev"),"CM+mev","CM") - for k,v in kwargs.iteritems(): + for k,v in six.iteritems(kwargs): key[k] = v key.to_csv(f.rstrip(".xls")+".csv",index=False) diff --git a/halo_serial.py b/halo_serial.py index 1ba154a..f3e5adf 100755 --- a/halo_serial.py +++ b/halo_serial.py @@ -1,9 +1,12 @@ +from __future__ import absolute_import +from __future__ import print_function import pandas as pd from gp_growth import factory,metric,gompertz from gp_growth.data.growth import GrowthData from gp_growth.storage import mongo import os import numpy as np +from six.moves import range # from matplotlib.pyplot import * # from scipy.optimize import curve_fit # from lib.gompertz import gompertz @@ -41,7 +44,7 @@ temp = GrowthData(temp,params) - print p,i,temp.data.shape + print(p,i,temp.data.shape) edata = temp.getData("gp") @@ -64,7 +67,7 @@ params['ss_tot'] = np.sum((edata.od - edata.od.mean())**2)/edata.shape[0] - print params + print(params) if output.shape[0]==0: # output = pd.DataFrame(params) diff --git a/notebooks/example.py b/notebooks/example.py new file mode 100644 index 0000000..1db5e87 --- /dev/null +++ b/notebooks/example.py @@ -0,0 +1,48 @@ +import pandas as pd +import numpy as np +import scipy +import seaborn as sns +from scipy.optimize import curve_fit +from gp_growth import gompertz, factory, metric,plot, normal +from gp_growth.storage import mongo +import GPy +import matplotlib.pyplot as plt +import bgreat + + +data = pd.read_csv("data/example/data.csv",index_col=0) +meta = pd.read_csv("data/example/meta.csv") + +meta.head() + +meta.shape +data.shape +assert data.shape[1] == meta.shape[0] + +parent = 'parent' +control = 'control' +condition = 'stress' + +meta['strain-regression'] = (meta.strain!=parent).astype(int) +meta['condition'] = (meta.Condition!=control).astype(int) +meta['interaction'] = meta['strain-regression']*meta.condition + +plt.figure(figsize=(16,6)) +plt.suptitle("Before Formatting") +plt.subplot(121) +plt.title('control') + +bgreat.plotSamples(data.values[:,np.where((meta.condition==0)& (meta.strain==parent))[0]],color='k',label='parent') +bgreat.plotSamples(data.values[:,np.where((meta.condition==0)& (meta.strain!=parent))[0]],color='g',label='mutant') +plt.legend(loc='best') +plt.ylim(0.05,.7) +plt.subplot(122) +plt.title('stress') +bgreat.plotSamples(data.values[:,np.where((meta.condition==1)& + (meta.strain==parent))[0]],color='k',label='parent') +bgreat.plotSamples(data.values[:,np.where((meta.condition==1)& + (meta.strain!=parent))[0]],color='g',label='mutant') +plt.legend(loc='best') +plt.ylim(0.05,.7) + +plt.show() \ No newline at end of file diff --git a/notebooks/example3.py b/notebooks/example3.py new file mode 100644 index 0000000..d1a4316 --- /dev/null +++ b/notebooks/example3.py @@ -0,0 +1,113 @@ +import matplotlib.pyplot as plt +import numpy as np +import bgreat + + + +import pandas as pd +data = pd.read_csv("data/example/data.csv",index_col=0) +meta = pd.read_csv("data/example/meta.csv") + +data.head() + +meta.head() + +assert data.shape[1] == meta.shape[0] + +parent = 'parent' +control = 'control' +condition = 'stress' + +meta['strain-regression'] = (meta.strain!=parent).astype(int) +meta['condition'] = (meta.Condition!=control).astype(int) +meta['interaction'] = meta['strain-regression']*meta.condition + +plt.figure(figsize=(16,6)) +plt.suptitle("Before Formatting") + +plt.subplot(121) +plt.title('control') +bgreat.plotSamples(data.values[:,np.where((meta.condition==0)& (meta.strain==parent))[0]],color='k',label=parent) +bgreat.plotSamples(data.values[:,np.where((meta.condition==0)& (meta.strain!=parent))[0]],color='g',label='mutant') +plt.legend(loc='best') +# plt.ylim(0.05,.7) + +plt.subplot(122) +plt.title('stress') +bgreat.plotSamples(data.values[:,np.where((meta.condition==1)& (meta.strain==parent))[0]],color='k',label=parent) +bgreat.plotSamples(data.values[:,np.where((meta.condition==1)& (meta.strain!=parent))[0]],color='g',label='mutant') +plt.legend(loc='best') +plt.show() + + + +data = data.iloc[4:,:] +data = np.log2(data) +g = meta.groupby(['strain','Condition']) +for k,ind in enumerate(g.groups): + print(k) + print(ind) + data.iloc[:,g.groups[ind]] -= data.iloc[0,g.groups[ind]].mean() + + + +plt.figure(figsize=(16,6)) +plt.suptitle("After Formatting") + +plt.subplot(121) +plt.title('control') +bgreat.plotSamples(data.values[:,np.where((meta.condition==0)& (meta.strain==parent))[0]],color='k',label=parent) +bgreat.plotSamples(data.values[:,np.where((meta.condition==0)& (meta.strain!=parent))[0]],color='g',label='mutant') +plt.legend(loc='best') +plt.ylim(-.4,2.4) + +plt.subplot(122) +plt.title('stress') +bgreat.plotSamples(data.values[:,np.where((meta.condition==1)& (meta.strain==parent))[0]],color='k',label='parent') +bgreat.plotSamples(data.values[:,np.where((meta.condition==1)& (meta.strain!=parent))[0]],color='g',label='mutant') +plt.legend(loc='best') +plt.ylim(-.4,2.4) +plt.show() + + + + +##import bgreat +##bgreat.setGlobals(_data=data, _meta=meta) +##bgreat.setGlobals(_parent=parent,_control=control) + +bgreat.setGlobals(_data=data,_meta=meta,_parent=parent,_control=control,_condition=None) + +mutants = ['mutant'] + +results = bgreat.testMutantControl(mutants,numPerm=20,dims=['time','strain-regression']) + +results + +plt.hist(results.permuted.values[0]) +plt.show() + +gp = bgreat.buildGP(bgreat.selectStrain('mutant')) + +gp + +xpred = np.zeros((100,2)) +xpred[:50,0] = np.linspace(data.index.min(),data.index.max()) +xpred[50:,0] = np.linspace(data.index.min(),data.index.max()) + +xpred[50:,1] = 1 + +mu,cov = gp.predict(xpred,full_cov=True) +var = np.diag(cov) +mu = mu[:,0] + +plt.figure(figsize=(10,5)) + +plt.plot(xpred[:50,0],mu[:50],label='parent'); +plt.fill_between(xpred[:50,0],mu[:50]-2*np.sqrt(var[:50]),mu[:50]+2*np.sqrt(var[:50]),alpha=.1) + +plt.plot(xpred[:50,0],mu[50:],label='mutant') +plt.fill_between(xpred[:50,0],mu[50:]-2*np.sqrt(var[50:]),mu[50:]+2*np.sqrt(var[50:]),alpha=.1) + +plt.legend(fontsize=20) +plt.show() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 89bd5ed..82e6453 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,44 +1,10 @@ -GPy==1.2.1 -Jinja2==2.7.3 -MarkupSafe==0.23 -Pygments==2.0.2 -Sphinx==1.3.1 -alabaster==0.7.3 -argparse==1.2.1 -backports.ssl-match-hostname==3.4.0.2 -certifi==14.05.14 -decorator==4.0.10 -docutils==0.12 -ipython==3.1.0 -jdcal==1.2 -jsonschema==2.4.0 -matplotlib==1.4.3 -mistune==0.5.1 -mock==1.0.1 -nose==1.3.6 -numexpr==2.4.2 -numpy==1.9.2 -numpydoc==0.5 -openpyxl==2.2.2 -pandas==0.16.0 -paramz==0.5.6 -ptyprocess==0.4 -pymongo==3.0.3 -pyparsing==2.0.3 -python-dateutil==2.4.2 -pytz==2015.2 -pyzmq==14.6.0 -requests==2.6.2 -runipy==0.1.3 -scikit-learn==0.16.1 -scipy==0.15.1 -seaborn==0.6.0 -six==1.9.0 -snowballstemmer==1.2.0 -sphinx-rtd-theme==0.1.7 -terminado==0.5 -tornado==4.1 -weave==0.15.0 -wsgiref==0.1.2 -xlrd==0.9.3 -xlwt==1.0.0 +gpy==1.9.8 +matplotlib==3.1.1 +numpy==1.17.2 +pandas==0.25.1 +pygments==2.4.2 +pymongo==3.9.0 +scikit-learn==0.21.3 +scipy==1.3.1 +seaborn==0.9.0 +sympy==1.4 diff --git a/setup_data.py b/setup_data.py index 0f534b4..dc4e4b7 100644 --- a/setup_data.py +++ b/setup_data.py @@ -1,8 +1,11 @@ +from __future__ import absolute_import +from __future__ import print_function import os,re,sys import getopt import numpy as np from gp_growth import utils import pandas as pd +from six.moves import range hsal_dir = "data/raw/hsal_ko" plates = os.listdir(hsal_dir) @@ -12,14 +15,14 @@ for plate in plates: - print plate + print(plate) try: key,data = utils.load_bioscreen(os.path.join(hsal_dir,plate),removeBlank=False) - except ValueError, e: + except ValueError as e: key,data = utils.load_bioscreen(os.path.join(hsal_dir,plate),convert=True,removeBlank=False) - print key.shape + print(key.shape) if meta.shape[0] > 0: meta = pd.concat((meta,key)) @@ -28,9 +31,9 @@ meta = key od = data - print meta.shape,od.shape + print(meta.shape,od.shape) -meta.index=range(meta.shape[0]) +meta.index=list(range(meta.shape[0])) od.columns=['time']+meta.index.tolist() combined = pd.merge(meta,od.T,left_index=True,right_index=True)