diff --git a/thinkplot/thinkplot.py b/thinkplot/thinkplot.py deleted file mode 100644 index e68f24670..000000000 --- a/thinkplot/thinkplot.py +++ /dev/null @@ -1,890 +0,0 @@ -"""This file contains code for use with "Think Stats", -by Allen B. Downey, available from greenteapress.com - -Copyright 2014 Allen B. Downey -License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html -""" - -from __future__ import print_function - -import math -import matplotlib -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd - -import warnings - -# customize some matplotlib attributes -#matplotlib.rc('figure', figsize=(4, 3)) - -#matplotlib.rc('font', size=14.0) -#matplotlib.rc('axes', labelsize=22.0, titlesize=22.0) -#matplotlib.rc('legend', fontsize=20.0) - -#matplotlib.rc('xtick.major', size=6.0) -#matplotlib.rc('xtick.minor', size=3.0) - -#matplotlib.rc('ytick.major', size=6.0) -#matplotlib.rc('ytick.minor', size=3.0) - - -class _Brewer(object): - """Encapsulates a nice sequence of colors. - - Shades of blue that look good in color and can be distinguished - in grayscale (up to a point). - - Borrowed from http://colorbrewer2.org/ - """ - color_iter = None - - colors = ['#f7fbff', '#deebf7', '#c6dbef', - '#9ecae1', '#6baed6', '#4292c6', - '#2171b5','#08519c','#08306b'][::-1] - - # lists that indicate which colors to use depending on how many are used - which_colors = [[], - [1], - [1, 3], - [0, 2, 4], - [0, 2, 4, 6], - [0, 2, 3, 5, 6], - [0, 2, 3, 4, 5, 6], - [0, 1, 2, 3, 4, 5, 6], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7, 8], - ] - - current_figure = None - - @classmethod - def Colors(cls): - """Returns the list of colors. - """ - return cls.colors - - @classmethod - def ColorGenerator(cls, num): - """Returns an iterator of color strings. - - n: how many colors will be used - """ - for i in cls.which_colors[num]: - yield cls.colors[i] - raise StopIteration('Ran out of colors in _Brewer.') - - @classmethod - def InitIter(cls, num): - """Initializes the color iterator with the given number of colors.""" - cls.color_iter = cls.ColorGenerator(num) - fig = plt.gcf() - cls.current_figure = fig - - @classmethod - def ClearIter(cls): - """Sets the color iterator to None.""" - cls.color_iter = None - cls.current_figure = None - - @classmethod - def GetIter(cls, num): - """Gets the color iterator.""" - fig = plt.gcf() - if fig != cls.current_figure: - cls.InitIter(num) - cls.current_figure = fig - - if cls.color_iter is None: - cls.InitIter(num) - - return cls.color_iter - - -def _UnderrideColor(options): - """If color is not in the options, chooses a color. - """ - if 'color' in options: - return options - - # get the current color iterator; if there is none, init one - color_iter = _Brewer.GetIter(5) - - try: - options['color'] = next(color_iter) - except StopIteration: - # if you run out of colors, initialize the color iterator - # and try again - warnings.warn('Ran out of colors. Starting over.') - _Brewer.ClearIter() - _UnderrideColor(options) - - return options - - -def PrePlot(num=None, rows=None, cols=None): - """Takes hints about what's coming. - - num: number of lines that will be plotted - rows: number of rows of subplots - cols: number of columns of subplots - """ - if num: - _Brewer.InitIter(num) - - if rows is None and cols is None: - return - - if rows is not None and cols is None: - cols = 1 - - if cols is not None and rows is None: - rows = 1 - - # resize the image, depending on the number of rows and cols - size_map = {(1, 1): (8, 6), - (1, 2): (12, 6), - (1, 3): (12, 6), - (1, 4): (12, 5), - (1, 5): (12, 4), - (2, 2): (10, 10), - (2, 3): (16, 10), - (3, 1): (8, 10), - (4, 1): (8, 12), - } - - if (rows, cols) in size_map: - fig = plt.gcf() - fig.set_size_inches(*size_map[rows, cols]) - - # create the first subplot - if rows > 1 or cols > 1: - ax = plt.subplot(rows, cols, 1) - global SUBPLOT_ROWS, SUBPLOT_COLS - SUBPLOT_ROWS = rows - SUBPLOT_COLS = cols - else: - ax = plt.gca() - - return ax - - -def SubPlot(plot_number, rows=None, cols=None, **options): - """Configures the number of subplots and changes the current plot. - - rows: int - cols: int - plot_number: int - options: passed to subplot - """ - rows = rows or SUBPLOT_ROWS - cols = cols or SUBPLOT_COLS - return plt.subplot(rows, cols, plot_number, **options) - - -def _Underride(d, **options): - """Add key-value pairs to d only if key is not in d. - - If d is None, create a new dictionary. - - d: dictionary - options: keyword args to add to d - """ - if d is None: - d = {} - - for key, val in options.items(): - d.setdefault(key, val) - - return d - - -def Clf(): - """Clears the figure and any hints that have been set.""" - global LOC - LOC = None - _Brewer.ClearIter() - plt.clf() - fig = plt.gcf() - fig.set_size_inches(8, 6) - - -def Figure(**options): - """Sets options for the current figure.""" - _Underride(options, figsize=(6, 8)) - plt.figure(**options) - - -def Plot(obj, ys=None, style='', **options): - """Plots a line. - - Args: - obj: sequence of x values, or Series, or anything with Render() - ys: sequence of y values - style: style string passed along to plt.plot - options: keyword args passed to plt.plot - """ - options = _UnderrideColor(options) - label = getattr(obj, 'label', '_nolegend_') - options = _Underride(options, linewidth=3, alpha=0.7, label=label) - - xs = obj - if ys is None: - if hasattr(obj, 'Render'): - xs, ys = obj.Render() - if isinstance(obj, pd.Series): - ys = obj.values - xs = obj.index - - if ys is None: - plt.plot(xs, style, **options) - else: - plt.plot(xs, ys, style, **options) - - -def Vlines(xs, y1, y2, **options): - """Plots a set of vertical lines. - - Args: - xs: sequence of x values - y1: sequence of y values - y2: sequence of y values - options: keyword args passed to plt.vlines - """ - options = _UnderrideColor(options) - options = _Underride(options, linewidth=1, alpha=0.5) - plt.vlines(xs, y1, y2, **options) - - -def Hlines(ys, x1, x2, **options): - """Plots a set of horizontal lines. - - Args: - ys: sequence of y values - x1: sequence of x values - x2: sequence of x values - options: keyword args passed to plt.vlines - """ - options = _UnderrideColor(options) - options = _Underride(options, linewidth=1, alpha=0.5) - plt.hlines(ys, x1, x2, **options) - - -def axvline(x, **options): - """Plots a vertical line. - - Args: - x: x location - options: keyword args passed to plt.axvline - """ - options = _UnderrideColor(options) - options = _Underride(options, linewidth=1, alpha=0.5) - plt.axvline(x, **options) - - -def axhline(y, **options): - """Plots a horizontal line. - - Args: - y: y location - options: keyword args passed to plt.axhline - """ - options = _UnderrideColor(options) - options = _Underride(options, linewidth=1, alpha=0.5) - plt.axhline(y, **options) - - -def tight_layout(**options): - """Adjust subplots to minimize padding and margins. - """ - options = _Underride(options, - wspace=0.1, hspace=0.1, - left=0, right=1, - bottom=0, top=1) - plt.tight_layout() - plt.subplots_adjust(**options) - - -def FillBetween(xs, y1, y2=None, where=None, **options): - """Fills the space between two lines. - - Args: - xs: sequence of x values - y1: sequence of y values - y2: sequence of y values - where: sequence of boolean - options: keyword args passed to plt.fill_between - """ - options = _UnderrideColor(options) - options = _Underride(options, linewidth=0, alpha=0.5) - plt.fill_between(xs, y1, y2, where, **options) - - -def Bar(xs, ys, **options): - """Plots a line. - - Args: - xs: sequence of x values - ys: sequence of y values - options: keyword args passed to plt.bar - """ - options = _UnderrideColor(options) - options = _Underride(options, linewidth=0, alpha=0.6) - plt.bar(xs, ys, **options) - - -def Scatter(xs, ys=None, **options): - """Makes a scatter plot. - - xs: x values - ys: y values - options: options passed to plt.scatter - """ - options = _Underride(options, color='blue', alpha=0.2, - s=30, edgecolors='none') - - if ys is None and isinstance(xs, pd.Series): - ys = xs.values - xs = xs.index - - plt.scatter(xs, ys, **options) - - -def HexBin(xs, ys, **options): - """Makes a scatter plot. - - xs: x values - ys: y values - options: options passed to plt.scatter - """ - options = _Underride(options, cmap=matplotlib.cm.Blues) - plt.hexbin(xs, ys, **options) - - -def Pdf(pdf, **options): - """Plots a Pdf, Pmf, or Hist as a line. - - Args: - pdf: Pdf, Pmf, or Hist object - options: keyword args passed to plt.plot - """ - low, high = options.pop('low', None), options.pop('high', None) - n = options.pop('n', 101) - xs, ps = pdf.Render(low=low, high=high, n=n) - options = _Underride(options, label=pdf.label) - Plot(xs, ps, **options) - - -def Pdfs(pdfs, **options): - """Plots a sequence of PDFs. - - Options are passed along for all PDFs. If you want different - options for each pdf, make multiple calls to Pdf. - - Args: - pdfs: sequence of PDF objects - options: keyword args passed to plt.plot - """ - for pdf in pdfs: - Pdf(pdf, **options) - - -def Hist(hist, **options): - """Plots a Pmf or Hist with a bar plot. - - The default width of the bars is based on the minimum difference - between values in the Hist. If that's too small, you can override - it by providing a width keyword argument, in the same units - as the values. - - Args: - hist: Hist or Pmf object - options: keyword args passed to plt.bar - """ - # find the minimum distance between adjacent values - xs, ys = hist.Render() - - # see if the values support arithmetic - try: - xs[0] - xs[0] - except TypeError: - # if not, replace values with numbers - labels = [str(x) for x in xs] - xs = np.arange(len(xs)) - plt.xticks(xs+0.5, labels) - - if 'width' not in options: - try: - options['width'] = 0.9 * np.diff(xs).min() - except TypeError: - warnings.warn("Hist: Can't compute bar width automatically." - "Check for non-numeric types in Hist." - "Or try providing width option." - ) - - options = _Underride(options, label=hist.label) - options = _Underride(options, align='center') - if options['align'] == 'left': - options['align'] = 'edge' - elif options['align'] == 'right': - options['align'] = 'edge' - options['width'] *= -1 - - Bar(xs, ys, **options) - - -def Hists(hists, **options): - """Plots two histograms as interleaved bar plots. - - Options are passed along for all PMFs. If you want different - options for each pmf, make multiple calls to Pmf. - - Args: - hists: list of two Hist or Pmf objects - options: keyword args passed to plt.plot - """ - for hist in hists: - Hist(hist, **options) - - -def Pmf(pmf, **options): - """Plots a Pmf or Hist as a line. - - Args: - pmf: Hist or Pmf object - options: keyword args passed to plt.plot - """ - xs, ys = pmf.Render() - low, high = min(xs), max(xs) - - width = options.pop('width', None) - if width is None: - try: - width = np.diff(xs).min() - except TypeError: - warnings.warn("Pmf: Can't compute bar width automatically." - "Check for non-numeric types in Pmf." - "Or try providing width option.") - points = [] - - lastx = np.nan - lasty = 0 - for x, y in zip(xs, ys): - if (x - lastx) > 1e-5: - points.append((lastx, 0)) - points.append((x, 0)) - - points.append((x, lasty)) - points.append((x, y)) - points.append((x+width, y)) - - lastx = x + width - lasty = y - points.append((lastx, 0)) - pxs, pys = zip(*points) - - align = options.pop('align', 'center') - if align == 'center': - pxs = np.array(pxs) - width/2.0 - if align == 'right': - pxs = np.array(pxs) - width - - options = _Underride(options, label=pmf.label) - Plot(pxs, pys, **options) - - -def Pmfs(pmfs, **options): - """Plots a sequence of PMFs. - - Options are passed along for all PMFs. If you want different - options for each pmf, make multiple calls to Pmf. - - Args: - pmfs: sequence of PMF objects - options: keyword args passed to plt.plot - """ - for pmf in pmfs: - Pmf(pmf, **options) - - -def Diff(t): - """Compute the differences between adjacent elements in a sequence. - - Args: - t: sequence of number - - Returns: - sequence of differences (length one less than t) - """ - diffs = [t[i+1] - t[i] for i in range(len(t)-1)] - return diffs - - -def Cdf(cdf, complement=False, transform=None, **options): - """Plots a CDF as a line. - - Args: - cdf: Cdf object - complement: boolean, whether to plot the complementary CDF - transform: string, one of 'exponential', 'pareto', 'weibull', 'gumbel' - options: keyword args passed to plt.plot - - Returns: - dictionary with the scale options that should be passed to - Config, Show or Save. - """ - xs, ps = cdf.Render() - xs = np.asarray(xs) - ps = np.asarray(ps) - - scale = dict(xscale='linear', yscale='linear') - - for s in ['xscale', 'yscale']: - if s in options: - scale[s] = options.pop(s) - - if transform == 'exponential': - complement = True - scale['yscale'] = 'log' - - if transform == 'pareto': - complement = True - scale['yscale'] = 'log' - scale['xscale'] = 'log' - - if complement: - ps = [1.0-p for p in ps] - - if transform == 'weibull': - xs = np.delete(xs, -1) - ps = np.delete(ps, -1) - ps = [-math.log(1.0-p) for p in ps] - scale['xscale'] = 'log' - scale['yscale'] = 'log' - - if transform == 'gumbel': - xs = np.delete(xs, 0) - ps = np.delete(ps, 0) - ps = [-math.log(p) for p in ps] - scale['yscale'] = 'log' - - options = _Underride(options, label=cdf.label) - Plot(xs, ps, **options) - return scale - - -def Cdfs(cdfs, complement=False, transform=None, **options): - """Plots a sequence of CDFs. - - cdfs: sequence of CDF objects - complement: boolean, whether to plot the complementary CDF - transform: string, one of 'exponential', 'pareto', 'weibull', 'gumbel' - options: keyword args passed to plt.plot - """ - for cdf in cdfs: - Cdf(cdf, complement, transform, **options) - - -def Contour(obj, pcolor=False, contour=True, imshow=False, **options): - """Makes a contour plot. - - d: map from (x, y) to z, or object that provides GetDict - pcolor: boolean, whether to make a pseudocolor plot - contour: boolean, whether to make a contour plot - imshow: boolean, whether to use plt.imshow - options: keyword args passed to plt.pcolor and/or plt.contour - """ - try: - d = obj.GetDict() - except AttributeError: - d = obj - - _Underride(options, linewidth=3, cmap=matplotlib.cm.Blues) - - xs, ys = zip(*d.keys()) - xs = sorted(set(xs)) - ys = sorted(set(ys)) - - X, Y = np.meshgrid(xs, ys) - func = lambda x, y: d.get((x, y), 0) - func = np.vectorize(func) - Z = func(X, Y) - - x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) - axes = plt.gca() - axes.xaxis.set_major_formatter(x_formatter) - - if pcolor: - plt.pcolormesh(X, Y, Z, **options) - if contour: - cs = plt.contour(X, Y, Z, **options) - plt.clabel(cs, inline=1, fontsize=10) - if imshow: - extent = xs[0], xs[-1], ys[0], ys[-1] - plt.imshow(Z, extent=extent, **options) - - -def Pcolor(xs, ys, zs, pcolor=True, contour=False, **options): - """Makes a pseudocolor plot. - - xs: - ys: - zs: - pcolor: boolean, whether to make a pseudocolor plot - contour: boolean, whether to make a contour plot - options: keyword args passed to plt.pcolor and/or plt.contour - """ - _Underride(options, linewidth=3, cmap=matplotlib.cm.Blues) - - X, Y = np.meshgrid(xs, ys) - Z = zs - - x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) - axes = plt.gca() - axes.xaxis.set_major_formatter(x_formatter) - - if pcolor: - plt.pcolormesh(X, Y, Z, **options) - - if contour: - cs = plt.contour(X, Y, Z, **options) - plt.clabel(cs, inline=1, fontsize=10) - - -def Text(x, y, s, **options): - """Puts text in a figure. - - x: number - y: number - s: string - options: keyword args passed to plt.text - """ - options = _Underride(options, - fontsize=16, - verticalalignment='top', - horizontalalignment='left') - plt.text(x, y, s, **options) - - -LEGEND = True -LOC = None - -def Config(**options): - """Configures the plot. - - Pulls options out of the option dictionary and passes them to - the corresponding plt functions. - """ - names = ['title', 'xlabel', 'ylabel', 'xscale', 'yscale', - 'xticks', 'yticks', 'axis', 'xlim', 'ylim'] - - for name in names: - if name in options: - getattr(plt, name)(options[name]) - - global LEGEND - LEGEND = options.get('legend', LEGEND) - - # see if there are any elements with labels; - # if not, don't draw a legend - ax = plt.gca() - handles, labels = ax.get_legend_handles_labels() - - if LEGEND and len(labels) > 0: - global LOC - LOC = options.get('loc', LOC) - frameon = options.get('frameon', True) - - try: - plt.legend(loc=LOC, frameon=frameon) - except UserWarning: - pass - - # x and y ticklabels can be made invisible - val = options.get('xticklabels', None) - if val is not None: - if val == 'invisible': - ax = plt.gca() - labels = ax.get_xticklabels() - plt.setp(labels, visible=False) - - val = options.get('yticklabels', None) - if val is not None: - if val == 'invisible': - ax = plt.gca() - labels = ax.get_yticklabels() - plt.setp(labels, visible=False) - -def set_font_size(title_size=16, label_size=16, ticklabel_size=14, legend_size=14): - """Set font sizes for the title, labels, ticklabels, and legend. - """ - def set_text_size(texts, size): - for text in texts: - text.set_size(size) - - ax = plt.gca() - - # TODO: Make this function more robust if any of these elements - # is missing. - - # title - ax.title.set_size(title_size) - - # x axis - ax.xaxis.label.set_size(label_size) - set_text_size(ax.xaxis.get_ticklabels(), ticklabel_size) - - # y axis - ax.yaxis.label.set_size(label_size) - set_text_size(ax.yaxis.get_ticklabels(), ticklabel_size) - - # legend - legend = ax.get_legend() - if legend is not None: - set_text_size(legend.texts, legend_size) - - -def bigger_text(): - sizes = dict(title_size=16, label_size=16, ticklabel_size=14, legend_size=14) - set_font_size(**sizes) - - -def Show(**options): - """Shows the plot. - - For options, see Config. - - options: keyword args used to invoke various plt functions - """ - clf = options.pop('clf', True) - Config(**options) - plt.show() - if clf: - Clf() - - -def Plotly(**options): - """Shows the plot. - - For options, see Config. - - options: keyword args used to invoke various plt functions - """ - clf = options.pop('clf', True) - Config(**options) - import plotly.plotly as plotly - url = plotly.plot_mpl(plt.gcf()) - if clf: - Clf() - return url - - -def Save(root=None, formats=None, **options): - """Saves the plot in the given formats and clears the figure. - - For options, see Config. - - Note: With a capital S, this is the original save, maintained for - compatibility. New code should use save(), which works better - with my newer code, especially in Jupyter notebooks. - - Args: - root: string filename root - formats: list of string formats - options: keyword args used to invoke various plt functions - """ - clf = options.pop('clf', True) - - save_options = {} - for option in ['bbox_inches', 'pad_inches']: - if option in options: - save_options[option] = options.pop(option) - - # TODO: falling Config inside Save was probably a mistake, but removing - # it will require some work - Config(**options) - - if formats is None: - formats = ['pdf', 'png'] - - try: - formats.remove('plotly') - Plotly(clf=False) - except ValueError: - pass - - if root: - for fmt in formats: - SaveFormat(root, fmt, **save_options) - if clf: - Clf() - - -def save(root, formats=None, **options): - """Saves the plot in the given formats and clears the figure. - - For options, see plt.savefig. - - Args: - root: string filename root - formats: list of string formats - options: keyword args passed to plt.savefig - """ - if formats is None: - formats = ['pdf', 'png'] - - try: - formats.remove('plotly') - Plotly(clf=False) - except ValueError: - pass - - for fmt in formats: - SaveFormat(root, fmt, **options) - - -def SaveFormat(root, fmt='eps', **options): - """Writes the current figure to a file in the given format. - - Args: - root: string filename root - fmt: string format - """ - _Underride(options, dpi=300) - filename = '%s.%s' % (root, fmt) - print('Writing', filename) - plt.savefig(filename, format=fmt, **options) - - -# provide aliases for calling functions with lower-case names -preplot = PrePlot -subplot = SubPlot -clf = Clf -figure = Figure -plot = Plot -vlines = Vlines -hlines = Hlines -fill_between = FillBetween -text = Text -scatter = Scatter -pmf = Pmf -pmfs = Pmfs -hist = Hist -hists = Hists -diff = Diff -cdf = Cdf -cdfs = Cdfs -contour = Contour -pcolor = Pcolor -config = Config -show = Show - - -def main(): - color_iter = _Brewer.ColorGenerator(7) - for color in color_iter: - print(color) - - -if __name__ == '__main__': - main() diff --git a/thinkstats2/thinkstats2.py b/thinkstats2/thinkstats2.py deleted file mode 100644 index f85e3796a..000000000 --- a/thinkstats2/thinkstats2.py +++ /dev/null @@ -1,3040 +0,0 @@ -"""This file contains code for use with "Think Stats" and -"Think Bayes", both by Allen B. Downey, available from greenteapress.com - -Copyright 2014 Allen B. Downey -License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html -""" - - -"""This file contains class definitions for: - -Hist: represents a histogram (map from values to integer frequencies). - -Pmf: represents a probability mass function (map from values to probs). - -_DictWrapper: private parent class for Hist and Pmf. - -Cdf: represents a discrete cumulative distribution function - -Pdf: represents a continuous probability density function - -""" - -import bisect -import copy -import logging -import math -import random -import re - -from collections import Counter -from operator import itemgetter - -import thinkplot - -import numpy as np -import pandas - -import scipy -from scipy import stats -from scipy import special -from scipy import ndimage - -from scipy.special import gamma - -from io import open - -ROOT2 = math.sqrt(2) - -def RandomSeed(x): - """Initialize the random and np.random generators. - - x: int seed - """ - random.seed(x) - np.random.seed(x) - - -def Odds(p): - """Computes odds for a given probability. - - Example: p=0.75 means 75 for and 25 against, or 3:1 odds in favor. - - Note: when p=1, the formula for odds divides by zero, which is - normally undefined. But I think it is reasonable to define Odds(1) - to be infinity, so that's what this function does. - - p: float 0-1 - - Returns: float odds - """ - if p == 1: - return float('inf') - return p / (1 - p) - - -def Probability(o): - """Computes the probability corresponding to given odds. - - Example: o=2 means 2:1 odds in favor, or 2/3 probability - - o: float odds, strictly positive - - Returns: float probability - """ - return o / (o + 1) - - -def Probability2(yes, no): - """Computes the probability corresponding to given odds. - - Example: yes=2, no=1 means 2:1 odds in favor, or 2/3 probability. - - yes, no: int or float odds in favor - """ - return yes / (yes + no) - - -class Interpolator(object): - """Represents a mapping between sorted sequences; performs linear interp. - - Attributes: - xs: sorted list - ys: sorted list - """ - - def __init__(self, xs, ys): - self.xs = xs - self.ys = ys - - def Lookup(self, x): - """Looks up x and returns the corresponding value of y.""" - return self._Bisect(x, self.xs, self.ys) - - def Reverse(self, y): - """Looks up y and returns the corresponding value of x.""" - return self._Bisect(y, self.ys, self.xs) - - def _Bisect(self, x, xs, ys): - """Helper function.""" - if x <= xs[0]: - return ys[0] - if x >= xs[-1]: - return ys[-1] - i = bisect.bisect(xs, x) - frac = 1.0 * (x - xs[i - 1]) / (xs[i] - xs[i - 1]) - y = ys[i - 1] + frac * 1.0 * (ys[i] - ys[i - 1]) - return y - - -# When we plot Hist, Pmf and Cdf objects, they don't appear in -# the legend unless we override the default label. -DEFAULT_LABEL = '_nolegend_' - - -class _DictWrapper(object): - """An object that contains a dictionary.""" - - def __init__(self, obj=None, label=None): - """Initializes the distribution. - - obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs - label: string label - """ - self.label = label if label is not None else DEFAULT_LABEL - self.d = {} - - # flag whether the distribution is under a log transform - self.log = False - - if obj is None: - return - - if isinstance(obj, (_DictWrapper, Cdf, Pdf)): - self.label = label if label is not None else obj.label - - if isinstance(obj, dict): - self.d.update(obj.items()) - elif isinstance(obj, (_DictWrapper, Cdf, Pdf)): - self.d.update(obj.Items()) - elif isinstance(obj, pandas.Series): - self.d.update(obj.value_counts().iteritems()) - else: - # finally, treat it like a list - self.d.update(Counter(obj)) - - if len(self) > 0 and isinstance(self, Pmf): - self.Normalize() - - def __hash__(self): - return id(self) - - def __str__(self): - cls = self.__class__.__name__ - if self.label == DEFAULT_LABEL: - return '%s(%s)' % (cls, str(self.d)) - else: - return self.label - - def __repr__(self): - cls = self.__class__.__name__ - if self.label == DEFAULT_LABEL: - return '%s(%s)' % (cls, repr(self.d)) - else: - return '%s(%s, %s)' % (cls, repr(self.d), repr(self.label)) - - def __eq__(self, other): - try: - return self.d == other.d - except AttributeError: - return False - - def __len__(self): - return len(self.d) - - def __iter__(self): - return iter(self.d) - - def iterkeys(self): - """Returns an iterator over keys.""" - return iter(self.d) - - def __contains__(self, value): - return value in self.d - - def __getitem__(self, value): - return self.d.get(value, 0) - - def __setitem__(self, value, prob): - self.d[value] = prob - - def __delitem__(self, value): - del self.d[value] - - def Copy(self, label=None): - """Returns a copy. - - Make a shallow copy of d. If you want a deep copy of d, - use copy.deepcopy on the whole object. - - label: string label for the new Hist - - returns: new _DictWrapper with the same type - """ - new = copy.copy(self) - new.d = copy.copy(self.d) - new.label = label if label is not None else self.label - return new - - def Scale(self, factor): - """Multiplies the values by a factor. - - factor: what to multiply by - - Returns: new object - """ - new = self.Copy() - new.d.clear() - - for val, prob in self.Items(): - new.Set(val * factor, prob) - return new - - def Log(self, m=None): - """Log transforms the probabilities. - - Removes values with probability 0. - - Normalizes so that the largest logprob is 0. - """ - if self.log: - raise ValueError("Pmf/Hist already under a log transform") - self.log = True - - if m is None: - m = self.MaxLike() - - for x, p in self.d.items(): - if p: - self.Set(x, math.log(p / m)) - else: - self.Remove(x) - - def Exp(self, m=None): - """Exponentiates the probabilities. - - m: how much to shift the ps before exponentiating - - If m is None, normalizes so that the largest prob is 1. - """ - if not self.log: - raise ValueError("Pmf/Hist not under a log transform") - self.log = False - - if m is None: - m = self.MaxLike() - - for x, p in self.d.items(): - self.Set(x, math.exp(p - m)) - - def GetDict(self): - """Gets the dictionary.""" - return self.d - - def SetDict(self, d): - """Sets the dictionary.""" - self.d = d - - def Values(self): - """Gets an unsorted sequence of values. - - Note: one source of confusion is that the keys of this - dictionary are the values of the Hist/Pmf, and the - values of the dictionary are frequencies/probabilities. - """ - return self.d.keys() - - def Items(self): - """Gets an unsorted sequence of (value, freq/prob) pairs.""" - return self.d.items() - - def SortedItems(self): - """Gets a sorted sequence of (value, freq/prob) pairs. - - It items are unsortable, the result is unsorted. - """ - def isnan(x): - try: - return math.isnan(x) - except TypeError: - return False - - if any([isnan(x) for x in self.Values()]): - msg = 'Keys contain NaN, may not sort correctly.' - logging.warning(msg) - - try: - return sorted(self.d.items()) - except TypeError: - return self.d.items() - - def Render(self, **options): - """Generates a sequence of points suitable for plotting. - - Note: options are ignored - - Returns: - tuple of (sorted value sequence, freq/prob sequence) - """ - return zip(*self.SortedItems()) - - def MakeCdf(self, label=None): - """Makes a Cdf.""" - label = label if label is not None else self.label - return Cdf(self, label=label) - - def Print(self): - """Prints the values and freqs/probs in ascending order.""" - for val, prob in self.SortedItems(): - print(val, prob) - - def Set(self, x, y=0): - """Sets the freq/prob associated with the value x. - - Args: - x: number value - y: number freq or prob - """ - self.d[x] = y - - def Incr(self, x, term=1): - """Increments the freq/prob associated with the value x. - - Args: - x: number value - term: how much to increment by - """ - self.d[x] = self.d.get(x, 0) + term - - def Mult(self, x, factor): - """Scales the freq/prob associated with the value x. - - Args: - x: number value - factor: how much to multiply by - """ - self.d[x] = self.d.get(x, 0) * factor - - def Remove(self, x): - """Removes a value. - - Throws an exception if the value is not there. - - Args: - x: value to remove - """ - del self.d[x] - - def Total(self): - """Returns the total of the frequencies/probabilities in the map.""" - total = sum(self.d.values()) - return total - - def MaxLike(self): - """Returns the largest frequency/probability in the map.""" - return max(self.d.values()) - - def Largest(self, n=10): - """Returns the largest n values, with frequency/probability. - - n: number of items to return - """ - return sorted(self.d.items(), reverse=True)[:n] - - def Smallest(self, n=10): - """Returns the smallest n values, with frequency/probability. - - n: number of items to return - """ - return sorted(self.d.items(), reverse=False)[:n] - - -class Hist(_DictWrapper): - """Represents a histogram, which is a map from values to frequencies. - - Values can be any hashable type; frequencies are integer counters. - """ - def Freq(self, x): - """Gets the frequency associated with the value x. - - Args: - x: number value - - Returns: - int frequency - """ - return self.d.get(x, 0) - - def Freqs(self, xs): - """Gets frequencies for a sequence of values.""" - return [self.Freq(x) for x in xs] - - def IsSubset(self, other): - """Checks whether the values in this histogram are a subset of - the values in the given histogram.""" - for val, freq in self.Items(): - if freq > other.Freq(val): - return False - return True - - def Subtract(self, other): - """Subtracts the values in the given histogram from this histogram.""" - for val, freq in other.Items(): - self.Incr(val, -freq) - - -class Pmf(_DictWrapper): - """Represents a probability mass function. - - Values can be any hashable type; probabilities are floating-point. - Pmfs are not necessarily normalized. - """ - - def Prob(self, x, default=0): - """Gets the probability associated with the value x. - - Args: - x: number value - default: value to return if the key is not there - - Returns: - float probability - """ - return self.d.get(x, default) - - def Probs(self, xs): - """Gets probabilities for a sequence of values.""" - return [self.Prob(x) for x in xs] - - def Percentile(self, percentage): - """Computes a percentile of a given Pmf. - - Note: this is not super efficient. If you are planning - to compute more than a few percentiles, compute the Cdf. - - percentage: float 0-100 - - returns: value from the Pmf - """ - p = percentage / 100 - total = 0 - for val, prob in sorted(self.Items()): - total += prob - if total >= p: - return val - - def ProbGreater(self, x): - """Probability that a sample from this Pmf exceeds x. - - x: number - - returns: float probability - """ - if isinstance(x, _DictWrapper): - return PmfProbGreater(self, x) - else: - t = [prob for (val, prob) in self.d.items() if val > x] - return sum(t) - - def ProbLess(self, x): - """Probability that a sample from this Pmf is less than x. - - x: number - - returns: float probability - """ - if isinstance(x, _DictWrapper): - return PmfProbLess(self, x) - else: - t = [prob for (val, prob) in self.d.items() if val < x] - return sum(t) - - def ProbEqual(self, x): - """Probability that a sample from this Pmf is exactly x. - - x: number - - returns: float probability - """ - if isinstance(x, _DictWrapper): - return PmfProbEqual(self, x) - else: - return self[x] - - # NOTE: I've decided to remove the magic comparators because they - # have the side-effect of making Pmf sortable, but in fact they - # don't support sorting. - - def Normalize(self, fraction=1): - """Normalizes this PMF so the sum of all probs is fraction. - - Args: - fraction: what the total should be after normalization - - Returns: the total probability before normalizing - """ - if self.log: - raise ValueError("Normalize: Pmf is under a log transform") - - total = self.Total() - if total == 0: - raise ValueError('Normalize: total probability is zero.') - - factor = fraction / total - for x in self.d: - self.d[x] *= factor - - return total - - def Random(self): - """Chooses a random element from this PMF. - - Note: this is not very efficient. If you plan to call - this more than a few times, consider converting to a CDF. - - Returns: - float value from the Pmf - """ - target = random.random() - total = 0 - for x, p in self.d.items(): - total += p - if total >= target: - return x - - # we shouldn't get here - raise ValueError('Random: Pmf might not be normalized.') - - def Sample(self, n): - """Generates a random sample from this distribution. - - n: int length of the sample - returns: NumPy array - """ - return self.MakeCdf().Sample(n) - - def Mean(self): - """Computes the mean of a PMF. - - Returns: - float mean - """ - return sum(p * x for x, p in self.Items()) - - def Median(self): - """Computes the median of a PMF. - - Returns: - float median - """ - return self.MakeCdf().Percentile(50) - - def Var(self, mu=None): - """Computes the variance of a PMF. - - mu: the point around which the variance is computed; - if omitted, computes the mean - - returns: float variance - """ - if mu is None: - mu = self.Mean() - - return sum(p * (x-mu)**2 for x, p in self.Items()) - - def Expect(self, func): - """Computes the expectation of func(x). - - Returns: - expectation - """ - return np.sum(p * func(x) for x, p in self.Items()) - - def Std(self, mu=None): - """Computes the standard deviation of a PMF. - - mu: the point around which the variance is computed; - if omitted, computes the mean - - returns: float standard deviation - """ - var = self.Var(mu) - return math.sqrt(var) - - def Mode(self): - """Returns the value with the highest probability. - - Returns: float probability - """ - _, val = max((prob, val) for val, prob in self.Items()) - return val - - # The mode of a posterior is the maximum aposteori probability (MAP) - MAP = Mode - - # If the distribution contains likelihoods only, the peak is the - # maximum likelihood estimator. - MaximumLikelihood = Mode - - def CredibleInterval(self, percentage=90): - """Computes the central credible interval. - - If percentage=90, computes the 90% CI. - - Args: - percentage: float between 0 and 100 - - Returns: - sequence of two floats, low and high - """ - cdf = self.MakeCdf() - return cdf.CredibleInterval(percentage) - - def __add__(self, other): - """Computes the Pmf of the sum of values drawn from self and other. - - other: another Pmf or a scalar - - returns: new Pmf - """ - try: - return self.AddPmf(other) - except AttributeError: - return self.AddConstant(other) - - __radd__ = __add__ - - def AddPmf(self, other): - """Computes the Pmf of the sum of values drawn from self and other. - - other: another Pmf - - returns: new Pmf - """ - pmf = Pmf() - for v1, p1 in self.Items(): - for v2, p2 in other.Items(): - pmf[v1 + v2] += p1 * p2 - return pmf - - def AddConstant(self, other): - """Computes the Pmf of the sum a constant and values from self. - - other: a number - - returns: new Pmf - """ - if other == 0: - return self.Copy() - - pmf = Pmf() - for v1, p1 in self.Items(): - pmf.Set(v1 + other, p1) - return pmf - - def __sub__(self, other): - """Computes the Pmf of the diff of values drawn from self and other. - - other: another Pmf - - returns: new Pmf - """ - try: - return self.SubPmf(other) - except AttributeError: - return self.AddConstant(-other) - - def SubPmf(self, other): - """Computes the Pmf of the diff of values drawn from self and other. - - other: another Pmf - - returns: new Pmf - """ - pmf = Pmf() - for v1, p1 in self.Items(): - for v2, p2 in other.Items(): - pmf.Incr(v1 - v2, p1 * p2) - return pmf - - def __mul__(self, other): - """Computes the Pmf of the product of values drawn from self and other. - - other: another Pmf - - returns: new Pmf - """ - try: - return self.MulPmf(other) - except AttributeError: - return self.MulConstant(other) - - def MulPmf(self, other): - """Computes the Pmf of the diff of values drawn from self and other. - - other: another Pmf - - returns: new Pmf - """ - pmf = Pmf() - for v1, p1 in self.Items(): - for v2, p2 in other.Items(): - pmf.Incr(v1 * v2, p1 * p2) - return pmf - - def MulConstant(self, other): - """Computes the Pmf of the product of a constant and values from self. - - other: a number - - returns: new Pmf - """ - pmf = Pmf() - for v1, p1 in self.Items(): - pmf.Set(v1 * other, p1) - return pmf - - def __div__(self, other): - """Computes the Pmf of the ratio of values drawn from self and other. - - other: another Pmf - - returns: new Pmf - """ - try: - return self.DivPmf(other) - except AttributeError: - return self.MulConstant(1/other) - - __truediv__ = __div__ - - def DivPmf(self, other): - """Computes the Pmf of the ratio of values drawn from self and other. - - other: another Pmf - - returns: new Pmf - """ - pmf = Pmf() - for v1, p1 in self.Items(): - for v2, p2 in other.Items(): - pmf.Incr(v1 / v2, p1 * p2) - return pmf - - def Max(self, k): - """Computes the CDF of the maximum of k selections from this dist. - - k: int - - returns: new Cdf - """ - cdf = self.MakeCdf() - cdf.ps **= k - return cdf - - -class Joint(Pmf): - """Represents a joint distribution. - - The values are sequences (usually tuples) - """ - - def Marginal(self, i, label=None): - """Gets the marginal distribution of the indicated variable. - - i: index of the variable we want - - Returns: Pmf - """ - pmf = Pmf(label=label) - for vs, prob in self.Items(): - pmf.Incr(vs[i], prob) - return pmf - - def Conditional(self, i, j, val, label=None): - """Gets the conditional distribution of the indicated variable. - - Distribution of vs[i], conditioned on vs[j] = val. - - i: index of the variable we want - j: which variable is conditioned on - val: the value the jth variable has to have - - Returns: Pmf - """ - pmf = Pmf(label=label) - for vs, prob in self.Items(): - if vs[j] != val: - continue - pmf.Incr(vs[i], prob) - - pmf.Normalize() - return pmf - - def MaxLikeInterval(self, percentage=90): - """Returns the maximum-likelihood credible interval. - - If percentage=90, computes a 90% CI containing the values - with the highest likelihoods. - - percentage: float between 0 and 100 - - Returns: list of values from the suite - """ - interval = [] - total = 0 - - t = [(prob, val) for val, prob in self.Items()] - t.sort(reverse=True) - - for prob, val in t: - interval.append(val) - total += prob - if total >= percentage / 100: - break - - return interval - - -def MakeJoint(pmf1, pmf2): - """Joint distribution of values from pmf1 and pmf2. - - Assumes that the PMFs represent independent random variables. - - Args: - pmf1: Pmf object - pmf2: Pmf object - - Returns: - Joint pmf of value pairs - """ - joint = Joint() - for v1, p1 in pmf1.Items(): - for v2, p2 in pmf2.Items(): - joint.Set((v1, v2), p1 * p2) - return joint - - -def MakeHistFromList(t, label=None): - """Makes a histogram from an unsorted sequence of values. - - Args: - t: sequence of numbers - label: string label for this histogram - - Returns: - Hist object - """ - return Hist(t, label=label) - - -def MakeHistFromDict(d, label=None): - """Makes a histogram from a map from values to frequencies. - - Args: - d: dictionary that maps values to frequencies - label: string label for this histogram - - Returns: - Hist object - """ - return Hist(d, label) - - -def MakePmfFromList(t, label=None): - """Makes a PMF from an unsorted sequence of values. - - Args: - t: sequence of numbers - label: string label for this PMF - - Returns: - Pmf object - """ - return Pmf(t, label=label) - - -def MakePmfFromDict(d, label=None): - """Makes a PMF from a map from values to probabilities. - - Args: - d: dictionary that maps values to probabilities - label: string label for this PMF - - Returns: - Pmf object - """ - return Pmf(d, label=label) - - -def MakePmfFromItems(t, label=None): - """Makes a PMF from a sequence of value-probability pairs - - Args: - t: sequence of value-probability pairs - label: string label for this PMF - - Returns: - Pmf object - """ - return Pmf(dict(t), label=label) - - -def MakePmfFromHist(hist, label=None): - """Makes a normalized PMF from a Hist object. - - Args: - hist: Hist object - label: string label - - Returns: - Pmf object - """ - if label is None: - label = hist.label - - return Pmf(hist, label=label) - - -def MakeMixture(metapmf, label='mix'): - """Make a mixture distribution. - - Args: - metapmf: Pmf that maps from Pmfs to probs. - label: string label for the new Pmf. - - Returns: Pmf object. - """ - mix = Pmf(label=label) - for pmf, p1 in metapmf.Items(): - for x, p2 in pmf.Items(): - mix[x] += p1 * p2 - return mix - - -def MakeUniformPmf(low, high, n): - """Make a uniform Pmf. - - low: lowest value (inclusive) - high: highest value (inclusize) - n: number of values - """ - pmf = Pmf() - for x in np.linspace(low, high, n): - pmf.Set(x, 1) - pmf.Normalize() - return pmf - - -class Cdf: - """Represents a cumulative distribution function. - - Attributes: - xs: sequence of values - ps: sequence of probabilities - label: string used as a graph label. - """ - def __init__(self, obj=None, ps=None, label=None): - """Initializes. - - If ps is provided, obj must be the corresponding list of values. - - obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs - ps: list of cumulative probabilities - label: string label - """ - self.label = label if label is not None else DEFAULT_LABEL - - if isinstance(obj, (_DictWrapper, Cdf, Pdf)): - if not label: - self.label = label if label is not None else obj.label - - if obj is None: - # caller does not provide obj, make an empty Cdf - self.xs = np.asarray([]) - self.ps = np.asarray([]) - if ps is not None: - logging.warning("Cdf: can't pass ps without also passing xs.") - return - else: - # if the caller provides xs and ps, just store them - if ps is not None: - if isinstance(ps, str): - logging.warning("Cdf: ps can't be a string") - - self.xs = np.asarray(obj) - self.ps = np.asarray(ps) - return - - # caller has provided just obj, not ps - if isinstance(obj, Cdf): - self.xs = copy.copy(obj.xs) - self.ps = copy.copy(obj.ps) - return - - if isinstance(obj, _DictWrapper): - dw = obj - else: - dw = Hist(obj) - - if len(dw) == 0: - self.xs = np.asarray([]) - self.ps = np.asarray([]) - return - - xs, freqs = zip(*sorted(dw.Items())) - self.xs = np.asarray(xs) - self.ps = np.cumsum(freqs, dtype=np.float) - self.ps /= self.ps[-1] - - def __str__(self): - cls = self.__class__.__name__ - if self.label == DEFAULT_LABEL: - return '%s(%s, %s)' % (cls, str(self.xs), str(self.ps)) - else: - return self.label - - def __repr__(self): - cls = self.__class__.__name__ - if self.label == DEFAULT_LABEL: - return '%s(%s, %s)' % (cls, str(self.xs), str(self.ps)) - else: - return '%s(%s, %s, %s)' % (cls, str(self.xs), str(self.ps), - repr(self.label)) - - def __len__(self): - return len(self.xs) - - def __getitem__(self, x): - return self.Prob(x) - - def __setitem__(self): - raise UnimplementedMethodException() - - def __delitem__(self): - raise UnimplementedMethodException() - - def __eq__(self, other): - return np.all(self.xs == other.xs) and np.all(self.ps == other.ps) - - def Print(self): - """Prints the values and freqs/probs in ascending order.""" - for val, prob in zip(self.xs, self.ps): - print(val, prob) - - def Copy(self, label=None): - """Returns a copy of this Cdf. - - label: string label for the new Cdf - """ - if label is None: - label = self.label - return Cdf(list(self.xs), list(self.ps), label=label) - - def MakePmf(self, label=None): - """Makes a Pmf.""" - if label is None: - label = self.label - return Pmf(self, label=label) - - def Items(self): - """Returns a sorted sequence of (value, probability) pairs. - - Note: in Python3, returns an iterator. - """ - a = self.ps - b = np.roll(a, 1) - b[0] = 0 - return zip(self.xs, a-b) - - def Shift(self, term): - """Adds a term to the xs. - - term: how much to add - """ - new = self.Copy() - # don't use +=, or else an int array + float yields int array - new.xs = new.xs + term - return new - - def Scale(self, factor): - """Multiplies the xs by a factor. - - factor: what to multiply by - """ - new = self.Copy() - # don't use *=, or else an int array * float yields int array - new.xs = new.xs * factor - return new - - def Prob(self, x): - """Returns CDF(x), the probability that corresponds to value x. - - Args: - x: number - - Returns: - float probability - """ - if x < self.xs[0]: - return 0 - index = bisect.bisect(self.xs, x) - p = self.ps[index-1] - return p - - def Probs(self, xs): - """Gets probabilities for a sequence of values. - - xs: any sequence that can be converted to NumPy array - - returns: NumPy array of cumulative probabilities - """ - xs = np.asarray(xs) - index = np.searchsorted(self.xs, xs, side='right') - ps = self.ps[index-1] - ps[xs < self.xs[0]] = 0 - return ps - - ProbArray = Probs - - def Value(self, p): - """Returns InverseCDF(p), the value that corresponds to probability p. - - Args: - p: number in the range [0, 1] - - Returns: - number value - """ - if p < 0 or p > 1: - raise ValueError('Probability p must be in range [0, 1]') - - index = bisect.bisect_left(self.ps, p) - return self.xs[index] - - def Values(self, ps=None): - """Returns InverseCDF(p), the value that corresponds to probability p. - - If ps is not provided, returns all values. - - Args: - ps: NumPy array of numbers in the range [0, 1] - - Returns: - NumPy array of values - """ - if ps is None: - return self.xs - - ps = np.asarray(ps) - if np.any(ps < 0) or np.any(ps > 1): - raise ValueError('Probability p must be in range [0, 1]') - - index = np.searchsorted(self.ps, ps, side='left') - return self.xs[index] - - ValueArray = Values - - def Percentile(self, p): - """Returns the value that corresponds to percentile p. - - Args: - p: number in the range [0, 100] - - Returns: - number value - """ - return self.Value(p / 100) - - def Percentiles(self, ps): - """Returns the value that corresponds to percentiles ps. - - Args: - ps: numbers in the range [0, 100] - - Returns: - array of values - """ - ps = np.asarray(ps) - return self.Values(ps / 100) - - def PercentileRank(self, x): - """Returns the percentile rank of the value x. - - x: potential value in the CDF - - returns: percentile rank in the range 0 to 100 - """ - return self.Prob(x) * 100 - - def PercentileRanks(self, xs): - """Returns the percentile ranks of the values in xs. - - xs: potential value in the CDF - - returns: array of percentile ranks in the range 0 to 100 - """ - return self.Probs(xs) * 100 - - def Random(self): - """Chooses a random value from this distribution.""" - return self.Value(random.random()) - - def Sample(self, n): - """Generates a random sample from this distribution. - - n: int length of the sample - returns: NumPy array - """ - ps = np.random.random(n) - return self.ValueArray(ps) - - def Mean(self): - """Computes the mean of a CDF. - - Returns: - float mean - """ - old_p = 0 - total = 0 - for x, new_p in zip(self.xs, self.ps): - p = new_p - old_p - total += p * x - old_p = new_p - return total - - def CredibleInterval(self, percentage=90): - """Computes the central credible interval. - - If percentage=90, computes the 90% CI. - - Args: - percentage: float between 0 and 100 - - Returns: - sequence of two floats, low and high - """ - prob = (1 - percentage / 100) / 2 - interval = self.Value(prob), self.Value(1 - prob) - return interval - - ConfidenceInterval = CredibleInterval - - def _Round(self, multiplier=1000): - """ - An entry is added to the cdf only if the percentile differs - from the previous value in a significant digit, where the number - of significant digits is determined by multiplier. The - default is 1000, which keeps log10(1000) = 3 significant digits. - """ - # TODO(write this method) - raise UnimplementedMethodException() - - def Render(self, **options): - """Generates a sequence of points suitable for plotting. - - An empirical CDF is a step function; linear interpolation - can be misleading. - - Note: options are ignored - - Returns: - tuple of (xs, ps) - """ - def interleave(a, b): - c = np.empty(a.shape[0] + b.shape[0]) - c[::2] = a - c[1::2] = b - return c - - a = np.array(self.xs) - xs = interleave(a, a) - shift_ps = np.roll(self.ps, 1) - shift_ps[0] = 0 - ps = interleave(shift_ps, self.ps) - return xs, ps - - def Max(self, k): - """Computes the CDF of the maximum of k selections from this dist. - - k: int - - returns: new Cdf - """ - cdf = self.Copy() - cdf.ps **= k - return cdf - - -def MakeCdfFromItems(items, label=None): - """Makes a cdf from an unsorted sequence of (value, frequency) pairs. - - Args: - items: unsorted sequence of (value, frequency) pairs - label: string label for this CDF - - Returns: - cdf: list of (value, fraction) pairs - """ - return Cdf(dict(items), label=label) - - -def MakeCdfFromDict(d, label=None): - """Makes a CDF from a dictionary that maps values to frequencies. - - Args: - d: dictionary that maps values to frequencies. - label: string label for the data. - - Returns: - Cdf object - """ - return Cdf(d, label=label) - - -def MakeCdfFromList(seq, label=None): - """Creates a CDF from an unsorted sequence. - - Args: - seq: unsorted sequence of sortable values - label: string label for the cdf - - Returns: - Cdf object - """ - return Cdf(seq, label=label) - - -def MakeCdfFromHist(hist, label=None): - """Makes a CDF from a Hist object. - - Args: - hist: Pmf.Hist object - label: string label for the data. - - Returns: - Cdf object - """ - if label is None: - label = hist.label - - return Cdf(hist, label=label) - - -def MakeCdfFromPmf(pmf, label=None): - """Makes a CDF from a Pmf object. - - Args: - pmf: Pmf.Pmf object - label: string label for the data. - - Returns: - Cdf object - """ - if label is None: - label = pmf.label - - return Cdf(pmf, label=label) - - -class UnimplementedMethodException(Exception): - """Exception if someone calls a method that should be overridden.""" - - -class Suite(Pmf): - """Represents a suite of hypotheses and their probabilities.""" - - def Update(self, data): - """Updates each hypothesis based on the data. - - data: any representation of the data - - returns: the normalizing constant - """ - for hypo in self.Values(): - like = self.Likelihood(data, hypo) - self.Mult(hypo, like) - return self.Normalize() - - def LogUpdate(self, data): - """Updates a suite of hypotheses based on new data. - - Modifies the suite directly; if you want to keep the original, make - a copy. - - Note: unlike Update, LogUpdate does not normalize. - - Args: - data: any representation of the data - """ - for hypo in self.Values(): - like = self.LogLikelihood(data, hypo) - self.Incr(hypo, like) - - def UpdateSet(self, dataset): - """Updates each hypothesis based on the dataset. - - This is more efficient than calling Update repeatedly because - it waits until the end to Normalize. - - Modifies the suite directly; if you want to keep the original, make - a copy. - - dataset: a sequence of data - - returns: the normalizing constant - """ - for data in dataset: - for hypo in self.Values(): - like = self.Likelihood(data, hypo) - self.Mult(hypo, like) - return self.Normalize() - - def LogUpdateSet(self, dataset): - """Updates each hypothesis based on the dataset. - - Modifies the suite directly; if you want to keep the original, make - a copy. - - dataset: a sequence of data - - returns: None - """ - for data in dataset: - self.LogUpdate(data) - - def Likelihood(self, data, hypo): - """Computes the likelihood of the data under the hypothesis. - - hypo: some representation of the hypothesis - data: some representation of the data - """ - raise UnimplementedMethodException() - - def LogLikelihood(self, data, hypo): - """Computes the log likelihood of the data under the hypothesis. - - hypo: some representation of the hypothesis - data: some representation of the data - """ - raise UnimplementedMethodException() - - def Print(self): - """Prints the hypotheses and their probabilities.""" - for hypo, prob in sorted(self.Items()): - print(hypo, prob) - - def MakeOdds(self): - """Transforms from probabilities to odds. - - Values with prob=0 are removed. - """ - for hypo, prob in self.Items(): - if prob: - self.Set(hypo, Odds(prob)) - else: - self.Remove(hypo) - - def MakeProbs(self): - """Transforms from odds to probabilities.""" - for hypo, odds in self.Items(): - self.Set(hypo, Probability(odds)) - - -def MakeSuiteFromList(t, label=None): - """Makes a suite from an unsorted sequence of values. - - Args: - t: sequence of numbers - label: string label for this suite - - Returns: - Suite object - """ - hist = MakeHistFromList(t, label=label) - d = hist.GetDict() - return MakeSuiteFromDict(d) - - -def MakeSuiteFromHist(hist, label=None): - """Makes a normalized suite from a Hist object. - - Args: - hist: Hist object - label: string label - - Returns: - Suite object - """ - if label is None: - label = hist.label - - # make a copy of the dictionary - d = dict(hist.GetDict()) - return MakeSuiteFromDict(d, label) - - -def MakeSuiteFromDict(d, label=None): - """Makes a suite from a map from values to probabilities. - - Args: - d: dictionary that maps values to probabilities - label: string label for this suite - - Returns: - Suite object - """ - suite = Suite(label=label) - suite.SetDict(d) - suite.Normalize() - return suite - - -class Pdf(object): - """Represents a probability density function (PDF).""" - - def Density(self, x): - """Evaluates this Pdf at x. - - Returns: float or NumPy array of probability density - """ - raise UnimplementedMethodException() - - def GetLinspace(self): - """Get a linspace for plotting. - - Not all subclasses of Pdf implement this. - - Returns: numpy array - """ - raise UnimplementedMethodException() - - def MakePmf(self, **options): - """Makes a discrete version of this Pdf. - - options can include - label: string - low: low end of range - high: high end of range - n: number of places to evaluate - - Returns: new Pmf - """ - label = options.pop('label', '') - xs, ds = self.Render(**options) - return Pmf(dict(zip(xs, ds)), label=label) - - def Render(self, **options): - """Generates a sequence of points suitable for plotting. - - If options includes low and high, it must also include n; - in that case the density is evaluated an n locations between - low and high, including both. - - If options includes xs, the density is evaluate at those location. - - Otherwise, self.GetLinspace is invoked to provide the locations. - - Returns: - tuple of (xs, densities) - """ - low, high = options.pop('low', None), options.pop('high', None) - if low is not None and high is not None: - n = options.pop('n', 101) - xs = np.linspace(low, high, n) - else: - xs = options.pop('xs', None) - if xs is None: - xs = self.GetLinspace() - - ds = self.Density(xs) - return xs, ds - - def Items(self): - """Generates a sequence of (value, probability) pairs. - """ - return zip(*self.Render()) - - -class NormalPdf(Pdf): - """Represents the PDF of a Normal distribution.""" - - def __init__(self, mu=0, sigma=1, label=None): - """Constructs a Normal Pdf with given mu and sigma. - - mu: mean - sigma: standard deviation - label: string - """ - self.mu = mu - self.sigma = sigma - self.label = label if label is not None else '_nolegend_' - - def __str__(self): - return 'NormalPdf(%f, %f)' % (self.mu, self.sigma) - - def GetLinspace(self): - """Get a linspace for plotting. - - Returns: numpy array - """ - low, high = self.mu-3*self.sigma, self.mu+3*self.sigma - return np.linspace(low, high, 101) - - def Density(self, xs): - """Evaluates this Pdf at xs. - - xs: scalar or sequence of floats - - returns: float or NumPy array of probability density - """ - return stats.norm.pdf(xs, self.mu, self.sigma) - - -class ExponentialPdf(Pdf): - """Represents the PDF of an exponential distribution.""" - - def __init__(self, lam=1, label=None): - """Constructs an exponential Pdf with given parameter. - - lam: rate parameter - label: string - """ - self.lam = lam - self.label = label if label is not None else '_nolegend_' - - def __str__(self): - return 'ExponentialPdf(%f)' % (self.lam) - - def GetLinspace(self): - """Get a linspace for plotting. - - Returns: numpy array - """ - low, high = 0, 5.0/self.lam - return np.linspace(low, high, 101) - - def Density(self, xs): - """Evaluates this Pdf at xs. - - xs: scalar or sequence of floats - - returns: float or NumPy array of probability density - """ - return stats.expon.pdf(xs, scale=1.0/self.lam) - - -class EstimatedPdf(Pdf): - """Represents a PDF estimated by KDE.""" - - def __init__(self, sample, label=None): - """Estimates the density function based on a sample. - - sample: sequence of data - label: string - """ - self.label = label if label is not None else '_nolegend_' - self.kde = stats.gaussian_kde(sample) - low = min(sample) - high = max(sample) - self.linspace = np.linspace(low, high, 101) - - def __str__(self): - return 'EstimatedPdf(label=%s)' % str(self.label) - - def GetLinspace(self): - """Get a linspace for plotting. - - Returns: numpy array - """ - return self.linspace - - def Density(self, xs): - """Evaluates this Pdf at xs. - - returns: float or NumPy array of probability density - """ - return self.kde.evaluate(xs) - - def Sample(self, n): - """Generates a random sample from the estimated Pdf. - - n: size of sample - """ - # NOTE: we have to flatten because resample returns a 2-D - # array for some reason. - return self.kde.resample(n).flatten() - - -def CredibleInterval(pmf, percentage=90): - """Computes a credible interval for a given distribution. - - If percentage=90, computes the 90% CI. - - Args: - pmf: Pmf object representing a posterior distribution - percentage: float between 0 and 100 - - Returns: - sequence of two floats, low and high - """ - cdf = pmf.MakeCdf() - prob = (1 - percentage / 100) / 2 - interval = cdf.Value(prob), cdf.Value(1 - prob) - return interval - - -def PmfProbLess(pmf1, pmf2): - """Probability that a value from pmf1 is less than a value from pmf2. - - Args: - pmf1: Pmf object - pmf2: Pmf object - - Returns: - float probability - """ - total = 0 - for v1, p1 in pmf1.Items(): - for v2, p2 in pmf2.Items(): - if v1 < v2: - total += p1 * p2 - return total - - -def PmfProbGreater(pmf1, pmf2): - """Probability that a value from pmf1 is less than a value from pmf2. - - Args: - pmf1: Pmf object - pmf2: Pmf object - - Returns: - float probability - """ - total = 0 - for v1, p1 in pmf1.Items(): - for v2, p2 in pmf2.Items(): - if v1 > v2: - total += p1 * p2 - return total - - -def PmfProbEqual(pmf1, pmf2): - """Probability that a value from pmf1 equals a value from pmf2. - - Args: - pmf1: Pmf object - pmf2: Pmf object - - Returns: - float probability - """ - total = 0 - for v1, p1 in pmf1.Items(): - for v2, p2 in pmf2.Items(): - if v1 == v2: - total += p1 * p2 - return total - - -def RandomSum(dists): - """Chooses a random value from each dist and returns the sum. - - dists: sequence of Pmf or Cdf objects - - returns: numerical sum - """ - total = sum(dist.Random() for dist in dists) - return total - - -def SampleSum(dists, n): - """Draws a sample of sums from a list of distributions. - - dists: sequence of Pmf or Cdf objects - n: sample size - - returns: new Pmf of sums - """ - pmf = Pmf(RandomSum(dists) for i in range(n)) - return pmf - - -def EvalNormalPdf(x, mu, sigma): - """Computes the unnormalized PDF of the normal distribution. - - x: value - mu: mean - sigma: standard deviation - - returns: float probability density - """ - return stats.norm.pdf(x, mu, sigma) - - -def MakeNormalPmf(mu, sigma, num_sigmas, n=201): - """Makes a PMF discrete approx to a Normal distribution. - - mu: float mean - sigma: float standard deviation - num_sigmas: how many sigmas to extend in each direction - n: number of values in the Pmf - - returns: normalized Pmf - """ - pmf = Pmf() - low = mu - num_sigmas * sigma - high = mu + num_sigmas * sigma - - for x in np.linspace(low, high, n): - p = EvalNormalPdf(x, mu, sigma) - pmf.Set(x, p) - pmf.Normalize() - return pmf - - -def EvalBinomialPmf(k, n, p): - """Evaluates the binomial PMF. - - Returns the probabily of k successes in n trials with probability p. - """ - return stats.binom.pmf(k, n, p) - - -def MakeBinomialPmf(n, p): - """Evaluates the binomial PMF. - - Returns the distribution of successes in n trials with probability p. - """ - pmf = Pmf() - for k in range(n+1): - pmf[k] = stats.binom.pmf(k, n, p) - return pmf - - -def EvalGammaPdf(x, a): - """Computes the Gamma PDF. - - x: where to evaluate the PDF - a: parameter of the gamma distribution - - returns: float probability - """ - return x**(a-1) * np.exp(-x) / gamma(a) - - -def MakeGammaPmf(xs, a): - """Makes a PMF discrete approx to a Gamma distribution. - - lam: parameter lambda in events per unit time - xs: upper bound of the Pmf - - returns: normalized Pmf - """ - xs = np.asarray(xs) - ps = EvalGammaPdf(xs, a) - pmf = Pmf(dict(zip(xs, ps))) - pmf.Normalize() - return pmf - - -def EvalGeometricPmf(k, p, loc=0): - """Evaluates the geometric PMF. - - With loc=0: Probability of `k` trials to get one success. - With loc=-1: Probability of `k` trials before first success. - - k: number of trials - p: probability of success on each trial - """ - return stats.geom.pmf(k, p, loc=loc) - - -def MakeGeometricPmf(p, loc=0, high=10): - """Evaluates the binomial PMF. - - With loc=0: PMF of trials to get one success. - With loc=-1: PMF of trials before first success. - - p: probability of success - high: upper bound where PMF is truncated - """ - pmf = Pmf() - for k in range(high): - pmf[k] = stats.geom.pmf(k, p, loc=loc) - pmf.Normalize() - return pmf - - -def EvalHypergeomPmf(k, N, K, n): - """Evaluates the hypergeometric PMF. - - Returns the probabily of k successes in n trials from a population - N with K successes in it. - """ - return stats.hypergeom.pmf(k, N, K, n) - - -def EvalPoissonPmf(k, lam): - """Computes the Poisson PMF. - - k: number of events - lam: parameter lambda in events per unit time - - returns: float probability - """ - return stats.poisson.pmf(k, lam) - - -def MakePoissonPmf(lam, high, step=1): - """Makes a PMF discrete approx to a Poisson distribution. - - lam: parameter lambda in events per unit time - high: upper bound of the Pmf - - returns: normalized Pmf - """ - pmf = Pmf() - for k in range(0, high + 1, step): - p = stats.poisson.pmf(k, lam) - pmf.Set(k, p) - pmf.Normalize() - return pmf - - -def EvalExponentialPdf(x, lam): - """Computes the exponential PDF. - - x: value - lam: parameter lambda in events per unit time - - returns: float probability density - """ - return lam * math.exp(-lam * x) - - -def EvalExponentialCdf(x, lam): - """Evaluates CDF of the exponential distribution with parameter lam.""" - return 1 - math.exp(-lam * x) - - -def MakeExponentialPmf(lam, high, n=200): - """Makes a PMF discrete approx to an exponential distribution. - - lam: parameter lambda in events per unit time - high: upper bound - n: number of values in the Pmf - - returns: normalized Pmf - """ - pmf = Pmf() - for x in np.linspace(0, high, n): - p = EvalExponentialPdf(x, lam) - pmf.Set(x, p) - pmf.Normalize() - return pmf - - -def EvalWeibullPdf(x, lam, k): - """Computes the Weibull PDF. - - x: value - lam: parameter lambda in events per unit time - k: parameter - - returns: float probability density - """ - arg = (x / lam) - return k / lam * arg**(k-1) * np.exp(-arg**k) - - -def EvalWeibullCdf(x, lam, k): - """Evaluates CDF of the Weibull distribution.""" - arg = (x / lam) - return 1 - np.exp(-arg**k) - - -def MakeWeibullPmf(lam, k, high, n=200): - """Makes a PMF discrete approx to a Weibull distribution. - - lam: parameter lambda in events per unit time - k: parameter - high: upper bound - n: number of values in the Pmf - - returns: normalized Pmf - """ - xs = np.linspace(0, high, n) - ps = EvalWeibullPdf(xs, lam, k) - ps[np.isinf(ps)] = 0 - return Pmf(dict(zip(xs, ps))) - - -def EvalParetoPdf(x, xm, alpha): - """Computes the Pareto. - - xm: minimum value (scale parameter) - alpha: shape parameter - - returns: float probability density - """ - return stats.pareto.pdf(x, alpha, scale=xm) - - -def MakeParetoPmf(xm, alpha, high, num=101): - """Makes a PMF discrete approx to a Pareto distribution. - - xm: minimum value (scale parameter) - alpha: shape parameter - high: upper bound value - num: number of values - - returns: normalized Pmf - """ - xs = np.linspace(xm, high, num) - ps = stats.pareto.pdf(xs, alpha, scale=xm) - pmf = Pmf(dict(zip(xs, ps))) - return pmf - -def StandardNormalCdf(x): - """Evaluates the CDF of the standard Normal distribution. - - See http://en.wikipedia.org/wiki/Normal_distribution - #Cumulative_distribution_function - - Args: - x: float - - Returns: - float - """ - return (math.erf(x / ROOT2) + 1) / 2 - - -def EvalNormalCdf(x, mu=0, sigma=1): - """Evaluates the CDF of the normal distribution. - - Args: - x: float - - mu: mean parameter - - sigma: standard deviation parameter - - Returns: - float - """ - return stats.norm.cdf(x, loc=mu, scale=sigma) - - -def EvalNormalCdfInverse(p, mu=0, sigma=1): - """Evaluates the inverse CDF of the normal distribution. - - See http://en.wikipedia.org/wiki/Normal_distribution#Quantile_function - - Args: - p: float - - mu: mean parameter - - sigma: standard deviation parameter - - Returns: - float - """ - return stats.norm.ppf(p, loc=mu, scale=sigma) - - -def EvalLognormalCdf(x, mu=0, sigma=1): - """Evaluates the CDF of the lognormal distribution. - - x: float or sequence - mu: mean parameter - sigma: standard deviation parameter - - Returns: float or sequence - """ - return stats.lognorm.cdf(x, loc=mu, scale=sigma) - - -def RenderExpoCdf(lam, low, high, n=101): - """Generates sequences of xs and ps for an exponential CDF. - - lam: parameter - low: float - high: float - n: number of points to render - - returns: numpy arrays (xs, ps) - """ - xs = np.linspace(low, high, n) - ps = 1 - np.exp(-lam * xs) - #ps = stats.expon.cdf(xs, scale=1.0/lam) - return xs, ps - - -def RenderNormalCdf(mu, sigma, low, high, n=101): - """Generates sequences of xs and ps for a Normal CDF. - - mu: parameter - sigma: parameter - low: float - high: float - n: number of points to render - - returns: numpy arrays (xs, ps) - """ - xs = np.linspace(low, high, n) - ps = stats.norm.cdf(xs, mu, sigma) - return xs, ps - - -def RenderParetoCdf(xmin, alpha, low, high, n=50): - """Generates sequences of xs and ps for a Pareto CDF. - - xmin: parameter - alpha: parameter - low: float - high: float - n: number of points to render - - returns: numpy arrays (xs, ps) - """ - if low < xmin: - low = xmin - xs = np.linspace(low, high, n) - ps = 1 - (xs / xmin) ** -alpha - #ps = stats.pareto.cdf(xs, scale=xmin, b=alpha) - return xs, ps - - -class Beta: - """Represents a Beta distribution. - - See http://en.wikipedia.org/wiki/Beta_distribution - """ - def __init__(self, alpha=1, beta=1, label=None): - """Initializes a Beta distribution.""" - self.alpha = alpha - self.beta = beta - self.label = label if label is not None else '_nolegend_' - - def Update(self, data): - """Updates a Beta distribution. - - data: pair of int (heads, tails) - """ - heads, tails = data - self.alpha += heads - self.beta += tails - - def Mean(self): - """Computes the mean of this distribution.""" - return self.alpha / (self.alpha + self.beta) - - def MAP(self): - """Computes the value with maximum a posteori probability.""" - a = self.alpha - 1 - b = self.beta - 1 - return a / (a + b) - - def Random(self): - """Generates a random variate from this distribution.""" - return random.betavariate(self.alpha, self.beta) - - def Sample(self, n): - """Generates a random sample from this distribution. - - n: int sample size - """ - size = n, - return np.random.beta(self.alpha, self.beta, size) - - def EvalPdf(self, x): - """Evaluates the PDF at x.""" - return x ** (self.alpha - 1) * (1 - x) ** (self.beta - 1) - - def MakePmf(self, steps=101, label=None): - """Returns a Pmf of this distribution. - - Note: Normally, we just evaluate the PDF at a sequence - of points and treat the probability density as a probability - mass. - - But if alpha or beta is less than one, we have to be - more careful because the PDF goes to infinity at x=0 - and x=1. In that case we evaluate the CDF and compute - differences. - - The result is a little funny, because the values at 0 and 1 - are not symmetric. Nevertheless, it is a reasonable discrete - model of the continuous distribution, and behaves well as - the number of values increases. - """ - if label is None and self.label is not None: - label = self.label - - if self.alpha < 1 or self.beta < 1: - cdf = self.MakeCdf() - pmf = cdf.MakePmf() - return pmf - - xs = [i / (steps - 1.0) for i in range(steps)] - probs = [self.EvalPdf(x) for x in xs] - pmf = Pmf(dict(zip(xs, probs)), label=label) - return pmf - - def MakeCdf(self, steps=101): - """Returns the CDF of this distribution.""" - xs = [i / (steps - 1.0) for i in range(steps)] - ps = special.betainc(self.alpha, self.beta, xs) - cdf = Cdf(xs, ps) - return cdf - - def Percentile(self, ps): - """Returns the given percentiles from this distribution. - - ps: scalar, array, or list of [0-100] - """ - ps = np.asarray(ps) / 100 - xs = special.betaincinv(self.alpha, self.beta, ps) - return xs - - -class Dirichlet(object): - """Represents a Dirichlet distribution. - - See http://en.wikipedia.org/wiki/Dirichlet_distribution - """ - - def __init__(self, n, conc=1, label=None): - """Initializes a Dirichlet distribution. - - n: number of dimensions - conc: concentration parameter (smaller yields more concentration) - label: string label - """ - if n < 2: - raise ValueError('A Dirichlet distribution with ' - 'n<2 makes no sense') - - self.n = n - self.params = np.ones(n, dtype=np.float) * conc - self.label = label if label is not None else '_nolegend_' - - def Update(self, data): - """Updates a Dirichlet distribution. - - data: sequence of observations, in order corresponding to params - """ - m = len(data) - self.params[:m] += data - - def Random(self): - """Generates a random variate from this distribution. - - Returns: normalized vector of fractions - """ - p = np.random.gamma(self.params) - return p / p.sum() - - def Likelihood(self, data): - """Computes the likelihood of the data. - - Selects a random vector of probabilities from this distribution. - - Returns: float probability - """ - m = len(data) - if self.n < m: - return 0 - - x = data - p = self.Random() - q = p[:m] ** x - return q.prod() - - def LogLikelihood(self, data): - """Computes the log likelihood of the data. - - Selects a random vector of probabilities from this distribution. - - Returns: float log probability - """ - m = len(data) - if self.n < m: - return float('-inf') - - x = self.Random() - y = np.log(x[:m]) * data - return y.sum() - - def MarginalBeta(self, i): - """Computes the marginal distribution of the ith element. - - See http://en.wikipedia.org/wiki/Dirichlet_distribution - #Marginal_distributions - - i: int - - Returns: Beta object - """ - alpha0 = self.params.sum() - alpha = self.params[i] - return Beta(alpha, alpha0 - alpha) - - def PredictivePmf(self, xs, label=None): - """Makes a predictive distribution. - - xs: values to go into the Pmf - - Returns: Pmf that maps from x to the mean prevalence of x - """ - alpha0 = self.params.sum() - ps = self.params / alpha0 - return Pmf(zip(xs, ps), label=label) - - -def BinomialCoef(n, k): - """Compute the binomial coefficient "n choose k". - - n: number of trials - k: number of successes - - Returns: float - """ - return scipy.misc.comb(n, k) - - -def LogBinomialCoef(n, k): - """Computes the log of the binomial coefficient. - - http://math.stackexchange.com/questions/64716/ - approximating-the-logarithm-of-the-binomial-coefficient - - n: number of trials - k: number of successes - - Returns: float - """ - return n * math.log(n) - k * math.log(k) - (n - k) * math.log(n - k) - - -def NormalProbability(ys, jitter=0): - """Generates data for a normal probability plot. - - ys: sequence of values - jitter: float magnitude of jitter added to the ys - - returns: numpy arrays xs, ys - """ - n = len(ys) - xs = np.random.normal(0, 1, n) - xs.sort() - - if jitter: - ys = Jitter(ys, jitter) - else: - ys = np.array(ys) - ys.sort() - - return xs, ys - - -def Jitter(values, jitter=0.5): - """Jitters the values by adding a uniform variate in (-jitter, jitter). - - values: sequence - jitter: scalar magnitude of jitter - - returns: new numpy array - """ - n = len(values) - return np.random.normal(0, jitter, n) + values - - -def NormalProbabilityPlot(sample, fit_color='0.8', **options): - """Makes a normal probability plot with a fitted line. - - sample: sequence of numbers - fit_color: color string for the fitted line - options: passed along to Plot - """ - xs, ys = NormalProbability(sample) - mean, var = MeanVar(sample) - std = math.sqrt(var) - - fit = FitLine(xs, mean, std) - thinkplot.Plot(*fit, color=fit_color, label='model') - - xs, ys = NormalProbability(sample) - thinkplot.Plot(xs, ys, **options) - - -def Mean(xs): - """Computes mean. - - xs: sequence of values - - returns: float mean - """ - return np.mean(xs) - - -def Var(xs, mu=None, ddof=0): - """Computes variance. - - xs: sequence of values - mu: option known mean - ddof: delta degrees of freedom - - returns: float - """ - xs = np.asarray(xs) - - if mu is None: - mu = xs.mean() - - ds = xs - mu - return np.dot(ds, ds) / (len(xs) - ddof) - - -def Std(xs, mu=None, ddof=0): - """Computes standard deviation. - - xs: sequence of values - mu: option known mean - ddof: delta degrees of freedom - - returns: float - """ - var = Var(xs, mu, ddof) - return math.sqrt(var) - - -def MeanVar(xs, ddof=0): - """Computes mean and variance. - - Based on http://stackoverflow.com/questions/19391149/ - numpy-mean-and-variance-from-single-function - - xs: sequence of values - ddof: delta degrees of freedom - - returns: pair of float, mean and var - """ - xs = np.asarray(xs) - mean = xs.mean() - s2 = Var(xs, mean, ddof) - return mean, s2 - - -def Trim(t, p=0.01): - """Trims the largest and smallest elements of t. - - Args: - t: sequence of numbers - p: fraction of values to trim off each end - - Returns: - sequence of values - """ - n = int(p * len(t)) - t = sorted(t)[n:-n] - return t - - -def TrimmedMean(t, p=0.01): - """Computes the trimmed mean of a sequence of numbers. - - Args: - t: sequence of numbers - p: fraction of values to trim off each end - - Returns: - float - """ - t = Trim(t, p) - return Mean(t) - - -def TrimmedMeanVar(t, p=0.01): - """Computes the trimmed mean and variance of a sequence of numbers. - - Side effect: sorts the list. - - Args: - t: sequence of numbers - p: fraction of values to trim off each end - - Returns: - float - """ - t = Trim(t, p) - mu, var = MeanVar(t) - return mu, var - - -def CohenEffectSize(group1, group2): - """Compute Cohen's d. - - group1: Series or NumPy array - group2: Series or NumPy array - - returns: float - """ - diff = group1.mean() - group2.mean() - - n1, n2 = len(group1), len(group2) - var1 = group1.var() - var2 = group2.var() - - pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2) - d = diff / math.sqrt(pooled_var) - return d - - -def Cov(xs, ys, meanx=None, meany=None): - """Computes Cov(X, Y). - - Args: - xs: sequence of values - ys: sequence of values - meanx: optional float mean of xs - meany: optional float mean of ys - - Returns: - Cov(X, Y) - """ - xs = np.asarray(xs) - ys = np.asarray(ys) - - if meanx is None: - meanx = np.mean(xs) - if meany is None: - meany = np.mean(ys) - - cov = np.dot(xs-meanx, ys-meany) / len(xs) - return cov - - -def Corr(xs, ys): - """Computes Corr(X, Y). - - Args: - xs: sequence of values - ys: sequence of values - - Returns: - Corr(X, Y) - """ - xs = np.asarray(xs) - ys = np.asarray(ys) - - meanx, varx = MeanVar(xs) - meany, vary = MeanVar(ys) - - corr = Cov(xs, ys, meanx, meany) / math.sqrt(varx * vary) - - return corr - - -def SerialCorr(series, lag=1): - """Computes the serial correlation of a series. - - series: Series - lag: integer number of intervals to shift - - returns: float correlation - """ - xs = series[lag:] - ys = series.shift(lag)[lag:] - corr = Corr(xs, ys) - return corr - - -def SpearmanCorr(xs, ys): - """Computes Spearman's rank correlation. - - Args: - xs: sequence of values - ys: sequence of values - - Returns: - float Spearman's correlation - """ - xranks = pandas.Series(xs).rank() - yranks = pandas.Series(ys).rank() - return Corr(xranks, yranks) - - -def MapToRanks(t): - """Returns a list of ranks corresponding to the elements in t. - - Args: - t: sequence of numbers - - Returns: - list of integer ranks, starting at 1 - """ - # pair up each value with its index - pairs = enumerate(t) - - # sort by value - sorted_pairs = sorted(pairs, key=itemgetter(1)) - - # pair up each pair with its rank - ranked = enumerate(sorted_pairs) - - # sort by index - resorted = sorted(ranked, key=lambda trip: trip[1][0]) - - # extract the ranks - ranks = [trip[0]+1 for trip in resorted] - return ranks - - -def LeastSquares(xs, ys): - """Computes a linear least squares fit for ys as a function of xs. - - Args: - xs: sequence of values - ys: sequence of values - - Returns: - tuple of (intercept, slope) - """ - meanx, varx = MeanVar(xs) - meany = Mean(ys) - - slope = Cov(xs, ys, meanx, meany) / varx - inter = meany - slope * meanx - - return inter, slope - - -def FitLine(xs, inter, slope): - """Fits a line to the given data. - - xs: sequence of x - - returns: tuple of numpy arrays (sorted xs, fit ys) - """ - fit_xs = np.sort(xs) - fit_ys = inter + slope * fit_xs - return fit_xs, fit_ys - - -def Residuals(xs, ys, inter, slope): - """Computes residuals for a linear fit with parameters inter and slope. - - Args: - xs: independent variable - ys: dependent variable - inter: float intercept - slope: float slope - - Returns: - list of residuals - """ - xs = np.asarray(xs) - ys = np.asarray(ys) - res = ys - (inter + slope * xs) - return res - - -def CoefDetermination(ys, res): - """Computes the coefficient of determination (R^2) for given residuals. - - Args: - ys: dependent variable - res: residuals - - Returns: - float coefficient of determination - """ - return 1 - Var(res) / Var(ys) - - -def CorrelatedGenerator(rho): - """Generates standard normal variates with serial correlation. - - rho: target coefficient of correlation - - Returns: iterable - """ - x = random.gauss(0, 1) - yield x - - sigma = math.sqrt(1 - rho**2) - while True: - x = random.gauss(x * rho, sigma) - yield x - - -def CorrelatedNormalGenerator(mu, sigma, rho): - """Generates normal variates with serial correlation. - - mu: mean of variate - sigma: standard deviation of variate - rho: target coefficient of correlation - - Returns: iterable - """ - for x in CorrelatedGenerator(rho): - yield x * sigma + mu - - -def RawMoment(xs, k): - """Computes the kth raw moment of xs. - """ - return sum(x**k for x in xs) / len(xs) - - -def CentralMoment(xs, k): - """Computes the kth central moment of xs. - """ - mean = RawMoment(xs, 1) - return sum((x - mean)**k for x in xs) / len(xs) - - -def StandardizedMoment(xs, k): - """Computes the kth standardized moment of xs. - """ - var = CentralMoment(xs, 2) - std = math.sqrt(var) - return CentralMoment(xs, k) / std**k - - -def Skewness(xs): - """Computes skewness. - """ - return StandardizedMoment(xs, 3) - - -def Median(xs): - """Computes the median (50th percentile) of a sequence. - - xs: sequence or anything else that can initialize a Cdf - - returns: float - """ - cdf = Cdf(xs) - return cdf.Value(0.5) - - -def IQR(xs): - """Computes the interquartile of a sequence. - - xs: sequence or anything else that can initialize a Cdf - - returns: pair of floats - """ - cdf = Cdf(xs) - return cdf.Value(0.25), cdf.Value(0.75) - - -def PearsonMedianSkewness(xs): - """Computes the Pearson median skewness. - """ - median = Median(xs) - mean = RawMoment(xs, 1) - var = CentralMoment(xs, 2) - std = math.sqrt(var) - gp = 3 * (mean - median) / std - return gp - - -class FixedWidthVariables(object): - """Represents a set of variables in a fixed width file.""" - - def __init__(self, variables, index_base=0): - """Initializes. - - variables: DataFrame - index_base: are the indices 0 or 1 based? - - Attributes: - colspecs: list of (start, end) index tuples - names: list of string variable names - """ - self.variables = variables - - # note: by default, subtract 1 from colspecs - self.colspecs = variables[['start', 'end']] - index_base - - # convert colspecs to a list of pair of int - self.colspecs = self.colspecs.astype(np.int).values.tolist() - self.names = variables['name'] - - def ReadFixedWidth(self, filename, **options): - """Reads a fixed width ASCII file. - - filename: string filename - - returns: DataFrame - """ - df = pandas.read_fwf(filename, - colspecs=self.colspecs, - names=self.names, - **options) - return df - - -def ReadStataDct(dct_file, **options): - """Reads a Stata dictionary file. - - dct_file: string filename - options: dict of options passed to open() - - returns: FixedWidthVariables object - """ - type_map = dict(byte=int, int=int, long=int, float=float, - double=float, numeric=float) - - var_info = [] - with open(dct_file, **options) as f: - for line in f: - match = re.search( r'_column\(([^)]*)\)', line) - if not match: - continue - start = int(match.group(1)) - t = line.split() - vtype, name, fstring = t[1:4] - name = name.lower() - if vtype.startswith('str'): - vtype = str - else: - vtype = type_map[vtype] - long_desc = ' '.join(t[4:]).strip('"') - var_info.append((start, vtype, name, fstring, long_desc)) - - columns = ['start', 'type', 'name', 'fstring', 'desc'] - variables = pandas.DataFrame(var_info, columns=columns) - - # fill in the end column by shifting the start column - variables['end'] = variables.start.shift(-1) - variables.loc[len(variables)-1, 'end'] = -1 - - dct = FixedWidthVariables(variables, index_base=1) - return dct - - -def Resample(xs, n=None): - """Draw a sample from xs with the same length as xs. - - xs: sequence - n: sample size (default: len(xs)) - - returns: NumPy array - """ - if n is None: - n = len(xs) - return np.random.choice(xs, n, replace=True) - - -def SampleRows(df, nrows, replace=False): - """Choose a sample of rows from a DataFrame. - - df: DataFrame - nrows: number of rows - replace: whether to sample with replacement - - returns: DataDf - """ - indices = np.random.choice(df.index, nrows, replace=replace) - sample = df.loc[indices] - return sample - - -def ResampleRows(df): - """Resamples rows from a DataFrame. - - df: DataFrame - - returns: DataFrame - """ - return SampleRows(df, len(df), replace=True) - - -def ResampleRowsWeighted(df, column='finalwgt'): - """Resamples a DataFrame using probabilities proportional to given column. - - df: DataFrame - column: string column name to use as weights - - returns: DataFrame - """ - weights = df[column].copy() - weights /= sum(weights) - indices = np.random.choice(df.index, len(df), replace=True, p=weights) - sample = df.loc[indices] - return sample - - -def PercentileRow(array, p): - """Selects the row from a sorted array that maps to percentile p. - - p: float 0--100 - - returns: NumPy array (one row) - """ - rows, cols = array.shape - index = int(rows * p / 100) - return array[index,] - - -def PercentileRows(ys_seq, percents): - """Given a collection of lines, selects percentiles along vertical axis. - - For example, if ys_seq contains simulation results like ys as a - function of time, and percents contains (5, 95), the result would - be a 90% CI for each vertical slice of the simulation results. - - ys_seq: sequence of lines (y values) - percents: list of percentiles (0-100) to select - - returns: list of NumPy arrays, one for each percentile - """ - nrows = len(ys_seq) - ncols = len(ys_seq[0]) - array = np.zeros((nrows, ncols)) - - for i, ys in enumerate(ys_seq): - array[i,] = ys - - array = np.sort(array, axis=0) - - rows = [PercentileRow(array, p) for p in percents] - return rows - - -def Smooth(xs, sigma=2, **options): - """Smooths a NumPy array with a Gaussian filter. - - xs: sequence - sigma: standard deviation of the filter - """ - return ndimage.filters.gaussian_filter1d(xs, sigma, **options) - - -class HypothesisTest(object): - """Represents a hypothesis test.""" - - def __init__(self, data): - """Initializes. - - data: data in whatever form is relevant - """ - self.data = data - self.MakeModel() - self.actual = self.TestStatistic(data) - self.test_stats = None - self.test_cdf = None - - def PValue(self, iters=1000): - """Computes the distribution of the test statistic and p-value. - - iters: number of iterations - - returns: float p-value - """ - self.test_stats = [self.TestStatistic(self.RunModel()) - for _ in range(iters)] - self.test_cdf = Cdf(self.test_stats) - - count = sum(1 for x in self.test_stats if x >= self.actual) - return count / iters - - def MaxTestStat(self): - """Returns the largest test statistic seen during simulations. - """ - return max(self.test_stats) - - def PlotCdf(self, label=None): - """Draws a Cdf with vertical lines at the observed test stat. - """ - def VertLine(x): - """Draws a vertical line at x.""" - thinkplot.Plot([x, x], [0, 1], color='0.8') - - VertLine(self.actual) - thinkplot.Cdf(self.test_cdf, label=label) - - def TestStatistic(self, data): - """Computes the test statistic. - - data: data in whatever form is relevant - """ - raise UnimplementedMethodException() - - def MakeModel(self): - """Build a model of the null hypothesis. - """ - pass - - def RunModel(self): - """Run the model of the null hypothesis. - - returns: simulated data - """ - raise UnimplementedMethodException() - - -def main(): - pass - - -if __name__ == '__main__': - main()