diff --git a/.coveragerc b/.coveragerc index 3f381eb..468c02e 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,2 +1,2 @@ [run] -omit = sparsity/test/*, */__init__.py \ No newline at end of file +omit = sparsity/test/*, */__init__.py, */_version.py \ No newline at end of file diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..e678b57 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +sparsity/_version.py export-subst diff --git a/.gitignore b/.gitignore index ed66943..da4c630 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ build/ *.so traildb_sparse.c __pycache__ +*.egg-info diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..54be32b --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include versioneer.py +include sparsity/_version.py diff --git a/circle.yml b/circle.yml index c86a9a2..8cfa0ac 100644 --- a/circle.yml +++ b/circle.yml @@ -11,10 +11,11 @@ dependencies: - pip install numpy cython 2>&1 - pip install pytest pytest-cov - pip install -v scipy pandas - - pip install dask[dataframe] + - pip install dask[dataframe] moto test: override: - - pip install -e . - - py.test --cov sparsity --cov-report xml sparsity/test + - pip install . + - python -c 'import sparsity' + - py.test --cov sparsity --cov-report xml sparsity post: - bash <(curl -s https://codecov.io/bash) \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..275e8e5 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[versioneer] +VCS=git +style=pep440 +versionfile_source=sparsity/_version.py +versionfile_build=sparsity/_version.py +tag_prefix=v diff --git a/setup.py b/setup.py index be9ee18..7ffa576 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,5 @@ +import os +import versioneer from distutils.core import setup, Extension from setuptools import find_packages @@ -15,17 +17,26 @@ ext_modules = cythonize([ext]) except (ImportError, OSError): ext_modules = None + +packages = find_packages() +packages.remove('sparsity.test') + setup( name='sparsity', - version='0.5.1', + version=versioneer.get_version(), ext_modules = ext_modules, author='Alan Hoeng', author_email='alan.f.hoeng@gmail.com', - packages=find_packages(), + packages=packages, + cmdclass=versioneer.get_cmdclass(), install_requires=[ - 'pandas>=0.19.2', + 'pandas>=0.19.0', 'scipy>=0.18.1', - 'numpy>=1.12.0' + 'numpy>=1.12.0', + 's3fs>=0.1.0' ], + test_requires=[ + 'moto' + ], zip_safe=False ) \ No newline at end of file diff --git a/sparsity/__init__.py b/sparsity/__init__.py index 7983873..280cc31 100644 --- a/sparsity/__init__.py +++ b/sparsity/__init__.py @@ -1 +1,4 @@ from sparsity.sparse_frame import SparseFrame, sparse_one_hot +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions diff --git a/sparsity/_version.py b/sparsity/_version.py new file mode 100644 index 0000000..0f6e225 --- /dev/null +++ b/sparsity/_version.py @@ -0,0 +1,520 @@ + +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.18 (https://github.com/warner/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "$Format:%d$" + git_full = "$Format:%H$" + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "pep440" + cfg.tag_prefix = "v" + cfg.parentdir_prefix = "None" + cfg.versionfile_source = "sparsity/_version.py" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} diff --git a/sparsity/dask/core.py b/sparsity/dask/core.py index 617e5df..7d7c07b 100644 --- a/sparsity/dask/core.py +++ b/sparsity/dask/core.py @@ -1,19 +1,27 @@ -from scipy import sparse - import dask +import dask.dataframe as dd +import numpy as np import pandas as pd from dask import threaded from dask.base import normalize_token, tokenize -from dask.dataframe.utils import make_meta as dd_make_meta, _nonempty_index +from dask.dataframe import methods +from dask.dataframe.core import (Scalar, Series, _emulate, _extract_meta, + _Frame, _maybe_from_pandas, apply, funcname, + no_default, partial, partial_by_order) +from dask.dataframe.utils import make_meta as dd_make_meta +from dask.dataframe.utils import _nonempty_index from dask.delayed import Delayed from dask.optimize import cull -from toolz import merge +from scipy import sparse +from toolz import merge, remove import sparsity as sp from sparsity.dask.indexing import _LocIndexer def _make_meta(inp): + if isinstance(inp, sp.SparseFrame) and inp.empty: + return inp if isinstance(inp, sp.SparseFrame): return inp.iloc[:0] else: @@ -65,12 +73,32 @@ def npartitions(self): def _meta_nonempty(self): return _meta_nonempty(self._meta) + @property + def columns(self): + return self._meta.columns + + @property + def index(self): + return self._meta.index + def map_partitions(self, func, meta, *args, **kwargs): return map_partitions(func, self, meta, *args, **kwargs) def to_delayed(self): return [Delayed(k, self.dask) for k in self._keys()] + def assign(self, **kwargs): + for k, v in kwargs.items(): + if not (isinstance(v, (Series, Scalar, pd.Series)) or + np.isscalar(v)): + raise TypeError("Column assignment doesn't support type " + "{0}".format(type(v).__name__)) + pairs = list(sum(kwargs.items(), ())) + + # Figure out columns of the output + df2 = self._meta.assign(**_extract_meta(kwargs)) + return elemwise(methods.assign, self, *pairs, meta=df2) + def _keys(self): return [(self._name, i) for i in range(self.npartitions)] @@ -110,6 +138,79 @@ def __repr__(self): ) +def is_broadcastable(dfs, s): + """ + This Series is broadcastable against another dataframe in the sequence + """ + return (isinstance(s, Series) and + s.npartitions == 1 and + s.known_divisions and + any(s.divisions == (min(df.columns), max(df.columns)) + for df in dfs if isinstance(df, (SparseFrame, dd.DataFrame)))) + + +def elemwise(op, *args, **kwargs): + """ Elementwise operation for dask.Sparseframes + + Parameters + ---------- + op: function + Function that takes as first parameter the underlying df + args: + Contains Dataframes + kwargs: + Contains meta. + """ + meta = kwargs.pop('meta', no_default) + + _name = funcname(op) + '-' + tokenize(op, kwargs, *args) + + # if pd.Series or pd.DataFrame change to dd.DataFrame + args = _maybe_from_pandas(args) + + # Align DataFrame blocks if divisions are different. + from .multi import _maybe_align_partitions # to avoid cyclical import + args = _maybe_align_partitions(args) + + # extract all dask instances + dasks = [arg for arg in args if isinstance(arg, (SparseFrame, _Frame, + Scalar))] + # extract all dask frames + dfs = [df for df in dasks if isinstance(df, (_Frame, SparseFrame))] + + # We take divisions from the first dask frame + divisions = dfs[0].divisions + + _is_broadcastable = partial(is_broadcastable, dfs) + dfs = list(remove(_is_broadcastable, dfs)) + n = len(divisions) - 1 + + other = [(i, arg) for i, arg in enumerate(args) + if not isinstance(arg, (_Frame, Scalar, SparseFrame))] + + # Get dsks graph tuple keys and adjust the key length of Scalar + keys = [d._keys() * n if isinstance(d, Scalar) or _is_broadcastable(d) + else d._keys() for d in dasks] + + if other: + dsk = {(_name, i): + (apply, partial_by_order, list(frs), + {'function': op, 'other': other}) + for i, frs in enumerate(zip(*keys))} + else: + dsk = {(_name, i): (op,) + frs for i, frs in enumerate(zip(*keys))} + dsk = merge(dsk, *[d.dask for d in dasks]) + + if meta is no_default: + if len(dfs) >= 2 and len(dasks) != len(dfs): + # should not occur in current funcs + msg = 'elemwise with 2 or more DataFrames and Scalar is not supported' + raise NotImplementedError(msg) + meta = _emulate(op, *args, **kwargs) + + return SparseFrame(dsk, _name, meta, divisions) + + def map_partitions(func, ddf, meta, **kwargs): dsk = {} name = func.__name__ diff --git a/sparsity/dask/multi.py b/sparsity/dask/multi.py new file mode 100644 index 0000000..fde6e04 --- /dev/null +++ b/sparsity/dask/multi.py @@ -0,0 +1,80 @@ +from sparsity.dask.core import SparseFrame +from functools import partial +from dask.dataframe.core import is_broadcastable, _Frame +from toolz import unique, merge_sorted + + +def align_partitions(*dfs): + """ Mutually partition and align DataFrame blocks + + This serves as precursor to multi-dataframe operations like join, concat, + or merge. + + Parameters + ---------- + dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar + Sequence of dataframes to be aligned on their index + + Returns + ------- + dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar + These must have consistent divisions with each other + divisions: tuple + Full divisions sequence of the entire result + result: list + A list of lists of keys that show which data exist on which + divisions + """ + _is_broadcastable = partial(is_broadcastable, dfs) + dfs1 = [df for df in dfs + if isinstance(df, (_Frame, SparseFrame)) and + not _is_broadcastable(df)] + if len(dfs) == 0: + raise ValueError("dfs contains no DataFrame and Series") + if not all(df.known_divisions for df in dfs1): + raise ValueError("Not all divisions are known, can't align " + "partitions. Please use `set_index` or " + "`set_partition` to set the index.") + + divisions = list(unique(merge_sorted(*[df.divisions for df in dfs1]))) + dfs2 = [df.repartition(divisions, force=True) + if isinstance(df, (_Frame, SparseFrame)) else df for df in dfs] + + result = list() + inds = [0 for df in dfs] + for d in divisions[:-1]: + L = list() + for i, df in enumerate(dfs2): + if isinstance(df, (_Frame, SparseFrame)): + j = inds[i] + divs = df.divisions + if j < len(divs) - 1 and divs[j] == d: + L.append((df._name, inds[i])) + inds[i] += 1 + else: + L.append(None) + else: # Scalar has no divisions + L.append(None) + result.append(L) + return dfs2, tuple(divisions), result + + +def _maybe_align_partitions(args): + """Align DataFrame blocks if divisions are different. + + Note that if all divisions are unknown, but have equal npartitions, then + they will be passed through unchanged. This is different than + `align_partitions`, which will fail if divisions aren't all known""" + _is_broadcastable = partial(is_broadcastable, args) + dfs = [df for df in args + if isinstance(df, (_Frame, SparseFrame)) and + not _is_broadcastable(df)] + if not dfs: + return args + + divisions = dfs[0].divisions + if not all(df.divisions == divisions for df in dfs): + dfs2 = iter(align_partitions(*dfs)[0]) + return [a if not isinstance(a, (_Frame, SparseFrame)) + else next(dfs2) for a in args] + return args \ No newline at end of file diff --git a/sparsity/dask/reshape.py b/sparsity/dask/reshape.py index 9c5a5ef..bfb686c 100644 --- a/sparsity/dask/reshape.py +++ b/sparsity/dask/reshape.py @@ -1,41 +1,84 @@ +import warnings +from collections import OrderedDict + +import numpy as np + import sparsity as sp from sparsity import sparse_one_hot from sparsity.dask import SparseFrame -import pandas as pd -import numpy as np -def one_hot_encode(ddf, column, - categories, index_col): + +def one_hot_encode(ddf, column=None, categories=None, index_col=None, + order=None, prefixes=False): """ - Sparse one hot encoding of dask.DataFrame + Sparse one hot encoding of dask.DataFrame. - Convert a dask.DataFrame into a series of SparseFrames. By one hot - encoding a single column + Convert a dask.DataFrame into a series of SparseFrames by one-hot + encoding specified columns. Parameters ---------- ddf: dask.DataFrame e.g. the clickstream - column: str - column name to one hot encode in with SparseFrame - categories: iterable - possible category values - index_col: str, iterable + categories: dict + Maps column name -> iterable of possible category values. + See description of `order`. + index_col: str | iterable which columns to use as index + order: iterable + Specify order in which one-hot encoded columns should be aligned. + + If `order = [col_name1, col_name2]` + and `categories = {col_name1: ['A', 'B'], col_name2: ['C', 'D']}`, + then the resulting SparseFrame will have columns + `['A', 'B', 'C', 'D']`. + + If you don't specify order, then output columns' order depends on + iteration over `categories` dictionary. You can pass `categories` + as an OrderedDict instead of providing `order` explicitly. + prefixes: bool + If False, column names will be the same as categories, + so that new columns will be named like: + [cat11, cat12, cat21, cat22, ...]. + + If True, original column name followed by an underscore will be added + in front of each category name, so that new columns will be named like: + [col1_cat11, col1_cat12, col2_cat21, col2_cat22, ...]. + column: DEPRECATED + Kept only for backward compatibility. Returns ------- - sparse_one_hot: dask.Series + sparse_one_hot: sparsity.dask.SparseFrame """ + if column is not None: + warnings.warn( + '`column` argument of sparsity.dask.reshape.one_hot_encode ' + 'function is deprecated.' + ) + if order is not None: + raise ValueError('`order` and `column` arguments cannot be used ' + 'together.') + categories = {column: categories} + idx_meta = ddf._meta.reset_index().set_index(index_col).index[:0] \ if index_col else ddf._meta.index - meta = sp.SparseFrame(np.array([]), columns=categories, - index=idx_meta) + + if order is not None: + categories = OrderedDict([(column, categories[column]) + for column in order]) + + columns = sparse_one_hot(ddf._meta, + categories=categories, + index_col=index_col, + prefixes=prefixes).columns + meta = sp.SparseFrame(np.array([]), columns=columns, + index=idx_meta) dsf = ddf.map_partitions(sparse_one_hot, - column=column, categories=categories, index_col=index_col, + prefixes=prefixes, meta=object) - return SparseFrame(dsf.dask, dsf._name, meta, dsf.divisions) \ No newline at end of file + return SparseFrame(dsf.dask, dsf._name, meta, dsf.divisions) diff --git a/sparsity/io.py b/sparsity/io.py index 35d8fb7..83d07d3 100644 --- a/sparsity/io.py +++ b/sparsity/io.py @@ -1,4 +1,8 @@ +from io import BytesIO + import numpy as np +import pandas as pd +from s3fs import S3FileSystem from scipy import sparse try: @@ -23,19 +27,43 @@ def traildb_to_coo(db, fieldname): return uuids, timestamps, cols,\ sparse.coo_matrix((np.ones(num_events), (r_idx, c_idx))) + def to_npz(sf, filename): data = _csr_to_dict(sf.data) + data['metadata'] = \ + {'multiindex': True if isinstance(sf.index, pd.MultiIndex) else False} data['frame_index'] = sf.index.values data['frame_columns'] = sf.columns.values - np.savez(filename, **data) + if not filename.endswith('.npz'): + filename += '.npz' + if not filename.startswith('s3://'): + fp = open(filename, 'wb') + np.savez(fp, **data) + else: + _save_npz_s3(data, filename) + + +def _save_npz_s3(data, filename): + buffer = BytesIO() + np.savez(buffer, **data) + buffer.seek(0) + fs = S3FileSystem() + fp = fs.open(filename, 'wb') + fp.write(buffer.read()) + def read_npz(filename): - loader = np.load(filename) + open_f = open if not filename.startswith('s3://') \ + else S3FileSystem().open + fp = open_f(filename, 'rb') + + loader = np.load(fp) csr_mat = _load_csr(loader) - idx = loader['frame_index'] + idx = _load_idx_from_npz(loader) cols = loader['frame_columns'] return (csr_mat, idx, cols) + def _csr_to_dict(array): return dict(data = array.data ,indices=array.indices, indptr =array.indptr, shape=array.shape) @@ -44,4 +72,24 @@ def _load_csr(loader): return sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']), - shape=loader['shape']) \ No newline at end of file + shape=loader['shape']) + + +def _load_idx_from_npz(loader): + idx = loader['frame_index'] + try: + if loader['metadata'][()]['multiindex']: + idx = pd.MultiIndex.from_tuples(idx) + except KeyError: + if all(map(lambda x: isinstance(x, tuple), idx)): + idx = pd.MultiIndex.from_tuples(idx) + return idx + + +def _just_read_array(path): + if path.endswith('hdf') or path.endswith('hdf5'): + return pd.read_hdf(path, '/df').values + elif path.endswith('csv'): + return pd.read_csv(path).values + elif path.endswith('pickle'): + return pd.read_pickle(path).values \ No newline at end of file diff --git a/sparsity/sparse_frame.py b/sparsity/sparse_frame.py index d9479e9..2576aaa 100644 --- a/sparsity/sparse_frame.py +++ b/sparsity/sparse_frame.py @@ -1,16 +1,20 @@ # coding=utf-8 import traceback -from functools import partial - -import pandas as pd -import numpy as np import uuid -from functools import reduce +import warnings +from collections import OrderedDict +from functools import partial, reduce -from pandas.core.common import _default_index +import numpy as np +import pandas as pd from pandas.api import types -from pandas.indexes.base import _ensure_index -from sparsity.io import to_npz, read_npz +from pandas.core.common import _default_index + +try: + from pandas.indexes.base import _ensure_index +except ImportError: + from pandas.core.indexes.base import _ensure_index +from sparsity.io import to_npz, read_npz, _just_read_array from scipy import sparse try: @@ -23,7 +27,7 @@ def _is_empty(data): try: - if data.nnz == 0: + if any(map(lambda x: x== 0, data.shape)): return True else: return False @@ -77,6 +81,7 @@ def __init__(self, data, index=None, columns=None, **kwargs): "\nThe error described above occurred while " "converting data to sparse matrix.") else: + self.empty = True if _is_empty(data) else False self._init_csr(data) # register indexers @@ -110,15 +115,22 @@ def todense(self, pandas=True): if self.shape[0] == 1 or self.shape[1] == 1: dense = dense.reshape(-1) - if pandas == True: + + if pandas: if self.empty: dense = pd.DataFrame([], columns=self.columns, index=self._index[:0]) - elif len(dense.shape) == 1: + elif len(dense.shape) == 1 and \ + self.data.shape[1] == 1: dense = pd.Series(dense, index=self.index, name=self.columns[0]) + elif len(dense.shape) == 1 and \ + self.data.shape[1] > 1: + dense = pd.DataFrame(dense.reshape(1, -1), index=self.index, + columns=self.columns) else: - dense = pd.DataFrame(dense, index=self.index, + idx = np.broadcast_to(self.index, dense.shape[0]) + dense = pd.DataFrame(dense, index=idx, columns=self.columns) return dense @@ -146,9 +158,6 @@ def sum(self, *args, **kwargs): def mean(self, *args, **kwargs): return self.data.mean(*args, **kwargs) - def std(self, *args, **kwargs): - return self.data.std(*args, **kwargs) - def max(self, *args, **kwargs): return self.data.max(*args, **kwargs) @@ -159,6 +168,27 @@ def copy(self, *args, **kwargs): return SparseFrame(self.data.copy(*args, **kwargs), self.index.copy(*args, **kwargs), self.columns.copy(*args, **kwargs)) + + def multiply(self, other, axis='columns'): + """ + To multiply row-wise 'other' should be of shape: (self.shape[0], 1) + To multiply col-wise 'other should be of shape: (1, self.shape[1]) + """ + try: + other = other.toarray() + except AttributeError: + pass + + if axis in [0, 'index']: + other = np.asarray(other).reshape(1, -1) + else: + other = np.asarray(other).reshape(-1, 1) + + data = self.data.multiply(other) + assert data.shape == self.data.shape, \ + "Data shapes miss-match: {}, {}".format(data.shape,self.data.shape) + return SparseFrame(data, self.index, self.columns) + def nnz(self): return self.data.nnz @@ -173,6 +203,15 @@ def take(self, idx, axis=0, **kwargs): index=self.index, columns=self.columns[idx]) + def _take(self, *args, **kwargs): + """ + This function is to mimic pandas api (0.21.0) + and support indexing. + + See https://github.com/pandas-dev/pandas/commit/458c1dc81b7e6f90180b06179ac91d9ed868cb05 + """ + return self.take(*args, **kwargs) + def _xs(self, key, *args, **kwargs): """Used for label based indexing.""" loc = self.index.get_loc(key) @@ -192,13 +231,23 @@ def data(self): return self._data return self._data[:-1,:] - # backwards comptability + # backwards compatibility def groupby(self, by=None, level=0): return self.groupby_sum(by, level) + def groupby_agg(self, by=None, level=None, agg_func=None): + by = self._get_groupby_col(by, level) + groups = pd.Index(np.arange(self.shape[0])).groupby(by) + res = sparse.csr_matrix((len(groups), self.shape[1])) + new_idx = [] + for i, (name, indizes) in enumerate(groups.items()): + new_idx.append(self.index.values[indizes[0]]) + res[i] = agg_func(self.data[indizes.values,:]) + return SparseFrame(res, index=new_idx) + def groupby_sum(self, by=None, level=0): """ - Sparse groupby sum aggregation. + Optimized sparse groupby sum aggregation. Simple operation using sparse matrix multiplication. Expects result to be sparse aswell. @@ -212,23 +261,37 @@ def groupby_sum(self, by=None, level=0): Returns ------- - df: sparcity.SparseFrame + df: sparsity.SparseFrame Grouped by and summed SparseFrame. """ - if by is not None and by is not "index": - assert len(by) == self.data.shape[0] - by = np.array(by) + by = self._get_groupby_col(by, level) + group_idx = by.argsort() + gm = _create_group_matrix(by[group_idx]) + grouped_data = self._data[group_idx, :].T.dot(gm).T + return SparseFrame(grouped_data, index=np.unique(by), columns=self._columns) + + + def _get_groupby_col(self, by, level): + if by is None and level is None: + raise ValueError("You have to supply one of 'by' and 'level'") + if by is not None: + try: + if by in self._columns: + by = self[by].toarray() + except TypeError: + assert len(by) == self.data.shape[0] + by = np.array(by) else: if level and isinstance(self._index, pd.MultiIndex): by = self.index.get_level_values(level).values - elif level: - raise ValueError("Connot use level in a non MultiIndex Frame") + elif level == 0: + by = np.asarray(self._index) + elif level > 0: + raise ValueError( + "Connot use level > 0 in a non MultiIndex Frame") else: by = self.index.values - group_idx = by.argsort() - gm = _create_group_matrix(by[group_idx]) - grouped_data = self._data[group_idx, :].T.dot(gm).T - return SparseFrame(grouped_data, index=np.unique(by), columns=self._columns) + return by def join(self, other, axis=1, how='outer', level=None): """ @@ -251,7 +314,7 @@ def join(self, other, axis=1, how='outer', level=None): """ if isinstance(self._index, pd.MultiIndex)\ or isinstance(other._index, pd.MultiIndex): - raise NotImplementedError() + raise NotImplementedError('MultiIndex not supported.') if not isinstance(other, SparseFrame): other = SparseFrame(other) if axis not in set([0, 1]): @@ -316,7 +379,14 @@ def sort_index(self): index = self._index[passive_sort_idx] return SparseFrame(data, index=index) - def add(self, other, how='outer'): + def fillna(self, value): + """Replace NaN values in explicitly stored data with `value`.""" + _data = self._data.copy() + _data.data[np.isnan(self._data.data)] = value + return SparseFrame(data=_data[:-1, :], + index=self.index, columns=self.columns) + + def add(self, other, how='outer', **kwargs): """ Aligned addition. Adds two tables by aligning them first. @@ -451,6 +521,18 @@ def _single_assign(self, key, value): new_cols, new_data = self._add_col(key, value) return SparseFrame(new_data, index=self.index, columns=new_cols) + def drop(self, labels, axis=0): + """Drop label(s) from given axis. Currently works only for columns. + """ + if not isinstance(labels, (list, tuple, set)): + labels = [labels] + if axis == 1: + mask = np.logical_not(self.columns.isin(labels)) + sf = self[self.columns[mask].tolist()] + else: + raise NotImplementedError + return sf + def drop_duplicate_idx(self, **kwargs): """Drop rows with duplicated index.""" mask = ~self.index.duplicated(**kwargs) @@ -464,7 +546,7 @@ def __getitem__(self, item): for key in item: idx.append(self.columns.get_loc(key)) return SparseFrame(self.data[:,idx], index=self.index, - columns=[item]) + columns=item) def dropna(self): """Drop nans from index.""" @@ -564,19 +646,50 @@ def _create_group_matrix(group_idx, dtype='f8'): dtype=dtype).tocsr() -def sparse_one_hot(df, column, categories, dtype='f8', index_col=None): +def sparse_one_hot(df, column=None, categories=None, dtype='f8', + index_col=None, order=None, prefixes=False): """ - One-hot encode a single column of a pandas.DataFrame. + One-hot encode specified columns of a pandas.DataFrame. Returns a SparseFrame. + + See the documentation of :func:`sparsity.dask.reshape.one_hot_encode`. """ - cols, csr = _one_hot_series_csr(categories, dtype, df[column]) + if column is not None: + warnings.warn( + '`column` argument of sparsity.sparse_frame.sparse_one_hot ' + 'function is deprecated.' + ) + if order is not None: + raise ValueError('`order` and `column` arguments cannot be used ' + 'together.') + categories = {column: categories} + + if order is not None: + categories = OrderedDict([(column, categories[column]) + for column in order]) + + new_cols = [] + csrs = [] + for column, column_cat in categories.items(): + if isinstance(column_cat, str): + column_cat = _just_read_array(column_cat) + cols, csr = _one_hot_series_csr(column_cat, dtype, df[column]) + if prefixes: + cols = list(map(lambda x: '{}_{}'.format(column, x), cols)) + new_cols.extend(cols) + csrs.append(csr) + if len(set(new_cols)) < len(new_cols): + raise ValueError('Different columns have same categories. This would ' + 'result in duplicated column names. ' + 'Set `prefix` to True to manage this situation.') + new_data = sparse.hstack(csrs, format='csr') if not isinstance(index_col, list): new_index = df[index_col] if index_col else df.index else: df = df.reset_index() new_index = pd.MultiIndex.from_arrays(df[index_col].values.T) - return SparseFrame(csr, index=new_index, columns=cols) + return SparseFrame(new_data, index=new_index, columns=new_cols) def _one_hot_series_csr(categories, dtype, oh_col): @@ -598,4 +711,4 @@ def _one_hot_series_csr(categories, dtype, oh_col): data = sparse.coo_matrix((data, (row_indices, col_indices)), shape=(n_samples, n_features), dtype=dtype).tocsr() - return cat.categories.values, data \ No newline at end of file + return cat.categories.values, data diff --git a/sparsity/test/__init__.py b/sparsity/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sparsity/test/conftest.py b/sparsity/test/conftest.py index 165ac2b..9752d7f 100644 --- a/sparsity/test/conftest.py +++ b/sparsity/test/conftest.py @@ -1,9 +1,11 @@ import os -import pytest +import shutil +import tempfile +from contextlib import contextmanager import numpy as np import pandas as pd - +import pytest import sparsity @@ -16,7 +18,19 @@ def testdb(): def clickstream(): df = pd.DataFrame(dict( page_id=np.random.choice(list('ABCDE'), size=100), + other_categorical=np.random.choice(list('FGHIJ'), size=100), id=np.random.choice([1,2,3,4,5,6,7,8,9], size=100) ), index=pd.date_range("2016-01-01", periods=100)) - return df \ No newline at end of file + return df + + +@contextmanager +def tmpdir(dir=None): + dirname = tempfile.mkdtemp(dir=dir) + + try: + yield dirname + finally: + if os.path.exists(dirname): + shutil.rmtree(dirname, ignore_errors=True) diff --git a/sparsity/test/test_dask_sparse_frame.py b/sparsity/test/test_dask_sparse_frame.py index 4487307..efb167d 100644 --- a/sparsity/test/test_dask_sparse_frame.py +++ b/sparsity/test/test_dask_sparse_frame.py @@ -1,33 +1,26 @@ -import shutil -import tempfile +import datetime as dt import os -from contextlib import contextmanager import dask +import dask.dataframe as dd +import numpy as np +import pandas as pd import pytest - import sparsity as sp import sparsity.dask as dsp -import pandas as pd -import numpy as np -import dask.dataframe as dd - +from dask.async import get_sync +from sparsity import sparse_one_hot from sparsity.dask.reshape import one_hot_encode -dask.context.set_options(get=dask.async.get_sync) - - -@contextmanager -def tmpdir(dir=None): - dirname = tempfile.mkdtemp(dir=dir) +from .conftest import tmpdir - try: - yield dirname - finally: - if os.path.exists(dirname): - shutil.rmtree(dirname, ignore_errors=True) +dask.context.set_options(get=dask.async.get_sync) +@pytest.fixture +def dsf(): + return dsp.from_pandas(pd.DataFrame(np.random.rand(10,2)), + npartitions=3) def test_from_pandas(): dsf = dsp.from_pandas(pd.DataFrame(np.random.rand(10,2)), @@ -59,11 +52,39 @@ def test_loc(iindexer, correct_shape): df = pd.DataFrame(np.random.rand(10, 2), index=list('ABCDEFGHIJ')) dsf = dsp.from_pandas(df, npartitions=2) - res = dsf.loc[iindexer].compute() + fut = dsf.loc[iindexer] + assert fut._meta.empty + res = fut.compute() assert isinstance(res, sp.SparseFrame) assert res.shape == correct_shape +def test_dask_loc(clickstream): + sf = dd.from_pandas(clickstream, npartitions=10) \ + .map_partitions( + sparse_one_hot, + categories={'page_id': list('ABCDE')}, + meta=list + ) + + res = sf.loc['2016-01-15':'2016-02-15'] + res = sp.SparseFrame.concat(res.compute(get=get_sync).tolist()) + assert res.index.date.max() == dt.date(2016, 2, 15) + assert res.index.date.min() == dt.date(2016, 1, 15) + + +def test_dask_multi_index_loc(clickstream): + sf = dd.from_pandas(clickstream, npartitions=10) \ + .map_partitions( + sparse_one_hot, + index_col=['index', 'id'], + categories={'page_id': list('ABCDE')}, + meta=list + ) + res = sf.loc['2016-01-15':'2016-02-15'] + res = sp.SparseFrame.vstack(res.compute(get=get_sync).tolist()) + assert res.index.get_level_values(0).date.min() == dt.date(2016, 1, 15) + assert res.index.get_level_values(0).date.max() == dt.date(2016, 2, 15) def test_repr(): dsf = dsp.from_pandas(pd.DataFrame(np.random.rand(10, 2)), @@ -75,16 +96,90 @@ def test_repr(): assert isinstance(dsf.__repr__(), str) -def test_one_hot(clickstream): +def test_one_hot_legacy(clickstream): ddf = dd.from_pandas(clickstream, npartitions=10) - dsf = one_hot_encode(ddf, column='page_id', - categories=list('ABCDE'), - index_col=['index', 'id']) + dsf = one_hot_encode(ddf, 'page_id', list('ABCDE'), ['index', 'id']) + assert dsf._meta.empty sf = dsf.compute() assert sf.shape == (100, 5) assert isinstance(sf.index, pd.MultiIndex) +def test_one_hot_no_order(clickstream): + ddf = dd.from_pandas(clickstream, npartitions=10) + dsf = one_hot_encode(ddf, + categories={'page_id': list('ABCDE'), + 'other_categorical': list('FGHIJ')}, + index_col=['index', 'id']) + assert dsf._meta.empty + assert sorted(dsf.columns) == list('ABCDEFGHIJ') + sf = dsf.compute() + assert sf.shape == (100, 10) + assert isinstance(sf.index, pd.MultiIndex) + assert sorted(sf.columns) == list('ABCDEFGHIJ') + + +def test_one_hot_prefixes(clickstream): + ddf = dd.from_pandas(clickstream, npartitions=10) + dsf = one_hot_encode(ddf, + categories={'page_id': list('ABCDE'), + 'other_categorical': list('FGHIJ')}, + index_col=['index', 'id'], + prefixes=True) + correct_columns = list(map(lambda x: 'page_id_' + x, list('ABCDE'))) \ + + list(map(lambda x: 'other_categorical_' + x, list('FGHIJ'))) + assert dsf._meta.empty + assert sorted(dsf.columns) == sorted(correct_columns) + sf = dsf.compute() + assert sf.shape == (100, 10) + assert isinstance(sf.index, pd.MultiIndex) + assert sorted(sf.columns) == sorted(correct_columns) + + +def test_one_hot_order1(clickstream): + ddf = dd.from_pandas(clickstream, npartitions=10) + dsf = one_hot_encode(ddf, + categories={'page_id': list('ABCDE'), + 'other_categorical': list('FGHIJ')}, + order=['page_id', 'other_categorical'], + index_col=['index', 'id']) + assert dsf._meta.empty + assert all(dsf.columns == list('ABCDEFGHIJ')) + sf = dsf.compute() + assert sf.shape == (100, 10) + assert isinstance(sf.index, pd.MultiIndex) + assert all(sf.columns == list('ABCDEFGHIJ')) + + +def test_one_hot_order2(clickstream): + ddf = dd.from_pandas(clickstream, npartitions=10) + dsf = one_hot_encode(ddf, + categories={'page_id': list('ABCDE'), + 'other_categorical': list('FGHIJ')}, + order=['other_categorical', 'page_id'], + index_col=['index', 'id']) + assert dsf._meta.empty + assert all(dsf.columns == list('FGHIJABCDE')) + sf = dsf.compute() + assert sf.shape == (100, 10) + assert isinstance(sf.index, pd.MultiIndex) + assert all(sf.columns == list('FGHIJABCDE')) + + +def test_one_hot_disk_categories(clickstream): + with tmpdir() as tmp: + cat_path = os.path.join(tmp, 'cat.pickle') + pd.Series(list('ABCDE')).to_pickle(cat_path) + ddf = dd.from_pandas(clickstream, npartitions=10) + dsf = one_hot_encode(ddf, + categories={'page_id': cat_path}, + index_col=['index', 'id']) + assert dsf._meta.empty + sf = dsf.compute() + assert sf.shape == (100, 5) + assert isinstance(sf.index, pd.MultiIndex) + + def test_read_npz(): sf = sp.SparseFrame(np.identity(100)) with tmpdir() as tmp: @@ -95,4 +190,17 @@ def test_read_npz(): dsf = dsp.read_npz(os.path.join(tmp, '*.npz')) sf = dsf.compute() - assert np.all(sf.data.toarray() == np.identity(100)) \ No newline at end of file + assert np.all(sf.data.toarray() == np.identity(100)) + + +def test_assign_column(): + s = pd.Series(np.arange(10)) + ds = dd.from_pandas(s, npartitions=2) + + f = pd.DataFrame(np.random.rand(10, 2), columns=['a', 'b']) + dsf = dsp.from_pandas(f, npartitions=2) + + dsf = dsf.assign(new=ds) + assert dsf._meta.empty + sf = dsf.compute() + assert np.all(sf.todense() == f.assign(new=s)) diff --git a/sparsity/test/test_sparse_frame.py b/sparsity/test/test_sparse_frame.py index 8e196cc..61e150c 100644 --- a/sparsity/test/test_sparse_frame.py +++ b/sparsity/test/test_sparse_frame.py @@ -1,15 +1,19 @@ # coding=utf-8 -import os import datetime as dt -import pandas as pd +import os + +#import dask.dataframe as dd +from contextlib import contextmanager -import dask.dataframe as dd import numpy as np +import pandas as pd import pytest -from dask.async import get_sync +from moto import mock_s3 from scipy import sparse - from sparsity import SparseFrame, sparse_one_hot +from sparsity.io import _csr_to_dict + +from .conftest import tmpdir try: import traildb @@ -17,6 +21,41 @@ traildb = False +@contextmanager +def mock_s3_fs(bucket, data=None): + """Mocks an s3 bucket + + Parameters + ---------- + bucket: str + bucket name + data: dict + dictionary with paths relative to bucket and + bytestrings as values. Will mock data in bucket + if supplied. + + Returns + ------- + """ + try: + m = mock_s3() + m.start() + import boto3 + import s3fs + client = boto3.client('s3', region_name='eu-west-1') + client.create_bucket(Bucket=bucket) + if data is not None: + data = data.copy() + for key, value in data.items(): + client.put_object(Bucket=bucket, Key=key, Body=value) + yield + finally: + if data is not None: + for key in data.keys(): + client.delete_object(Bucket=bucket, Key=key) + m.stop() + + # 2017 starts with a sunday @pytest.fixture() def sampledata(): @@ -24,6 +63,8 @@ def gendata(n): sample_data = pd.DataFrame( dict(date=pd.date_range("2017-01-01", periods=n))) sample_data["weekday"] = sample_data.date.dt.weekday_name + sample_data["weekday_abbr"] = sample_data.weekday.apply( + lambda x: x[:3]) sample_data["id"] = np.tile(np.arange(7), len(sample_data) // 7 + 1)[ :len(sample_data)] return sample_data @@ -31,6 +72,26 @@ def gendata(n): return gendata +@pytest.fixture() +def weekdays(): + return ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', + 'Friday', 'Saturday'] + + +@pytest.fixture() +def weekdays_abbr(weekdays): + return list(map(lambda x: x[:3], weekdays)) + + +@pytest.fixture() +def groupby_frame(): + shuffle_idx = np.random.permutation(np.arange(100)) + index = np.tile(np.arange(10), 10) + data = np.vstack([np.identity(10) for _ in range(10)]) + t = SparseFrame(data[shuffle_idx, :], index=index[shuffle_idx]) + return t + + @pytest.fixture() def sf_midx(): midx = pd.MultiIndex.from_arrays( @@ -41,17 +102,24 @@ def sf_midx(): sf = SparseFrame(np.identity(5), index=midx, columns=cols) return sf +@pytest.fixture() +def sf_midx_int(): + midx = pd.MultiIndex.from_arrays( + [np.concatenate([np.ones(4), np.zeros(1)]), + pd.date_range("2016-10-01", periods=5)] + ) + cols = list('ABCDE') + sf = SparseFrame(np.identity(5), index=midx, columns=cols) + return sf + def test_empty_init(): sf = SparseFrame(np.array([]), index=[], columns=['A', 'B']) assert sf.data.shape == (0, 2) -def test_groupby(): - shuffle_idx = np.random.permutation(np.arange(100)) - index = np.tile(np.arange(10), 10) - data = np.vstack([np.identity(10) for _ in range(10)]) - t = SparseFrame(data[shuffle_idx, :], index=index[shuffle_idx]) +def test_groupby(groupby_frame): + t = groupby_frame res = t.groupby_sum().data.todense() assert np.all(res == (np.identity(10) * 10)) @@ -171,7 +239,7 @@ def test_loc(): np.identity(5)[:3]) -def test_loc_multi_index(sf_midx): +def test_loc_multi_index(sf_midx, sf_midx_int): assert sf_midx.loc['2016-10-01'].data[0, 0] == 1 @@ -187,6 +255,9 @@ def test_loc_multi_index(sf_midx): assert np.all(sf_midx.loc[dt_slice].data.todense() == np.identity(5)[:3]) + assert np.all(sf_midx_int.loc[1].todense() == sf_midx.data[:4,:]) + assert np.all(sf_midx_int.loc[0].todense() == sf_midx.data[4, :]) + def test_set_index(sf_midx): sf = sf_midx.set_index(level=1) @@ -211,6 +282,26 @@ def test_set_index(sf_midx): # assert np.all(sf.loc[[4, 5]].data.todense() == np.identity(5)[[3, 4]]) +def test_save_load_multiindex(sf_midx): + with tmpdir() as tmp: + # test new + path = os.path.join(tmp, 'sf.npz') + sf_midx.to_npz(path) + res = SparseFrame.read_npz(path) + assert isinstance(res.index, pd.MultiIndex) + + # test backwards compatibility + def _to_npz_legacy(sf, filename): + data = _csr_to_dict(sf.data) + data['frame_index'] = sf.index.values + data['frame_columns'] = sf.columns.values + np.savez(filename, **data) + + _to_npz_legacy(sf_midx, path) + res = SparseFrame.read_npz(path) + assert isinstance(res.index, pd.MultiIndex) + + def test_new_column_assign_array(): sf = SparseFrame(np.identity(5)) sf[6] = np.ones(5) @@ -342,7 +433,19 @@ def test_add_no_overlap(complex_example): assert np.all(res.data.todense() == correct) -def test_csr_one_hot_series(sampledata): +def test_csr_one_hot_series_disk_categories(sampledata): + with tmpdir() as tmp: + categories = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', + 'Thursday', 'Friday', 'Saturday'] + cat_path = os.path.join(tmp, 'bla.pickle') + pd.Series(categories).to_pickle(cat_path) + sparse_frame = sparse_one_hot(sampledata(49), + categories={'weekday': cat_path}) + res = sparse_frame.groupby_sum(np.tile(np.arange(7), 7)).data.todense() + assert np.all(res == np.identity(7) * 7) + + +def test_csr_one_hot_series_legacy(sampledata): categories = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'] sparse_frame = sparse_one_hot(sampledata(49), 'weekday', categories) @@ -350,10 +453,87 @@ def test_csr_one_hot_series(sampledata): assert np.all(res == np.identity(7) * 7) +def test_csr_one_hot_series(sampledata, weekdays, weekdays_abbr): + correct = np.hstack((np.identity(7) * 7, + np.identity(7) * 7)) + + categories = {'weekday': weekdays, + 'weekday_abbr': weekdays_abbr} + + sparse_frame = sparse_one_hot(sampledata(49), categories=categories, + order=['weekday', 'weekday_abbr']) + + res = sparse_frame.groupby_sum(np.tile(np.arange(7), 7)).data.todense() + assert np.all(res == correct) + assert all(sparse_frame.columns == (weekdays + weekdays_abbr)) + + +def test_csr_one_hot_series_other_order(sampledata, weekdays, weekdays_abbr): + + categories = {'weekday': weekdays, + 'weekday_abbr': weekdays_abbr} + + sparse_frame = sparse_one_hot(sampledata(49), categories=categories, + order=['weekday_abbr', 'weekday']) + + assert all(sparse_frame.columns == (weekdays_abbr + weekdays)) + + +def test_csr_one_hot_series_no_order(sampledata, weekdays, weekdays_abbr): + + categories = {'weekday': weekdays, + 'weekday_abbr': weekdays_abbr} + + sparse_frame = sparse_one_hot(sampledata(49), categories=categories) + + assert sorted(sparse_frame.columns) == sorted(weekdays_abbr + weekdays) + + +def test_csr_one_hot_series_prefixes(sampledata, weekdays, weekdays_abbr): + correct = np.hstack((np.identity(7) * 7, + np.identity(7) * 7)) + + categories = {'weekday': weekdays, + 'weekday_abbr': weekdays_abbr} + + sparse_frame = sparse_one_hot(sampledata(49), categories=categories, + order=['weekday', 'weekday_abbr'], + prefixes=True) + + res = sparse_frame.groupby_sum(np.tile(np.arange(7), 7)).data.todense() + assert np.all(res == correct) + correct_columns = list(map(lambda x: 'weekday_' + x, weekdays)) \ + + list(map(lambda x: 'weekday_abbr_' + x, weekdays_abbr)) + assert all(sparse_frame.columns == correct_columns) + + +def test_csr_one_hot_series_same_categories(weekdays): + sample_data = pd.DataFrame( + dict(date=pd.date_range("2017-01-01", periods=7))) + sample_data["weekday"] = sample_data.date.dt.weekday_name + sample_data["weekday2"] = sample_data.date.dt.weekday_name + + categories = {'weekday': weekdays, + 'weekday2': weekdays} + + with pytest.raises(ValueError): + sparse_one_hot(sample_data, categories=categories, + order=['weekday', 'weekday2']) + + sparse_frame = sparse_one_hot(sample_data, categories=categories, + order=['weekday', 'weekday2'], + prefixes=True) + + correct_columns = list(map(lambda x: 'weekday_' + x, weekdays)) \ + + list(map(lambda x: 'weekday2_' + x, weekdays)) + assert all(sparse_frame.columns == correct_columns) + + def test_csr_one_hot_series_too_much_categories(sampledata): categories = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Yesterday', 'Saturday', 'Birthday'] - sparse_frame = sparse_one_hot(sampledata(49), 'weekday', categories) + sparse_frame = sparse_one_hot(sampledata(49), + categories={'weekday': categories}) res = sparse_frame.groupby_sum(np.tile(np.arange(7), 7)).data.todense() correct = np.identity(7) * 7 @@ -367,7 +547,7 @@ def test_csr_one_hot_series_too_little_categories(sampledata): categories = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] with pytest.raises(ValueError): - sparse_one_hot(sampledata(49), 'weekday', categories) + sparse_one_hot(sampledata(49), categories={'weekday': categories}) @pytest.mark.skipif(traildb is False, reason="TrailDB not installed") @@ -393,6 +573,16 @@ def test_npz_io(complex_example): os.remove('/tmp/sparse.npz') +def test_npz_io_s3(complex_example): + with mock_s3_fs('sparsity'): + sf, second, third = complex_example + sf.to_npz('s3://sparsity/sparse.npz') + loaded = SparseFrame.read_npz('s3://sparsity/sparse.npz') + assert np.all(loaded.data.todense() == sf.data.todense()) + assert np.all(loaded.index == sf.index) + assert np.all(loaded.columns == sf.columns) + + def test_getitem(): sf = SparseFrame(np.identity(10), columns=list('abcdefghij')) assert sf['a'].data.todense()[0] == 1 @@ -424,12 +614,10 @@ def test_vstack_multi_index(clickstream): df_0 = clickstream.iloc[:len(clickstream) // 2] df_1 = clickstream.iloc[len(clickstream) // 2:] sf_0 = sparse_one_hot(df_0, - categories=list('ABCDE'), - column='page_id', + categories={'page_id': list('ABCDE')}, index_col=['index', 'id']) sf_1 = sparse_one_hot(df_1, - categories=list('ABCDE'), - column='page_id', + categories={'page_id': list('ABCDE')}, index_col=['index', 'id']) res = SparseFrame.vstack([sf_0, sf_1]) assert isinstance(res.index, pd.MultiIndex) @@ -443,36 +631,6 @@ def test_boolean_indexing(): assert res.index.tolist() == [3, 4] -def test_dask_loc(clickstream): - sf = dd.from_pandas(clickstream, npartitions=10) \ - .map_partitions( - sparse_one_hot, - column='page_id', - categories=list('ABCDE'), - meta=list - ) - - res = sf.loc['2016-01-15':'2016-02-15'] - res = SparseFrame.concat(res.compute(get=get_sync).tolist()) - assert res.index.date.max() == dt.date(2016, 2, 15) - assert res.index.date.min() == dt.date(2016, 1, 15) - - -def test_dask_multi_index_loc(clickstream): - sf = dd.from_pandas(clickstream, npartitions=10) \ - .map_partitions( - sparse_one_hot, - column='page_id', - index_col=['index', 'id'], - categories=list('ABCDE'), - meta=list - ) - res = sf.loc['2016-01-15':'2016-02-15'] - res = SparseFrame.vstack(res.compute(get=get_sync).tolist()) - assert res.index.get_level_values(0).date.min() == dt.date(2016, 1, 15) - assert res.index.get_level_values(0).date.max() == dt.date(2016, 2, 15) - - def test_rename(): old_names = list('ABCDE') func = lambda x: x + '_new' @@ -523,6 +681,20 @@ def test_repr(): assert isinstance(res, str) +def test_groupby_agg(groupby_frame): + res = groupby_frame.groupby_agg( + level=0, + agg_func=lambda x: x.sum(axis=0) + ).data.todense() + assert np.all(res == (np.identity(10) * 10)) + + res = groupby_frame.groupby_agg( + level=0, + agg_func=lambda x: x.mean(axis=0) + ).data.todense() + assert np.all(res.round() == np.identity(10)) + + def test_init_with_pandas(): df = pd.DataFrame(np.identity(5), index=[ @@ -544,3 +716,68 @@ def test_init_with_pandas(): df['A'] = 'bla' with pytest.raises(TypeError): sf = SparseFrame(df) + + +def test_multiply_rowwise(): + # Row wise multiplication with different types + sf = SparseFrame(np.ones((5, 5))) + other = np.arange(5) + msg = "Row wise multiplication failed" + + # nd.array + other = other.reshape(1, -1) + res = sf.multiply(other, axis=0) + assert np.all(res.sum(axis=0) == 5 * other), msg + + # SparseFrame + _other = SparseFrame(other) + res = sf.multiply(_other, axis=0) + assert np.all(res.sum(axis=0) == 5 * other), msg + + # csr_matrix + _other = _other.data + res = sf.multiply(_other, axis=0) + assert np.all(res.sum(axis=0) == 5 * other), msg + + +def test_multiply_colwise(): + # Column wise multiplication with different types + sf = SparseFrame(np.ones((5, 5))) + other = np.arange(5) + msg = "Column wise multiplication failed" + + # nd.array + other = other.reshape(-1, 1) + res = sf.multiply(other, axis=1) + assert np.all(res.sum(axis=1) == 5 * other), msg + + # SparseFrame + _other = SparseFrame(other) + res = sf.multiply(_other, axis=1) + assert np.all(res.sum(axis=1) == 5 * other), msg + + # csr_matrix + _other = _other.data + _other.toarray() + res = sf.multiply(_other, axis=1) + assert np.all(res.sum(axis=1) == 5 * other), msg + + +def test_drop_single_label(): + old_names = list('ABCDE') + sf = SparseFrame(np.identity(5), columns=old_names) + sf = sf.drop('A', axis=1) + + correct = np.identity(5)[:, 1:] + assert sf.columns.tolist() == list('BCDE') + np.testing.assert_array_equal(sf.data.todense(), correct) + + +def test_drop_multiple_labels(): + old_names = list('ABCDE') + sf = SparseFrame(np.identity(5), columns=old_names) + sf = sf.drop(['A', 'C'], axis=1) + + correct = np.identity(5)[:, [1, 3, 4]] + assert sf.columns.tolist() == list('BDE') + np.testing.assert_array_equal(sf.data.todense(), correct) diff --git a/versioneer.py b/versioneer.py new file mode 100644 index 0000000..64fea1c --- /dev/null +++ b/versioneer.py @@ -0,0 +1,1822 @@ + +# Version: 0.18 + +"""The Versioneer - like a rocketeer, but for versions. + +The Versioneer +============== + +* like a rocketeer, but for versions! +* https://github.com/warner/python-versioneer +* Brian Warner +* License: Public Domain +* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy +* [![Latest Version] +(https://pypip.in/version/versioneer/badge.svg?style=flat) +](https://pypi.python.org/pypi/versioneer/) +* [![Build Status] +(https://travis-ci.org/warner/python-versioneer.png?branch=master) +](https://travis-ci.org/warner/python-versioneer) + +This is a tool for managing a recorded version number in distutils-based +python projects. The goal is to remove the tedious and error-prone "update +the embedded version string" step from your release process. Making a new +release should be as easy as recording a new tag in your version-control +system, and maybe making new tarballs. + + +## Quick Install + +* `pip install versioneer` to somewhere to your $PATH +* add a `[versioneer]` section to your setup.cfg (see below) +* run `versioneer install` in your source tree, commit the results + +## Version Identifiers + +Source trees come from a variety of places: + +* a version-control system checkout (mostly used by developers) +* a nightly tarball, produced by build automation +* a snapshot tarball, produced by a web-based VCS browser, like github's + "tarball from tag" feature +* a release tarball, produced by "setup.py sdist", distributed through PyPI + +Within each source tree, the version identifier (either a string or a number, +this tool is format-agnostic) can come from a variety of places: + +* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows + about recent "tags" and an absolute revision-id +* the name of the directory into which the tarball was unpacked +* an expanded VCS keyword ($Id$, etc) +* a `_version.py` created by some earlier build step + +For released software, the version identifier is closely related to a VCS +tag. Some projects use tag names that include more than just the version +string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool +needs to strip the tag prefix to extract the version identifier. For +unreleased software (between tags), the version identifier should provide +enough information to help developers recreate the same tree, while also +giving them an idea of roughly how old the tree is (after version 1.2, before +version 1.3). Many VCS systems can report a description that captures this, +for example `git describe --tags --dirty --always` reports things like +"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the +0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has +uncommitted changes. + +The version identifier is used for multiple purposes: + +* to allow the module to self-identify its version: `myproject.__version__` +* to choose a name and prefix for a 'setup.py sdist' tarball + +## Theory of Operation + +Versioneer works by adding a special `_version.py` file into your source +tree, where your `__init__.py` can import it. This `_version.py` knows how to +dynamically ask the VCS tool for version information at import time. + +`_version.py` also contains `$Revision$` markers, and the installation +process marks `_version.py` to have this marker rewritten with a tag name +during the `git archive` command. As a result, generated tarballs will +contain enough information to get the proper version. + +To allow `setup.py` to compute a version too, a `versioneer.py` is added to +the top level of your source tree, next to `setup.py` and the `setup.cfg` +that configures it. This overrides several distutils/setuptools commands to +compute the version when invoked, and changes `setup.py build` and `setup.py +sdist` to replace `_version.py` with a small static file that contains just +the generated version data. + +## Installation + +See [INSTALL.md](./INSTALL.md) for detailed installation instructions. + +## Version-String Flavors + +Code which uses Versioneer can learn about its version string at runtime by +importing `_version` from your main `__init__.py` file and running the +`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can +import the top-level `versioneer.py` and run `get_versions()`. + +Both functions return a dictionary with different flavors of version +information: + +* `['version']`: A condensed version string, rendered using the selected + style. This is the most commonly used value for the project's version + string. The default "pep440" style yields strings like `0.11`, + `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section + below for alternative styles. + +* `['full-revisionid']`: detailed revision identifier. For Git, this is the + full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". + +* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the + commit date in ISO 8601 format. This will be None if the date is not + available. + +* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that + this is only accurate if run in a VCS checkout, otherwise it is likely to + be False or None + +* `['error']`: if the version string could not be computed, this will be set + to a string describing the problem, otherwise it will be None. It may be + useful to throw an exception in setup.py if this is set, to avoid e.g. + creating tarballs with a version string of "unknown". + +Some variants are more useful than others. Including `full-revisionid` in a +bug report should allow developers to reconstruct the exact code being tested +(or indicate the presence of local changes that should be shared with the +developers). `version` is suitable for display in an "about" box or a CLI +`--version` output: it can be easily compared against release notes and lists +of bugs fixed in various releases. + +The installer adds the following text to your `__init__.py` to place a basic +version in `YOURPROJECT.__version__`: + + from ._version import get_versions + __version__ = get_versions()['version'] + del get_versions + +## Styles + +The setup.cfg `style=` configuration controls how the VCS information is +rendered into a version string. + +The default style, "pep440", produces a PEP440-compliant string, equal to the +un-prefixed tag name for actual releases, and containing an additional "local +version" section with more detail for in-between builds. For Git, this is +TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags +--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the +tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and +that this commit is two revisions ("+2") beyond the "0.11" tag. For released +software (exactly equal to a known tag), the identifier will only contain the +stripped tag, e.g. "0.11". + +Other styles are available. See [details.md](details.md) in the Versioneer +source tree for descriptions. + +## Debugging + +Versioneer tries to avoid fatal errors: if something goes wrong, it will tend +to return a version of "0+unknown". To investigate the problem, run `setup.py +version`, which will run the version-lookup code in a verbose mode, and will +display the full contents of `get_versions()` (including the `error` string, +which may help identify what went wrong). + +## Known Limitations + +Some situations are known to cause problems for Versioneer. This details the +most significant ones. More can be found on Github +[issues page](https://github.com/warner/python-versioneer/issues). + +### Subprojects + +Versioneer has limited support for source trees in which `setup.py` is not in +the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are +two common reasons why `setup.py` might not be in the root: + +* Source trees which contain multiple subprojects, such as + [Buildbot](https://github.com/buildbot/buildbot), which contains both + "master" and "slave" subprojects, each with their own `setup.py`, + `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI + distributions (and upload multiple independently-installable tarballs). +* Source trees whose main purpose is to contain a C library, but which also + provide bindings to Python (and perhaps other langauges) in subdirectories. + +Versioneer will look for `.git` in parent directories, and most operations +should get the right version string. However `pip` and `setuptools` have bugs +and implementation details which frequently cause `pip install .` from a +subproject directory to fail to find a correct version string (so it usually +defaults to `0+unknown`). + +`pip install --editable .` should work correctly. `setup.py install` might +work too. + +Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in +some later version. + +[Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking +this issue. The discussion in +[PR #61](https://github.com/warner/python-versioneer/pull/61) describes the +issue from the Versioneer side in more detail. +[pip PR#3176](https://github.com/pypa/pip/pull/3176) and +[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve +pip to let Versioneer work correctly. + +Versioneer-0.16 and earlier only looked for a `.git` directory next to the +`setup.cfg`, so subprojects were completely unsupported with those releases. + +### Editable installs with setuptools <= 18.5 + +`setup.py develop` and `pip install --editable .` allow you to install a +project into a virtualenv once, then continue editing the source code (and +test) without re-installing after every change. + +"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a +convenient way to specify executable scripts that should be installed along +with the python package. + +These both work as expected when using modern setuptools. When using +setuptools-18.5 or earlier, however, certain operations will cause +`pkg_resources.DistributionNotFound` errors when running the entrypoint +script, which must be resolved by re-installing the package. This happens +when the install happens with one version, then the egg_info data is +regenerated while a different version is checked out. Many setup.py commands +cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into +a different virtualenv), so this can be surprising. + +[Bug #83](https://github.com/warner/python-versioneer/issues/83) describes +this one, but upgrading to a newer version of setuptools should probably +resolve it. + +### Unicode version strings + +While Versioneer works (and is continually tested) with both Python 2 and +Python 3, it is not entirely consistent with bytes-vs-unicode distinctions. +Newer releases probably generate unicode version strings on py2. It's not +clear that this is wrong, but it may be surprising for applications when then +write these strings to a network connection or include them in bytes-oriented +APIs like cryptographic checksums. + +[Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates +this question. + + +## Updating Versioneer + +To upgrade your project to a new release of Versioneer, do the following: + +* install the new Versioneer (`pip install -U versioneer` or equivalent) +* edit `setup.cfg`, if necessary, to include any new configuration settings + indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. +* re-run `versioneer install` in your source tree, to replace + `SRC/_version.py` +* commit any changed files + +## Future Directions + +This tool is designed to make it easily extended to other version-control +systems: all VCS-specific components are in separate directories like +src/git/ . The top-level `versioneer.py` script is assembled from these +components by running make-versioneer.py . In the future, make-versioneer.py +will take a VCS name as an argument, and will construct a version of +`versioneer.py` that is specific to the given VCS. It might also take the +configuration arguments that are currently provided manually during +installation by editing setup.py . Alternatively, it might go the other +direction and include code from all supported VCS systems, reducing the +number of intermediate scripts. + + +## License + +To make Versioneer easier to embed, all its code is dedicated to the public +domain. The `_version.py` that it creates is also in the public domain. +Specifically, both are released under the Creative Commons "Public Domain +Dedication" license (CC0-1.0), as described in +https://creativecommons.org/publicdomain/zero/1.0/ . + +""" + +from __future__ import print_function +try: + import configparser +except ImportError: + import ConfigParser as configparser +import errno +import json +import os +import re +import subprocess +import sys + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_root(): + """Get the project root directory. + + We require that all commands are run from the project root, i.e. the + directory that contains setup.py, setup.cfg, and versioneer.py . + """ + root = os.path.realpath(os.path.abspath(os.getcwd())) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + # allow 'python path/to/setup.py COMMAND' + root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + err = ("Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND').") + raise VersioneerBadRootError(err) + try: + # Certain runtime workflows (setup.py install/develop in a setuptools + # tree) execute all dependencies in a single python process, so + # "versioneer" may be imported multiple times, and python's shared + # module-import table will cache the first one. So we can't use + # os.path.dirname(__file__), as that will find whichever + # versioneer.py was first imported, even in later projects. + me = os.path.realpath(os.path.abspath(__file__)) + me_dir = os.path.normcase(os.path.splitext(me)[0]) + vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) + if me_dir != vsr_dir: + print("Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py)) + except NameError: + pass + return root + + +def get_config_from_root(root): + """Read the project setup.cfg file to determine Versioneer config.""" + # This might raise EnvironmentError (if setup.cfg is missing), or + # configparser.NoSectionError (if it lacks a [versioneer] section), or + # configparser.NoOptionError (if it lacks "VCS="). See the docstring at + # the top of versioneer.py for instructions on writing your setup.cfg . + setup_cfg = os.path.join(root, "setup.cfg") + parser = configparser.SafeConfigParser() + with open(setup_cfg, "r") as f: + parser.readfp(f) + VCS = parser.get("versioneer", "VCS") # mandatory + + def get(parser, name): + if parser.has_option("versioneer", name): + return parser.get("versioneer", name) + return None + cfg = VersioneerConfig() + cfg.VCS = VCS + cfg.style = get(parser, "style") or "" + cfg.versionfile_source = get(parser, "versionfile_source") + cfg.versionfile_build = get(parser, "versionfile_build") + cfg.tag_prefix = get(parser, "tag_prefix") + if cfg.tag_prefix in ("''", '""'): + cfg.tag_prefix = "" + cfg.parentdir_prefix = get(parser, "parentdir_prefix") + cfg.verbose = get(parser, "verbose") + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +# these dictionaries contain VCS-specific tools +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode + + +LONG_VERSION_PY['git'] = ''' +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.18 (https://github.com/warner/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" + git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" + git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "%(STYLE)s" + cfg.tag_prefix = "%(TAG_PREFIX)s" + cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" + cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %%s" %% dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %%s" %% (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %%s (error)" %% dispcmd) + print("stdout was %%s" %% stdout) + return None, p.returncode + return stdout, p.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %%s but none started with prefix %%s" %% + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %%d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%%s', no digits" %% ",".join(refs - tags)) + if verbose: + print("likely tags: %%s" %% ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %%s" %% r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %%s not under git control" %% root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%%s*" %% tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%%s'" + %% describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%%s' doesn't start with prefix '%%s'" + print(fmt %% (full_tag, tag_prefix)) + pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" + %% (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%%d" %% pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%%d" %% pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%%s'" %% style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} +''' + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def do_vcs_install(manifest_in, versionfile_source, ipy): + """Git-specific installation logic for Versioneer. + + For Git, this means creating/changing .gitattributes to mark _version.py + for export-subst keyword substitution. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + files = [manifest_in, versionfile_source] + if ipy: + files.append(ipy) + try: + me = __file__ + if me.endswith(".pyc") or me.endswith(".pyo"): + me = os.path.splitext(me)[0] + ".py" + versioneer_file = os.path.relpath(me) + except NameError: + versioneer_file = "versioneer.py" + files.append(versioneer_file) + present = False + try: + f = open(".gitattributes", "r") + for line in f.readlines(): + if line.strip().startswith(versionfile_source): + if "export-subst" in line.strip().split()[1:]: + present = True + f.close() + except EnvironmentError: + pass + if not present: + f = open(".gitattributes", "a+") + f.write("%s export-subst\n" % versionfile_source) + f.close() + files.append(".gitattributes") + run_command(GITS, ["add", "--"] + files) + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +SHORT_VERSION_PY = """ +# This file was generated by 'versioneer.py' (0.18) from +# revision-control system data, or from the parent directory name of an +# unpacked source archive. Distribution tarballs contain a pre-generated copy +# of this file. + +import json + +version_json = ''' +%s +''' # END VERSION_JSON + + +def get_versions(): + return json.loads(version_json) +""" + + +def versions_from_file(filename): + """Try to determine the version from _version.py if present.""" + try: + with open(filename) as f: + contents = f.read() + except EnvironmentError: + raise NotThisMethod("unable to read _version.py") + mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) + if not mo: + mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) + if not mo: + raise NotThisMethod("no version_json in _version.py") + return json.loads(mo.group(1)) + + +def write_to_version_file(filename, versions): + """Write the given version number to the given _version.py file.""" + os.unlink(filename) + contents = json.dumps(versions, sort_keys=True, + indent=1, separators=(",", ": ")) + with open(filename, "w") as f: + f.write(SHORT_VERSION_PY % contents) + + print("set %s to '%s'" % (filename, versions["version"])) + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +class VersioneerBadRootError(Exception): + """The project root directory is unknown or missing key files.""" + + +def get_versions(verbose=False): + """Get the project version from whatever source is available. + + Returns dict with two keys: 'version' and 'full'. + """ + if "versioneer" in sys.modules: + # see the discussion in cmdclass.py:get_cmdclass() + del sys.modules["versioneer"] + + root = get_root() + cfg = get_config_from_root(root) + + assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" + handlers = HANDLERS.get(cfg.VCS) + assert handlers, "unrecognized VCS '%s'" % cfg.VCS + verbose = verbose or cfg.verbose + assert cfg.versionfile_source is not None, \ + "please set versioneer.versionfile_source" + assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" + + versionfile_abs = os.path.join(root, cfg.versionfile_source) + + # extract version from first of: _version.py, VCS command (e.g. 'git + # describe'), parentdir. This is meant to work for developers using a + # source checkout, for users of a tarball created by 'setup.py sdist', + # and for users of a tarball/zipball created by 'git archive' or github's + # download-from-tag feature or the equivalent in other VCSes. + + get_keywords_f = handlers.get("get_keywords") + from_keywords_f = handlers.get("keywords") + if get_keywords_f and from_keywords_f: + try: + keywords = get_keywords_f(versionfile_abs) + ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) + if verbose: + print("got version from expanded keyword %s" % ver) + return ver + except NotThisMethod: + pass + + try: + ver = versions_from_file(versionfile_abs) + if verbose: + print("got version from file %s %s" % (versionfile_abs, ver)) + return ver + except NotThisMethod: + pass + + from_vcs_f = handlers.get("pieces_from_vcs") + if from_vcs_f: + try: + pieces = from_vcs_f(cfg.tag_prefix, root, verbose) + ver = render(pieces, cfg.style) + if verbose: + print("got version from VCS %s" % ver) + return ver + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + if verbose: + print("got version from parentdir %s" % ver) + return ver + except NotThisMethod: + pass + + if verbose: + print("unable to compute version") + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, "error": "unable to compute version", + "date": None} + + +def get_version(): + """Get the short version string for this project.""" + return get_versions()["version"] + + +def get_cmdclass(): + """Get the custom setuptools/distutils subclasses used by Versioneer.""" + if "versioneer" in sys.modules: + del sys.modules["versioneer"] + # this fixes the "python setup.py develop" case (also 'install' and + # 'easy_install .'), in which subdependencies of the main project are + # built (using setup.py bdist_egg) in the same python process. Assume + # a main project A and a dependency B, which use different versions + # of Versioneer. A's setup.py imports A's Versioneer, leaving it in + # sys.modules by the time B's setup.py is executed, causing B to run + # with the wrong versioneer. Setuptools wraps the sub-dep builds in a + # sandbox that restores sys.modules to it's pre-build state, so the + # parent is protected against the child's "import versioneer". By + # removing ourselves from sys.modules here, before the child build + # happens, we protect the child from the parent's versioneer too. + # Also see https://github.com/warner/python-versioneer/issues/52 + + cmds = {} + + # we add "version" to both distutils and setuptools + from distutils.core import Command + + class cmd_version(Command): + description = "report generated version string" + user_options = [] + boolean_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + vers = get_versions(verbose=True) + print("Version: %s" % vers["version"]) + print(" full-revisionid: %s" % vers.get("full-revisionid")) + print(" dirty: %s" % vers.get("dirty")) + print(" date: %s" % vers.get("date")) + if vers["error"]: + print(" error: %s" % vers["error"]) + cmds["version"] = cmd_version + + # we override "build_py" in both distutils and setuptools + # + # most invocation pathways end up running build_py: + # distutils/build -> build_py + # distutils/install -> distutils/build ->.. + # setuptools/bdist_wheel -> distutils/install ->.. + # setuptools/bdist_egg -> distutils/install_lib -> build_py + # setuptools/install -> bdist_egg ->.. + # setuptools/develop -> ? + # pip install: + # copies source tree to a tempdir before running egg_info/etc + # if .git isn't copied too, 'git describe' will fail + # then does setup.py bdist_wheel, or sometimes setup.py install + # setup.py egg_info -> ? + + # we override different "build_py" commands for both environments + if "setuptools" in sys.modules: + from setuptools.command.build_py import build_py as _build_py + else: + from distutils.command.build_py import build_py as _build_py + + class cmd_build_py(_build_py): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_py.run(self) + # now locate _version.py in the new build/ directory and replace + # it with an updated value + if cfg.versionfile_build: + target_versionfile = os.path.join(self.build_lib, + cfg.versionfile_build) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + cmds["build_py"] = cmd_build_py + + if "cx_Freeze" in sys.modules: # cx_freeze enabled? + from cx_Freeze.dist import build_exe as _build_exe + # nczeczulin reports that py2exe won't like the pep440-style string + # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. + # setup(console=[{ + # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION + # "product_version": versioneer.get_version(), + # ... + + class cmd_build_exe(_build_exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _build_exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + cmds["build_exe"] = cmd_build_exe + del cmds["build_py"] + + if 'py2exe' in sys.modules: # py2exe enabled? + try: + from py2exe.distutils_buildexe import py2exe as _py2exe # py3 + except ImportError: + from py2exe.build_exe import py2exe as _py2exe # py2 + + class cmd_py2exe(_py2exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _py2exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + cmds["py2exe"] = cmd_py2exe + + # we override different "sdist" commands for both environments + if "setuptools" in sys.modules: + from setuptools.command.sdist import sdist as _sdist + else: + from distutils.command.sdist import sdist as _sdist + + class cmd_sdist(_sdist): + def run(self): + versions = get_versions() + self._versioneer_generated_versions = versions + # unless we update this, the command will keep using the old + # version + self.distribution.metadata.version = versions["version"] + return _sdist.run(self) + + def make_release_tree(self, base_dir, files): + root = get_root() + cfg = get_config_from_root(root) + _sdist.make_release_tree(self, base_dir, files) + # now locate _version.py in the new base_dir directory + # (remembering that it may be a hardlink) and replace it with an + # updated value + target_versionfile = os.path.join(base_dir, cfg.versionfile_source) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, + self._versioneer_generated_versions) + cmds["sdist"] = cmd_sdist + + return cmds + + +CONFIG_ERROR = """ +setup.cfg is missing the necessary Versioneer configuration. You need +a section like: + + [versioneer] + VCS = git + style = pep440 + versionfile_source = src/myproject/_version.py + versionfile_build = myproject/_version.py + tag_prefix = + parentdir_prefix = myproject- + +You will also need to edit your setup.py to use the results: + + import versioneer + setup(version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), ...) + +Please read the docstring in ./versioneer.py for configuration instructions, +edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. +""" + +SAMPLE_CONFIG = """ +# See the docstring in versioneer.py for instructions. Note that you must +# re-run 'versioneer.py setup' after changing this section, and commit the +# resulting files. + +[versioneer] +#VCS = git +#style = pep440 +#versionfile_source = +#versionfile_build = +#tag_prefix = +#parentdir_prefix = + +""" + +INIT_PY_SNIPPET = """ +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions +""" + + +def do_setup(): + """Main VCS-independent setup function for installing Versioneer.""" + root = get_root() + try: + cfg = get_config_from_root(root) + except (EnvironmentError, configparser.NoSectionError, + configparser.NoOptionError) as e: + if isinstance(e, (EnvironmentError, configparser.NoSectionError)): + print("Adding sample versioneer config to setup.cfg", + file=sys.stderr) + with open(os.path.join(root, "setup.cfg"), "a") as f: + f.write(SAMPLE_CONFIG) + print(CONFIG_ERROR, file=sys.stderr) + return 1 + + print(" creating %s" % cfg.versionfile_source) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), + "__init__.py") + if os.path.exists(ipy): + try: + with open(ipy, "r") as f: + old = f.read() + except EnvironmentError: + old = "" + if INIT_PY_SNIPPET not in old: + print(" appending to %s" % ipy) + with open(ipy, "a") as f: + f.write(INIT_PY_SNIPPET) + else: + print(" %s unmodified" % ipy) + else: + print(" %s doesn't exist, ok" % ipy) + ipy = None + + # Make sure both the top-level "versioneer.py" and versionfile_source + # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so + # they'll be copied into source distributions. Pip won't be able to + # install the package without this. + manifest_in = os.path.join(root, "MANIFEST.in") + simple_includes = set() + try: + with open(manifest_in, "r") as f: + for line in f: + if line.startswith("include "): + for include in line.split()[1:]: + simple_includes.add(include) + except EnvironmentError: + pass + # That doesn't cover everything MANIFEST.in can do + # (http://docs.python.org/2/distutils/sourcedist.html#commands), so + # it might give some false negatives. Appending redundant 'include' + # lines is safe, though. + if "versioneer.py" not in simple_includes: + print(" appending 'versioneer.py' to MANIFEST.in") + with open(manifest_in, "a") as f: + f.write("include versioneer.py\n") + else: + print(" 'versioneer.py' already in MANIFEST.in") + if cfg.versionfile_source not in simple_includes: + print(" appending versionfile_source ('%s') to MANIFEST.in" % + cfg.versionfile_source) + with open(manifest_in, "a") as f: + f.write("include %s\n" % cfg.versionfile_source) + else: + print(" versionfile_source already in MANIFEST.in") + + # Make VCS-specific changes. For git, this means creating/changing + # .gitattributes to mark _version.py for export-subst keyword + # substitution. + do_vcs_install(manifest_in, cfg.versionfile_source, ipy) + return 0 + + +def scan_setup_py(): + """Validate the contents of setup.py against Versioneer's expectations.""" + found = set() + setters = False + errors = 0 + with open("setup.py", "r") as f: + for line in f.readlines(): + if "import versioneer" in line: + found.add("import") + if "versioneer.get_cmdclass()" in line: + found.add("cmdclass") + if "versioneer.get_version()" in line: + found.add("get_version") + if "versioneer.VCS" in line: + setters = True + if "versioneer.versionfile_source" in line: + setters = True + if len(found) != 3: + print("") + print("Your setup.py appears to be missing some important items") + print("(but I might be wrong). Please make sure it has something") + print("roughly like the following:") + print("") + print(" import versioneer") + print(" setup( version=versioneer.get_version(),") + print(" cmdclass=versioneer.get_cmdclass(), ...)") + print("") + errors += 1 + if setters: + print("You should remove lines like 'versioneer.VCS = ' and") + print("'versioneer.versionfile_source = ' . This configuration") + print("now lives in setup.cfg, and should be removed from setup.py") + print("") + errors += 1 + return errors + + +if __name__ == "__main__": + cmd = sys.argv[1] + if cmd == "setup": + errors = do_setup() + errors += scan_setup_py() + if errors: + sys.exit(1)